diff options
author | Junio C Hamano <junkio@cox.net> | 2006-04-27 12:48:26 -0700 |
---|---|---|
committer | Junio C Hamano <junkio@cox.net> | 2006-04-27 12:48:26 -0700 |
commit | b8c59e6ad30697e7313b28ec0f7c7c9f1e4c7b5e (patch) | |
tree | 3ddf449c06c3da3ab102f7747e365b574e77db94 /gsimm.c | |
parent | Merge branch 'master' into next (diff) | |
download | tgif-b8c59e6ad30697e7313b28ec0f7c7c9f1e4c7b5e.tar.xz |
Retire rabinpoly fingerprinting code
For now let's retire this and reintroduce it as part of the updated
pack-objects series from Geert when it is ready.
Signed-off-by: Junio C Hamano <junkio@cox.net>
Diffstat (limited to 'gsimm.c')
-rw-r--r-- | gsimm.c | 94 |
1 files changed, 0 insertions, 94 deletions
diff --git a/gsimm.c b/gsimm.c deleted file mode 100644 index 7024bf8f58..0000000000 --- a/gsimm.c +++ /dev/null @@ -1,94 +0,0 @@ -#include "rabinpoly.h" -#include "gsimm.h" - -/* Has to be power of two. Since the Rabin hash only has 63 - usable bits, the number of hashes is limited to 32. - Lower powers of two could be used for speeding up processing - of very large files. */ -#define NUM_HASHES_PER_CHAR 32 - -/* Size of cache used to eliminate duplicate substrings. - Make small enough to comfortably fit in L1 cache. */ -#define DUP_CACHE_SIZE 256 - -/* For the final counting, do not count each bit individually, but - group them. Must be power of two, at most NUM_HASHES_PER_CHAR. - However, larger sizes result in higher cache usage. Use 8 bits - per group for efficient processing of large files on fast machines - with decent caches, or 4 bits for faster processing of small files - and for machines with small caches. */ -#define GROUP_BITS 4 -#define GROUP_COUNTERS (1<<GROUP_BITS) - -static void freq_to_md(u_char *md, int *freq) -{ int j, k; - - for (j = 0; j < MD_LENGTH; j++) - { u_char ch = 0; - - for (k = 0; k < 8; k++) ch = 2*ch + (freq[8*j+k] > 0); - md[j] = ch; - } - bzero (freq, sizeof(freq[0]) * MD_BITS); -} - -void gb_simm_process(u_char *data, unsigned len, u_char *md) -{ size_t j = 0; - u_int32_t ofs; - u_int32_t dup_cache[DUP_CACHE_SIZE]; - u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)]; - int freq[MD_BITS]; - - bzero (freq, sizeof(freq[0]) * MD_BITS); - bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t)); - bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t))); - - /* Ignore incomplete substrings */ - while (j < len && j < RABIN_WINDOW_SIZE) rabin_slide8 (data[j++]); - - while (j < len) - { u_int64_t hash; - u_int32_t ofs, sum; - u_char idx; - int k; - - hash = rabin_slide8 (data[j++]); - - /* In order to update a much larger frequency table - with only 32 bits of checksum, randomly select a - part of the table to update. The selection should - only depend on the content of the represented data, - and be independent of the bits used for the update. - - Instead of updating 32 individual counters, process - the checksum in MD_BITS / GROUP_BITS groups of - GROUP_BITS bits, and count the frequency of each bit pattern. - */ - - idx = (hash >> 32); - sum = (u_int32_t) hash; - ofs = idx % (MD_BITS / NUM_HASHES_PER_CHAR) * NUM_HASHES_PER_CHAR; - idx %= DUP_CACHE_SIZE; - if (dup_cache[idx] != sum) - { dup_cache[idx] = sum; - for (k = 0; k < NUM_HASHES_PER_CHAR / GROUP_BITS; k++) - { count[ofs * GROUP_COUNTERS / GROUP_BITS + (sum % GROUP_COUNTERS)]++; - ofs += GROUP_BITS; - sum >>= GROUP_BITS; - } } } - - /* Distribute the occurrences of each bit group over the frequency table. */ - for (ofs = 0; ofs < MD_BITS; ofs += GROUP_BITS) - { int j; - for (j = 0; j < GROUP_COUNTERS; j++) - { int k; - for (k = 0; k < GROUP_BITS; k++) - { freq[ofs + k] += ((1<<k) & j) - ? count[ofs * GROUP_COUNTERS / GROUP_BITS + j] - : -count[ofs * GROUP_COUNTERS / GROUP_BITS + j]; - } } } - - if (md) - { rabin_reset(); - freq_to_md (md, freq); -} } |