summaryrefslogtreecommitdiff
path: root/gsimm.c
diff options
context:
space:
mode:
authorLibravatar Junio C Hamano <junkio@cox.net>2006-04-27 12:48:26 -0700
committerLibravatar Junio C Hamano <junkio@cox.net>2006-04-27 12:48:26 -0700
commitb8c59e6ad30697e7313b28ec0f7c7c9f1e4c7b5e (patch)
tree3ddf449c06c3da3ab102f7747e365b574e77db94 /gsimm.c
parentMerge branch 'master' into next (diff)
downloadtgif-b8c59e6ad30697e7313b28ec0f7c7c9f1e4c7b5e.tar.xz
Retire rabinpoly fingerprinting code
For now let's retire this and reintroduce it as part of the updated pack-objects series from Geert when it is ready. Signed-off-by: Junio C Hamano <junkio@cox.net>
Diffstat (limited to 'gsimm.c')
-rw-r--r--gsimm.c94
1 files changed, 0 insertions, 94 deletions
diff --git a/gsimm.c b/gsimm.c
deleted file mode 100644
index 7024bf8f58..0000000000
--- a/gsimm.c
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "rabinpoly.h"
-#include "gsimm.h"
-
-/* Has to be power of two. Since the Rabin hash only has 63
- usable bits, the number of hashes is limited to 32.
- Lower powers of two could be used for speeding up processing
- of very large files. */
-#define NUM_HASHES_PER_CHAR 32
-
-/* Size of cache used to eliminate duplicate substrings.
- Make small enough to comfortably fit in L1 cache. */
-#define DUP_CACHE_SIZE 256
-
-/* For the final counting, do not count each bit individually, but
- group them. Must be power of two, at most NUM_HASHES_PER_CHAR.
- However, larger sizes result in higher cache usage. Use 8 bits
- per group for efficient processing of large files on fast machines
- with decent caches, or 4 bits for faster processing of small files
- and for machines with small caches. */
-#define GROUP_BITS 4
-#define GROUP_COUNTERS (1<<GROUP_BITS)
-
-static void freq_to_md(u_char *md, int *freq)
-{ int j, k;
-
- for (j = 0; j < MD_LENGTH; j++)
- { u_char ch = 0;
-
- for (k = 0; k < 8; k++) ch = 2*ch + (freq[8*j+k] > 0);
- md[j] = ch;
- }
- bzero (freq, sizeof(freq[0]) * MD_BITS);
-}
-
-void gb_simm_process(u_char *data, unsigned len, u_char *md)
-{ size_t j = 0;
- u_int32_t ofs;
- u_int32_t dup_cache[DUP_CACHE_SIZE];
- u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
- int freq[MD_BITS];
-
- bzero (freq, sizeof(freq[0]) * MD_BITS);
- bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
- bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));
-
- /* Ignore incomplete substrings */
- while (j < len && j < RABIN_WINDOW_SIZE) rabin_slide8 (data[j++]);
-
- while (j < len)
- { u_int64_t hash;
- u_int32_t ofs, sum;
- u_char idx;
- int k;
-
- hash = rabin_slide8 (data[j++]);
-
- /* In order to update a much larger frequency table
- with only 32 bits of checksum, randomly select a
- part of the table to update. The selection should
- only depend on the content of the represented data,
- and be independent of the bits used for the update.
-
- Instead of updating 32 individual counters, process
- the checksum in MD_BITS / GROUP_BITS groups of
- GROUP_BITS bits, and count the frequency of each bit pattern.
- */
-
- idx = (hash >> 32);
- sum = (u_int32_t) hash;
- ofs = idx % (MD_BITS / NUM_HASHES_PER_CHAR) * NUM_HASHES_PER_CHAR;
- idx %= DUP_CACHE_SIZE;
- if (dup_cache[idx] != sum)
- { dup_cache[idx] = sum;
- for (k = 0; k < NUM_HASHES_PER_CHAR / GROUP_BITS; k++)
- { count[ofs * GROUP_COUNTERS / GROUP_BITS + (sum % GROUP_COUNTERS)]++;
- ofs += GROUP_BITS;
- sum >>= GROUP_BITS;
- } } }
-
- /* Distribute the occurrences of each bit group over the frequency table. */
- for (ofs = 0; ofs < MD_BITS; ofs += GROUP_BITS)
- { int j;
- for (j = 0; j < GROUP_COUNTERS; j++)
- { int k;
- for (k = 0; k < GROUP_BITS; k++)
- { freq[ofs + k] += ((1<<k) & j)
- ? count[ofs * GROUP_COUNTERS / GROUP_BITS + j]
- : -count[ofs * GROUP_COUNTERS / GROUP_BITS + j];
- } } }
-
- if (md)
- { rabin_reset();
- freq_to_md (md, freq);
-} }