diff options
author | Junio C Hamano <junkio@cox.net> | 2006-04-16 21:07:32 -0700 |
---|---|---|
committer | Junio C Hamano <junkio@cox.net> | 2006-04-16 21:21:46 -0700 |
commit | 9a305b67f8055503a743e67f628400f094c169ee (patch) | |
tree | c7ed3e27ef9a7abea8ce897458d2a79cdf12d726 /gsimm.c | |
parent | Clean-up Geert's similarity fingerprint code. (diff) | |
download | tgif-9a305b67f8055503a743e67f628400f094c169ee.tar.xz |
Geert's similarity
Define a function to compute similarity score 0.0<=score<=1.0
Signed-off-by: Junio C Hamano <junkio@cox.net>
Diffstat (limited to 'gsimm.c')
-rw-r--r-- | gsimm.c | 29 |
1 files changed, 29 insertions, 0 deletions
@@ -1,3 +1,4 @@ +#include <string.h> #include "rabinpoly.h" #include "gsimm.h" @@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq) bzero (freq, sizeof(freq[0]) * MD_BITS); } +static int dist (u_char *l, u_char *r) +{ int j, k; + int d = 0; + + for (j = 0; j < MD_LENGTH; j++) + { u_char ch = l[j] ^ r[j]; + + for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0); + } + + return d; +} + +double gb_simm_score(u_char *l, u_char *r) +{ + int d = dist(l, r); + double sim = (double) (d) / (MD_LENGTH * 4 - 1); + if (1.0 < sim) + return 0; + else + return 1.0 - sim; +} + void gb_simm_process(u_char *data, unsigned len, u_char *md) { size_t j = 0; u_int32_t ofs; @@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md) u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)]; int freq[MD_BITS]; + if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) { + memset(md, 0, MD_LENGTH); + return; + } + bzero (freq, sizeof(freq[0]) * MD_BITS); bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t)); bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t))); |