From 9a305b67f8055503a743e67f628400f094c169ee Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sun, 16 Apr 2006 21:07:32 -0700 Subject: Geert's similarity Define a function to compute similarity score 0.0<=score<=1.0 Signed-off-by: Junio C Hamano --- gsimm.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'gsimm.c') diff --git a/gsimm.c b/gsimm.c index 7024bf8f58..bd646eb3d2 100644 --- a/gsimm.c +++ b/gsimm.c @@ -1,3 +1,4 @@ +#include #include "rabinpoly.h" #include "gsimm.h" @@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq) bzero (freq, sizeof(freq[0]) * MD_BITS); } +static int dist (u_char *l, u_char *r) +{ int j, k; + int d = 0; + + for (j = 0; j < MD_LENGTH; j++) + { u_char ch = l[j] ^ r[j]; + + for (k = 0; k < 8; k++) d += ((ch & (1< 0); + } + + return d; +} + +double gb_simm_score(u_char *l, u_char *r) +{ + int d = dist(l, r); + double sim = (double) (d) / (MD_LENGTH * 4 - 1); + if (1.0 < sim) + return 0; + else + return 1.0 - sim; +} + void gb_simm_process(u_char *data, unsigned len, u_char *md) { size_t j = 0; u_int32_t ofs; @@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md) u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)]; int freq[MD_BITS]; + if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) { + memset(md, 0, MD_LENGTH); + return; + } + bzero (freq, sizeof(freq[0]) * MD_BITS); bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t)); bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t))); -- cgit v1.2.3