summaryrefslogtreecommitdiff
path: root/gsimm.c
diff options
context:
space:
mode:
authorLibravatar Junio C Hamano <junkio@cox.net>2006-04-16 21:07:32 -0700
committerLibravatar Junio C Hamano <junkio@cox.net>2006-04-16 21:21:46 -0700
commit9a305b67f8055503a743e67f628400f094c169ee (patch)
treec7ed3e27ef9a7abea8ce897458d2a79cdf12d726 /gsimm.c
parentClean-up Geert's similarity fingerprint code. (diff)
downloadtgif-9a305b67f8055503a743e67f628400f094c169ee.tar.xz
Geert's similarity
Define a function to compute similarity score 0.0<=score<=1.0 Signed-off-by: Junio C Hamano <junkio@cox.net>
Diffstat (limited to 'gsimm.c')
-rw-r--r--gsimm.c29
1 files changed, 29 insertions, 0 deletions
diff --git a/gsimm.c b/gsimm.c
index 7024bf8f58..bd646eb3d2 100644
--- a/gsimm.c
+++ b/gsimm.c
@@ -1,3 +1,4 @@
+#include <string.h>
#include "rabinpoly.h"
#include "gsimm.h"
@@ -32,6 +33,29 @@ static void freq_to_md(u_char *md, int *freq)
bzero (freq, sizeof(freq[0]) * MD_BITS);
}
+static int dist (u_char *l, u_char *r)
+{ int j, k;
+ int d = 0;
+
+ for (j = 0; j < MD_LENGTH; j++)
+ { u_char ch = l[j] ^ r[j];
+
+ for (k = 0; k < 8; k++) d += ((ch & (1<<k)) > 0);
+ }
+
+ return d;
+}
+
+double gb_simm_score(u_char *l, u_char *r)
+{
+ int d = dist(l, r);
+ double sim = (double) (d) / (MD_LENGTH * 4 - 1);
+ if (1.0 < sim)
+ return 0;
+ else
+ return 1.0 - sim;
+}
+
void gb_simm_process(u_char *data, unsigned len, u_char *md)
{ size_t j = 0;
u_int32_t ofs;
@@ -39,6 +63,11 @@ void gb_simm_process(u_char *data, unsigned len, u_char *md)
u_int32_t count [MD_BITS * (GROUP_COUNTERS/GROUP_BITS)];
int freq[MD_BITS];
+ if (len < GB_SIMM_MIN_FILE_SIZE || GB_SIMM_MAX_FILE_SIZE < len) {
+ memset(md, 0, MD_LENGTH);
+ return;
+ }
+
bzero (freq, sizeof(freq[0]) * MD_BITS);
bzero (dup_cache, DUP_CACHE_SIZE * sizeof (u_int32_t));
bzero (count, (MD_BITS * (GROUP_COUNTERS/GROUP_BITS) * sizeof (u_int32_t)));