diff options
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | count-delta.c | 72 | ||||
-rw-r--r-- | count-delta.h | 10 | ||||
-rw-r--r-- | diffcore-break.c | 45 | ||||
-rw-r--r-- | diffcore-delta.c | 96 | ||||
-rw-r--r-- | diffcore-rename.c | 20 | ||||
-rw-r--r-- | diffcore.h | 4 | ||||
-rw-r--r-- | pack-check.c | 20 | ||||
-rw-r--r-- | pack-objects.c | 38 |
9 files changed, 136 insertions, 173 deletions
@@ -190,7 +190,7 @@ PYMODULES = \ LIB_FILE=libgit.a LIB_H = \ - blob.h cache.h commit.h count-delta.h csum-file.h delta.h \ + blob.h cache.h commit.h csum-file.h delta.h \ diff.h object.h pack.h pkt-line.h quote.h refs.h \ run-command.h strbuf.h tag.h tree.h git-compat-util.h revision.h @@ -200,7 +200,7 @@ DIFF_OBJS = \ diffcore-delta.o LIB_OBJS = \ - blob.o commit.o connect.o count-delta.o csum-file.o \ + blob.o commit.o connect.o csum-file.o \ date.o diff-delta.o entry.o exec_cmd.o ident.o index.o \ object.o pack-check.o patch-delta.o path.o pkt-line.o \ quote.o read-cache.o refs.o run-command.o \ diff --git a/count-delta.c b/count-delta.c deleted file mode 100644 index 058a2aadb1..0000000000 --- a/count-delta.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2005 Junio C Hamano - * The delta-parsing part is almost straight copy of patch-delta.c - * which is (C) 2005 Nicolas Pitre <nico@cam.org>. - */ -#include <stdlib.h> -#include <string.h> -#include <limits.h> -#include "delta.h" -#include "count-delta.h" - -/* - * NOTE. We do not _interpret_ delta fully. As an approximation, we - * just count the number of bytes that are copied from the source, and - * the number of literal data bytes that are inserted. - * - * Number of bytes that are _not_ copied from the source is deletion, - * and number of inserted literal bytes are addition, so sum of them - * is the extent of damage. - */ -int count_delta(void *delta_buf, unsigned long delta_size, - unsigned long *src_copied, unsigned long *literal_added) -{ - unsigned long copied_from_source, added_literal; - const unsigned char *data, *top; - unsigned char cmd; - unsigned long src_size, dst_size, out; - - if (delta_size < DELTA_SIZE_MIN) - return -1; - - data = delta_buf; - top = delta_buf + delta_size; - - src_size = get_delta_hdr_size(&data); - dst_size = get_delta_hdr_size(&data); - - added_literal = copied_from_source = out = 0; - while (data < top) { - cmd = *data++; - if (cmd & 0x80) { - unsigned long cp_off = 0, cp_size = 0; - if (cmd & 0x01) cp_off = *data++; - if (cmd & 0x02) cp_off |= (*data++ << 8); - if (cmd & 0x04) cp_off |= (*data++ << 16); - if (cmd & 0x08) cp_off |= (*data++ << 24); - if (cmd & 0x10) cp_size = *data++; - if (cmd & 0x20) cp_size |= (*data++ << 8); - if (cmd & 0x40) cp_size |= (*data++ << 16); - if (cp_size == 0) cp_size = 0x10000; - - copied_from_source += cp_size; - out += cp_size; - } else { - /* write literal into dst */ - added_literal += cmd; - out += cmd; - data += cmd; - } - } - - /* sanity check */ - if (data != top || out != dst_size) - return -1; - - /* delete size is what was _not_ copied from source. - * edit size is that and literal additions. - */ - *src_copied = copied_from_source; - *literal_added = added_literal; - return 0; -} diff --git a/count-delta.h b/count-delta.h deleted file mode 100644 index 7359629827..0000000000 --- a/count-delta.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Copyright (C) 2005 Junio C Hamano - */ -#ifndef COUNT_DELTA_H -#define COUNT_DELTA_H - -int count_delta(void *, unsigned long, - unsigned long *src_copied, unsigned long *literal_added); - -#endif diff --git a/diffcore-break.c b/diffcore-break.c index 0fc2b860be..71ad58a25a 100644 --- a/diffcore-break.c +++ b/diffcore-break.c @@ -45,8 +45,8 @@ static int should_break(struct diff_filespec *src, * The value we return is 1 if we want the pair to be broken, * or 0 if we do not. */ - unsigned long delta_size, base_size, src_copied, literal_added; - int to_break = 0; + unsigned long delta_size, base_size, src_copied, literal_added, + src_removed; *merge_score_p = 0; /* assume no deletion --- "do not break" * is the default. @@ -72,33 +72,40 @@ static int should_break(struct diff_filespec *src, &src_copied, &literal_added)) return 0; + /* sanity */ + if (src->size < src_copied) + src_copied = src->size; + if (dst->size < literal_added + src_copied) { + if (src_copied < dst->size) + literal_added = dst->size - src_copied; + else + literal_added = 0; + } + src_removed = src->size - src_copied; + /* Compute merge-score, which is "how much is removed * from the source material". The clean-up stage will * merge the surviving pair together if the score is * less than the minimum, after rename/copy runs. */ - if (src->size <= src_copied) - ; /* all copied, nothing removed */ - else { - delta_size = src->size - src_copied; - *merge_score_p = delta_size * MAX_SCORE / src->size; - } - + *merge_score_p = src_removed * MAX_SCORE / src->size; + /* Extent of damage, which counts both inserts and * deletes. */ - if (src->size + literal_added <= src_copied) - delta_size = 0; /* avoid wrapping around */ - else - delta_size = (src->size - src_copied) + literal_added; - - /* We break if the edit exceeds the minimum. - * i.e. (break_score / MAX_SCORE < delta_size / base_size) + delta_size = src_removed + literal_added; + if (delta_size * MAX_SCORE / base_size < break_score) + return 0; + + /* If you removed a lot without adding new material, that is + * not really a rewrite. */ - if (break_score * base_size < delta_size * MAX_SCORE) - to_break = 1; + if ((src->size * break_score < src_removed * MAX_SCORE) && + (literal_added * 20 < src_removed) && + (literal_added * 20 < src_copied)) + return 0; - return to_break; + return 1; } void diffcore_break(int break_score) diff --git a/diffcore-delta.c b/diffcore-delta.c index 1e6a6911ec..70bacff837 100644 --- a/diffcore-delta.c +++ b/diffcore-delta.c @@ -1,32 +1,53 @@ #include "cache.h" #include "diff.h" #include "diffcore.h" -#include "delta.h" -#include "count-delta.h" - -static int diffcore_count_changes_1(void *src, unsigned long src_size, - void *dst, unsigned long dst_size, - unsigned long delta_limit, - unsigned long *src_copied, - unsigned long *literal_added) + +/* + * Idea here is very simple. + * + * We have total of (sz-N+1) N-byte overlapping sequences in buf whose + * size is sz. If the same N-byte sequence appears in both source and + * destination, we say the byte that starts that sequence is shared + * between them (i.e. copied from source to destination). + * + * For each possible N-byte sequence, if the source buffer has more + * instances of it than the destination buffer, that means the + * difference are the number of bytes not copied from source to + * destination. If the counts are the same, everything was copied + * from source to destination. If the destination has more, + * everything was copied, and destination added more. + * + * We are doing an approximation so we do not really have to waste + * memory by actually storing the sequence. We just hash them into + * somewhere around 2^16 hashbuckets and count the occurrences. + * + * The length of the sequence is arbitrarily set to 8 for now. + */ + +#define HASHBASE 65537 /* next_prime(2^16) */ + +static void hash_chars(unsigned char *buf, unsigned long sz, int *count) { - void *delta; - unsigned long delta_size; - - delta = diff_delta(src, src_size, - dst, dst_size, - &delta_size, delta_limit); - if (!delta) - /* If delta_limit is exceeded, we have too much differences */ - return -1; + unsigned int accum1, accum2, i; - /* Estimate the edit size by interpreting delta. */ - if (count_delta(delta, delta_size, src_copied, literal_added)) { - free(delta); - return -1; + /* an 8-byte shift register made of accum1 and accum2. New + * bytes come at LSB of accum2, and shifted up to accum1 + */ + for (i = accum1 = accum2 = 0; i < 7; i++, sz--) { + accum1 = (accum1 << 8) | (accum2 >> 24); + accum2 = (accum2 << 8) | *buf++; + } + while (sz) { + accum1 = (accum1 << 8) | (accum2 >> 24); + accum2 = (accum2 << 8) | *buf++; + /* We want something that hashes permuted byte + * sequences nicely; simpler hash like (accum1 ^ + * accum2) does not perform as well. + */ + i = (accum1 + accum2 * 0x61) % HASHBASE; + count[i]++; + sz--; } - free(delta); - return 0; } int diffcore_count_changes(void *src, unsigned long src_size, @@ -35,9 +56,28 @@ int diffcore_count_changes(void *src, unsigned long src_size, unsigned long *src_copied, unsigned long *literal_added) { - return diffcore_count_changes_1(src, src_size, - dst, dst_size, - delta_limit, - src_copied, - literal_added); + int *src_count, *dst_count, i; + unsigned long sc, la; + + if (src_size < 8 || dst_size < 8) + return -1; + + src_count = xcalloc(HASHBASE * 2, sizeof(int)); + dst_count = src_count + HASHBASE; + hash_chars(src, src_size, src_count); + hash_chars(dst, dst_size, dst_count); + + sc = la = 0; + for (i = 0; i < HASHBASE; i++) { + if (src_count[i] < dst_count[i]) { + la += dst_count[i] - src_count[i]; + sc += src_count[i]; + } + else /* i.e. if (dst_count[i] <= src_count[i]) */ + sc += dst_count[i]; + } + *src_copied = sc; + *literal_added = la; + free(src_count); + return 0; } diff --git a/diffcore-rename.c b/diffcore-rename.c index 55cf1c37f3..625b589fb7 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -170,19 +170,15 @@ static int estimate_similarity(struct diff_filespec *src, &src_copied, &literal_added)) return 0; - /* Extent of damage */ - if (src->size + literal_added < src_copied) - delta_size = 0; - else - delta_size = (src->size - src_copied) + literal_added; - - /* - * Now we will give some score to it. 100% edit gets 0 points - * and 0% edit gets MAX_SCORE points. + /* How similar are they? + * what percentage of material in dst are from source? */ - score = MAX_SCORE - (MAX_SCORE * delta_size / base_size); - if (score < 0) return 0; - if (MAX_SCORE < score) return MAX_SCORE; + if (dst->size < src_copied) + score = MAX_SCORE; + else if (!dst->size) + score = 0; /* should not happen */ + else + score = src_copied * MAX_SCORE / dst->size; return score; } diff --git a/diffcore.h b/diffcore.h index dba4f17658..d31b3b476c 100644 --- a/diffcore.h +++ b/diffcore.h @@ -17,8 +17,8 @@ */ #define MAX_SCORE 60000.0 #define DEFAULT_RENAME_SCORE 30000 /* rename/copy similarity minimum (50%) */ -#define DEFAULT_BREAK_SCORE 30000 /* minimum for break to happen (50%)*/ -#define DEFAULT_MERGE_SCORE 48000 /* maximum for break-merge to happen (80%)*/ +#define DEFAULT_BREAK_SCORE 30000 /* minimum for break to happen (50%) */ +#define DEFAULT_MERGE_SCORE 36000 /* maximum for break-merge to happen 60%) */ #define MINIMUM_BREAK_SIZE 400 /* do not break a file smaller than this */ diff --git a/pack-check.c b/pack-check.c index eca32b6cab..84ed90d369 100644 --- a/pack-check.c +++ b/pack-check.c @@ -70,13 +70,17 @@ static int verify_packfile(struct packed_git *p) } +#define MAX_CHAIN 40 + static void show_pack_info(struct packed_git *p) { struct pack_header *hdr; int nr_objects, i; + unsigned int chain_histogram[MAX_CHAIN]; hdr = p->pack_base; nr_objects = ntohl(hdr->hdr_entries); + memset(chain_histogram, 0, sizeof(chain_histogram)); for (i = 0; i < nr_objects; i++) { unsigned char sha1[20], base_sha1[20]; @@ -97,11 +101,25 @@ static void show_pack_info(struct packed_git *p) printf("%s ", sha1_to_hex(sha1)); if (!delta_chain_length) printf("%-6s %lu %u\n", type, size, e.offset); - else + else { printf("%-6s %lu %u %u %s\n", type, size, e.offset, delta_chain_length, sha1_to_hex(base_sha1)); + if (delta_chain_length < MAX_CHAIN) + chain_histogram[delta_chain_length]++; + else + chain_histogram[0]++; + } } + for (i = 0; i < MAX_CHAIN; i++) { + if (!chain_histogram[i]) + continue; + printf("chain length %s %d: %d object%s\n", + i ? "=" : ">=", + i ? i : MAX_CHAIN, + chain_histogram[i], + 1 < chain_histogram[i] ? "s" : ""); + } } int verify_pack(struct packed_git *p, int verbose) diff --git a/pack-objects.c b/pack-objects.c index 136a7f5aad..49357c6735 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -32,9 +32,6 @@ struct object_entry { * be used as the base objectto delta huge * objects against. */ - int based_on_preferred; /* current delta candidate is a preferred - * one, or delta against a preferred one. - */ }; /* @@ -824,8 +821,6 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de { struct object_entry *cur_entry = cur->entry; struct object_entry *old_entry = old->entry; - int old_preferred = (old_entry->preferred_base || - old_entry->based_on_preferred); unsigned long size, oldsize, delta_size, sizediff; long max_size; void *delta_buf; @@ -867,27 +862,8 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de * delete). */ max_size = size / 2 - 20; - if (cur_entry->delta) { - if (cur_entry->based_on_preferred) { - if (old_preferred) - max_size = cur_entry->delta_size-1; - else - /* trying with non-preferred one when we - * already have a delta based on preferred - * one is pointless. - */ - return -1; - } - else if (!old_preferred) - max_size = cur_entry->delta_size-1; - else - /* otherwise... even if delta with a - * preferred one produces a bigger result than - * what we currently have, which is based on a - * non-preferred one, it is OK. - */ - ; - } + if (cur_entry->delta) + max_size = cur_entry->delta_size-1; if (sizediff >= max_size) return -1; delta_buf = diff_delta(old->data, oldsize, @@ -897,7 +873,6 @@ static int try_delta(struct unpacked *cur, struct unpacked *old, unsigned max_de cur_entry->delta = old_entry; cur_entry->delta_size = delta_size; cur_entry->depth = old_entry->depth + 1; - cur_entry->based_on_preferred = old_preferred; free(delta_buf); return 0; } @@ -966,6 +941,15 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (try_delta(n, m, depth) < 0) break; } +#if 0 + /* if we made n a delta, and if n is already at max + * depth, leaving it in the window is pointless. we + * should evict it first. + * ... in theory only; somehow this makes things worse. + */ + if (entry->delta && depth <= entry->depth) + continue; +#endif idx++; if (idx >= window) idx = 0; |