diff options
author | Barret Rhoden <brho@google.com> | 2019-06-20 12:38:19 -0400 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2019-06-20 13:38:09 -0700 |
commit | a07a97760cbf2e794600157a5a9f61a888a17941 (patch) | |
tree | ac22afcf779b28ef3456abd55a42656c37cd073f | |
parent | blame: add a fingerprint heuristic to match ignored lines (diff) | |
download | tgif-a07a97760cbf2e794600157a5a9f61a888a17941.tar.xz |
blame: use the fingerprint heuristic to match ignored lines
This commit integrates the fuzzy fingerprint heuristic into
guess_line_blames().
We actually make two passes. The first pass uses the fuzzy algorithm to
find a match within the current diff chunk. If that fails, the second
pass searches the entire parent file for the best match.
For an example of scanning the entire parent for a match, consider:
commit-a 30) #include <sys/header_a.h>
commit-b 31) #include <header_b.h>
commit-c 32) #include <header_c.h>
Then commit X alphabetizes them:
commit-X 30) #include <header_b.h>
commit-X 31) #include <header_c.h>
commit-X 32) #include <sys/header_a.h>
If we just check the parent's chunk (i.e. the first pass), we'd get:
commit-b 30) #include <header_b.h>
commit-c 31) #include <header_c.h>
commit-X 32) #include <sys/header_a.h>
That's because commit X actually consists of two chunks: one chunk is
removing sys/header_a.h, then some context, and the second chunk is
adding sys/header_a.h.
If we scan the entire parent file, we get:
commit-b 30) #include <header_b.h>
commit-c 31) #include <header_c.h>
commit-a 32) #include <sys/header_a.h>
Signed-off-by: Barret Rhoden <brho@google.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
-rw-r--r-- | blame.c | 60 | ||||
-rwxr-xr-x | t/t8014-blame-ignore-fuzzy.sh | 3 |
2 files changed, 55 insertions, 8 deletions
@@ -989,12 +989,19 @@ static void fill_origin_fingerprints(struct blame_origin *o, mmfile_t *file) return; o->num_lines = find_line_starts(&line_starts, o->file.ptr, o->file.size); - /* TODO: Will fill in fingerprints in a future commit */ + o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines); + get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts, + 0, o->num_lines); free(line_starts); } static void drop_origin_fingerprints(struct blame_origin *o) { + if (o->fingerprints) { + free_line_fingerprints(o->fingerprints, o->num_lines); + o->num_lines = 0; + FREE_AND_NULL(o->fingerprints); + } } /* @@ -1572,9 +1579,34 @@ static int are_lines_adjacent(struct blame_line_tracker *first, first->s_lno + 1 == second->s_lno; } +static int scan_parent_range(struct fingerprint *p_fps, + struct fingerprint *t_fps, int t_idx, + int from, int nr_lines) +{ + int sim, p_idx; + #define FINGERPRINT_FILE_THRESHOLD 10 + int best_sim_val = FINGERPRINT_FILE_THRESHOLD; + int best_sim_idx = -1; + + for (p_idx = from; p_idx < from + nr_lines; p_idx++) { + sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]); + if (sim < best_sim_val) + continue; + /* Break ties with the closest-to-target line number */ + if (sim == best_sim_val && best_sim_idx != -1 && + abs(best_sim_idx - t_idx) < abs(p_idx - t_idx)) + continue; + best_sim_val = sim; + best_sim_idx = p_idx; + } + return best_sim_idx; +} + /* - * This cheap heuristic assigns lines in the chunk to their relative location in - * the parent's chunk. Any additional lines are left with the target. + * The first pass checks the blame entry (from the target) against the parent's + * diff chunk. If that fails for a line, the second pass tries to match that + * line to any part of parent file. That catches cases where a change was + * broken into two chunks by 'context.' */ static void guess_line_blames(struct blame_origin *parent, struct blame_origin *target, @@ -1583,11 +1615,22 @@ static void guess_line_blames(struct blame_origin *parent, { int i, best_idx, target_idx; int parent_slno = tlno + offset; + int *fuzzy_matches; + fuzzy_matches = fuzzy_find_matching_lines(parent, target, + tlno, parent_slno, same, + parent_len); for (i = 0; i < same - tlno; i++) { target_idx = tlno + i; - best_idx = target_idx + offset; - if (best_idx < parent_slno + parent_len) { + if (fuzzy_matches && fuzzy_matches[i] >= 0) { + best_idx = fuzzy_matches[i]; + } else { + best_idx = scan_parent_range(parent->fingerprints, + target->fingerprints, + target_idx, 0, + parent->num_lines); + } + if (best_idx >= 0) { line_blames[i].is_parent = 1; line_blames[i].s_lno = best_idx; } else { @@ -1595,6 +1638,7 @@ static void guess_line_blames(struct blame_origin *parent, line_blames[i].s_lno = target_idx; } } + free(fuzzy_matches); } /* @@ -2371,6 +2415,12 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin, if (!porigin) continue; pass_blame_to_parent(sb, origin, porigin, 1); + /* + * Preemptively drop porigin so we can refresh the + * fingerprints if we use the parent again, which can + * occur if you ignore back-to-back commits. + */ + drop_origin_blob(porigin); if (!origin->suspects) goto finish; } diff --git a/t/t8014-blame-ignore-fuzzy.sh b/t/t8014-blame-ignore-fuzzy.sh index 8443966152..6f1a94caef 100755 --- a/t/t8014-blame-ignore-fuzzy.sh +++ b/t/t8014-blame-ignore-fuzzy.sh @@ -3,9 +3,6 @@ test_description='git blame ignore fuzzy heuristic' . ./test-lib.sh -# short circuit until blame has the fuzzy capabilities -test_done - pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/' # Each test is composed of 4 variables: |