summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--blame.c642
-rwxr-xr-xt/t8014-blame-ignore-fuzzy.sh440
2 files changed, 1082 insertions, 0 deletions
diff --git a/blame.c b/blame.c
index 13e1e931fd..ad8810cb0e 100644
--- a/blame.c
+++ b/blame.c
@@ -339,6 +339,648 @@ static int find_line_starts(int **line_starts, const char *buf,
return num;
}
+struct fingerprint_entry;
+
+/* A fingerprint is intended to loosely represent a string, such that two
+ * fingerprints can be quickly compared to give an indication of the similarity
+ * of the strings that they represent.
+ *
+ * A fingerprint is represented as a multiset of the lower-cased byte pairs in
+ * the string that it represents. Whitespace is added at each end of the
+ * string. Whitespace pairs are ignored. Whitespace is converted to '\0'.
+ * For example, the string "Darth Radar" will be converted to the following
+ * fingerprint:
+ * {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"}
+ *
+ * The similarity between two fingerprints is the size of the intersection of
+ * their multisets, including repeated elements. See fingerprint_similarity for
+ * examples.
+ *
+ * For ease of implementation, the fingerprint is implemented as a map
+ * of byte pairs to the count of that byte pair in the string, instead of
+ * allowing repeated elements in a set.
+ */
+struct fingerprint {
+ struct hashmap map;
+ /* As we know the maximum number of entries in advance, it's
+ * convenient to store the entries in a single array instead of having
+ * the hashmap manage the memory.
+ */
+ struct fingerprint_entry *entries;
+};
+
+/* A byte pair in a fingerprint. Stores the number of times the byte pair
+ * occurs in the string that the fingerprint represents.
+ */
+struct fingerprint_entry {
+ /* The hashmap entry - the hash represents the byte pair in its
+ * entirety so we don't need to store the byte pair separately.
+ */
+ struct hashmap_entry entry;
+ /* The number of times the byte pair occurs in the string that the
+ * fingerprint represents.
+ */
+ int count;
+};
+
+/* See `struct fingerprint` for an explanation of what a fingerprint is.
+ * \param result the fingerprint of the string is stored here. This must be
+ * freed later using free_fingerprint.
+ * \param line_begin the start of the string
+ * \param line_end the end of the string
+ */
+static void get_fingerprint(struct fingerprint *result,
+ const char *line_begin,
+ const char *line_end)
+{
+ unsigned int hash, c0 = 0, c1;
+ const char *p;
+ int max_map_entry_count = 1 + line_end - line_begin;
+ struct fingerprint_entry *entry = xcalloc(max_map_entry_count,
+ sizeof(struct fingerprint_entry));
+ struct fingerprint_entry *found_entry;
+
+ hashmap_init(&result->map, NULL, NULL, max_map_entry_count);
+ result->entries = entry;
+ for (p = line_begin; p <= line_end; ++p, c0 = c1) {
+ /* Always terminate the string with whitespace.
+ * Normalise whitespace to 0, and normalise letters to
+ * lower case. This won't work for multibyte characters but at
+ * worst will match some unrelated characters.
+ */
+ if ((p == line_end) || isspace(*p))
+ c1 = 0;
+ else
+ c1 = tolower(*p);
+ hash = c0 | (c1 << 8);
+ /* Ignore whitespace pairs */
+ if (hash == 0)
+ continue;
+ hashmap_entry_init(entry, hash);
+
+ found_entry = hashmap_get(&result->map, entry, NULL);
+ if (found_entry) {
+ found_entry->count += 1;
+ } else {
+ entry->count = 1;
+ hashmap_add(&result->map, entry);
+ ++entry;
+ }
+ }
+}
+
+static void free_fingerprint(struct fingerprint *f)
+{
+ hashmap_free(&f->map, 0);
+ free(f->entries);
+}
+
+/* Calculates the similarity between two fingerprints as the size of the
+ * intersection of their multisets, including repeated elements. See
+ * `struct fingerprint` for an explanation of the fingerprint representation.
+ * The similarity between "cat mat" and "father rather" is 2 because "at" is
+ * present twice in both strings while the similarity between "tim" and "mit"
+ * is 0.
+ */
+static int fingerprint_similarity(struct fingerprint *a, struct fingerprint *b)
+{
+ int intersection = 0;
+ struct hashmap_iter iter;
+ const struct fingerprint_entry *entry_a, *entry_b;
+
+ hashmap_iter_init(&b->map, &iter);
+
+ while ((entry_b = hashmap_iter_next(&iter))) {
+ if ((entry_a = hashmap_get(&a->map, entry_b, NULL))) {
+ intersection += entry_a->count < entry_b->count ?
+ entry_a->count : entry_b->count;
+ }
+ }
+ return intersection;
+}
+
+/* Subtracts byte-pair elements in B from A, modifying A in place.
+ */
+static void fingerprint_subtract(struct fingerprint *a, struct fingerprint *b)
+{
+ struct hashmap_iter iter;
+ struct fingerprint_entry *entry_a;
+ const struct fingerprint_entry *entry_b;
+
+ hashmap_iter_init(&b->map, &iter);
+
+ while ((entry_b = hashmap_iter_next(&iter))) {
+ if ((entry_a = hashmap_get(&a->map, entry_b, NULL))) {
+ if (entry_a->count <= entry_b->count)
+ hashmap_remove(&a->map, entry_b, NULL);
+ else
+ entry_a->count -= entry_b->count;
+ }
+ }
+}
+
+/* Calculate fingerprints for a series of lines.
+ * Puts the fingerprints in the fingerprints array, which must have been
+ * preallocated to allow storing line_count elements.
+ */
+static void get_line_fingerprints(struct fingerprint *fingerprints,
+ const char *content, const int *line_starts,
+ long first_line, long line_count)
+{
+ int i;
+ const char *linestart, *lineend;
+
+ line_starts += first_line;
+ for (i = 0; i < line_count; ++i) {
+ linestart = content + line_starts[i];
+ lineend = content + line_starts[i + 1];
+ get_fingerprint(fingerprints + i, linestart, lineend);
+ }
+}
+
+static void free_line_fingerprints(struct fingerprint *fingerprints,
+ int nr_fingerprints)
+{
+ int i;
+
+ for (i = 0; i < nr_fingerprints; i++)
+ free_fingerprint(&fingerprints[i]);
+}
+
+/* This contains the data necessary to linearly map a line number in one half
+ * of a diff chunk to the line in the other half of the diff chunk that is
+ * closest in terms of its position as a fraction of the length of the chunk.
+ */
+struct line_number_mapping {
+ int destination_start, destination_length,
+ source_start, source_length;
+};
+
+/* Given a line number in one range, offset and scale it to map it onto the
+ * other range.
+ * Essentially this mapping is a simple linear equation but the calculation is
+ * more complicated to allow performing it with integer operations.
+ * Another complication is that if a line could map onto many lines in the
+ * destination range then we want to choose the line at the center of those
+ * possibilities.
+ * Example: if the chunk is 2 lines long in A and 10 lines long in B then the
+ * first 5 lines in B will map onto the first line in the A chunk, while the
+ * last 5 lines will all map onto the second line in the A chunk.
+ * Example: if the chunk is 10 lines long in A and 2 lines long in B then line
+ * 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A.
+ */
+static int map_line_number(int line_number,
+ const struct line_number_mapping *mapping)
+{
+ return ((line_number - mapping->source_start) * 2 + 1) *
+ mapping->destination_length /
+ (mapping->source_length * 2) +
+ mapping->destination_start;
+}
+
+/* Get a pointer to the element storing the similarity between a line in A
+ * and a line in B.
+ *
+ * The similarities are stored in a 2-dimensional array. Each "row" in the
+ * array contains the similarities for a line in B. The similarities stored in
+ * a row are the similarities between the line in B and the nearby lines in A.
+ * To keep the length of each row the same, it is padded out with values of -1
+ * where the search range extends beyond the lines in A.
+ * For example, if max_search_distance_a is 2 and the two sides of a diff chunk
+ * look like this:
+ * a | m
+ * b | n
+ * c | o
+ * d | p
+ * e | q
+ * Then the similarity array will contain:
+ * [-1, -1, am, bm, cm,
+ * -1, an, bn, cn, dn,
+ * ao, bo, co, do, eo,
+ * bp, cp, dp, ep, -1,
+ * cq, dq, eq, -1, -1]
+ * Where similarities are denoted either by -1 for invalid, or the
+ * concatenation of the two lines in the diff being compared.
+ *
+ * \param similarities array of similarities between lines in A and B
+ * \param line_a the index of the line in A, in the same frame of reference as
+ * closest_line_a.
+ * \param local_line_b the index of the line in B, relative to the first line
+ * in B that similarities represents.
+ * \param closest_line_a the index of the line in A that is deemed to be
+ * closest to local_line_b. This must be in the same
+ * frame of reference as line_a. This value defines
+ * where similarities is centered for the line in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ */
+static int *get_similarity(int *similarities,
+ int line_a, int local_line_b,
+ int closest_line_a, int max_search_distance_a)
+{
+ assert(abs(line_a - closest_line_a) <=
+ max_search_distance_a);
+ return similarities + line_a - closest_line_a +
+ max_search_distance_a +
+ local_line_b * (max_search_distance_a * 2 + 1);
+}
+
+#define CERTAIN_NOTHING_MATCHES -2
+#define CERTAINTY_NOT_CALCULATED -1
+
+/* Given a line in B, first calculate its similarities with nearby lines in A
+ * if not already calculated, then identify the most similar and second most
+ * similar lines. The "certainty" is calculated based on those two
+ * similarities.
+ *
+ * \param start_a the index of the first line of the chunk in A
+ * \param length_a the length in lines of the chunk in A
+ * \param local_line_b the index of the line in B, relative to the first line
+ * in the chunk.
+ * \param fingerprints_a array of fingerprints for the chunk in A
+ * \param fingerprints_b array of fingerprints for the chunk in B
+ * \param similarities 2-dimensional array of similarities between lines in A
+ * and B. See get_similarity() for more details.
+ * \param certainties array of values indicating how strongly a line in B is
+ * matched with some line in A.
+ * \param second_best_result array of absolute indices in A for the second
+ * closest match of a line in B.
+ * \param result array of absolute indices in A for the closest match of a line
+ * in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ * \param map_line_number_in_b_to_a parameter to map_line_number().
+ */
+static void find_best_line_matches(
+ int start_a,
+ int length_a,
+ int start_b,
+ int local_line_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ const int max_search_distance_a,
+ const struct line_number_mapping *map_line_number_in_b_to_a)
+{
+
+ int i, search_start, search_end, closest_local_line_a, *similarity,
+ best_similarity = 0, second_best_similarity = 0,
+ best_similarity_index = 0, second_best_similarity_index = 0;
+
+ /* certainty has already been calculated so no need to redo the work */
+ if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)
+ return;
+
+ closest_local_line_a = map_line_number(
+ local_line_b + start_b, map_line_number_in_b_to_a) - start_a;
+
+ search_start = closest_local_line_a - max_search_distance_a;
+ if (search_start < 0)
+ search_start = 0;
+
+ search_end = closest_local_line_a + max_search_distance_a + 1;
+ if (search_end > length_a)
+ search_end = length_a;
+
+ for (i = search_start; i < search_end; ++i) {
+ similarity = get_similarity(similarities,
+ i, local_line_b,
+ closest_local_line_a,
+ max_search_distance_a);
+ if (*similarity == -1) {
+ /* This value will never exceed 10 but assert just in
+ * case
+ */
+ assert(abs(i - closest_local_line_a) < 1000);
+ /* scale the similarity by (1000 - distance from
+ * closest line) to act as a tie break between lines
+ * that otherwise are equally similar.
+ */
+ *similarity = fingerprint_similarity(
+ fingerprints_b + local_line_b,
+ fingerprints_a + i) *
+ (1000 - abs(i - closest_local_line_a));
+ }
+ if (*similarity > best_similarity) {
+ second_best_similarity = best_similarity;
+ second_best_similarity_index = best_similarity_index;
+ best_similarity = *similarity;
+ best_similarity_index = i;
+ } else if (*similarity > second_best_similarity) {
+ second_best_similarity = *similarity;
+ second_best_similarity_index = i;
+ }
+ }
+
+ if (best_similarity == 0) {
+ /* this line definitely doesn't match with anything. Mark it
+ * with this special value so it doesn't get invalidated and
+ * won't be recalculated.
+ */
+ certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;
+ result[local_line_b] = -1;
+ } else {
+ /* Calculate the certainty with which this line matches.
+ * If the line matches well with two lines then that reduces
+ * the certainty. However we still want to prioritise matching
+ * a line that matches very well with two lines over matching a
+ * line that matches poorly with one line, hence doubling
+ * best_similarity.
+ * This means that if we have
+ * line X that matches only one line with a score of 3,
+ * line Y that matches two lines equally with a score of 5,
+ * and line Z that matches only one line with a score or 2,
+ * then the lines in order of certainty are X, Y, Z.
+ */
+ certainties[local_line_b] = best_similarity * 2 -
+ second_best_similarity;
+
+ /* We keep both the best and second best results to allow us to
+ * check at a later stage of the matching process whether the
+ * result needs to be invalidated.
+ */
+ result[local_line_b] = start_a + best_similarity_index;
+ second_best_result[local_line_b] =
+ start_a + second_best_similarity_index;
+ }
+}
+
+/*
+ * This finds the line that we can match with the most confidence, and
+ * uses it as a partition. It then calls itself on the lines on either side of
+ * that partition. In this way we avoid lines appearing out of order, and
+ * retain a sensible line ordering.
+ * \param start_a index of the first line in A with which lines in B may be
+ * compared.
+ * \param start_b index of the first line in B for which matching should be
+ * done.
+ * \param length_a number of lines in A with which lines in B may be compared.
+ * \param length_b number of lines in B for which matching should be done.
+ * \param fingerprints_a mutable array of fingerprints in A. The first element
+ * corresponds to the line at start_a.
+ * \param fingerprints_b array of fingerprints in B. The first element
+ * corresponds to the line at start_b.
+ * \param similarities 2-dimensional array of similarities between lines in A
+ * and B. See get_similarity() for more details.
+ * \param certainties array of values indicating how strongly a line in B is
+ * matched with some line in A.
+ * \param second_best_result array of absolute indices in A for the second
+ * closest match of a line in B.
+ * \param result array of absolute indices in A for the closest match of a line
+ * in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ * \param max_search_distance_b an upper bound on the greatest possible
+ * distance between lines in B such that they will
+ * both be compared with the same line in A
+ * according to max_search_distance_a.
+ * \param map_line_number_in_b_to_a parameter to map_line_number().
+ */
+static void fuzzy_find_matching_lines_recurse(
+ int start_a, int start_b,
+ int length_a, int length_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ int max_search_distance_a,
+ int max_search_distance_b,
+ const struct line_number_mapping *map_line_number_in_b_to_a)
+{
+ int i, invalidate_min, invalidate_max, offset_b,
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ most_certain_line_a, most_certain_local_line_b = -1,
+ most_certain_line_certainty = -1,
+ closest_local_line_a;
+
+ for (i = 0; i < length_b; ++i) {
+ find_best_line_matches(start_a,
+ length_a,
+ start_b,
+ i,
+ fingerprints_a,
+ fingerprints_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ max_search_distance_a,
+ map_line_number_in_b_to_a);
+
+ if (certainties[i] > most_certain_line_certainty) {
+ most_certain_line_certainty = certainties[i];
+ most_certain_local_line_b = i;
+ }
+ }
+
+ /* No matches. */
+ if (most_certain_local_line_b == -1)
+ return;
+
+ most_certain_line_a = result[most_certain_local_line_b];
+
+ /*
+ * Subtract the most certain line's fingerprint in B from the matched
+ * fingerprint in A. This means that other lines in B can't also match
+ * the same parts of the line in A.
+ */
+ fingerprint_subtract(fingerprints_a + most_certain_line_a - start_a,
+ fingerprints_b + most_certain_local_line_b);
+
+ /* Invalidate results that may be affected by the choice of most
+ * certain line.
+ */
+ invalidate_min = most_certain_local_line_b - max_search_distance_b;
+ invalidate_max = most_certain_local_line_b + max_search_distance_b + 1;
+ if (invalidate_min < 0)
+ invalidate_min = 0;
+ if (invalidate_max > length_b)
+ invalidate_max = length_b;
+
+ /* As the fingerprint in A has changed, discard previously calculated
+ * similarity values with that fingerprint.
+ */
+ for (i = invalidate_min; i < invalidate_max; ++i) {
+ closest_local_line_a = map_line_number(
+ i + start_b, map_line_number_in_b_to_a) - start_a;
+
+ /* Check that the lines in A and B are close enough that there
+ * is a similarity value for them.
+ */
+ if (abs(most_certain_line_a - start_a - closest_local_line_a) >
+ max_search_distance_a) {
+ continue;
+ }
+
+ *get_similarity(similarities, most_certain_line_a - start_a,
+ i, closest_local_line_a,
+ max_search_distance_a) = -1;
+ }
+
+ /* More invalidating of results that may be affected by the choice of
+ * most certain line.
+ * Discard the matches for lines in B that are currently matched with a
+ * line in A such that their ordering contradicts the ordering imposed
+ * by the choice of most certain line.
+ */
+ for (i = most_certain_local_line_b - 1; i >= invalidate_min; --i) {
+ /* In this loop we discard results for lines in B that are
+ * before most-certain-line-B but are matched with a line in A
+ * that is after most-certain-line-A.
+ */
+ if (certainties[i] >= 0 &&
+ (result[i] >= most_certain_line_a ||
+ second_best_result[i] >= most_certain_line_a)) {
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+ }
+ for (i = most_certain_local_line_b + 1; i < invalidate_max; ++i) {
+ /* In this loop we discard results for lines in B that are
+ * after most-certain-line-B but are matched with a line in A
+ * that is before most-certain-line-A.
+ */
+ if (certainties[i] >= 0 &&
+ (result[i] <= most_certain_line_a ||
+ second_best_result[i] <= most_certain_line_a)) {
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+ }
+
+ /* Repeat the matching process for lines before the most certain line.
+ */
+ if (most_certain_local_line_b > 0) {
+ fuzzy_find_matching_lines_recurse(
+ start_a, start_b,
+ most_certain_line_a + 1 - start_a,
+ most_certain_local_line_b,
+ fingerprints_a, fingerprints_b, similarities,
+ certainties, second_best_result, result,
+ max_search_distance_a,
+ max_search_distance_b,
+ map_line_number_in_b_to_a);
+ }
+ /* Repeat the matching process for lines after the most certain line.
+ */
+ if (most_certain_local_line_b + 1 < length_b) {
+ second_half_start_a = most_certain_line_a;
+ offset_b = most_certain_local_line_b + 1;
+ second_half_start_b = start_b + offset_b;
+ second_half_length_a =
+ length_a + start_a - second_half_start_a;
+ second_half_length_b =
+ length_b + start_b - second_half_start_b;
+ fuzzy_find_matching_lines_recurse(
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ fingerprints_a + second_half_start_a - start_a,
+ fingerprints_b + offset_b,
+ similarities +
+ offset_b * (max_search_distance_a * 2 + 1),
+ certainties + offset_b,
+ second_best_result + offset_b, result + offset_b,
+ max_search_distance_a,
+ max_search_distance_b,
+ map_line_number_in_b_to_a);
+ }
+}
+
+/* Find the lines in the parent line range that most closely match the lines in
+ * the target line range. This is accomplished by matching fingerprints in each
+ * blame_origin, and choosing the best matches that preserve the line ordering.
+ * See struct fingerprint for details of fingerprint matching, and
+ * fuzzy_find_matching_lines_recurse for details of preserving line ordering.
+ *
+ * The performance is believed to be O(n log n) in the typical case and O(n^2)
+ * in a pathological case, where n is the number of lines in the target range.
+ */
+static int *fuzzy_find_matching_lines(struct blame_origin *parent,
+ struct blame_origin *target,
+ int tlno, int parent_slno, int same,
+ int parent_len)
+{
+ /* We use the terminology "A" for the left hand side of the diff AKA
+ * parent, and "B" for the right hand side of the diff AKA target. */
+ int start_a = parent_slno;
+ int length_a = parent_len;
+ int start_b = tlno;
+ int length_b = same - tlno;
+
+ struct line_number_mapping map_line_number_in_b_to_a = {
+ start_a, length_a, start_b, length_b
+ };
+
+ struct fingerprint *fingerprints_a = parent->fingerprints;
+ struct fingerprint *fingerprints_b = target->fingerprints;
+
+ int i, *result, *second_best_result,
+ *certainties, *similarities, similarity_count;
+
+ /*
+ * max_search_distance_a means that given a line in B, compare it to
+ * the line in A that is closest to its position, and the lines in A
+ * that are no greater than max_search_distance_a lines away from the
+ * closest line in A.
+ *
+ * max_search_distance_b is an upper bound on the greatest possible
+ * distance between lines in B such that they will both be compared
+ * with the same line in A according to max_search_distance_a.
+ */
+ int max_search_distance_a = 10, max_search_distance_b;
+
+ if (length_a <= 0)
+ return NULL;
+
+ if (max_search_distance_a >= length_a)
+ max_search_distance_a = length_a ? length_a - 1 : 0;
+
+ max_search_distance_b = ((2 * max_search_distance_a + 1) * length_b
+ - 1) / length_a;
+
+ result = xcalloc(sizeof(int), length_b);
+ second_best_result = xcalloc(sizeof(int), length_b);
+ certainties = xcalloc(sizeof(int), length_b);
+
+ /* See get_similarity() for details of similarities. */
+ similarity_count = length_b * (max_search_distance_a * 2 + 1);
+ similarities = xcalloc(sizeof(int), similarity_count);
+
+ for (i = 0; i < length_b; ++i) {
+ result[i] = -1;
+ second_best_result[i] = -1;
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+
+ for (i = 0; i < similarity_count; ++i)
+ similarities[i] = -1;
+
+ fuzzy_find_matching_lines_recurse(start_a, start_b,
+ length_a, length_b,
+ fingerprints_a + start_a,
+ fingerprints_b + start_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ max_search_distance_a,
+ max_search_distance_b,
+ &map_line_number_in_b_to_a);
+
+ free(similarities);
+ free(certainties);
+ free(second_best_result);
+
+ return result;
+}
+
static void fill_origin_fingerprints(struct blame_origin *o, mmfile_t *file)
{
int *line_starts;
diff --git a/t/t8014-blame-ignore-fuzzy.sh b/t/t8014-blame-ignore-fuzzy.sh
new file mode 100755
index 0000000000..8443966152
--- /dev/null
+++ b/t/t8014-blame-ignore-fuzzy.sh
@@ -0,0 +1,440 @@
+#!/bin/sh
+
+test_description='git blame ignore fuzzy heuristic'
+. ./test-lib.sh
+
+# short circuit until blame has the fuzzy capabilities
+test_done
+
+pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
+
+# Each test is composed of 4 variables:
+# titleN - the test name
+# aN - the initial content
+# bN - the final content
+# expectedN - the line numbers from aN that we expect git blame
+# on bN to identify, or "Final" if bN itself should
+# be identified as the origin of that line.
+
+# We start at test 2 because setup will show as test 1
+title2="Regression test for partially overlapping search ranges"
+cat <<EOF >a2
+1
+2
+3
+abcdef
+5
+6
+7
+ijkl
+9
+10
+11
+pqrs
+13
+14
+15
+wxyz
+17
+18
+19
+EOF
+cat <<EOF >b2
+abcde
+ijk
+pqr
+wxy
+EOF
+cat <<EOF >expected2
+4
+8
+12
+16
+EOF
+
+title3="Combine 3 lines into 2"
+cat <<EOF >a3
+if ((maxgrow==0) ||
+ ( single_line_field && (field->dcols < maxgrow)) ||
+ (!single_line_field && (field->drows < maxgrow)))
+EOF
+cat <<EOF >b3
+if ((maxgrow == 0) || (single_line_field && (field->dcols < maxgrow)) ||
+ (!single_line_field && (field->drows < maxgrow))) {
+EOF
+cat <<EOF >expected3
+2
+3
+EOF
+
+title4="Add curly brackets"
+cat <<EOF >a4
+ if (rows) *rows = field->rows;
+ if (cols) *cols = field->cols;
+ if (frow) *frow = field->frow;
+ if (fcol) *fcol = field->fcol;
+EOF
+cat <<EOF >b4
+ if (rows) {
+ *rows = field->rows;
+ }
+ if (cols) {
+ *cols = field->cols;
+ }
+ if (frow) {
+ *frow = field->frow;
+ }
+ if (fcol) {
+ *fcol = field->fcol;
+ }
+EOF
+cat <<EOF >expected4
+1
+1
+Final
+2
+2
+Final
+3
+3
+Final
+4
+4
+Final
+EOF
+
+
+title5="Combine many lines and change case"
+cat <<EOF >a5
+for(row=0,pBuffer=field->buf;
+ row<height;
+ row++,pBuffer+=width )
+{
+ if ((len = (int)( After_End_Of_Data( pBuffer, width ) - pBuffer )) > 0)
+ {
+ wmove( win, row, 0 );
+ waddnstr( win, pBuffer, len );
+EOF
+cat <<EOF >b5
+for (Row = 0, PBuffer = field->buf; Row < Height; Row++, PBuffer += Width) {
+ if ((Len = (int)(afterEndOfData(PBuffer, Width) - PBuffer)) > 0) {
+ wmove(win, Row, 0);
+ waddnstr(win, PBuffer, Len);
+EOF
+cat <<EOF >expected5
+1
+5
+7
+8
+EOF
+
+title6="Rename and combine lines"
+cat <<EOF >a6
+bool need_visual_update = ((form != (FORM *)0) &&
+ (form->status & _POSTED) &&
+ (form->current==field));
+
+if (need_visual_update)
+ Synchronize_Buffer(form);
+
+if (single_line_field)
+{
+ growth = field->cols * amount;
+ if (field->maxgrow)
+ growth = Minimum(field->maxgrow - field->dcols,growth);
+ field->dcols += growth;
+ if (field->dcols == field->maxgrow)
+EOF
+cat <<EOF >b6
+bool NeedVisualUpdate = ((Form != (FORM *)0) && (Form->status & _POSTED) &&
+ (Form->current == field));
+
+if (NeedVisualUpdate) {
+ synchronizeBuffer(Form);
+}
+
+if (SingleLineField) {
+ Growth = field->cols * amount;
+ if (field->maxgrow) {
+ Growth = Minimum(field->maxgrow - field->dcols, Growth);
+ }
+ field->dcols += Growth;
+ if (field->dcols == field->maxgrow) {
+EOF
+cat <<EOF >expected6
+1
+3
+4
+5
+6
+Final
+7
+8
+10
+11
+12
+Final
+13
+14
+EOF
+
+# Both lines match identically so position must be used to tie-break.
+title7="Same line twice"
+cat <<EOF >a7
+abc
+abc
+EOF
+cat <<EOF >b7
+abcd
+abcd
+EOF
+cat <<EOF >expected7
+1
+2
+EOF
+
+title8="Enforce line order"
+cat <<EOF >a8
+abcdef
+ghijkl
+ab
+EOF
+cat <<EOF >b8
+ghijk
+abcd
+EOF
+cat <<EOF >expected8
+2
+3
+EOF
+
+title9="Expand lines and rename variables"
+cat <<EOF >a9
+int myFunction(int ArgumentOne, Thing *ArgTwo, Blah XuglyBug) {
+ Squiggle FabulousResult = squargle(ArgumentOne, *ArgTwo,
+ XuglyBug) + EwwwGlobalWithAReallyLongNameYepTooLong;
+ return FabulousResult * 42;
+}
+EOF
+cat <<EOF >b9
+int myFunction(int argument_one, Thing *arg_asdfgh,
+ Blah xugly_bug) {
+ Squiggle fabulous_result = squargle(argument_one,
+ *arg_asdfgh, xugly_bug)
+ + g_ewww_global_with_a_really_long_name_yep_too_long;
+ return fabulous_result * 42;
+}
+EOF
+cat <<EOF >expected9
+1
+1
+2
+3
+3
+4
+5
+EOF
+
+title10="Two close matches versus one less close match"
+cat <<EOF >a10
+abcdef
+abcdef
+ghijkl
+EOF
+cat <<EOF >b10
+gh
+abcdefx
+EOF
+cat <<EOF >expected10
+Final
+2
+EOF
+
+# The first line of b matches best with the last line of a, but the overall
+# match is better if we match it with the the first line of a.
+title11="Piggy in the middle"
+cat <<EOF >a11
+abcdefg
+ijklmn
+abcdefgh
+EOF
+cat <<EOF >b11
+abcdefghx
+ijklm
+EOF
+cat <<EOF >expected11
+1
+2
+EOF
+
+title12="No trailing newline"
+printf "abc\ndef" >a12
+printf "abx\nstu" >b12
+cat <<EOF >expected12
+1
+Final
+EOF
+
+title13="Reorder includes"
+cat <<EOF >a13
+#include "c.h"
+#include "b.h"
+#include "a.h"
+#include "e.h"
+#include "d.h"
+EOF
+cat <<EOF >b13
+#include "a.h"
+#include "b.h"
+#include "c.h"
+#include "d.h"
+#include "e.h"
+EOF
+cat <<EOF >expected13
+3
+2
+1
+5
+4
+EOF
+
+last_test=13
+
+test_expect_success setup '
+ { for i in $(test_seq 2 $last_test)
+ do
+ # Append each line in a separate commit to make it easy to
+ # check which original line the blame output relates to.
+
+ line_count=0 &&
+ { while IFS= read line
+ do
+ line_count=$((line_count+1)) &&
+ echo "$line" >>"$i" &&
+ git add "$i" &&
+ test_tick &&
+ GIT_AUTHOR_NAME="$line_count" git commit -m "$line_count"
+ done } <"a$i"
+ done } &&
+
+ { for i in $(test_seq 2 $last_test)
+ do
+ # Overwrite the files with the final content.
+ cp b$i $i &&
+ git add $i
+ done } &&
+ test_tick &&
+
+ # Commit the final content all at once so it can all be
+ # referred to with the same commit ID.
+ GIT_AUTHOR_NAME=Final git commit -m Final &&
+
+ IGNOREME=$(git rev-parse HEAD)
+'
+
+for i in $(test_seq 2 $last_test); do
+ eval title="\$title$i"
+ test_expect_success "$title" \
+ "git blame -M9 --ignore-rev $IGNOREME $i >output &&
+ sed -e \"$pick_author\" output >actual &&
+ test_cmp expected$i actual"
+done
+
+# This invoked a null pointer dereference when the chunk callback was called
+# with a zero length parent chunk and there were no more suspects.
+test_expect_success 'Diff chunks with no suspects' '
+ test_write_lines xy1 A B C xy1 >file &&
+ git add file &&
+ test_tick &&
+ GIT_AUTHOR_NAME=1 git commit -m 1 &&
+
+ test_write_lines xy2 A B xy2 C xy2 >file &&
+ git add file &&
+ test_tick &&
+ GIT_AUTHOR_NAME=2 git commit -m 2 &&
+ REV_2=$(git rev-parse HEAD) &&
+
+ test_write_lines xy3 A >file &&
+ git add file &&
+ test_tick &&
+ GIT_AUTHOR_NAME=3 git commit -m 3 &&
+ REV_3=$(git rev-parse HEAD) &&
+
+ test_write_lines 1 1 >expected &&
+
+ git blame --ignore-rev $REV_2 --ignore-rev $REV_3 file >output &&
+ sed -e "$pick_author" output >actual &&
+
+ test_cmp expected actual
+ '
+
+test_expect_success 'position matching' '
+ test_write_lines abc def >file2 &&
+ git add file2 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=1 git commit -m 1 &&
+
+ test_write_lines abc def abc def >file2 &&
+ git add file2 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=2 git commit -m 2 &&
+
+ test_write_lines abcx defx abcx defx >file2 &&
+ git add file2 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=3 git commit -m 3 &&
+ REV_3=$(git rev-parse HEAD) &&
+
+ test_write_lines abcy defy abcx defx >file2 &&
+ git add file2 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=4 git commit -m 4 &&
+ REV_4=$(git rev-parse HEAD) &&
+
+ test_write_lines 1 1 2 2 >expected &&
+
+ git blame --ignore-rev $REV_3 --ignore-rev $REV_4 file2 >output &&
+ sed -e "$pick_author" output >actual &&
+
+ test_cmp expected actual
+ '
+
+# This fails if each blame entry is processed independently instead of
+# processing each diff change in full.
+test_expect_success 'preserve order' '
+ test_write_lines bcde >file3 &&
+ git add file3 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=1 git commit -m 1 &&
+
+ test_write_lines bcde fghij >file3 &&
+ git add file3 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=2 git commit -m 2 &&
+
+ test_write_lines bcde fghij abcd >file3 &&
+ git add file3 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=3 git commit -m 3 &&
+
+ test_write_lines abcdx fghijx bcdex >file3 &&
+ git add file3 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=4 git commit -m 4 &&
+ REV_4=$(git rev-parse HEAD) &&
+
+ test_write_lines abcdx fghijy bcdex >file3 &&
+ git add file3 &&
+ test_tick &&
+ GIT_AUTHOR_NAME=5 git commit -m 5 &&
+ REV_5=$(git rev-parse HEAD) &&
+
+ test_write_lines 1 2 3 >expected &&
+
+ git blame --ignore-rev $REV_4 --ignore-rev $REV_5 file3 >output &&
+ sed -e "$pick_author" output >actual &&
+
+ test_cmp expected actual
+ '
+
+test_done