summaryrefslogtreecommitdiff
path: root/blame.c
diff options
context:
space:
mode:
Diffstat (limited to 'blame.c')
-rw-r--r--blame.c1275
1 files changed, 1154 insertions, 121 deletions
diff --git a/blame.c b/blame.c
index 08c0c6cf73..da7e28800e 100644
--- a/blame.c
+++ b/blame.c
@@ -9,6 +9,8 @@
#include "blame.h"
#include "alloc.h"
#include "commit-slab.h"
+#include "bloom.h"
+#include "commit-graph.h"
define_commit_slab(blame_suspects, struct blame_origin *);
static struct blame_suspects blame_suspects;
@@ -90,7 +92,7 @@ static struct blame_origin *get_origin(struct commit *commit, const char *path)
-static void verify_working_tree_path(struct repository *repo,
+static void verify_working_tree_path(struct repository *r,
struct commit *work_tree, const char *path)
{
struct commit_list *parents;
@@ -99,52 +101,55 @@ static void verify_working_tree_path(struct repository *repo,
for (parents = work_tree->parents; parents; parents = parents->next) {
const struct object_id *commit_oid = &parents->item->object.oid;
struct object_id blob_oid;
- unsigned mode;
+ unsigned short mode;
- if (!get_tree_entry(commit_oid, path, &blob_oid, &mode) &&
- oid_object_info(repo, &blob_oid, NULL) == OBJ_BLOB)
+ if (!get_tree_entry(r, commit_oid, path, &blob_oid, &mode) &&
+ oid_object_info(r, &blob_oid, NULL) == OBJ_BLOB)
return;
}
- pos = index_name_pos(repo->index, path, strlen(path));
+ pos = index_name_pos(r->index, path, strlen(path));
if (pos >= 0)
; /* path is in the index */
- else if (-1 - pos < repo->index->cache_nr &&
- !strcmp(repo->index->cache[-1 - pos]->name, path))
+ else if (-1 - pos < r->index->cache_nr &&
+ !strcmp(r->index->cache[-1 - pos]->name, path))
; /* path is in the index, unmerged */
else
die("no such path '%s' in HEAD", path);
}
-static struct commit_list **append_parent(struct commit_list **tail, const struct object_id *oid)
+static struct commit_list **append_parent(struct repository *r,
+ struct commit_list **tail,
+ const struct object_id *oid)
{
struct commit *parent;
- parent = lookup_commit_reference(the_repository, oid);
+ parent = lookup_commit_reference(r, oid);
if (!parent)
die("no such commit %s", oid_to_hex(oid));
return &commit_list_insert(parent, tail)->next;
}
-static void append_merge_parents(struct commit_list **tail)
+static void append_merge_parents(struct repository *r,
+ struct commit_list **tail)
{
int merge_head;
struct strbuf line = STRBUF_INIT;
- merge_head = open(git_path_merge_head(the_repository), O_RDONLY);
+ merge_head = open(git_path_merge_head(r), O_RDONLY);
if (merge_head < 0) {
if (errno == ENOENT)
return;
die("cannot open '%s' for reading",
- git_path_merge_head(the_repository));
+ git_path_merge_head(r));
}
while (!strbuf_getwholeline_fd(&line, merge_head, '\n')) {
struct object_id oid;
- if (line.len < GIT_SHA1_HEXSZ || get_oid_hex(line.buf, &oid))
+ if (get_oid_hex(line.buf, &oid))
die("unknown line in '%s': %s",
- git_path_merge_head(the_repository), line.buf);
- tail = append_parent(tail, &oid);
+ git_path_merge_head(r), line.buf);
+ tail = append_parent(r, tail, &oid);
}
close(merge_head);
strbuf_release(&line);
@@ -155,18 +160,20 @@ static void append_merge_parents(struct commit_list **tail)
* want to transfer ownership of the buffer to the commit (so we
* must use detach).
*/
-static void set_commit_buffer_from_strbuf(struct commit *c, struct strbuf *sb)
+static void set_commit_buffer_from_strbuf(struct repository *r,
+ struct commit *c,
+ struct strbuf *sb)
{
size_t len;
void *buf = strbuf_detach(sb, &len);
- set_commit_buffer(the_repository, c, buf, len);
+ set_commit_buffer(r, c, buf, len);
}
/*
* Prepare a dummy commit that represents the work tree (or staged) item.
* Note that annotating work tree item never works in the reverse.
*/
-static struct commit *fake_working_tree_commit(struct repository *repo,
+static struct commit *fake_working_tree_commit(struct repository *r,
struct diff_options *opt,
const char *path,
const char *contents_from)
@@ -183,9 +190,9 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
unsigned mode;
struct strbuf msg = STRBUF_INIT;
- read_index(repo->index);
+ repo_read_index(r);
time(&now);
- commit = alloc_commit_node(the_repository);
+ commit = alloc_commit_node(r);
commit->object.parsed = 1;
commit->date = now;
parent_tail = &commit->parents;
@@ -193,13 +200,14 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
if (!resolve_ref_unsafe("HEAD", RESOLVE_REF_READING, &head_oid, NULL))
die("no such ref: HEAD");
- parent_tail = append_parent(parent_tail, &head_oid);
- append_merge_parents(parent_tail);
- verify_working_tree_path(repo, commit, path);
+ parent_tail = append_parent(r, parent_tail, &head_oid);
+ append_merge_parents(r, parent_tail);
+ verify_working_tree_path(r, commit, path);
origin = make_origin(commit, path);
- ident = fmt_ident("Not Committed Yet", "not.committed.yet", NULL, 0);
+ ident = fmt_ident("Not Committed Yet", "not.committed.yet",
+ WANT_BLANK_IDENT, NULL, 0);
strbuf_addstr(&msg, "tree 0000000000000000000000000000000000000000\n");
for (parent = commit->parents; parent; parent = parent->next)
strbuf_addf(&msg, "parent %s\n",
@@ -211,7 +219,7 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
ident, ident, path,
(!contents_from ? path :
(!strcmp(contents_from, "-") ? "standard input" : contents_from)));
- set_commit_buffer_from_strbuf(commit, &msg);
+ set_commit_buffer_from_strbuf(r, commit, &msg);
if (!contents_from || strcmp("-", contents_from)) {
struct stat st;
@@ -234,7 +242,7 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
switch (st.st_mode & S_IFMT) {
case S_IFREG:
if (opt->flags.allow_textconv &&
- textconv_object(read_from, mode, &null_oid, 0, &buf_ptr, &buf_len))
+ textconv_object(r, read_from, mode, &null_oid, 0, &buf_ptr, &buf_len))
strbuf_attach(&buf, buf_ptr, buf_len, buf_len + 1);
else if (strbuf_read_file(&buf, read_from, st.st_size) != st.st_size)
die_errno("cannot open or read '%s'", read_from);
@@ -253,7 +261,7 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
if (strbuf_read(&buf, 0, 0) < 0)
die_errno("failed to read from stdin");
}
- convert_to_git(repo->index, path, buf.buf, buf.len, &buf, 0);
+ convert_to_git(r->index, path, buf.buf, buf.len, &buf, 0);
origin->file.ptr = buf.buf;
origin->file.size = buf.len;
pretend_object_file(buf.buf, buf.len, OBJ_BLOB, &origin->blob_oid);
@@ -264,28 +272,28 @@ static struct commit *fake_working_tree_commit(struct repository *repo,
* bits; we are not going to write this index out -- we just
* want to run "diff-index --cached".
*/
- discard_index(repo->index);
- read_index(repo->index);
+ discard_index(r->index);
+ repo_read_index(r);
len = strlen(path);
if (!mode) {
- int pos = index_name_pos(repo->index, path, len);
+ int pos = index_name_pos(r->index, path, len);
if (0 <= pos)
- mode = repo->index->cache[pos]->ce_mode;
+ mode = r->index->cache[pos]->ce_mode;
else
/* Let's not bother reading from HEAD tree */
mode = S_IFREG | 0644;
}
- ce = make_empty_cache_entry(repo->index, len);
+ ce = make_empty_cache_entry(r->index, len);
oidcpy(&ce->oid, &origin->blob_oid);
memcpy(ce->name, path, len);
ce->ce_flags = create_ce_flags(0);
ce->ce_namelen = len;
ce->ce_mode = create_ce_mode(mode);
- add_index_entry(repo->index, ce,
+ add_index_entry(r->index, ce,
ADD_CACHE_OK_TO_ADD | ADD_CACHE_OK_TO_REPLACE);
- cache_tree_invalidate_path(repo->index, path);
+ cache_tree_invalidate_path(r->index, path);
return commit;
}
@@ -305,12 +313,710 @@ static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b,
return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb);
}
+static const char *get_next_line(const char *start, const char *end)
+{
+ const char *nl = memchr(start, '\n', end - start);
+
+ return nl ? nl + 1 : end;
+}
+
+static int find_line_starts(int **line_starts, const char *buf,
+ unsigned long len)
+{
+ const char *end = buf + len;
+ const char *p;
+ int *lineno;
+ int num = 0;
+
+ for (p = buf; p < end; p = get_next_line(p, end))
+ num++;
+
+ ALLOC_ARRAY(*line_starts, num + 1);
+ lineno = *line_starts;
+
+ for (p = buf; p < end; p = get_next_line(p, end))
+ *lineno++ = p - buf;
+
+ *lineno = len;
+
+ return num;
+}
+
+struct fingerprint_entry;
+
+/* A fingerprint is intended to loosely represent a string, such that two
+ * fingerprints can be quickly compared to give an indication of the similarity
+ * of the strings that they represent.
+ *
+ * A fingerprint is represented as a multiset of the lower-cased byte pairs in
+ * the string that it represents. Whitespace is added at each end of the
+ * string. Whitespace pairs are ignored. Whitespace is converted to '\0'.
+ * For example, the string "Darth Radar" will be converted to the following
+ * fingerprint:
+ * {"\0d", "da", "da", "ar", "ar", "rt", "th", "h\0", "\0r", "ra", "ad", "r\0"}
+ *
+ * The similarity between two fingerprints is the size of the intersection of
+ * their multisets, including repeated elements. See fingerprint_similarity for
+ * examples.
+ *
+ * For ease of implementation, the fingerprint is implemented as a map
+ * of byte pairs to the count of that byte pair in the string, instead of
+ * allowing repeated elements in a set.
+ */
+struct fingerprint {
+ struct hashmap map;
+ /* As we know the maximum number of entries in advance, it's
+ * convenient to store the entries in a single array instead of having
+ * the hashmap manage the memory.
+ */
+ struct fingerprint_entry *entries;
+};
+
+/* A byte pair in a fingerprint. Stores the number of times the byte pair
+ * occurs in the string that the fingerprint represents.
+ */
+struct fingerprint_entry {
+ /* The hashmap entry - the hash represents the byte pair in its
+ * entirety so we don't need to store the byte pair separately.
+ */
+ struct hashmap_entry entry;
+ /* The number of times the byte pair occurs in the string that the
+ * fingerprint represents.
+ */
+ int count;
+};
+
+/* See `struct fingerprint` for an explanation of what a fingerprint is.
+ * \param result the fingerprint of the string is stored here. This must be
+ * freed later using free_fingerprint.
+ * \param line_begin the start of the string
+ * \param line_end the end of the string
+ */
+static void get_fingerprint(struct fingerprint *result,
+ const char *line_begin,
+ const char *line_end)
+{
+ unsigned int hash, c0 = 0, c1;
+ const char *p;
+ int max_map_entry_count = 1 + line_end - line_begin;
+ struct fingerprint_entry *entry = xcalloc(max_map_entry_count,
+ sizeof(struct fingerprint_entry));
+ struct fingerprint_entry *found_entry;
+
+ hashmap_init(&result->map, NULL, NULL, max_map_entry_count);
+ result->entries = entry;
+ for (p = line_begin; p <= line_end; ++p, c0 = c1) {
+ /* Always terminate the string with whitespace.
+ * Normalise whitespace to 0, and normalise letters to
+ * lower case. This won't work for multibyte characters but at
+ * worst will match some unrelated characters.
+ */
+ if ((p == line_end) || isspace(*p))
+ c1 = 0;
+ else
+ c1 = tolower(*p);
+ hash = c0 | (c1 << 8);
+ /* Ignore whitespace pairs */
+ if (hash == 0)
+ continue;
+ hashmap_entry_init(&entry->entry, hash);
+
+ found_entry = hashmap_get_entry(&result->map, entry,
+ /* member name */ entry, NULL);
+ if (found_entry) {
+ found_entry->count += 1;
+ } else {
+ entry->count = 1;
+ hashmap_add(&result->map, &entry->entry);
+ ++entry;
+ }
+ }
+}
+
+static void free_fingerprint(struct fingerprint *f)
+{
+ hashmap_free(&f->map);
+ free(f->entries);
+}
+
+/* Calculates the similarity between two fingerprints as the size of the
+ * intersection of their multisets, including repeated elements. See
+ * `struct fingerprint` for an explanation of the fingerprint representation.
+ * The similarity between "cat mat" and "father rather" is 2 because "at" is
+ * present twice in both strings while the similarity between "tim" and "mit"
+ * is 0.
+ */
+static int fingerprint_similarity(struct fingerprint *a, struct fingerprint *b)
+{
+ int intersection = 0;
+ struct hashmap_iter iter;
+ const struct fingerprint_entry *entry_a, *entry_b;
+
+ hashmap_for_each_entry(&b->map, &iter, entry_b,
+ entry /* member name */) {
+ entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
+ if (entry_a) {
+ intersection += entry_a->count < entry_b->count ?
+ entry_a->count : entry_b->count;
+ }
+ }
+ return intersection;
+}
+
+/* Subtracts byte-pair elements in B from A, modifying A in place.
+ */
+static void fingerprint_subtract(struct fingerprint *a, struct fingerprint *b)
+{
+ struct hashmap_iter iter;
+ struct fingerprint_entry *entry_a;
+ const struct fingerprint_entry *entry_b;
+
+ hashmap_iter_init(&b->map, &iter);
+
+ hashmap_for_each_entry(&b->map, &iter, entry_b,
+ entry /* member name */) {
+ entry_a = hashmap_get_entry(&a->map, entry_b, entry, NULL);
+ if (entry_a) {
+ if (entry_a->count <= entry_b->count)
+ hashmap_remove(&a->map, &entry_b->entry, NULL);
+ else
+ entry_a->count -= entry_b->count;
+ }
+ }
+}
+
+/* Calculate fingerprints for a series of lines.
+ * Puts the fingerprints in the fingerprints array, which must have been
+ * preallocated to allow storing line_count elements.
+ */
+static void get_line_fingerprints(struct fingerprint *fingerprints,
+ const char *content, const int *line_starts,
+ long first_line, long line_count)
+{
+ int i;
+ const char *linestart, *lineend;
+
+ line_starts += first_line;
+ for (i = 0; i < line_count; ++i) {
+ linestart = content + line_starts[i];
+ lineend = content + line_starts[i + 1];
+ get_fingerprint(fingerprints + i, linestart, lineend);
+ }
+}
+
+static void free_line_fingerprints(struct fingerprint *fingerprints,
+ int nr_fingerprints)
+{
+ int i;
+
+ for (i = 0; i < nr_fingerprints; i++)
+ free_fingerprint(&fingerprints[i]);
+}
+
+/* This contains the data necessary to linearly map a line number in one half
+ * of a diff chunk to the line in the other half of the diff chunk that is
+ * closest in terms of its position as a fraction of the length of the chunk.
+ */
+struct line_number_mapping {
+ int destination_start, destination_length,
+ source_start, source_length;
+};
+
+/* Given a line number in one range, offset and scale it to map it onto the
+ * other range.
+ * Essentially this mapping is a simple linear equation but the calculation is
+ * more complicated to allow performing it with integer operations.
+ * Another complication is that if a line could map onto many lines in the
+ * destination range then we want to choose the line at the center of those
+ * possibilities.
+ * Example: if the chunk is 2 lines long in A and 10 lines long in B then the
+ * first 5 lines in B will map onto the first line in the A chunk, while the
+ * last 5 lines will all map onto the second line in the A chunk.
+ * Example: if the chunk is 10 lines long in A and 2 lines long in B then line
+ * 0 in B will map onto line 2 in A, and line 1 in B will map onto line 7 in A.
+ */
+static int map_line_number(int line_number,
+ const struct line_number_mapping *mapping)
+{
+ return ((line_number - mapping->source_start) * 2 + 1) *
+ mapping->destination_length /
+ (mapping->source_length * 2) +
+ mapping->destination_start;
+}
+
+/* Get a pointer to the element storing the similarity between a line in A
+ * and a line in B.
+ *
+ * The similarities are stored in a 2-dimensional array. Each "row" in the
+ * array contains the similarities for a line in B. The similarities stored in
+ * a row are the similarities between the line in B and the nearby lines in A.
+ * To keep the length of each row the same, it is padded out with values of -1
+ * where the search range extends beyond the lines in A.
+ * For example, if max_search_distance_a is 2 and the two sides of a diff chunk
+ * look like this:
+ * a | m
+ * b | n
+ * c | o
+ * d | p
+ * e | q
+ * Then the similarity array will contain:
+ * [-1, -1, am, bm, cm,
+ * -1, an, bn, cn, dn,
+ * ao, bo, co, do, eo,
+ * bp, cp, dp, ep, -1,
+ * cq, dq, eq, -1, -1]
+ * Where similarities are denoted either by -1 for invalid, or the
+ * concatenation of the two lines in the diff being compared.
+ *
+ * \param similarities array of similarities between lines in A and B
+ * \param line_a the index of the line in A, in the same frame of reference as
+ * closest_line_a.
+ * \param local_line_b the index of the line in B, relative to the first line
+ * in B that similarities represents.
+ * \param closest_line_a the index of the line in A that is deemed to be
+ * closest to local_line_b. This must be in the same
+ * frame of reference as line_a. This value defines
+ * where similarities is centered for the line in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ */
+static int *get_similarity(int *similarities,
+ int line_a, int local_line_b,
+ int closest_line_a, int max_search_distance_a)
+{
+ assert(abs(line_a - closest_line_a) <=
+ max_search_distance_a);
+ return similarities + line_a - closest_line_a +
+ max_search_distance_a +
+ local_line_b * (max_search_distance_a * 2 + 1);
+}
+
+#define CERTAIN_NOTHING_MATCHES -2
+#define CERTAINTY_NOT_CALCULATED -1
+
+/* Given a line in B, first calculate its similarities with nearby lines in A
+ * if not already calculated, then identify the most similar and second most
+ * similar lines. The "certainty" is calculated based on those two
+ * similarities.
+ *
+ * \param start_a the index of the first line of the chunk in A
+ * \param length_a the length in lines of the chunk in A
+ * \param local_line_b the index of the line in B, relative to the first line
+ * in the chunk.
+ * \param fingerprints_a array of fingerprints for the chunk in A
+ * \param fingerprints_b array of fingerprints for the chunk in B
+ * \param similarities 2-dimensional array of similarities between lines in A
+ * and B. See get_similarity() for more details.
+ * \param certainties array of values indicating how strongly a line in B is
+ * matched with some line in A.
+ * \param second_best_result array of absolute indices in A for the second
+ * closest match of a line in B.
+ * \param result array of absolute indices in A for the closest match of a line
+ * in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ * \param map_line_number_in_b_to_a parameter to map_line_number().
+ */
+static void find_best_line_matches(
+ int start_a,
+ int length_a,
+ int start_b,
+ int local_line_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ const int max_search_distance_a,
+ const struct line_number_mapping *map_line_number_in_b_to_a)
+{
+
+ int i, search_start, search_end, closest_local_line_a, *similarity,
+ best_similarity = 0, second_best_similarity = 0,
+ best_similarity_index = 0, second_best_similarity_index = 0;
+
+ /* certainty has already been calculated so no need to redo the work */
+ if (certainties[local_line_b] != CERTAINTY_NOT_CALCULATED)
+ return;
+
+ closest_local_line_a = map_line_number(
+ local_line_b + start_b, map_line_number_in_b_to_a) - start_a;
+
+ search_start = closest_local_line_a - max_search_distance_a;
+ if (search_start < 0)
+ search_start = 0;
+
+ search_end = closest_local_line_a + max_search_distance_a + 1;
+ if (search_end > length_a)
+ search_end = length_a;
+
+ for (i = search_start; i < search_end; ++i) {
+ similarity = get_similarity(similarities,
+ i, local_line_b,
+ closest_local_line_a,
+ max_search_distance_a);
+ if (*similarity == -1) {
+ /* This value will never exceed 10 but assert just in
+ * case
+ */
+ assert(abs(i - closest_local_line_a) < 1000);
+ /* scale the similarity by (1000 - distance from
+ * closest line) to act as a tie break between lines
+ * that otherwise are equally similar.
+ */
+ *similarity = fingerprint_similarity(
+ fingerprints_b + local_line_b,
+ fingerprints_a + i) *
+ (1000 - abs(i - closest_local_line_a));
+ }
+ if (*similarity > best_similarity) {
+ second_best_similarity = best_similarity;
+ second_best_similarity_index = best_similarity_index;
+ best_similarity = *similarity;
+ best_similarity_index = i;
+ } else if (*similarity > second_best_similarity) {
+ second_best_similarity = *similarity;
+ second_best_similarity_index = i;
+ }
+ }
+
+ if (best_similarity == 0) {
+ /* this line definitely doesn't match with anything. Mark it
+ * with this special value so it doesn't get invalidated and
+ * won't be recalculated.
+ */
+ certainties[local_line_b] = CERTAIN_NOTHING_MATCHES;
+ result[local_line_b] = -1;
+ } else {
+ /* Calculate the certainty with which this line matches.
+ * If the line matches well with two lines then that reduces
+ * the certainty. However we still want to prioritise matching
+ * a line that matches very well with two lines over matching a
+ * line that matches poorly with one line, hence doubling
+ * best_similarity.
+ * This means that if we have
+ * line X that matches only one line with a score of 3,
+ * line Y that matches two lines equally with a score of 5,
+ * and line Z that matches only one line with a score or 2,
+ * then the lines in order of certainty are X, Y, Z.
+ */
+ certainties[local_line_b] = best_similarity * 2 -
+ second_best_similarity;
+
+ /* We keep both the best and second best results to allow us to
+ * check at a later stage of the matching process whether the
+ * result needs to be invalidated.
+ */
+ result[local_line_b] = start_a + best_similarity_index;
+ second_best_result[local_line_b] =
+ start_a + second_best_similarity_index;
+ }
+}
+
+/*
+ * This finds the line that we can match with the most confidence, and
+ * uses it as a partition. It then calls itself on the lines on either side of
+ * that partition. In this way we avoid lines appearing out of order, and
+ * retain a sensible line ordering.
+ * \param start_a index of the first line in A with which lines in B may be
+ * compared.
+ * \param start_b index of the first line in B for which matching should be
+ * done.
+ * \param length_a number of lines in A with which lines in B may be compared.
+ * \param length_b number of lines in B for which matching should be done.
+ * \param fingerprints_a mutable array of fingerprints in A. The first element
+ * corresponds to the line at start_a.
+ * \param fingerprints_b array of fingerprints in B. The first element
+ * corresponds to the line at start_b.
+ * \param similarities 2-dimensional array of similarities between lines in A
+ * and B. See get_similarity() for more details.
+ * \param certainties array of values indicating how strongly a line in B is
+ * matched with some line in A.
+ * \param second_best_result array of absolute indices in A for the second
+ * closest match of a line in B.
+ * \param result array of absolute indices in A for the closest match of a line
+ * in B.
+ * \param max_search_distance_a maximum distance in lines from the closest line
+ * in A for other lines in A for which
+ * similarities may be calculated.
+ * \param max_search_distance_b an upper bound on the greatest possible
+ * distance between lines in B such that they will
+ * both be compared with the same line in A
+ * according to max_search_distance_a.
+ * \param map_line_number_in_b_to_a parameter to map_line_number().
+ */
+static void fuzzy_find_matching_lines_recurse(
+ int start_a, int start_b,
+ int length_a, int length_b,
+ struct fingerprint *fingerprints_a,
+ struct fingerprint *fingerprints_b,
+ int *similarities,
+ int *certainties,
+ int *second_best_result,
+ int *result,
+ int max_search_distance_a,
+ int max_search_distance_b,
+ const struct line_number_mapping *map_line_number_in_b_to_a)
+{
+ int i, invalidate_min, invalidate_max, offset_b,
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ most_certain_line_a, most_certain_local_line_b = -1,
+ most_certain_line_certainty = -1,
+ closest_local_line_a;
+
+ for (i = 0; i < length_b; ++i) {
+ find_best_line_matches(start_a,
+ length_a,
+ start_b,
+ i,
+ fingerprints_a,
+ fingerprints_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ max_search_distance_a,
+ map_line_number_in_b_to_a);
+
+ if (certainties[i] > most_certain_line_certainty) {
+ most_certain_line_certainty = certainties[i];
+ most_certain_local_line_b = i;
+ }
+ }
+
+ /* No matches. */
+ if (most_certain_local_line_b == -1)
+ return;
+
+ most_certain_line_a = result[most_certain_local_line_b];
+
+ /*
+ * Subtract the most certain line's fingerprint in B from the matched
+ * fingerprint in A. This means that other lines in B can't also match
+ * the same parts of the line in A.
+ */
+ fingerprint_subtract(fingerprints_a + most_certain_line_a - start_a,
+ fingerprints_b + most_certain_local_line_b);
+
+ /* Invalidate results that may be affected by the choice of most
+ * certain line.
+ */
+ invalidate_min = most_certain_local_line_b - max_search_distance_b;
+ invalidate_max = most_certain_local_line_b + max_search_distance_b + 1;
+ if (invalidate_min < 0)
+ invalidate_min = 0;
+ if (invalidate_max > length_b)
+ invalidate_max = length_b;
+
+ /* As the fingerprint in A has changed, discard previously calculated
+ * similarity values with that fingerprint.
+ */
+ for (i = invalidate_min; i < invalidate_max; ++i) {
+ closest_local_line_a = map_line_number(
+ i + start_b, map_line_number_in_b_to_a) - start_a;
+
+ /* Check that the lines in A and B are close enough that there
+ * is a similarity value for them.
+ */
+ if (abs(most_certain_line_a - start_a - closest_local_line_a) >
+ max_search_distance_a) {
+ continue;
+ }
+
+ *get_similarity(similarities, most_certain_line_a - start_a,
+ i, closest_local_line_a,
+ max_search_distance_a) = -1;
+ }
+
+ /* More invalidating of results that may be affected by the choice of
+ * most certain line.
+ * Discard the matches for lines in B that are currently matched with a
+ * line in A such that their ordering contradicts the ordering imposed
+ * by the choice of most certain line.
+ */
+ for (i = most_certain_local_line_b - 1; i >= invalidate_min; --i) {
+ /* In this loop we discard results for lines in B that are
+ * before most-certain-line-B but are matched with a line in A
+ * that is after most-certain-line-A.
+ */
+ if (certainties[i] >= 0 &&
+ (result[i] >= most_certain_line_a ||
+ second_best_result[i] >= most_certain_line_a)) {
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+ }
+ for (i = most_certain_local_line_b + 1; i < invalidate_max; ++i) {
+ /* In this loop we discard results for lines in B that are
+ * after most-certain-line-B but are matched with a line in A
+ * that is before most-certain-line-A.
+ */
+ if (certainties[i] >= 0 &&
+ (result[i] <= most_certain_line_a ||
+ second_best_result[i] <= most_certain_line_a)) {
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+ }
+
+ /* Repeat the matching process for lines before the most certain line.
+ */
+ if (most_certain_local_line_b > 0) {
+ fuzzy_find_matching_lines_recurse(
+ start_a, start_b,
+ most_certain_line_a + 1 - start_a,
+ most_certain_local_line_b,
+ fingerprints_a, fingerprints_b, similarities,
+ certainties, second_best_result, result,
+ max_search_distance_a,
+ max_search_distance_b,
+ map_line_number_in_b_to_a);
+ }
+ /* Repeat the matching process for lines after the most certain line.
+ */
+ if (most_certain_local_line_b + 1 < length_b) {
+ second_half_start_a = most_certain_line_a;
+ offset_b = most_certain_local_line_b + 1;
+ second_half_start_b = start_b + offset_b;
+ second_half_length_a =
+ length_a + start_a - second_half_start_a;
+ second_half_length_b =
+ length_b + start_b - second_half_start_b;
+ fuzzy_find_matching_lines_recurse(
+ second_half_start_a, second_half_start_b,
+ second_half_length_a, second_half_length_b,
+ fingerprints_a + second_half_start_a - start_a,
+ fingerprints_b + offset_b,
+ similarities +
+ offset_b * (max_search_distance_a * 2 + 1),
+ certainties + offset_b,
+ second_best_result + offset_b, result + offset_b,
+ max_search_distance_a,
+ max_search_distance_b,
+ map_line_number_in_b_to_a);
+ }
+}
+
+/* Find the lines in the parent line range that most closely match the lines in
+ * the target line range. This is accomplished by matching fingerprints in each
+ * blame_origin, and choosing the best matches that preserve the line ordering.
+ * See struct fingerprint for details of fingerprint matching, and
+ * fuzzy_find_matching_lines_recurse for details of preserving line ordering.
+ *
+ * The performance is believed to be O(n log n) in the typical case and O(n^2)
+ * in a pathological case, where n is the number of lines in the target range.
+ */
+static int *fuzzy_find_matching_lines(struct blame_origin *parent,
+ struct blame_origin *target,
+ int tlno, int parent_slno, int same,
+ int parent_len)
+{
+ /* We use the terminology "A" for the left hand side of the diff AKA
+ * parent, and "B" for the right hand side of the diff AKA target. */
+ int start_a = parent_slno;
+ int length_a = parent_len;
+ int start_b = tlno;
+ int length_b = same - tlno;
+
+ struct line_number_mapping map_line_number_in_b_to_a = {
+ start_a, length_a, start_b, length_b
+ };
+
+ struct fingerprint *fingerprints_a = parent->fingerprints;
+ struct fingerprint *fingerprints_b = target->fingerprints;
+
+ int i, *result, *second_best_result,
+ *certainties, *similarities, similarity_count;
+
+ /*
+ * max_search_distance_a means that given a line in B, compare it to
+ * the line in A that is closest to its position, and the lines in A
+ * that are no greater than max_search_distance_a lines away from the
+ * closest line in A.
+ *
+ * max_search_distance_b is an upper bound on the greatest possible
+ * distance between lines in B such that they will both be compared
+ * with the same line in A according to max_search_distance_a.
+ */
+ int max_search_distance_a = 10, max_search_distance_b;
+
+ if (length_a <= 0)
+ return NULL;
+
+ if (max_search_distance_a >= length_a)
+ max_search_distance_a = length_a ? length_a - 1 : 0;
+
+ max_search_distance_b = ((2 * max_search_distance_a + 1) * length_b
+ - 1) / length_a;
+
+ result = xcalloc(sizeof(int), length_b);
+ second_best_result = xcalloc(sizeof(int), length_b);
+ certainties = xcalloc(sizeof(int), length_b);
+
+ /* See get_similarity() for details of similarities. */
+ similarity_count = length_b * (max_search_distance_a * 2 + 1);
+ similarities = xcalloc(sizeof(int), similarity_count);
+
+ for (i = 0; i < length_b; ++i) {
+ result[i] = -1;
+ second_best_result[i] = -1;
+ certainties[i] = CERTAINTY_NOT_CALCULATED;
+ }
+
+ for (i = 0; i < similarity_count; ++i)
+ similarities[i] = -1;
+
+ fuzzy_find_matching_lines_recurse(start_a, start_b,
+ length_a, length_b,
+ fingerprints_a + start_a,
+ fingerprints_b + start_b,
+ similarities,
+ certainties,
+ second_best_result,
+ result,
+ max_search_distance_a,
+ max_search_distance_b,
+ &map_line_number_in_b_to_a);
+
+ free(similarities);
+ free(certainties);
+ free(second_best_result);
+
+ return result;
+}
+
+static void fill_origin_fingerprints(struct blame_origin *o)
+{
+ int *line_starts;
+
+ if (o->fingerprints)
+ return;
+ o->num_lines = find_line_starts(&line_starts, o->file.ptr,
+ o->file.size);
+ o->fingerprints = xcalloc(sizeof(struct fingerprint), o->num_lines);
+ get_line_fingerprints(o->fingerprints, o->file.ptr, line_starts,
+ 0, o->num_lines);
+ free(line_starts);
+}
+
+static void drop_origin_fingerprints(struct blame_origin *o)
+{
+ if (o->fingerprints) {
+ free_line_fingerprints(o->fingerprints, o->num_lines);
+ o->num_lines = 0;
+ FREE_AND_NULL(o->fingerprints);
+ }
+}
+
/*
* Given an origin, prepare mmfile_t structure to be used by the
* diff machinery
*/
static void fill_origin_blob(struct diff_options *opt,
- struct blame_origin *o, mmfile_t *file, int *num_read_blob)
+ struct blame_origin *o, mmfile_t *file,
+ int *num_read_blob, int fill_fingerprints)
{
if (!o->file.ptr) {
enum object_type type;
@@ -318,7 +1024,8 @@ static void fill_origin_blob(struct diff_options *opt,
(*num_read_blob)++;
if (opt->flags.allow_textconv &&
- textconv_object(o->path, o->mode, &o->blob_oid, 1, &file->ptr, &file_size))
+ textconv_object(opt->repo, o->path, o->mode,
+ &o->blob_oid, 1, &file->ptr, &file_size))
;
else
file->ptr = read_object_file(&o->blob_oid, &type,
@@ -333,13 +1040,14 @@ static void fill_origin_blob(struct diff_options *opt,
}
else
*file = o->file;
+ if (fill_fingerprints)
+ fill_origin_fingerprints(o);
}
static void drop_origin_blob(struct blame_origin *o)
{
- if (o->file.ptr) {
- FREE_AND_NULL(o->file.ptr);
- }
+ FREE_AND_NULL(o->file.ptr);
+ drop_origin_fingerprints(o);
}
/*
@@ -475,7 +1183,9 @@ void blame_coalesce(struct blame_scoreboard *sb)
for (ent = sb->ent; ent && (next = ent->next); ent = next) {
if (ent->suspect == next->suspect &&
- ent->s_lno + ent->num_lines == next->s_lno) {
+ ent->s_lno + ent->num_lines == next->s_lno &&
+ ent->ignored == next->ignored &&
+ ent->unblamable == next->unblamable) {
ent->num_lines += next->num_lines;
ent->next = next->next;
blame_origin_decref(next->suspect);
@@ -522,14 +1232,14 @@ static void queue_blames(struct blame_scoreboard *sb, struct blame_origin *porig
*
* This also fills origin->mode for corresponding tree path.
*/
-static int fill_blob_sha1_and_mode(struct repository *repo,
+static int fill_blob_sha1_and_mode(struct repository *r,
struct blame_origin *origin)
{
if (!is_null_oid(&origin->blob_oid))
return 0;
- if (get_tree_entry(&origin->commit->object.oid, origin->path, &origin->blob_oid, &origin->mode))
+ if (get_tree_entry(r, &origin->commit->object.oid, origin->path, &origin->blob_oid, &origin->mode))
goto error_out;
- if (oid_object_info(repo, &origin->blob_oid, NULL) != OBJ_BLOB)
+ if (oid_object_info(r, &origin->blob_oid, NULL) != OBJ_BLOB)
goto error_out;
return 0;
error_out:
@@ -538,12 +1248,74 @@ static int fill_blob_sha1_and_mode(struct repository *repo,
return -1;
}
+struct blame_bloom_data {
+ /*
+ * Changed-path Bloom filter keys. These can help prevent
+ * computing diffs against first parents, but we need to
+ * expand the list as code is moved or files are renamed.
+ */
+ struct bloom_filter_settings *settings;
+ struct bloom_key **keys;
+ int nr;
+ int alloc;
+};
+
+static int bloom_count_queries = 0;
+static int bloom_count_no = 0;
+static int maybe_changed_path(struct repository *r,
+ struct blame_origin *origin,
+ struct blame_bloom_data *bd)
+{
+ int i;
+ struct bloom_filter *filter;
+
+ if (!bd)
+ return 1;
+
+ if (origin->commit->generation == GENERATION_NUMBER_INFINITY)
+ return 1;
+
+ filter = get_bloom_filter(r, origin->commit, 0);
+
+ if (!filter)
+ return 1;
+
+ bloom_count_queries++;
+ for (i = 0; i < bd->nr; i++) {
+ if (bloom_filter_contains(filter,
+ bd->keys[i],
+ bd->settings))
+ return 1;
+ }
+
+ bloom_count_no++;
+ return 0;
+}
+
+static void add_bloom_key(struct blame_bloom_data *bd,
+ const char *path)
+{
+ if (!bd)
+ return;
+
+ if (bd->nr >= bd->alloc) {
+ bd->alloc *= 2;
+ REALLOC_ARRAY(bd->keys, bd->alloc);
+ }
+
+ bd->keys[bd->nr] = xmalloc(sizeof(struct bloom_key));
+ fill_bloom_key(path, strlen(path), bd->keys[bd->nr], bd->settings);
+ bd->nr++;
+}
+
/*
* We have an origin -- check if the same path exists in the
* parent and return an origin structure to represent it.
*/
-static struct blame_origin *find_origin(struct commit *parent,
- struct blame_origin *origin)
+static struct blame_origin *find_origin(struct repository *r,
+ struct commit *parent,
+ struct blame_origin *origin,
+ struct blame_bloom_data *bd)
{
struct blame_origin *porigin;
struct diff_options diff_opts;
@@ -563,7 +1335,7 @@ static struct blame_origin *find_origin(struct commit *parent,
* and origin first. Most of the time they are the
* same and diff-tree is fairly efficient about this.
*/
- diff_setup(&diff_opts);
+ repo_diff_setup(r, &diff_opts);
diff_opts.flags.recursive = 1;
diff_opts.detect_rename = 0;
diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
@@ -577,10 +1349,18 @@ static struct blame_origin *find_origin(struct commit *parent,
if (is_null_oid(&origin->commit->object.oid))
do_diff_cache(get_commit_tree_oid(parent), &diff_opts);
- else
- diff_tree_oid(get_commit_tree_oid(parent),
- get_commit_tree_oid(origin->commit),
- "", &diff_opts);
+ else {
+ int compute_diff = 1;
+ if (origin->commit->parents &&
+ !oidcmp(&parent->object.oid,
+ &origin->commit->parents->item->object.oid))
+ compute_diff = maybe_changed_path(r, origin, bd);
+
+ if (compute_diff)
+ diff_tree_oid(get_commit_tree_oid(parent),
+ get_commit_tree_oid(origin->commit),
+ "", &diff_opts);
+ }
diffcore_std(&diff_opts);
if (!diff_queued_diff.nr) {
@@ -630,14 +1410,16 @@ static struct blame_origin *find_origin(struct commit *parent,
* We have an origin -- find the path that corresponds to it in its
* parent and return an origin structure to represent it.
*/
-static struct blame_origin *find_rename(struct commit *parent,
- struct blame_origin *origin)
+static struct blame_origin *find_rename(struct repository *r,
+ struct commit *parent,
+ struct blame_origin *origin,
+ struct blame_bloom_data *bd)
{
struct blame_origin *porigin = NULL;
struct diff_options diff_opts;
int i;
- diff_setup(&diff_opts);
+ repo_diff_setup(r, &diff_opts);
diff_opts.flags.recursive = 1;
diff_opts.detect_rename = DIFF_DETECT_RENAME;
diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
@@ -656,6 +1438,7 @@ static struct blame_origin *find_rename(struct commit *parent,
struct diff_filepair *p = diff_queued_diff.queue[i];
if ((p->status == 'R' || p->status == 'C') &&
!strcmp(p->two->path, origin->path)) {
+ add_bloom_key(bd, p->one->path);
porigin = get_origin(parent, p->one->path);
oidcpy(&porigin->blob_oid, &p->one->oid);
porigin->mode = p->one->mode;
@@ -723,8 +1506,14 @@ static void split_overlap(struct blame_entry *split,
struct blame_origin *parent)
{
int chunk_end_lno;
+ int i;
memset(split, 0, sizeof(struct blame_entry [3]));
+ for (i = 0; i < 3; i++) {
+ split[i].ignored = e->ignored;
+ split[i].unblamable = e->unblamable;
+ }
+
if (e->s_lno < tlno) {
/* there is a pre-chunk part not blamed on parent */
split[0].suspect = blame_origin_incref(e->suspect);
@@ -833,6 +1622,164 @@ static struct blame_entry *reverse_blame(struct blame_entry *head,
}
/*
+ * Splits a blame entry into two entries at 'len' lines. The original 'e'
+ * consists of len lines, i.e. [e->lno, e->lno + len), and the second part,
+ * which is returned, consists of the remainder: [e->lno + len, e->lno +
+ * e->num_lines). The caller needs to sort out the reference counting for the
+ * new entry's suspect.
+ */
+static struct blame_entry *split_blame_at(struct blame_entry *e, int len,
+ struct blame_origin *new_suspect)
+{
+ struct blame_entry *n = xcalloc(1, sizeof(struct blame_entry));
+
+ n->suspect = new_suspect;
+ n->ignored = e->ignored;
+ n->unblamable = e->unblamable;
+ n->lno = e->lno + len;
+ n->s_lno = e->s_lno + len;
+ n->num_lines = e->num_lines - len;
+ e->num_lines = len;
+ e->score = 0;
+ return n;
+}
+
+struct blame_line_tracker {
+ int is_parent;
+ int s_lno;
+};
+
+static int are_lines_adjacent(struct blame_line_tracker *first,
+ struct blame_line_tracker *second)
+{
+ return first->is_parent == second->is_parent &&
+ first->s_lno + 1 == second->s_lno;
+}
+
+static int scan_parent_range(struct fingerprint *p_fps,
+ struct fingerprint *t_fps, int t_idx,
+ int from, int nr_lines)
+{
+ int sim, p_idx;
+ #define FINGERPRINT_FILE_THRESHOLD 10
+ int best_sim_val = FINGERPRINT_FILE_THRESHOLD;
+ int best_sim_idx = -1;
+
+ for (p_idx = from; p_idx < from + nr_lines; p_idx++) {
+ sim = fingerprint_similarity(&t_fps[t_idx], &p_fps[p_idx]);
+ if (sim < best_sim_val)
+ continue;
+ /* Break ties with the closest-to-target line number */
+ if (sim == best_sim_val && best_sim_idx != -1 &&
+ abs(best_sim_idx - t_idx) < abs(p_idx - t_idx))
+ continue;
+ best_sim_val = sim;
+ best_sim_idx = p_idx;
+ }
+ return best_sim_idx;
+}
+
+/*
+ * The first pass checks the blame entry (from the target) against the parent's
+ * diff chunk. If that fails for a line, the second pass tries to match that
+ * line to any part of parent file. That catches cases where a change was
+ * broken into two chunks by 'context.'
+ */
+static void guess_line_blames(struct blame_origin *parent,
+ struct blame_origin *target,
+ int tlno, int offset, int same, int parent_len,
+ struct blame_line_tracker *line_blames)
+{
+ int i, best_idx, target_idx;
+ int parent_slno = tlno + offset;
+ int *fuzzy_matches;
+
+ fuzzy_matches = fuzzy_find_matching_lines(parent, target,
+ tlno, parent_slno, same,
+ parent_len);
+ for (i = 0; i < same - tlno; i++) {
+ target_idx = tlno + i;
+ if (fuzzy_matches && fuzzy_matches[i] >= 0) {
+ best_idx = fuzzy_matches[i];
+ } else {
+ best_idx = scan_parent_range(parent->fingerprints,
+ target->fingerprints,
+ target_idx, 0,
+ parent->num_lines);
+ }
+ if (best_idx >= 0) {
+ line_blames[i].is_parent = 1;
+ line_blames[i].s_lno = best_idx;
+ } else {
+ line_blames[i].is_parent = 0;
+ line_blames[i].s_lno = target_idx;
+ }
+ }
+ free(fuzzy_matches);
+}
+
+/*
+ * This decides which parts of a blame entry go to the parent (added to the
+ * ignoredp list) and which stay with the target (added to the diffp list). The
+ * actual decision was made in a separate heuristic function, and those answers
+ * for the lines in 'e' are in line_blames. This consumes e, essentially
+ * putting it on a list.
+ *
+ * Note that the blame entries on the ignoredp list are not necessarily sorted
+ * with respect to the parent's line numbers yet.
+ */
+static void ignore_blame_entry(struct blame_entry *e,
+ struct blame_origin *parent,
+ struct blame_entry **diffp,
+ struct blame_entry **ignoredp,
+ struct blame_line_tracker *line_blames)
+{
+ int entry_len, nr_lines, i;
+
+ /*
+ * We carve new entries off the front of e. Each entry comes from a
+ * contiguous chunk of lines: adjacent lines from the same origin
+ * (either the parent or the target).
+ */
+ entry_len = 1;
+ nr_lines = e->num_lines; /* e changes in the loop */
+ for (i = 0; i < nr_lines; i++) {
+ struct blame_entry *next = NULL;
+
+ /*
+ * We are often adjacent to the next line - only split the blame
+ * entry when we have to.
+ */
+ if (i + 1 < nr_lines) {
+ if (are_lines_adjacent(&line_blames[i],
+ &line_blames[i + 1])) {
+ entry_len++;
+ continue;
+ }
+ next = split_blame_at(e, entry_len,
+ blame_origin_incref(e->suspect));
+ }
+ if (line_blames[i].is_parent) {
+ e->ignored = 1;
+ blame_origin_decref(e->suspect);
+ e->suspect = blame_origin_incref(parent);
+ e->s_lno = line_blames[i - entry_len + 1].s_lno;
+ e->next = *ignoredp;
+ *ignoredp = e;
+ } else {
+ e->unblamable = 1;
+ /* e->s_lno is already in the target's address space. */
+ e->next = *diffp;
+ *diffp = e;
+ }
+ assert(e->num_lines == entry_len);
+ e = next;
+ entry_len = 1;
+ }
+ assert(!e);
+}
+
+/*
* Process one hunk from the patch between the current suspect for
* blame_entry e and its parent. This first blames any unfinished
* entries before the chunk (which is where target and parent start
@@ -841,13 +1788,20 @@ static struct blame_entry *reverse_blame(struct blame_entry *head,
* -C options may lead to overlapping/duplicate source line number
* ranges, all we can rely on from sorting/merging is the order of the
* first suspect line number.
+ *
+ * tlno: line number in the target where this chunk begins
+ * same: line number in the target where this chunk ends
+ * offset: add to tlno to get the chunk starting point in the parent
+ * parent_len: number of lines in the parent chunk
*/
static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,
- int tlno, int offset, int same,
- struct blame_origin *parent)
+ int tlno, int offset, int same, int parent_len,
+ struct blame_origin *parent,
+ struct blame_origin *target, int ignore_diffs)
{
struct blame_entry *e = **srcq;
- struct blame_entry *samep = NULL, *diffp = NULL;
+ struct blame_entry *samep = NULL, *diffp = NULL, *ignoredp = NULL;
+ struct blame_line_tracker *line_blames = NULL;
while (e && e->s_lno < tlno) {
struct blame_entry *next = e->next;
@@ -858,14 +1812,9 @@ static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,
*/
if (e->s_lno + e->num_lines > tlno) {
/* Move second half to a new record */
- int len = tlno - e->s_lno;
- struct blame_entry *n = xcalloc(1, sizeof (struct blame_entry));
- n->suspect = e->suspect;
- n->lno = e->lno + len;
- n->s_lno = e->s_lno + len;
- n->num_lines = e->num_lines - len;
- e->num_lines = len;
- e->score = 0;
+ struct blame_entry *n;
+
+ n = split_blame_at(e, tlno - e->s_lno, e->suspect);
/* Push new record to diffp */
n->next = diffp;
diffp = n;
@@ -901,6 +1850,14 @@ static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,
*/
samep = NULL;
diffp = NULL;
+
+ if (ignore_diffs && same - tlno > 0) {
+ line_blames = xcalloc(sizeof(struct blame_line_tracker),
+ same - tlno);
+ guess_line_blames(parent, target, tlno, offset, same,
+ parent_len, line_blames);
+ }
+
while (e && e->s_lno < same) {
struct blame_entry *next = e->next;
@@ -912,22 +1869,37 @@ static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,
* Move second half to a new record to be
* processed by later chunks
*/
- int len = same - e->s_lno;
- struct blame_entry *n = xcalloc(1, sizeof (struct blame_entry));
- n->suspect = blame_origin_incref(e->suspect);
- n->lno = e->lno + len;
- n->s_lno = e->s_lno + len;
- n->num_lines = e->num_lines - len;
- e->num_lines = len;
- e->score = 0;
+ struct blame_entry *n;
+
+ n = split_blame_at(e, same - e->s_lno,
+ blame_origin_incref(e->suspect));
/* Push new record to samep */
n->next = samep;
samep = n;
}
- e->next = diffp;
- diffp = e;
+ if (ignore_diffs) {
+ ignore_blame_entry(e, parent, &diffp, &ignoredp,
+ line_blames + e->s_lno - tlno);
+ } else {
+ e->next = diffp;
+ diffp = e;
+ }
e = next;
}
+ free(line_blames);
+ if (ignoredp) {
+ /*
+ * Note ignoredp is not sorted yet, and thus neither is dstq.
+ * That list must be sorted before we queue_blames(). We defer
+ * sorting until after all diff hunks are processed, so that
+ * guess_line_blames() can pick *any* line in the parent. The
+ * slight drawback is that we end up sorting all blame entries
+ * passed to the parent, including those that are unrelated to
+ * changes made by the ignored commit.
+ */
+ **dstq = reverse_blame(ignoredp, **dstq);
+ *dstq = &ignoredp->next;
+ }
**srcq = reverse_blame(diffp, reverse_blame(samep, e));
/* Move across elements that are in the unblamable portion */
if (diffp)
@@ -936,7 +1908,9 @@ static void blame_chunk(struct blame_entry ***dstq, struct blame_entry ***srcq,
struct blame_chunk_cb_data {
struct blame_origin *parent;
+ struct blame_origin *target;
long offset;
+ int ignore_diffs;
struct blame_entry **dstq;
struct blame_entry **srcq;
};
@@ -949,7 +1923,8 @@ static int blame_chunk_cb(long start_a, long count_a,
if (start_a - start_b != d->offset)
die("internal error in blame::blame_chunk_cb");
blame_chunk(&d->dstq, &d->srcq, start_b, start_a - start_b,
- start_b + count_b, d->parent);
+ start_b + count_b, count_a, d->parent, d->target,
+ d->ignore_diffs);
d->offset = start_a + count_a - (start_b + count_b);
return 0;
}
@@ -961,7 +1936,7 @@ static int blame_chunk_cb(long start_a, long count_a,
*/
static void pass_blame_to_parent(struct blame_scoreboard *sb,
struct blame_origin *target,
- struct blame_origin *parent)
+ struct blame_origin *parent, int ignore_diffs)
{
mmfile_t file_p, file_o;
struct blame_chunk_cb_data d;
@@ -971,11 +1946,15 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb,
return; /* nothing remains for this target */
d.parent = parent;
+ d.target = target;
d.offset = 0;
+ d.ignore_diffs = ignore_diffs;
d.dstq = &newdest; d.srcq = &target->suspects;
- fill_origin_blob(&sb->revs->diffopt, parent, &file_p, &sb->num_read_blob);
- fill_origin_blob(&sb->revs->diffopt, target, &file_o, &sb->num_read_blob);
+ fill_origin_blob(&sb->revs->diffopt, parent, &file_p,
+ &sb->num_read_blob, ignore_diffs);
+ fill_origin_blob(&sb->revs->diffopt, target, &file_o,
+ &sb->num_read_blob, ignore_diffs);
sb->num_get_patch++;
if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts))
@@ -983,8 +1962,13 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb,
oid_to_hex(&parent->commit->object.oid),
oid_to_hex(&target->commit->object.oid));
/* The rest are the same as the parent */
- blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, parent);
+ blame_chunk(&d.dstq, &d.srcq, INT_MAX, d.offset, INT_MAX, 0,
+ parent, target, 0);
*d.dstq = NULL;
+ if (ignore_diffs)
+ newdest = llist_mergesort(newdest, get_next_blame,
+ set_next_blame,
+ compare_blame_suspect);
queue_blames(sb, parent, newdest);
return;
@@ -1181,7 +2165,8 @@ static void find_move_in_parent(struct blame_scoreboard *sb,
if (!unblamed)
return; /* nothing remains for this target */
- fill_origin_blob(&sb->revs->diffopt, parent, &file_p, &sb->num_read_blob);
+ fill_origin_blob(&sb->revs->diffopt, parent, &file_p,
+ &sb->num_read_blob, 0);
if (!file_p.ptr)
return;
@@ -1261,7 +2246,7 @@ static void find_copy_in_parent(struct blame_scoreboard *sb,
if (!unblamed)
return; /* nothing remains for this target */
- diff_setup(&diff_opts);
+ repo_diff_setup(sb->repo, &diff_opts);
diff_opts.flags.recursive = 1;
diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT;
@@ -1310,7 +2295,8 @@ static void find_copy_in_parent(struct blame_scoreboard *sb,
norigin = get_origin(parent, p->one->path);
oidcpy(&norigin->blob_oid, &p->one->oid);
norigin->mode = p->one->mode;
- fill_origin_blob(&sb->revs->diffopt, norigin, &file_p, &sb->num_read_blob);
+ fill_origin_blob(&sb->revs->diffopt, norigin, &file_p,
+ &sb->num_read_blob, 0);
if (!file_p.ptr)
continue;
@@ -1419,6 +2405,11 @@ static void distribute_blame(struct blame_scoreboard *sb, struct blame_entry *bl
#define MAXSG 16
+typedef struct blame_origin *(*blame_find_alg)(struct repository *,
+ struct commit *,
+ struct blame_origin *,
+ struct blame_bloom_data *);
+
static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin, int opt)
{
struct rev_info *revs = sb->revs;
@@ -1443,8 +2434,7 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
* common cases, then we look for renames in the second pass.
*/
for (pass = 0; pass < 2 - sb->no_whole_file_rename; pass++) {
- struct blame_origin *(*find)(struct commit *, struct blame_origin *);
- find = pass ? find_rename : find_origin;
+ blame_find_alg find = pass ? find_rename : find_origin;
for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
i < num_sg && sg;
@@ -1456,17 +2446,17 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
continue;
if (parse_commit(p))
continue;
- porigin = find(p, origin);
+ porigin = find(sb->repo, p, origin, sb->bloom_data);
if (!porigin)
continue;
- if (!oidcmp(&porigin->blob_oid, &origin->blob_oid)) {
+ if (oideq(&porigin->blob_oid, &origin->blob_oid)) {
pass_whole_blame(sb, origin, porigin);
blame_origin_decref(porigin);
goto finish;
}
for (j = same = 0; j < i; j++)
if (sg_origin[j] &&
- !oidcmp(&sg_origin[j]->blob_oid, &porigin->blob_oid)) {
+ oideq(&sg_origin[j]->blob_oid, &porigin->blob_oid)) {
same = 1;
break;
}
@@ -1488,12 +2478,35 @@ static void pass_blame(struct blame_scoreboard *sb, struct blame_origin *origin,
blame_origin_incref(porigin);
origin->previous = porigin;
}
- pass_blame_to_parent(sb, origin, porigin);
+ pass_blame_to_parent(sb, origin, porigin, 0);
if (!origin->suspects)
goto finish;
}
/*
+ * Pass remaining suspects for ignored commits to their parents.
+ */
+ if (oidset_contains(&sb->ignore_list, &commit->object.oid)) {
+ for (i = 0, sg = first_scapegoat(revs, commit, sb->reverse);
+ i < num_sg && sg;
+ sg = sg->next, i++) {
+ struct blame_origin *porigin = sg_origin[i];
+
+ if (!porigin)
+ continue;
+ pass_blame_to_parent(sb, origin, porigin, 1);
+ /*
+ * Preemptively drop porigin so we can refresh the
+ * fingerprints if we use the parent again, which can
+ * occur if you ignore back-to-back commits.
+ */
+ drop_origin_blob(porigin);
+ if (!origin->suspects)
+ goto finish;
+ }
+ }
+
+ /*
* Optionally find moves in parents' files.
*/
if (opt & PICKAXE_BLAME_MOVE) {
@@ -1555,7 +2568,8 @@ finish:
}
for (i = 0; i < num_sg; i++) {
if (sg_origin[i]) {
- drop_origin_blob(sg_origin[i]);
+ if (!sg_origin[i]->suspects)
+ drop_origin_blob(sg_origin[i]);
blame_origin_decref(sg_origin[i]);
}
}
@@ -1632,37 +2646,14 @@ void assign_blame(struct blame_scoreboard *sb, int opt)
}
}
-static const char *get_next_line(const char *start, const char *end)
-{
- const char *nl = memchr(start, '\n', end - start);
- return nl ? nl + 1 : end;
-}
-
/*
* To allow quick access to the contents of nth line in the
* final image, prepare an index in the scoreboard.
*/
static int prepare_lines(struct blame_scoreboard *sb)
{
- const char *buf = sb->final_buf;
- unsigned long len = sb->final_buf_size;
- const char *end = buf + len;
- const char *p;
- int *lineno;
- int num = 0;
-
- for (p = buf; p < end; p = get_next_line(p, end))
- num++;
-
- ALLOC_ARRAY(sb->lineno, num + 1);
- lineno = sb->lineno;
-
- for (p = buf; p < end; p = get_next_line(p, end))
- *lineno++ = p - buf;
-
- *lineno = len;
-
- sb->num_lines = num;
+ sb->num_lines = find_line_starts(&sb->lineno, sb->final_buf,
+ sb->final_buf_size);
return sb->num_lines;
}
@@ -1677,7 +2668,7 @@ static struct commit *find_single_final(struct rev_info *revs,
struct object *obj = revs->pending.objects[i].item;
if (obj->flags & UNINTERESTING)
continue;
- obj = deref_tag(the_repository, obj, NULL, 0);
+ obj = deref_tag(revs->repo, obj, NULL, 0);
if (obj->type != OBJ_COMMIT)
die("Non commit %s?", revs->pending.objects[i].name);
if (found)
@@ -1708,14 +2699,14 @@ static struct commit *dwim_reverse_initial(struct rev_info *revs,
/* Is that sole rev a committish? */
obj = revs->pending.objects[0].item;
- obj = deref_tag(the_repository, obj, NULL, 0);
+ obj = deref_tag(revs->repo, obj, NULL, 0);
if (obj->type != OBJ_COMMIT)
return NULL;
/* Do we have HEAD? */
if (!resolve_ref_unsafe("HEAD", RESOLVE_REF_READING, &head_oid, NULL))
return NULL;
- head_commit = lookup_commit_reference_gently(the_repository,
+ head_commit = lookup_commit_reference_gently(revs->repo,
&head_oid, 1);
if (!head_commit)
return NULL;
@@ -1744,7 +2735,7 @@ static struct commit *find_single_initial(struct rev_info *revs,
struct object *obj = revs->pending.objects[i].item;
if (!(obj->flags & UNINTERESTING))
continue;
- obj = deref_tag(the_repository, obj, NULL, 0);
+ obj = deref_tag(revs->repo, obj, NULL, 0);
if (obj->type != OBJ_COMMIT)
die("Non commit %s?", revs->pending.objects[i].name);
if (found)
@@ -1834,7 +2825,7 @@ void setup_scoreboard(struct blame_scoreboard *sb,
sb->revs->children.name = "children";
while (c->parents &&
- oidcmp(&c->object.oid, &sb->final->object.oid)) {
+ !oideq(&c->object.oid, &sb->final->object.oid)) {
struct commit_list *l = xcalloc(1, sizeof(*l));
l->item = c;
@@ -1844,7 +2835,7 @@ void setup_scoreboard(struct blame_scoreboard *sb,
c = c->parents->item;
}
- if (oidcmp(&c->object.oid, &sb->final->object.oid))
+ if (!oideq(&c->object.oid, &sb->final->object.oid))
die(_("--reverse --first-parent together require range along first-parent chain"));
}
@@ -1859,7 +2850,7 @@ void setup_scoreboard(struct blame_scoreboard *sb,
die(_("no such path %s in %s"), path, final_commit_name);
if (sb->revs->diffopt.flags.allow_textconv &&
- textconv_object(path, o->mode, &o->blob_oid, 1, (char **) &sb->final_buf,
+ textconv_object(sb->repo, path, o->mode, &o->blob_oid, 1, (char **) &sb->final_buf,
&sb->final_buf_size))
;
else
@@ -1895,3 +2886,45 @@ struct blame_entry *blame_entry_prepend(struct blame_entry *head,
blame_origin_incref(o);
return new_head;
}
+
+void setup_blame_bloom_data(struct blame_scoreboard *sb,
+ const char *path)
+{
+ struct blame_bloom_data *bd;
+
+ if (!sb->repo->objects->commit_graph)
+ return;
+
+ if (!sb->repo->objects->commit_graph->bloom_filter_settings)
+ return;
+
+ bd = xmalloc(sizeof(struct blame_bloom_data));
+
+ bd->settings = sb->repo->objects->commit_graph->bloom_filter_settings;
+
+ bd->alloc = 4;
+ bd->nr = 0;
+ ALLOC_ARRAY(bd->keys, bd->alloc);
+
+ add_bloom_key(bd, path);
+
+ sb->bloom_data = bd;
+}
+
+void cleanup_scoreboard(struct blame_scoreboard *sb)
+{
+ if (sb->bloom_data) {
+ int i;
+ for (i = 0; i < sb->bloom_data->nr; i++) {
+ free(sb->bloom_data->keys[i]->hashes);
+ free(sb->bloom_data->keys[i]);
+ }
+ free(sb->bloom_data->keys);
+ FREE_AND_NULL(sb->bloom_data);
+
+ trace2_data_intmax("blame", sb->repo,
+ "bloom/queries", bloom_count_queries);
+ trace2_data_intmax("blame", sb->repo,
+ "bloom/response-no", bloom_count_no);
+ }
+}