summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLibravatar Elijah Newren <newren@gmail.com>2021-02-14 07:51:47 +0000
committerLibravatar Junio C Hamano <gitster@pobox.com>2021-02-15 18:02:16 -0800
commita35df3371c2e2e9b407ff8c950169e74f6bf4add (patch)
tree1fcbc5f408994502d22a84697e22ea44ae0010aa
parentt4001: add a test comparing basename similarity and content similarity (diff)
downloadtgif-a35df3371c2e2e9b407ff8c950169e74f6bf4add.tar.xz
diffcore-rename: compute basenames of source and dest candidates
We want to make use of unique basenames among remaining source and destination files to help inform rename detection, so that more likely pairings can be checked first. (src/moduleA/foo.txt and source/module/A/foo.txt are likely related if there are no other 'foo.txt' files among the remaining deleted and added files.) Add a new function, not yet used, which creates a map of the unique basenames within rename_src and another within rename_dst, together with the indices within rename_src/rename_dst where those basenames show up. Non-unique basenames still show up in the map, but have an invalid index (-1). This function was inspired by the fact that in real world repositories, files are often moved across directories without changing names. Here are some sample repositories and the percentage of their historical renames (as of early 2020) that preserved basenames: * linux: 76% * gcc: 64% * gecko: 79% * webkit: 89% These statistics alone don't prove that an optimization in this area will help or how much it will help, since there are also unpaired adds and deletes, restrictions on which basenames we consider, etc., but it certainly motivated the idea to try something in this area. Signed-off-by: Elijah Newren <newren@gmail.com> Signed-off-by: Junio C Hamano <gitster@pobox.com>
-rw-r--r--diffcore-rename.c63
1 files changed, 63 insertions, 0 deletions
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 6fd0c4a2f4..e51f33a218 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -367,6 +367,69 @@ static int find_exact_renames(struct diff_options *options)
return renames;
}
+static const char *get_basename(const char *filename)
+{
+ /*
+ * gitbasename() has to worry about special drives, multiple
+ * directory separator characters, trailing slashes, NULL or
+ * empty strings, etc. We only work on filenames as stored in
+ * git, and thus get to ignore all those complications.
+ */
+ const char *base = strrchr(filename, '/');
+ return base ? base + 1 : filename;
+}
+
+MAYBE_UNUSED
+static int find_basename_matches(struct diff_options *options,
+ int minimum_score)
+{
+ int i;
+ struct strintmap sources;
+ struct strintmap dests;
+
+ /*
+ * Create maps of basename -> fullname(s) for remaining sources and
+ * dests.
+ */
+ strintmap_init_with_options(&sources, -1, NULL, 0);
+ strintmap_init_with_options(&dests, -1, NULL, 0);
+ for (i = 0; i < rename_src_nr; ++i) {
+ char *filename = rename_src[i].p->one->path;
+ const char *base;
+
+ /* exact renames removed in remove_unneeded_paths_from_src() */
+ assert(!rename_src[i].p->one->rename_used);
+
+ /* Record index within rename_src (i) if basename is unique */
+ base = get_basename(filename);
+ if (strintmap_contains(&sources, base))
+ strintmap_set(&sources, base, -1);
+ else
+ strintmap_set(&sources, base, i);
+ }
+ for (i = 0; i < rename_dst_nr; ++i) {
+ char *filename = rename_dst[i].p->two->path;
+ const char *base;
+
+ if (rename_dst[i].is_rename)
+ continue; /* involved in exact match already. */
+
+ /* Record index within rename_dst (i) if basename is unique */
+ base = get_basename(filename);
+ if (strintmap_contains(&dests, base))
+ strintmap_set(&dests, base, -1);
+ else
+ strintmap_set(&dests, base, i);
+ }
+
+ /* TODO: Make use of basenames source and destination basenames */
+
+ strintmap_clear(&sources);
+ strintmap_clear(&dests);
+
+ return 0;
+}
+
#define NUM_CANDIDATE_PER_DST 4
static void record_if_better(struct diff_score m[], struct diff_score *o)
{