summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--diffcore-rename.c230
-rw-r--r--diffcore.h19
-rw-r--r--merge-ort.c79
3 files changed, 281 insertions, 47 deletions
diff --git a/diffcore-rename.c b/diffcore-rename.c
index 36a98f9c49..963ca58221 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -371,7 +371,7 @@ struct dir_rename_info {
struct strintmap idx_map;
struct strmap dir_rename_guess;
struct strmap *dir_rename_count;
- struct strset *relevant_source_dirs;
+ struct strintmap *relevant_source_dirs;
unsigned setup;
};
@@ -407,6 +407,28 @@ static const char *get_highest_rename_path(struct strintmap *counts)
return highest_destination_dir;
}
+static char *UNKNOWN_DIR = "/"; /* placeholder -- short, illegal directory */
+
+static int dir_rename_already_determinable(struct strintmap *counts)
+{
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
+ int first = 0, second = 0, unknown = 0;
+ strintmap_for_each_entry(counts, &iter, entry) {
+ const char *destination_dir = entry->key;
+ intptr_t count = (intptr_t)entry->value;
+ if (!strcmp(destination_dir, UNKNOWN_DIR)) {
+ unknown = count;
+ } else if (count >= first) {
+ second = first;
+ first = count;
+ } else if (count >= second) {
+ second = count;
+ }
+ }
+ return first > second + unknown;
+}
+
static void increment_count(struct dir_rename_info *info,
char *old_dir,
char *new_dir)
@@ -429,7 +451,7 @@ static void increment_count(struct dir_rename_info *info,
}
static void update_dir_rename_counts(struct dir_rename_info *info,
- struct strset *dirs_removed,
+ struct strintmap *dirs_removed,
const char *oldname,
const char *newname)
{
@@ -461,10 +483,12 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
return;
while (1) {
+ int drd_flag = NOT_RELEVANT;
+
/* Get old_dir, skip if its directory isn't relevant. */
dirname_munge(old_dir);
if (info->relevant_source_dirs &&
- !strset_contains(info->relevant_source_dirs, old_dir))
+ !strintmap_contains(info->relevant_source_dirs, old_dir))
break;
/* Get new_dir */
@@ -509,16 +533,31 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
}
}
- if (strset_contains(dirs_removed, old_dir))
+ /*
+ * Above we suggested that we'd keep recording renames for
+ * all ancestor directories where the trailing directories
+ * matched, i.e. for
+ * "a/b/c/d/e/foo.c" -> "a/b/some/thing/else/e/foo.c"
+ * we'd increment rename counts for each of
+ * a/b/c/d/e/ => a/b/some/thing/else/e/
+ * a/b/c/d/ => a/b/some/thing/else/
+ * However, we only need the rename counts for directories
+ * in dirs_removed whose value is RELEVANT_FOR_SELF.
+ * However, we add one special case of also recording it for
+ * first_time_in_loop because find_basename_matches() can
+ * use that as a hint to find a good pairing.
+ */
+ if (dirs_removed)
+ drd_flag = strintmap_get(dirs_removed, old_dir);
+ if (drd_flag == RELEVANT_FOR_SELF || first_time_in_loop)
increment_count(info, old_dir, new_dir);
- else
- break;
+ first_time_in_loop = 0;
+ if (drd_flag == NOT_RELEVANT)
+ break;
/* If we hit toplevel directory ("") for old or new dir, quit */
if (!*old_dir || !*new_dir)
break;
-
- first_time_in_loop = 0;
}
/* Free resources we don't need anymore */
@@ -527,8 +566,8 @@ static void update_dir_rename_counts(struct dir_rename_info *info,
}
static void initialize_dir_rename_info(struct dir_rename_info *info,
- struct strset *relevant_sources,
- struct strset *dirs_removed,
+ struct strintmap *relevant_sources,
+ struct strintmap *dirs_removed,
struct strmap *dir_rename_count)
{
struct hashmap_iter iter;
@@ -555,12 +594,13 @@ static void initialize_dir_rename_info(struct dir_rename_info *info,
info->relevant_source_dirs = dirs_removed; /* might be NULL */
} else {
info->relevant_source_dirs = xmalloc(sizeof(struct strintmap));
- strset_init(info->relevant_source_dirs);
- strset_for_each_entry(relevant_sources, &iter, entry) {
+ strintmap_init(info->relevant_source_dirs, 0 /* unused */);
+ strintmap_for_each_entry(relevant_sources, &iter, entry) {
char *dirname = get_dirname(entry->key);
if (!dirs_removed ||
- strset_contains(dirs_removed, dirname))
- strset_add(info->relevant_source_dirs, dirname);
+ strintmap_contains(dirs_removed, dirname))
+ strintmap_set(info->relevant_source_dirs,
+ dirname, 0 /* value irrelevant */);
free(dirname);
}
}
@@ -624,7 +664,7 @@ void partial_clear_dir_rename_count(struct strmap *dir_rename_count)
}
static void cleanup_dir_rename_info(struct dir_rename_info *info,
- struct strset *dirs_removed,
+ struct strintmap *dirs_removed,
int keep_dir_rename_count)
{
struct hashmap_iter iter;
@@ -644,7 +684,7 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
/* relevant_source_dirs */
if (info->relevant_source_dirs &&
info->relevant_source_dirs != dirs_removed) {
- strset_clear(info->relevant_source_dirs);
+ strintmap_clear(info->relevant_source_dirs);
FREE_AND_NULL(info->relevant_source_dirs);
}
@@ -659,18 +699,22 @@ static void cleanup_dir_rename_info(struct dir_rename_info *info,
/*
* Although dir_rename_count was passed in
* diffcore_rename_extended() and we want to keep it around and
- * return it to that caller, we first want to remove any data
+ * return it to that caller, we first want to remove any counts in
+ * the maps associated with UNKNOWN_DIR entries and any data
* associated with directories that weren't renamed.
*/
strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
const char *source_dir = entry->key;
struct strintmap *counts = entry->value;
- if (!strset_contains(dirs_removed, source_dir)) {
+ if (!strintmap_get(dirs_removed, source_dir)) {
string_list_append(&to_remove, source_dir);
strintmap_clear(counts);
continue;
}
+
+ if (strintmap_contains(counts, UNKNOWN_DIR))
+ strintmap_remove(counts, UNKNOWN_DIR);
}
for (i = 0; i < to_remove.nr; ++i)
strmap_remove(info->dir_rename_count,
@@ -770,8 +814,8 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info)
static int find_basename_matches(struct diff_options *options,
int minimum_score,
struct dir_rename_info *info,
- struct strset *relevant_sources,
- struct strset *dirs_removed)
+ struct strintmap *relevant_sources,
+ struct strintmap *dirs_removed)
{
/*
* When I checked in early 2020, over 76% of file renames in linux
@@ -863,7 +907,7 @@ static int find_basename_matches(struct diff_options *options,
/* Skip irrelevant sources */
if (relevant_sources &&
- !strset_contains(relevant_sources, filename))
+ !strintmap_contains(relevant_sources, filename))
continue;
/*
@@ -994,7 +1038,7 @@ static int find_renames(struct diff_score *mx,
int minimum_score,
int copies,
struct dir_rename_info *info,
- struct strset *dirs_removed)
+ struct strintmap *dirs_removed)
{
int count = 0, i;
@@ -1019,7 +1063,7 @@ static int find_renames(struct diff_score *mx,
}
static void remove_unneeded_paths_from_src(int detecting_copies,
- struct strset *interesting)
+ struct strintmap *interesting)
{
int i, new_num_src;
@@ -1061,7 +1105,7 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
continue;
/* If we don't care about the source path, skip it */
- if (interesting && !strset_contains(interesting, one->path))
+ if (interesting && !strintmap_contains(interesting, one->path))
continue;
if (new_num_src < i)
@@ -1073,9 +1117,136 @@ static void remove_unneeded_paths_from_src(int detecting_copies,
rename_src_nr = new_num_src;
}
+static void handle_early_known_dir_renames(struct dir_rename_info *info,
+ struct strintmap *relevant_sources,
+ struct strintmap *dirs_removed)
+{
+ /*
+ * Directory renames are determined via an aggregate of all renames
+ * under them and using a "majority wins" rule. The fact that
+ * "majority wins", though, means we don't need all the renames
+ * under the given directory, we only need enough to ensure we have
+ * a majority.
+ */
+
+ int i, new_num_src;
+ struct hashmap_iter iter;
+ struct strmap_entry *entry;
+
+ if (!dirs_removed || !relevant_sources)
+ return; /* nothing to cull */
+ if (break_idx)
+ return; /* culling incompatbile with break detection */
+
+ /*
+ * Supplement dir_rename_count with number of potential renames,
+ * marking all potential rename sources as mapping to UNKNOWN_DIR.
+ */
+ for (i = 0; i < rename_src_nr; i++) {
+ char *old_dir;
+ struct diff_filespec *one = rename_src[i].p->one;
+
+ /*
+ * sources that are part of a rename will have already been
+ * removed by a prior call to remove_unneeded_paths_from_src()
+ */
+ assert(!one->rename_used);
+
+ old_dir = get_dirname(one->path);
+ while (*old_dir != '\0' &&
+ NOT_RELEVANT != strintmap_get(dirs_removed, old_dir)) {
+ char *freeme = old_dir;
+
+ increment_count(info, old_dir, UNKNOWN_DIR);
+ old_dir = get_dirname(old_dir);
+
+ /* Free resources we don't need anymore */
+ free(freeme);
+ }
+ /*
+ * old_dir and new_dir free'd in increment_count, but
+ * get_dirname() gives us a new pointer we need to free for
+ * old_dir. Also, if the loop runs 0 times we need old_dir
+ * to be freed.
+ */
+ free(old_dir);
+ }
+
+ /*
+ * For any directory which we need a potential rename detected for
+ * (i.e. those marked as RELEVANT_FOR_SELF in dirs_removed), check
+ * whether we have enough renames to satisfy the "majority rules"
+ * requirement such that detecting any more renames of files under
+ * it won't change the result. For any such directory, mark that
+ * we no longer need to detect a rename for it. However, since we
+ * might need to still detect renames for an ancestor of that
+ * directory, use RELEVANT_FOR_ANCESTOR.
+ */
+ strmap_for_each_entry(info->dir_rename_count, &iter, entry) {
+ /* entry->key is source_dir */
+ struct strintmap *counts = entry->value;
+
+ if (strintmap_get(dirs_removed, entry->key) ==
+ RELEVANT_FOR_SELF &&
+ dir_rename_already_determinable(counts)) {
+ strintmap_set(dirs_removed, entry->key,
+ RELEVANT_FOR_ANCESTOR);
+ }
+ }
+
+ for (i = 0, new_num_src = 0; i < rename_src_nr; i++) {
+ struct diff_filespec *one = rename_src[i].p->one;
+ int val;
+
+ val = strintmap_get(relevant_sources, one->path);
+
+ /*
+ * sources that were not found in relevant_sources should
+ * have already been removed by a prior call to
+ * remove_unneeded_paths_from_src()
+ */
+ assert(val != -1);
+
+ if (val == RELEVANT_LOCATION) {
+ int removable = 1;
+ char *dir = get_dirname(one->path);
+ while (1) {
+ char *freeme = dir;
+ int res = strintmap_get(dirs_removed, dir);
+
+ /* Quit if not found or irrelevant */
+ if (res == NOT_RELEVANT)
+ break;
+ /* If RELEVANT_FOR_SELF, can't remove */
+ if (res == RELEVANT_FOR_SELF) {
+ removable = 0;
+ break;
+ }
+ /* Else continue searching upwards */
+ assert(res == RELEVANT_FOR_ANCESTOR);
+ dir = get_dirname(dir);
+ free(freeme);
+ }
+ free(dir);
+ if (removable) {
+ strintmap_set(relevant_sources, one->path,
+ RELEVANT_NO_MORE);
+ continue;
+ }
+ }
+
+ if (new_num_src < i)
+ memcpy(&rename_src[new_num_src], &rename_src[i],
+ sizeof(struct diff_rename_src));
+ new_num_src++;
+ }
+
+ rename_src_nr = new_num_src;
+}
+
void diffcore_rename_extended(struct diff_options *options,
- struct strset *relevant_sources,
- struct strset *dirs_removed,
+ struct strintmap *relevant_sources,
+ struct strintmap *dirs_removed,
struct strmap *dir_rename_count)
{
int detect_rename = options->detect_rename;
@@ -1208,9 +1379,16 @@ void diffcore_rename_extended(struct diff_options *options,
* Cull sources, again:
* - remove ones involved in renames (found via basenames)
* - remove ones not found in relevant_sources
+ * and
+ * - remove ones in relevant_sources which are needed only
+ * for directory renames IF no ancestory directory
+ * actually needs to know any more individual path
+ * renames under them
*/
trace2_region_enter("diff", "cull basename", options->repo);
remove_unneeded_paths_from_src(want_copies, relevant_sources);
+ handle_early_known_dir_renames(&info, relevant_sources,
+ dirs_removed);
trace2_region_leave("diff", "cull basename", options->repo);
}
diff --git a/diffcore.h b/diffcore.h
index d76982f220..f5c6de4841 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -8,8 +8,8 @@
struct diff_options;
struct repository;
+struct strintmap;
struct strmap;
-struct strset;
struct userdiff_driver;
/* This header file is internal between diff.c and its diff transformers
@@ -161,13 +161,26 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *,
struct diff_filespec *);
void diff_q(struct diff_queue_struct *, struct diff_filepair *);
+/* dir_rename_relevance: the reason we want rename information for a dir */
+enum dir_rename_relevance {
+ NOT_RELEVANT = 0,
+ RELEVANT_FOR_ANCESTOR = 1,
+ RELEVANT_FOR_SELF = 2
+};
+/* file_rename_relevance: the reason(s) we want rename information for a file */
+enum file_rename_relevance {
+ RELEVANT_NO_MORE = 0, /* i.e. NOT relevant */
+ RELEVANT_CONTENT = 1,
+ RELEVANT_LOCATION = 2
+};
+
void partial_clear_dir_rename_count(struct strmap *dir_rename_count);
void diffcore_break(struct repository *, int);
void diffcore_rename(struct diff_options *);
void diffcore_rename_extended(struct diff_options *options,
- struct strset *relevant_sources,
- struct strset *dirs_removed,
+ struct strintmap *relevant_sources,
+ struct strintmap *dirs_removed,
struct strmap *dir_rename_count);
void diffcore_merge_broken(void);
void diffcore_pickaxe(struct diff_options *);
diff --git a/merge-ort.c b/merge-ort.c
index 5e118a85ee..028d712f54 100644
--- a/merge-ort.c
+++ b/merge-ort.c
@@ -73,8 +73,12 @@ struct rename_info {
/*
* dirs_removed: directories removed on a given side of history.
+ *
+ * The keys of dirs_removed[side] are the directories that were removed
+ * on the given side of history. The value of the strintmap for each
+ * directory is a value from enum dir_rename_relevance.
*/
- struct strset dirs_removed[3];
+ struct strintmap dirs_removed[3];
/*
* dir_rename_count: tracking where parts of a directory were renamed to
@@ -95,18 +99,20 @@ struct rename_info {
struct strmap dir_renames[3];
/*
- * relevant_sources: deleted paths for which we need rename detection
+ * relevant_sources: deleted paths wanted in rename detection, and why
*
* relevant_sources is a set of deleted paths on each side of
* history for which we need rename detection. If a path is deleted
* on one side of history, we need to detect if it is part of a
* rename if either
- * * we need to detect renames for an ancestor directory
* * the file is modified/deleted on the other side of history
+ * * we need to detect renames for an ancestor directory
* If neither of those are true, we can skip rename detection for
- * that path.
+ * that path. The reason is stored as a value from enum
+ * file_rename_relevance, as the reason can inform the algorithm in
+ * diffcore_rename_extended().
*/
- struct strset relevant_sources[3];
+ struct strintmap relevant_sources[3];
/*
* dir_rename_mask:
@@ -362,8 +368,8 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
int i;
void (*strmap_func)(struct strmap *, int) =
reinitialize ? strmap_partial_clear : strmap_clear;
- void (*strset_func)(struct strset *) =
- reinitialize ? strset_partial_clear : strset_clear;
+ void (*strintmap_func)(struct strintmap *) =
+ reinitialize ? strintmap_partial_clear : strintmap_clear;
/*
* We marked opti->paths with strdup_strings = 0, so that we
@@ -395,7 +401,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
/* Free memory used by various renames maps */
for (i = MERGE_SIDE1; i <= MERGE_SIDE2; ++i) {
- strset_func(&renames->dirs_removed[i]);
+ strintmap_func(&renames->dirs_removed[i]);
partial_clear_dir_rename_count(&renames->dir_rename_count[i]);
if (!reinitialize)
@@ -403,7 +409,7 @@ static void clear_or_reinit_internal_opts(struct merge_options_internal *opti,
strmap_func(&renames->dir_renames[i], 0);
- strset_func(&renames->relevant_sources[i]);
+ strintmap_func(&renames->relevant_sources[i]);
}
if (!reinitialize) {
@@ -673,8 +679,11 @@ static void add_pair(struct merge_options *opt,
unsigned content_relevant = (match_mask == 0);
unsigned location_relevant = (dir_rename_mask == 0x07);
- if (content_relevant || location_relevant)
- strset_add(&renames->relevant_sources[side], pathname);
+ if (content_relevant || location_relevant) {
+ /* content_relevant trumps location_relevant */
+ strintmap_set(&renames->relevant_sources[side], pathname,
+ content_relevant ? RELEVANT_CONTENT : RELEVANT_LOCATION);
+ }
}
one = alloc_filespec(pathname);
@@ -729,10 +738,41 @@ static void collect_rename_info(struct merge_options *opt,
if (dirmask == 1 || dirmask == 3 || dirmask == 5) {
/* absent_mask = 0x07 - dirmask; sides = absent_mask/2 */
unsigned sides = (0x07 - dirmask)/2;
+ unsigned relevance = (renames->dir_rename_mask == 0x07) ?
+ RELEVANT_FOR_ANCESTOR : NOT_RELEVANT;
+ /*
+ * Record relevance of this directory. However, note that
+ * when collect_merge_info_callback() recurses into this
+ * directory and calls collect_rename_info() on paths
+ * within that directory, if we find a path that was added
+ * to this directory on the other side of history, we will
+ * upgrade this value to RELEVANT_FOR_SELF; see below.
+ */
if (sides & 1)
- strset_add(&renames->dirs_removed[1], fullname);
+ strintmap_set(&renames->dirs_removed[1], fullname,
+ relevance);
if (sides & 2)
- strset_add(&renames->dirs_removed[2], fullname);
+ strintmap_set(&renames->dirs_removed[2], fullname,
+ relevance);
+ }
+
+ /*
+ * Here's the block that potentially upgrades to RELEVANT_FOR_SELF.
+ * When we run across a file added to a directory. In such a case,
+ * find the directory of the file and upgrade its relevance.
+ */
+ if (renames->dir_rename_mask == 0x07 &&
+ (filemask == 2 || filemask == 4)) {
+ /*
+ * Need directory rename for parent directory on other side
+ * of history from added file. Thus
+ * side = (~filemask & 0x06) >> 1
+ * or
+ * side = 3 - (filemask/2).
+ */
+ unsigned side = 3 - (filemask >> 1);
+ strintmap_set(&renames->dirs_removed[side], dirname,
+ RELEVANT_FOR_SELF);
}
if (filemask == 0 || filemask == 7)
@@ -1511,6 +1551,9 @@ static void get_provisional_directory_renames(struct merge_options *opt,
}
}
+ if (max == 0)
+ continue;
+
if (bad_max == max) {
path_msg(opt, source_dir, 0,
_("CONFLICT (directory rename split): "
@@ -2160,7 +2203,7 @@ static inline int possible_side_renames(struct rename_info *renames,
unsigned side_index)
{
return renames->pairs[side_index].nr > 0 &&
- !strset_empty(&renames->relevant_sources[side_index]);
+ !strintmap_empty(&renames->relevant_sources[side_index]);
}
static inline int possible_renames(struct rename_info *renames)
@@ -3444,14 +3487,14 @@ static void merge_start(struct merge_options *opt, struct merge_result *result)
/* Initialization of various renames fields */
renames = &opt->priv->renames;
for (i = MERGE_SIDE1; i <= MERGE_SIDE2; i++) {
- strset_init_with_options(&renames->dirs_removed[i],
- NULL, 0);
+ strintmap_init_with_options(&renames->dirs_removed[i],
+ NOT_RELEVANT, NULL, 0);
strmap_init_with_options(&renames->dir_rename_count[i],
NULL, 1);
strmap_init_with_options(&renames->dir_renames[i],
NULL, 0);
- strset_init_with_options(&renames->relevant_sources[i],
- NULL, 0);
+ strintmap_init_with_options(&renames->relevant_sources[i],
+ 0, NULL, 0);
}
/*