From 446f46d8c7ea3b91f7afe4c5a46b5cd20cc40196 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:37 +0000 Subject: dir: fix simple typo in comment Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index b460211e61..b505ba747b 100644 --- a/dir.c +++ b/dir.c @@ -2174,7 +2174,7 @@ static void add_path_to_appropriate_result_list(struct dir_struct *dir, * If 'stop_at_first_file' is specified, 'path_excluded' is returned * to signal that a file was found. This is the least significant value that * indicates that a file was encountered that does not depend on the order of - * whether an untracked or exluded path was encountered first. + * whether an untracked or excluded path was encountered first. * * Returns the most significant path_treatment value encountered in the scan. * If 'stop_at_first_file' is specified, `path_excluded` is the most -- cgit v1.2.3 From cd129eed986588907d1f1bd7d478a8d4f6d1dc1c Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:38 +0000 Subject: dir: consolidate treat_path() and treat_one_path() Commit 16e2cfa90993 ("read_directory(): further split treat_path()", 2010-01-08) split treat_one_path() out of treat_path(), because treat_leading_path() would not have access to a dirent but wanted to re-use as much of treat_path() as possible. Not re-using all of treat_path() caused other bugs, as noted in commit b9670c1f5e6b ("dir: fix checks on common prefix directory", 2019-12-19). Finally, in commit ad6f2157f951 ("dir: restructure in a way to avoid passing around a struct dirent", 2020-01-16), dirents were removed from treat_path() and other functions entirely. Since the only reason for splitting these functions was the lack of a dirent -- which no longer applies to either function -- and since the split caused problems in the past resulting in us not using treat_one_path() separately anymore, just undo the split. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 121 ++++++++++++++++++++++++++++++------------------------------------ 1 file changed, 55 insertions(+), 66 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index b505ba747b..d0f3d66085 100644 --- a/dir.c +++ b/dir.c @@ -1863,21 +1863,65 @@ static int resolve_dtype(int dtype, struct index_state *istate, return dtype; } -static enum path_treatment treat_one_path(struct dir_struct *dir, - struct untracked_cache_dir *untracked, - struct index_state *istate, - struct strbuf *path, - int baselen, - const struct pathspec *pathspec, - int dtype) -{ - int exclude; - int has_path_in_index = !!index_file_exists(istate, path->buf, path->len, ignore_case); +static enum path_treatment treat_path_fast(struct dir_struct *dir, + struct untracked_cache_dir *untracked, + struct cached_dir *cdir, + struct index_state *istate, + struct strbuf *path, + int baselen, + const struct pathspec *pathspec) +{ + strbuf_setlen(path, baselen); + if (!cdir->ucd) { + strbuf_addstr(path, cdir->file); + return path_untracked; + } + strbuf_addstr(path, cdir->ucd->name); + /* treat_one_path() does this before it calls treat_directory() */ + strbuf_complete(path, '/'); + if (cdir->ucd->check_only) + /* + * check_only is set as a result of treat_directory() getting + * to its bottom. Verify again the same set of directories + * with check_only set. + */ + return read_directory_recursive(dir, istate, path->buf, path->len, + cdir->ucd, 1, 0, pathspec); + /* + * We get path_recurse in the first run when + * directory_exists_in_index() returns index_nonexistent. We + * are sure that new changes in the index does not impact the + * outcome. Return now. + */ + return path_recurse; +} + +static enum path_treatment treat_path(struct dir_struct *dir, + struct untracked_cache_dir *untracked, + struct cached_dir *cdir, + struct index_state *istate, + struct strbuf *path, + int baselen, + const struct pathspec *pathspec) +{ + int has_path_in_index, dtype, exclude; enum path_treatment path_treatment; - dtype = resolve_dtype(dtype, istate, path->buf, path->len); + if (!cdir->d_name) + return treat_path_fast(dir, untracked, cdir, istate, path, + baselen, pathspec); + if (is_dot_or_dotdot(cdir->d_name) || !fspathcmp(cdir->d_name, ".git")) + return path_none; + strbuf_setlen(path, baselen); + strbuf_addstr(path, cdir->d_name); + if (simplify_away(path->buf, path->len, pathspec)) + return path_none; + + dtype = resolve_dtype(cdir->d_type, istate, path->buf, path->len); /* Always exclude indexed files */ + has_path_in_index = !!index_file_exists(istate, path->buf, path->len, + ignore_case); if (dtype != DT_DIR && has_path_in_index) return path_none; @@ -1942,61 +1986,6 @@ static enum path_treatment treat_one_path(struct dir_struct *dir, } } -static enum path_treatment treat_path_fast(struct dir_struct *dir, - struct untracked_cache_dir *untracked, - struct cached_dir *cdir, - struct index_state *istate, - struct strbuf *path, - int baselen, - const struct pathspec *pathspec) -{ - strbuf_setlen(path, baselen); - if (!cdir->ucd) { - strbuf_addstr(path, cdir->file); - return path_untracked; - } - strbuf_addstr(path, cdir->ucd->name); - /* treat_one_path() does this before it calls treat_directory() */ - strbuf_complete(path, '/'); - if (cdir->ucd->check_only) - /* - * check_only is set as a result of treat_directory() getting - * to its bottom. Verify again the same set of directories - * with check_only set. - */ - return read_directory_recursive(dir, istate, path->buf, path->len, - cdir->ucd, 1, 0, pathspec); - /* - * We get path_recurse in the first run when - * directory_exists_in_index() returns index_nonexistent. We - * are sure that new changes in the index does not impact the - * outcome. Return now. - */ - return path_recurse; -} - -static enum path_treatment treat_path(struct dir_struct *dir, - struct untracked_cache_dir *untracked, - struct cached_dir *cdir, - struct index_state *istate, - struct strbuf *path, - int baselen, - const struct pathspec *pathspec) -{ - if (!cdir->d_name) - return treat_path_fast(dir, untracked, cdir, istate, path, - baselen, pathspec); - if (is_dot_or_dotdot(cdir->d_name) || !fspathcmp(cdir->d_name, ".git")) - return path_none; - strbuf_setlen(path, baselen); - strbuf_addstr(path, cdir->d_name); - if (simplify_away(path->buf, path->len, pathspec)) - return path_none; - - return treat_one_path(dir, untracked, istate, path, baselen, pathspec, - cdir->d_type); -} - static void add_untracked(struct untracked_cache_dir *dir, const char *name) { if (!dir) -- cgit v1.2.3 From 0126d1415a63ed24764f5f87a10929bc6222bddd Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:39 +0000 Subject: dir: fix broken comment Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index d0f3d66085..3a36768366 100644 --- a/dir.c +++ b/dir.c @@ -2259,7 +2259,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir, add_untracked(untracked, path.buf + baselen); break; } - /* skip the dir_add_* part */ + /* skip the add_path_to_appropriate_result_list() */ continue; } -- cgit v1.2.3 From 2df179d3dfeb431cc3030ac44e79b136debf1fd9 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:40 +0000 Subject: dir: fix confusion based on variable tense Despite having contributed several fixes in this area, I have for months (years?) assumed that the "exclude" variable was a directive; this caused me to think of it as a different mode we operate in and left me confused as I tried to build up a mental model around why we'd need such a directive. I mostly tried to ignore it while focusing on the pieces I was trying to understand. Then I finally traced this variable all back to a call to is_excluded(), meaning it was actually functioning as an adjective. In particular, it was a checked property ("Does this path match a rule in .gitignore?"), rather than a mode passed in from the caller. Change the variable name to match the part of speech used by the function called to define it, which will hopefully make these bits of code slightly clearer to the next reader. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 3a36768366..8074e651e6 100644 --- a/dir.c +++ b/dir.c @@ -1656,7 +1656,7 @@ static enum exist_status directory_exists_in_index(struct index_state *istate, static enum path_treatment treat_directory(struct dir_struct *dir, struct index_state *istate, struct untracked_cache_dir *untracked, - const char *dirname, int len, int baselen, int exclude, + const char *dirname, int len, int baselen, int excluded, const struct pathspec *pathspec) { int nested_repo = 0; @@ -1679,13 +1679,13 @@ static enum path_treatment treat_directory(struct dir_struct *dir, } if (nested_repo) return ((dir->flags & DIR_SKIP_NESTED_GIT) ? path_none : - (exclude ? path_excluded : path_untracked)); + (excluded ? path_excluded : path_untracked)); if (dir->flags & DIR_SHOW_OTHER_DIRECTORIES) break; - if (exclude && - (dir->flags & DIR_SHOW_IGNORED_TOO) && - (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) { + if (excluded && + (dir->flags & DIR_SHOW_IGNORED_TOO) && + (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) { /* * This is an excluded directory and we are @@ -1713,7 +1713,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir, /* This is the "show_other_directories" case */ if (!(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES)) - return exclude ? path_excluded : path_untracked; + return excluded ? path_excluded : path_untracked; untracked = lookup_untracked(dir->untracked, untracked, dirname + baselen, len - baselen); @@ -1723,7 +1723,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir, * the directory contains any files. */ return read_directory_recursive(dir, istate, dirname, len, - untracked, 1, exclude, pathspec); + untracked, 1, excluded, pathspec); } /* @@ -1904,7 +1904,7 @@ static enum path_treatment treat_path(struct dir_struct *dir, int baselen, const struct pathspec *pathspec) { - int has_path_in_index, dtype, exclude; + int has_path_in_index, dtype, excluded; enum path_treatment path_treatment; if (!cdir->d_name) @@ -1949,13 +1949,13 @@ static enum path_treatment treat_path(struct dir_struct *dir, (directory_exists_in_index(istate, path->buf, path->len) == index_nonexistent)) return path_none; - exclude = is_excluded(dir, istate, path->buf, &dtype); + excluded = is_excluded(dir, istate, path->buf, &dtype); /* * Excluded? If we don't explicitly want to show * ignored files, ignore it */ - if (exclude && !(dir->flags & (DIR_SHOW_IGNORED|DIR_SHOW_IGNORED_TOO))) + if (excluded && !(dir->flags & (DIR_SHOW_IGNORED|DIR_SHOW_IGNORED_TOO))) return path_excluded; switch (dtype) { @@ -1965,7 +1965,7 @@ static enum path_treatment treat_path(struct dir_struct *dir, strbuf_addch(path, '/'); path_treatment = treat_directory(dir, istate, untracked, path->buf, path->len, - baselen, exclude, pathspec); + baselen, excluded, pathspec); /* * If 1) we only want to return directories that * match an exclude pattern and 2) this directory does @@ -1974,7 +1974,7 @@ static enum path_treatment treat_path(struct dir_struct *dir, * recurse into this directory (instead of marking the * directory itself as an ignored path). */ - if (!exclude && + if (!excluded && path_treatment == path_excluded && (dir->flags & DIR_SHOW_IGNORED_TOO) && (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) @@ -1982,7 +1982,7 @@ static enum path_treatment treat_path(struct dir_struct *dir, return path_treatment; case DT_REG: case DT_LNK: - return exclude ? path_excluded : path_untracked; + return excluded ? path_excluded : path_untracked; } } -- cgit v1.2.3 From 0bbd0e8b5233b7cf66b846d4c1825a530d8402bd Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Wed, 1 Apr 2020 04:17:41 +0000 Subject: dir: refactor treat_directory to clarify control flow The logic in treat_directory() is handled by a multi-case switch statement, but this switch is very asymmetrical, as the first two cases are simple but the third is more complicated than the rest of the method. In fact, the third case includes a "break" statement that leads to the block of code outside the switch statement. That is the only way to reach that block, as the switch handles all possible values from directory_exists_in_index(); Extract the switch statement into a series of "if" statements. This simplifies the trivial cases, while clarifying how to reach the "show_other_directories" case. This is particularly important as the "show_other_directories" case will expand in a later change. Helped-by: Elijah Newren Signed-off-by: Derrick Stolee Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 8074e651e6..d9bcb7e19b 100644 --- a/dir.c +++ b/dir.c @@ -1660,29 +1660,28 @@ static enum path_treatment treat_directory(struct dir_struct *dir, const struct pathspec *pathspec) { int nested_repo = 0; - /* The "len-1" is to strip the final '/' */ - switch (directory_exists_in_index(istate, dirname, len-1)) { - case index_directory: - return path_recurse; + enum exist_status status = directory_exists_in_index(istate, dirname, len-1); - case index_gitdir: + if (status == index_directory) + return path_recurse; + if (status == index_gitdir) return path_none; + if (status != index_nonexistent) + BUG("Unhandled value for directory_exists_in_index: %d\n", status); - case index_nonexistent: - if ((dir->flags & DIR_SKIP_NESTED_GIT) || - !(dir->flags & DIR_NO_GITLINKS)) { - struct strbuf sb = STRBUF_INIT; - strbuf_addstr(&sb, dirname); - nested_repo = is_nonbare_repository_dir(&sb); - strbuf_release(&sb); - } - if (nested_repo) - return ((dir->flags & DIR_SKIP_NESTED_GIT) ? path_none : - (excluded ? path_excluded : path_untracked)); + if ((dir->flags & DIR_SKIP_NESTED_GIT) || + !(dir->flags & DIR_NO_GITLINKS)) { + struct strbuf sb = STRBUF_INIT; + strbuf_addstr(&sb, dirname); + nested_repo = is_nonbare_repository_dir(&sb); + strbuf_release(&sb); + } + if (nested_repo) + return ((dir->flags & DIR_SKIP_NESTED_GIT) ? path_none : + (excluded ? path_excluded : path_untracked)); - if (dir->flags & DIR_SHOW_OTHER_DIRECTORIES) - break; + if (!(dir->flags & DIR_SHOW_OTHER_DIRECTORIES)) { if (excluded && (dir->flags & DIR_SHOW_IGNORED_TOO) && (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) { -- cgit v1.2.3 From 8d92fb292706fd8d13cfe55353b2ec9345153a3e Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:42 +0000 Subject: dir: replace exponential algorithm with a linear one dir's read_directory_recursive() naturally operates recursively in order to walk the directory tree. Treating of directories is sometimes weird because there are so many different permutations about how to handle directories. Some examples: * 'git ls-files -o --directory' only needs to know that a directory itself is untracked; it doesn't need to recurse into it to see what is underneath. * 'git status' needs to recurse into an untracked directory, but only to determine whether or not it is empty. If there are no files underneath, the directory itself will be omitted from the output. If it is not empty, only the directory will be listed. * 'git status --ignored' needs to recurse into untracked directories and report all the ignored entries and then report the directory as untracked -- UNLESS all the entries under the directory are ignored, in which case we don't print any of the entries under the directory and just report the directory itself as ignored. (Note that although this forces us to walk all untracked files underneath the directory as well, we strip them from the output, except for users like 'git clean' who also set DIR_KEEP_TRACKED_CONTENTS.) * For 'git clean', we may need to recurse into a directory that doesn't match any specified pathspecs, if it's possible that there is an entry underneath the directory that can match one of the pathspecs. In such a case, we need to be careful to omit the directory itself from the list of paths (see commit 404ebceda01c ("dir: also check directories for matching pathspecs", 2019-09-17)) Part of the tension noted above is that the treatment of a directory can change based on the files within it, and based on the various settings in dir->flags. Trying to keep this in mind while reading over the code, it is easy to think in terms of "treat_directory() tells us what to do with a directory, and read_directory_recursive() is the thing that recurses". Since we need to look into a directory to know how to treat it, though, it is quite easy to decide to (also) recurse into the directory from treat_directory() by adding a read_directory_recursive() call. Adding such a call is actually fine, IF we make sure that read_directory_recursive() does not also recurse into that same directory. Unfortunately, commit df5bcdf83aeb ("dir: recurse into untracked dirs for ignored files", 2017-05-18), added exactly such a case to the code, meaning we'd have two calls to read_directory_recursive() for an untracked directory. So, if we had a file named one/two/three/four/five/somefile.txt and nothing in one/ was tracked, then 'git status --ignored' would call read_directory_recursive() twice on the directory 'one/', and each of those would call read_directory_recursive() twice on the directory 'one/two/', and so on until read_directory_recursive() was called 2^5 times for 'one/two/three/four/five/'. Avoid calling read_directory_recursive() twice per level by moving a lot of the special logic into treat_directory(). Since dir.c is somewhat complex, extra cruft built up around this over time. While trying to unravel it, I noticed several instances where the first call to read_directory_recursive() would return e.g. path_untracked for some directory and a later one would return e.g. path_none, despite the fact that the directory clearly should have been considered untracked. The code happened to work due to the side-effect from the first invocation of adding untracked entries to dir->entries; this allowed it to get the correct output despite the supposed override in return value by the later call. I am somewhat concerned that there are still bugs and maybe even testcases with the wrong expectation. I have tried to carefully document treat_directory() since it becomes more complex after this change (though much of this complexity came from elsewhere that probably deserved better comments to begin with). However, much of my work felt more like a game of whackamole while attempting to make the code match the existing regression tests than an attempt to create an implementation that matched some clear design. That seems wrong to me, but the rules of existing behavior had so many special cases that I had a hard time coming up with some overarching rules about what correct behavior is for all cases, forcing me to hope that the regression tests are correct and sufficient. Such a hope seems likely to be ill-founded, given my experience with dir.c-related testcases in the last few months: Examples where the documentation was hard to parse or even just wrong: * 3aca58045f4f (git-clean.txt: do not claim we will delete files with -n/--dry-run, 2019-09-17) * 09487f2cbad3 (clean: avoid removing untracked files in a nested git repository, 2019-09-17) * e86bbcf987fa (clean: disambiguate the definition of -d, 2019-09-17) Examples where testcases were declared wrong and changed: * 09487f2cbad3 (clean: avoid removing untracked files in a nested git repository, 2019-09-17) * e86bbcf987fa (clean: disambiguate the definition of -d, 2019-09-17) * a2b13367fe55 (Revert "dir.c: make 'git-status --ignored' work within leading directories", 2019-12-10) Examples where testcases were clearly inadequate: * 502c386ff944 (t7300-clean: demonstrate deleting nested repo with an ignored file breakage, 2019-08-25) * 7541cc530239 (t7300: add testcases showing failure to clean specified pathspecs, 2019-09-17) * a5e916c7453b (dir: fix off-by-one error in match_pathspec_item, 2019-09-17) * 404ebceda01c (dir: also check directories for matching pathspecs, 2019-09-17) * 09487f2cbad3 (clean: avoid removing untracked files in a nested git repository, 2019-09-17) * e86bbcf987fa (clean: disambiguate the definition of -d, 2019-09-17) * 452efd11fbf6 (t3011: demonstrate directory traversal failures, 2019-12-10) * b9670c1f5e6b (dir: fix checks on common prefix directory, 2019-12-19) Examples where "correct behavior" was unclear to everyone: https://lore.kernel.org/git/20190905154735.29784-1-newren@gmail.com/ Other commits of note: * 902b90cf42bc (clean: fix theoretical path corruption, 2019-09-17) However, on the positive side, it does make the code much faster. For the following simple shell loop in an empty repository: for depth in $(seq 10 25) do dirs=$(for i in $(seq 1 $depth) ; do printf 'dir/' ; done) rm -rf dir mkdir -p $dirs >$dirs/untracked-file /usr/bin/time --format="$depth: %e" git status --ignored >/dev/null done I saw the following timings, in seconds (note that the numbers are a little noisy from run-to-run, but the trend is very clear with every run): 10: 0.03 11: 0.05 12: 0.08 13: 0.19 14: 0.29 15: 0.50 16: 1.05 17: 2.11 18: 4.11 19: 8.60 20: 17.55 21: 33.87 22: 68.71 23: 140.05 24: 274.45 25: 551.15 For the above run, using strace I can look for the number of untracked directories opened and can verify that it matches the expected 2^($depth+1)-2 (the sum of 2^1 + 2^2 + 2^3 + ... + 2^$depth). After this fix, with strace I can verify that the number of untracked directories that are opened drops to just $depth, and the timings all drop to 0.00. In fact, it isn't until a depth of 190 nested directories that it sometimes starts reporting a time of 0.01 seconds and doesn't consistently report 0.01 seconds until there are 240 nested directories. The previous code would have taken 17.55 * 2^220 / (60*60*24*365) = 9.4 * 10^59 YEARS to have completed the 240 nested directories case. It's not often that you get to speed something up by a factor of 3*10^69. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 210 ++++++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 147 insertions(+), 63 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index d9bcb7e19b..1b3c095b5a 100644 --- a/dir.c +++ b/dir.c @@ -1659,7 +1659,13 @@ static enum path_treatment treat_directory(struct dir_struct *dir, const char *dirname, int len, int baselen, int excluded, const struct pathspec *pathspec) { - int nested_repo = 0; + /* + * WARNING: From this function, you can return path_recurse or you + * can call read_directory_recursive() (or neither), but + * you CAN'T DO BOTH. + */ + enum path_treatment state; + int nested_repo = 0, old_ignored_nr, check_only, stop_early; /* The "len-1" is to strip the final '/' */ enum exist_status status = directory_exists_in_index(istate, dirname, len-1); @@ -1711,18 +1717,135 @@ static enum path_treatment treat_directory(struct dir_struct *dir, /* This is the "show_other_directories" case */ - if (!(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES)) + /* + * If we have a pathspec which could match something _below_ this + * directory (e.g. when checking 'subdir/' having a pathspec like + * 'subdir/some/deep/path/file' or 'subdir/widget-*.c'), then we + * need to recurse. + */ + if (pathspec) { + int ret = do_match_pathspec(istate, pathspec, dirname, len, + 0 /* prefix */, NULL /* seen */, + DO_MATCH_LEADING_PATHSPEC); + if (ret == MATCHED_RECURSIVELY_LEADING_PATHSPEC) + return path_recurse; + } + + /* + * Other than the path_recurse case immediately above, we only need + * to recurse into untracked/ignored directories if either of the + * following bits is set: + * - DIR_SHOW_IGNORED_TOO (because then we need to determine if + * there are ignored directories below) + * - DIR_HIDE_EMPTY_DIRECTORIES (because we have to determine if + * the directory is empty) + */ + if (!(dir->flags & (DIR_SHOW_IGNORED_TOO | DIR_HIDE_EMPTY_DIRECTORIES))) return excluded ? path_excluded : path_untracked; + /* + * ...and even if DIR_SHOW_IGNORED_TOO is set, we can still avoid + * recursing into ignored directories if the path is excluded and + * DIR_SHOW_IGNORED_TOO_MODE_MATCHING is also set. + */ + if (excluded && + (dir->flags & DIR_SHOW_IGNORED_TOO) && + (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) + return path_excluded; + + /* + * If we have we don't want to know the all the paths under an + * untracked or ignored directory, we still need to go into the + * directory to determine if it is empty (because an empty directory + * should be path_none instead of path_excluded or path_untracked). + */ + check_only = ((dir->flags & DIR_HIDE_EMPTY_DIRECTORIES) && + !(dir->flags & DIR_SHOW_IGNORED_TOO)); + + /* + * However, there's another optimization possible as a subset of + * check_only, based on the cases we have to consider: + * A) Directory matches no exclude patterns: + * * Directory is empty => path_none + * * Directory has an untracked file under it => path_untracked + * * Directory has only ignored files under it => path_excluded + * B) Directory matches an exclude pattern: + * * Directory is empty => path_none + * * Directory has an untracked file under it => path_excluded + * * Directory has only ignored files under it => path_excluded + * In case A, we can exit as soon as we've found an untracked + * file but otherwise have to walk all files. In case B, though, + * we can stop at the first file we find under the directory. + */ + stop_early = check_only && excluded; + + /* + * If /every/ file within an untracked directory is ignored, then + * we want to treat the directory as ignored (for e.g. status + * --porcelain), without listing the individual ignored files + * underneath. To do so, we'll save the current ignored_nr, and + * pop all the ones added after it if it turns out the entire + * directory is ignored. + */ + old_ignored_nr = dir->ignored_nr; + + /* Actually recurse into dirname now, we'll fixup the state later. */ untracked = lookup_untracked(dir->untracked, untracked, dirname + baselen, len - baselen); + state = read_directory_recursive(dir, istate, dirname, len, untracked, + check_only, stop_early, pathspec); + + /* There are a variety of reasons we may need to fixup the state... */ + if (state == path_excluded) { + /* state == path_excluded implies all paths under + * dirname were ignored... + * + * if running e.g. `git status --porcelain --ignored=matching`, + * then we want to see the subpaths that are ignored. + * + * if running e.g. just `git status --porcelain`, then + * we just want the directory itself to be listed as ignored + * and not the individual paths underneath. + */ + int want_ignored_subpaths = + ((dir->flags & DIR_SHOW_IGNORED_TOO) && + (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)); + + if (want_ignored_subpaths) { + /* + * with --ignored=matching, we want the subpaths + * INSTEAD of the directory itself. + */ + state = path_none; + } else { + int i; + for (i = old_ignored_nr + 1; iignored_nr; ++i) + FREE_AND_NULL(dir->ignored[i]); + dir->ignored_nr = old_ignored_nr; + } + } /* - * If this is an excluded directory, then we only need to check if - * the directory contains any files. + * If there is nothing under the current directory and we are not + * hiding empty directories, then we need to report on the + * untracked or ignored status of the directory itself. */ - return read_directory_recursive(dir, istate, dirname, len, - untracked, 1, excluded, pathspec); + if (state == path_none && !(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES)) + state = excluded ? path_excluded : path_untracked; + + /* + * We can recurse into untracked directories that don't match any + * of the given pathspecs when some file underneath the directory + * might match one of the pathspecs. If so, we should make sure + * to note that the directory itself did not match. + */ + if (pathspec && + !match_pathspec(istate, pathspec, dirname, len, + 0 /* prefix */, NULL, + 0 /* do NOT special case dirs */)) + state = path_none; + + return state; } /* @@ -1870,6 +1993,11 @@ static enum path_treatment treat_path_fast(struct dir_struct *dir, int baselen, const struct pathspec *pathspec) { + /* + * WARNING: From this function, you can return path_recurse or you + * can call read_directory_recursive() (or neither), but + * you CAN'T DO BOTH. + */ strbuf_setlen(path, baselen); if (!cdir->ucd) { strbuf_addstr(path, cdir->file); @@ -1904,7 +2032,6 @@ static enum path_treatment treat_path(struct dir_struct *dir, const struct pathspec *pathspec) { int has_path_in_index, dtype, excluded; - enum path_treatment path_treatment; if (!cdir->d_name) return treat_path_fast(dir, untracked, cdir, istate, path, @@ -1961,24 +2088,16 @@ static enum path_treatment treat_path(struct dir_struct *dir, default: return path_none; case DT_DIR: - strbuf_addch(path, '/'); - path_treatment = treat_directory(dir, istate, untracked, - path->buf, path->len, - baselen, excluded, pathspec); /* - * If 1) we only want to return directories that - * match an exclude pattern and 2) this directory does - * not match an exclude pattern but all of its - * contents are excluded, then indicate that we should - * recurse into this directory (instead of marking the - * directory itself as an ignored path). + * WARNING: Do not ignore/amend the return value from + * treat_directory(), and especially do not change it to return + * path_recurse as that can cause exponential slowdown. + * Instead, modify treat_directory() to return the right value. */ - if (!excluded && - path_treatment == path_excluded && - (dir->flags & DIR_SHOW_IGNORED_TOO) && - (dir->flags & DIR_SHOW_IGNORED_TOO_MODE_MATCHING)) - return path_recurse; - return path_treatment; + strbuf_addch(path, '/'); + return treat_directory(dir, istate, untracked, + path->buf, path->len, + baselen, excluded, pathspec); case DT_REG: case DT_LNK: return excluded ? path_excluded : path_untracked; @@ -2175,14 +2294,10 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir, int stop_at_first_file, const struct pathspec *pathspec) { /* - * WARNING WARNING WARNING: - * - * Any updates to the traversal logic here may need corresponding - * updates in treat_leading_path(). See the commit message for the - * commit adding this warning as well as the commit preceding it - * for details. + * WARNING: Do NOT recurse unless path_recurse is returned from + * treat_path(). Recursing on any other return value + * can result in exponential slowdown. */ - struct cached_dir cdir; enum path_treatment state, subdir_state, dir_state = path_none; struct strbuf path = STRBUF_INIT; @@ -2204,13 +2319,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir, dir_state = state; /* recurse into subdir if instructed by treat_path */ - if ((state == path_recurse) || - ((state == path_untracked) && - (resolve_dtype(cdir.d_type, istate, path.buf, path.len) == DT_DIR) && - ((dir->flags & DIR_SHOW_IGNORED_TOO) || - (pathspec && - do_match_pathspec(istate, pathspec, path.buf, path.len, - baselen, NULL, DO_MATCH_LEADING_PATHSPEC) == MATCHED_RECURSIVELY_LEADING_PATHSPEC)))) { + if (state == path_recurse) { struct untracked_cache_dir *ud; ud = lookup_untracked(dir->untracked, untracked, path.buf + baselen, @@ -2294,15 +2403,6 @@ static int treat_leading_path(struct dir_struct *dir, const char *path, int len, const struct pathspec *pathspec) { - /* - * WARNING WARNING WARNING: - * - * Any updates to the traversal logic here may need corresponding - * updates in read_directory_recursive(). See 777b420347 (dir: - * synchronize treat_leading_path() and read_directory_recursive(), - * 2019-12-19) and its parent commit for details. - */ - struct strbuf sb = STRBUF_INIT; struct strbuf subdir = STRBUF_INIT; int prevlen, baselen; @@ -2353,23 +2453,7 @@ static int treat_leading_path(struct dir_struct *dir, strbuf_reset(&subdir); strbuf_add(&subdir, path+prevlen, baselen-prevlen); cdir.d_name = subdir.buf; - state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, - pathspec); - if (state == path_untracked && - resolve_dtype(cdir.d_type, istate, sb.buf, sb.len) == DT_DIR && - (dir->flags & DIR_SHOW_IGNORED_TOO || - do_match_pathspec(istate, pathspec, sb.buf, sb.len, - baselen, NULL, DO_MATCH_LEADING_PATHSPEC) == MATCHED_RECURSIVELY_LEADING_PATHSPEC)) { - if (!match_pathspec(istate, pathspec, sb.buf, sb.len, - 0 /* prefix */, NULL, - 0 /* do NOT special case dirs */)) - state = path_none; - add_path_to_appropriate_result_list(dir, NULL, &cdir, - istate, - &sb, baselen, - pathspec, state); - state = path_recurse; - } + state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, pathspec); if (state != path_recurse) break; /* do not recurse into it */ -- cgit v1.2.3 From 1684644489fbd76d01fe3bb53c65df6856fd00c5 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:43 +0000 Subject: dir: include DIR_KEEP_UNTRACKED_CONTENTS handling in treat_directory() Handling DIR_KEEP_UNTRACKED_CONTENTS within treat_directory() instead of as a post-processing step in read_directory(): * allows us to directly access and remove the relevant entries instead of needing to calculate which ones need to be removed * keeps the logic for directory handling in one location (and puts it closer the the logic for stripping out extra ignored entries, which seems logical). Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 1b3c095b5a..8be31df58c 100644 --- a/dir.c +++ b/dir.c @@ -1665,7 +1665,8 @@ static enum path_treatment treat_directory(struct dir_struct *dir, * you CAN'T DO BOTH. */ enum path_treatment state; - int nested_repo = 0, old_ignored_nr, check_only, stop_early; + int nested_repo = 0, check_only, stop_early; + int old_ignored_nr, old_untracked_nr; /* The "len-1" is to strip the final '/' */ enum exist_status status = directory_exists_in_index(istate, dirname, len-1); @@ -1785,9 +1786,13 @@ static enum path_treatment treat_directory(struct dir_struct *dir, * --porcelain), without listing the individual ignored files * underneath. To do so, we'll save the current ignored_nr, and * pop all the ones added after it if it turns out the entire - * directory is ignored. + * directory is ignored. Also, when DIR_SHOW_IGNORED_TOO and + * !DIR_KEEP_UNTRACKED_CONTENTS then we don't want to show + * untracked paths so will need to pop all those off the last + * after we traverse. */ old_ignored_nr = dir->ignored_nr; + old_untracked_nr = dir->nr; /* Actually recurse into dirname now, we'll fixup the state later. */ untracked = lookup_untracked(dir->untracked, untracked, @@ -1825,6 +1830,18 @@ static enum path_treatment treat_directory(struct dir_struct *dir, } } + /* + * We may need to ignore some of the untracked paths we found while + * traversing subdirectories. + */ + if ((dir->flags & DIR_SHOW_IGNORED_TOO) && + !(dir->flags & DIR_KEEP_UNTRACKED_CONTENTS)) { + int i; + for (i = old_untracked_nr + 1; inr; ++i) + FREE_AND_NULL(dir->entries[i]); + dir->nr = old_untracked_nr; + } + /* * If there is nothing under the current directory and we are not * hiding empty directories, then we need to report on the @@ -2653,28 +2670,6 @@ int read_directory(struct dir_struct *dir, struct index_state *istate, QSORT(dir->entries, dir->nr, cmp_dir_entry); QSORT(dir->ignored, dir->ignored_nr, cmp_dir_entry); - /* - * If DIR_SHOW_IGNORED_TOO is set, read_directory_recursive() will - * also pick up untracked contents of untracked dirs; by default - * we discard these, but given DIR_KEEP_UNTRACKED_CONTENTS we do not. - */ - if ((dir->flags & DIR_SHOW_IGNORED_TOO) && - !(dir->flags & DIR_KEEP_UNTRACKED_CONTENTS)) { - int i, j; - - /* remove from dir->entries untracked contents of untracked dirs */ - for (i = j = 0; j < dir->nr; j++) { - if (i && - check_dir_entry_contains(dir->entries[i - 1], dir->entries[j])) { - FREE_AND_NULL(dir->entries[j]); - } else { - dir->entries[i++] = dir->entries[j]; - } - } - - dir->nr = i; - } - trace_performance_leave("read directory %.*s", len, path); if (dir->untracked) { static int force_untracked_cache = -1; -- cgit v1.2.3 From 7f45ab2dca04c5b76958843120c6bd6d3a033043 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:44 +0000 Subject: dir: replace double pathspec matching with single in treat_directory() treat_directory() had a call to both do_match_pathspec() and match_pathspec(). These calls have migrated through the code somewhat since their introduction, but we don't actually need both. Replace the two calls with one, and while at it, move the check earlier in order to reduce the need for callers of fill_directory() to do post-filtering of results. The next patch will address post-filtering more forcefully and provide more relevant history and context. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index 8be31df58c..a67930dcff 100644 --- a/dir.c +++ b/dir.c @@ -1665,6 +1665,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir, * you CAN'T DO BOTH. */ enum path_treatment state; + int matches_how = 0; int nested_repo = 0, check_only, stop_early; int old_ignored_nr, old_untracked_nr; /* The "len-1" is to strip the final '/' */ @@ -1677,6 +1678,22 @@ static enum path_treatment treat_directory(struct dir_struct *dir, if (status != index_nonexistent) BUG("Unhandled value for directory_exists_in_index: %d\n", status); + /* + * We don't want to descend into paths that don't match the necessary + * patterns. Clearly, if we don't have a pathspec, then we can't check + * for matching patterns. Also, if (excluded) then we know we matched + * the exclusion patterns so as an optimization we can skip checking + * for matching patterns. + */ + if (pathspec && !excluded) { + matches_how = do_match_pathspec(istate, pathspec, dirname, len, + 0 /* prefix */, NULL /* seen */, + DO_MATCH_LEADING_PATHSPEC); + if (!matches_how) + return path_none; + } + + if ((dir->flags & DIR_SKIP_NESTED_GIT) || !(dir->flags & DIR_NO_GITLINKS)) { struct strbuf sb = STRBUF_INIT; @@ -1724,13 +1741,8 @@ static enum path_treatment treat_directory(struct dir_struct *dir, * 'subdir/some/deep/path/file' or 'subdir/widget-*.c'), then we * need to recurse. */ - if (pathspec) { - int ret = do_match_pathspec(istate, pathspec, dirname, len, - 0 /* prefix */, NULL /* seen */, - DO_MATCH_LEADING_PATHSPEC); - if (ret == MATCHED_RECURSIVELY_LEADING_PATHSPEC) - return path_recurse; - } + if (matches_how == MATCHED_RECURSIVELY_LEADING_PATHSPEC) + return path_recurse; /* * Other than the path_recurse case immediately above, we only need @@ -1850,18 +1862,6 @@ static enum path_treatment treat_directory(struct dir_struct *dir, if (state == path_none && !(dir->flags & DIR_HIDE_EMPTY_DIRECTORIES)) state = excluded ? path_excluded : path_untracked; - /* - * We can recurse into untracked directories that don't match any - * of the given pathspecs when some file underneath the directory - * might match one of the pathspecs. If so, we should make sure - * to note that the directory itself did not match. - */ - if (pathspec && - !match_pathspec(istate, pathspec, dirname, len, - 0 /* prefix */, NULL, - 0 /* do NOT special case dirs */)) - state = path_none; - return state; } -- cgit v1.2.3 From 95c11ecc73f286e0a95d9591ae98f1221efe4633 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Wed, 1 Apr 2020 04:17:45 +0000 Subject: Fix error-prone fill_directory() API; make it only return matches Traditionally, the expected calling convention for the dir.c API was: fill_directory(&dir, ..., pathspec) foreach entry in dir->entries: if (dir_path_match(entry, pathspec)) process_or_display(entry) This may have made sense once upon a time, because the fill_directory() call could use cheap checks to avoid doing full pathspec matching, and an external caller may have wanted to do other post-processing of the results anyway. However: * this structure makes it easy for users of the API to get it wrong * this structure actually makes it harder to understand fill_directory() and the functions it uses internally. It has tripped me up several times while trying to fix bugs and restructure things. * relying on post-filtering was already found to produce wrong results; pathspec matching had to be added internally for multiple cases in order to get the right results (see commits 404ebceda01c (dir: also check directories for matching pathspecs, 2019-09-17) and 89a1f4aaf765 (dir: if our pathspec might match files under a dir, recurse into it, 2019-09-17)) * it's bad for performance: fill_directory() already has to do lots of checks and knows the subset of cases where it still needs to do more checks. Forcing external callers to do full pathspec matching means they must re-check _every_ path. So, add the pathspec matching within the fill_directory() internals, and remove it from external callers. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- dir.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'dir.c') diff --git a/dir.c b/dir.c index a67930dcff..2de6491040 100644 --- a/dir.c +++ b/dir.c @@ -2117,7 +2117,14 @@ static enum path_treatment treat_path(struct dir_struct *dir, baselen, excluded, pathspec); case DT_REG: case DT_LNK: - return excluded ? path_excluded : path_untracked; + if (excluded) + return path_excluded; + if (pathspec && + !do_match_pathspec(istate, pathspec, path->buf, path->len, + 0 /* prefix */, NULL /* seen */, + 0 /* flags */)) + return path_none; + return path_untracked; } } -- cgit v1.2.3