From 65acfeacaa6e50c92a6ac18dc08356026a99b3f3 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 30 Sep 2016 17:19:35 -0700 Subject: abbrev: add FALLBACK_DEFAULT_ABBREV to prepare for auto sizing We'll be introducing a new way to decide the default abbreviation length by initialising DEFAULT_ABBREV to -1 to signal the first call to "find unique abbreviation" codepath to compute a reasonable value based on the number of objects we have to avoid collisions. We have long relied on DEFAULT_ABBREV being a positive concrete value that is used as the abbreviation length when no extra configuration or command line option has overridden it. Some codepaths wants to use such a positive concrete default value even before making their first request to actually trigger the computation for the auto sized default. Introduce FALLBACK_DEFAULT_ABBREV and use it to the code that attempts to align the report from "git fetch". For now, this macro is also used to initialize the default_abbrev variable, but the auto-sizing code will use -1 and then use the value of FALLBACK_DEFAULT_ABBREV as the starting point of auto-sizing. Signed-off-by: Junio C Hamano --- builtin/fetch.c | 3 +++ cache.h | 3 +++ environment.c | 2 +- transport.h | 3 +-- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/builtin/fetch.c b/builtin/fetch.c index 164623bb6f..a9f12cc5cf 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -17,6 +17,9 @@ #include "argv-array.h" #include "utf8.h" +#define TRANSPORT_SUMMARY(x) \ + (int)(TRANSPORT_SUMMARY_WIDTH + strlen(x) - gettext_width(x)), (x) + static const char * const builtin_fetch_usage[] = { N_("git fetch [] [ [...]]"), N_("git fetch [] "), diff --git a/cache.h b/cache.h index f346c01708..5a651b8435 100644 --- a/cache.h +++ b/cache.h @@ -1183,6 +1183,9 @@ static inline int hex2chr(const char *s) #define MINIMUM_ABBREV minimum_abbrev #define DEFAULT_ABBREV default_abbrev +/* used when the code does not know or care what the default abbrev is */ +#define FALLBACK_DEFAULT_ABBREV 7 + struct object_context { unsigned char tree[20]; char path[PATH_MAX]; diff --git a/environment.c b/environment.c index cd5aa57179..44fb107b8a 100644 --- a/environment.c +++ b/environment.c @@ -16,7 +16,7 @@ int trust_executable_bit = 1; int trust_ctime = 1; int check_stat = 1; int has_symlinks = 1; -int minimum_abbrev = 4, default_abbrev = 7; +int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV; int ignore_case; int assume_unchanged; int prefer_symlink_refs; diff --git a/transport.h b/transport.h index 6fe3485325..e783377e40 100644 --- a/transport.h +++ b/transport.h @@ -142,8 +142,7 @@ struct transport { #define TRANSPORT_PUSH_ATOMIC 8192 #define TRANSPORT_PUSH_OPTIONS 16384 -#define TRANSPORT_SUMMARY_WIDTH (2 * DEFAULT_ABBREV + 3) -#define TRANSPORT_SUMMARY(x) (int)(TRANSPORT_SUMMARY_WIDTH + strlen(x) - gettext_width(x)), (x) +#define TRANSPORT_SUMMARY_WIDTH (2 * FALLBACK_DEFAULT_ABBREV + 3) /* Returns a transport suitable for the url */ struct transport *transport_get(struct remote *, const char *); -- cgit v1.2.3 From 7b5b7721affae7040cac77d647a5ec8628f0f845 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 30 Sep 2016 17:19:36 -0700 Subject: abbrev: prepare for new world order The code that sets custom abbreviation length, in response to command line argument, often does something like this: if (skip_prefix(arg, "--abbrev=", &arg)) abbrev = atoi(arg); else if (!strcmp("--abbrev", &arg)) abbrev = DEFAULT_ABBREV; /* make the value sane */ if (abbrev < 0 || 40 < abbrev) abbrev = ... some sane value ... However, it is pointless to sanity-check and tweak the value obtained from DEFAULT_ABBREV. We are going to allow it to be initially set to -1 to signal that the default abbreviation length must be auto sized upon the first request to abbreviate, based on the number of objects in the repository, and when that happens, rejecting or tweaking a negative value to a "saner" one will negatively interfere with the auto sizing. The codepaths for git rev-parse --short git diff --raw --abbrev do exactly that; allow them to pass possibly negative abbrevs intact, that will come from DEFAULT_ABBREV in the future. Signed-off-by: Junio C Hamano --- builtin/rev-parse.c | 5 +++-- diff.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/builtin/rev-parse.c b/builtin/rev-parse.c index 76cf05e2ad..17cbfabdde 100644 --- a/builtin/rev-parse.c +++ b/builtin/rev-parse.c @@ -643,8 +643,9 @@ int cmd_rev_parse(int argc, const char **argv, const char *prefix) filter &= ~(DO_FLAGS|DO_NOREV); verify = 1; abbrev = DEFAULT_ABBREV; - if (arg[7] == '=') - abbrev = strtoul(arg + 8, NULL, 10); + if (!arg[7]) + continue; + abbrev = strtoul(arg + 8, NULL, 10); if (abbrev < MINIMUM_ABBREV) abbrev = MINIMUM_ABBREV; else if (40 <= abbrev) diff --git a/diff.c b/diff.c index c6da383c56..cefc13eb8e 100644 --- a/diff.c +++ b/diff.c @@ -3399,7 +3399,7 @@ void diff_setup_done(struct diff_options *options) */ read_cache(); } - if (options->abbrev <= 0 || 40 < options->abbrev) + if (40 < options->abbrev) options->abbrev = 40; /* full */ /* -- cgit v1.2.3 From e6c587c733b4634030b353f4024794b08bc86892 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Sep 2016 17:19:37 -0700 Subject: abbrev: auto size the default abbreviation In fairly early days we somehow decided to abbreviate object names down to 7-hexdigits, but as projects grow, it is becoming more and more likely to see such a short object names made in earlier days and recorded in the log messages no longer unique. Currently the Linux kernel project needs 11 to 12 hexdigits, while Git itself needs 10 hexdigits to uniquely identify the objects they have, while many smaller projects may still be fine with the original 7-hexdigit default. One-size does not fit all projects. Introduce a mechanism, where we estimate the number of objects in the repository upon the first request to abbreviate an object name with the default setting and come up with a sane default for the repository. Based on the expectation that we would see collision in a repository with 2^(2N) objects when using object names shortened to first N bits, use sufficient number of hexdigits to cover the number of objects in the repository. Each hexdigit (4-bits) we add to the shortened name allows us to have four times (2-bits) as many objects in the repository. Signed-off-by: Linus Torvalds Signed-off-by: Junio C Hamano --- cache.h | 1 + environment.c | 2 +- sha1_name.c | 28 +++++++++++++++++++++++++++- 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/cache.h b/cache.h index 5a651b8435..0e2a0595e5 100644 --- a/cache.h +++ b/cache.h @@ -1204,6 +1204,7 @@ struct object_context { #define GET_SHA1_TREEISH 020 #define GET_SHA1_BLOB 040 #define GET_SHA1_FOLLOW_SYMLINKS 0100 +#define GET_SHA1_AUTOMATIC 0200 #define GET_SHA1_ONLY_TO_DIE 04000 #define GET_SHA1_DISAMBIGUATORS \ diff --git a/environment.c b/environment.c index 44fb107b8a..6f9d290563 100644 --- a/environment.c +++ b/environment.c @@ -16,7 +16,7 @@ int trust_executable_bit = 1; int trust_ctime = 1; int check_stat = 1; int has_symlinks = 1; -int minimum_abbrev = 4, default_abbrev = FALLBACK_DEFAULT_ABBREV; +int minimum_abbrev = 4, default_abbrev = -1; int ignore_case; int assume_unchanged; int prefer_symlink_refs; diff --git a/sha1_name.c b/sha1_name.c index 3b647fd7cf..beb7ab588b 100644 --- a/sha1_name.c +++ b/sha1_name.c @@ -15,6 +15,7 @@ typedef int (*disambiguate_hint_fn)(const unsigned char *, void *); struct disambiguate_state { int len; /* length of prefix in hex chars */ + unsigned int nrobjects; char hex_pfx[GIT_SHA1_HEXSZ + 1]; unsigned char bin_pfx[GIT_SHA1_RAWSZ]; @@ -118,6 +119,14 @@ static void find_short_object_filename(struct disambiguate_state *ds) if (strlen(de->d_name) != 38) continue; + + /* + * We only look at the one subdirectory, and we assume + * each subdirectory is roughly similar, so each + * object we find probably has 255 other objects in + * the other fan-out directories. + */ + ds->nrobjects += 256; if (memcmp(de->d_name, ds->hex_pfx + 2, ds->len - 2)) continue; memcpy(hex + 2, de->d_name, 38); @@ -151,6 +160,7 @@ static void unique_in_pack(struct packed_git *p, open_pack_index(p); num = p->num_objects; + ds->nrobjects += num; last = num; while (first < last) { uint32_t mid = (first + last) / 2; @@ -380,6 +390,9 @@ static int show_ambiguous_object(const unsigned char *sha1, void *data) return 0; } +/* start from our historical default before the automatic abbreviation */ +static int default_automatic_abbrev = FALLBACK_DEFAULT_ABBREV; + static int get_short_sha1(const char *name, int len, unsigned char *sha1, unsigned flags) { @@ -426,6 +439,14 @@ static int get_short_sha1(const char *name, int len, unsigned char *sha1, for_each_abbrev(ds.hex_pfx, show_ambiguous_object, &ds); } + if (len < 16 && !status && (flags & GET_SHA1_AUTOMATIC)) { + unsigned int expect_collision = 1 << (len * 2); + if (ds.nrobjects > expect_collision) { + default_automatic_abbrev = len+1; + return SHORT_NAME_AMBIGUOUS; + } + } + return status; } @@ -458,14 +479,19 @@ int for_each_abbrev(const char *prefix, each_abbrev_fn fn, void *cb_data) int find_unique_abbrev_r(char *hex, const unsigned char *sha1, int len) { int status, exists; + int flags = GET_SHA1_QUIETLY; + if (len < 0) { + flags |= GET_SHA1_AUTOMATIC; + len = default_automatic_abbrev; + } sha1_to_hex_r(hex, sha1); if (len == 40 || !len) return 40; exists = has_sha1_file(sha1); while (len < 40) { unsigned char sha1_ret[20]; - status = get_short_sha1(hex, len, sha1_ret, GET_SHA1_QUIETLY); + status = get_short_sha1(hex, len, sha1_ret, flags); if (exists ? !status : status == SHORT_NAME_NOT_FOUND) { -- cgit v1.2.3