From fe7df03a9a2fa434ebce38b2cd5e6da42f8b2692 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:09:41 +0200 Subject: fetch: speed up lookup of want refs via commit-graph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When updating our local refs based on the refs fetched from the remote, we need to iterate through all requested refs and load their respective commits such that we can determine whether they need to be appended to FETCH_HEAD or not. In cases where we're fetching from a remote with exceedingly many refs, resolving these refs can be quite expensive given that we repeatedly need to unpack object headers for each of the referenced objects. Speed this up by opportunistically trying to resolve object IDs via the commit graph. We only do so for any refs which are not in "refs/tags": more likely than not, these are going to be a commit anyway, and this lets us avoid having to unpack object headers completely in case the object is a commit that is part of the commit-graph. This significantly speeds up mirror-fetches in a real-world repository with 2.3M refs: Benchmark #1: HEAD~: git-fetch Time (mean ± σ): 56.482 s ± 0.384 s [User: 53.340 s, System: 5.365 s] Range (min … max): 56.050 s … 57.045 s 5 runs Benchmark #2: HEAD: git-fetch Time (mean ± σ): 33.727 s ± 0.170 s [User: 30.252 s, System: 5.194 s] Range (min … max): 33.452 s … 33.871 s 5 runs Summary 'HEAD: git-fetch' ran 1.67 ± 0.01 times faster than 'HEAD~: git-fetch' Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index e064687dbd..91d1301613 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1074,7 +1074,6 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, int connectivity_checked, struct ref *ref_map) { struct fetch_head fetch_head; - struct commit *commit; int url_len, i, rc = 0; struct strbuf note = STRBUF_INIT, err = STRBUF_INIT; struct ref_transaction *transaction = NULL; @@ -1122,6 +1121,7 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, want_status <= FETCH_HEAD_IGNORE; want_status++) { for (rm = ref_map; rm; rm = rm->next) { + struct commit *commit = NULL; struct ref *ref = NULL; if (rm->status == REF_STATUS_REJECT_SHALLOW) { @@ -1131,11 +1131,23 @@ static int store_updated_refs(const char *raw_url, const char *remote_name, continue; } - commit = lookup_commit_reference_gently(the_repository, - &rm->old_oid, - 1); - if (!commit) - rm->fetch_head_status = FETCH_HEAD_NOT_FOR_MERGE; + /* + * References in "refs/tags/" are often going to point + * to annotated tags, which are not part of the + * commit-graph. We thus only try to look up refs in + * the graph which are not in that namespace to not + * regress performance in repositories with many + * annotated tags. + */ + if (!starts_with(rm->name, "refs/tags/")) + commit = lookup_commit_in_graph(the_repository, &rm->old_oid); + if (!commit) { + commit = lookup_commit_reference_gently(the_repository, + &rm->old_oid, + 1); + if (!commit) + rm->fetch_head_status = FETCH_HEAD_NOT_FOR_MERGE; + } if (rm->fetch_head_status != want_status) continue; -- cgit v1.2.3 From 47c61004c7cfbb8662b13fac813b45e3fd214665 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:09:45 +0200 Subject: fetch: avoid unpacking headers in object existence check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When updating local refs after the fetch has transferred all objects, we do an object existence test as a safety guard to avoid updating a ref to an object which we don't have. We do so via `oid_object_info()`: if it returns an error, then we know the object does not exist. One side effect of `oid_object_info()` is that it parses the object's type, and to do so it must unpack the object header. This is completely pointless: we don't care for the type, but only want to assert that the object exists. Refactor the code to use `repo_has_object_file()`, which both makes the code's intent clearer and is also faster because it does not unpack object headers. In a real-world repo with 2.3M refs, this results in a small speedup when doing a mirror-fetch: Benchmark #1: HEAD~: git-fetch Time (mean ± σ): 33.686 s ± 0.176 s [User: 30.119 s, System: 5.262 s] Range (min … max): 33.512 s … 33.944 s 5 runs Benchmark #2: HEAD: git-fetch Time (mean ± σ): 31.247 s ± 0.195 s [User: 28.135 s, System: 5.066 s] Range (min … max): 30.948 s … 31.472 s 5 runs Summary 'HEAD: git-fetch' ran 1.08 ± 0.01 times faster than 'HEAD~: git-fetch' Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index 91d1301613..01513e6aea 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -846,13 +846,11 @@ static int update_local_ref(struct ref *ref, int summary_width) { struct commit *current = NULL, *updated; - enum object_type type; struct branch *current_branch = branch_get(NULL); const char *pretty_ref = prettify_refname(ref->name); int fast_forward = 0; - type = oid_object_info(the_repository, &ref->new_oid, NULL); - if (type < 0) + if (!repo_has_object_file(the_repository, &ref->new_oid)) die(_("object %s not found"), oid_to_hex(&ref->new_oid)); if (oideq(&ref->old_oid, &ref->new_oid)) { -- cgit v1.2.3 From 9fec7b213045135655354e864d15894175428d5a Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:09:50 +0200 Subject: connected: refactor iterator to return next object ID directly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The object ID iterator used by the connectivity checks returns the next object ID via an out-parameter and then uses a return code to indicate whether an item was found. This is a bit roundabout: instead of a separate error code, we can just return the next object ID directly and use `NULL` pointers as indicator that the iterator got no items left. Furthermore, this avoids a copy of the object ID. Refactor the iterator and all its implementations to return object IDs directly. This brings a tiny performance improvement when doing a mirror-fetch of a repository with about 2.3M refs: Benchmark #1: 328dc58b49919c43897240f2eabfa30be2ce32a4~: git-fetch Time (mean ± σ): 30.110 s ± 0.148 s [User: 27.161 s, System: 5.075 s] Range (min … max): 29.934 s … 30.406 s 10 runs Benchmark #2: 328dc58b49919c43897240f2eabfa30be2ce32a4: git-fetch Time (mean ± σ): 29.899 s ± 0.109 s [User: 26.916 s, System: 5.104 s] Range (min … max): 29.696 s … 29.996 s 10 runs Summary '328dc58b49919c43897240f2eabfa30be2ce32a4: git-fetch' ran 1.01 ± 0.01 times faster than '328dc58b49919c43897240f2eabfa30be2ce32a4~: git-fetch' While this 1% speedup could be labelled as statistically insignificant, the speedup is consistent on my machine. Furthermore, this is an end to end test, so it is expected that the improvement in the connectivity check itself is more significant. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index 01513e6aea..cdf0d0d671 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -962,7 +962,7 @@ static int update_local_ref(struct ref *ref, } } -static int iterate_ref_map(void *cb_data, struct object_id *oid) +static const struct object_id *iterate_ref_map(void *cb_data) { struct ref **rm = cb_data; struct ref *ref = *rm; @@ -970,10 +970,9 @@ static int iterate_ref_map(void *cb_data, struct object_id *oid) while (ref && ref->status == REF_STATUS_REJECT_SHALLOW) ref = ref->next; if (!ref) - return -1; /* end of the list */ + return NULL; *rm = ref->next; - oidcpy(oid, &ref->old_oid); - return 0; + return &ref->old_oid; } struct fetch_head { -- cgit v1.2.3 From 284b2ce8fcb100e7194b9cca6d9b99bca7da39b6 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:09:58 +0200 Subject: fetch: refactor fetch refs to be more extendable Refactor `fetch_refs()` code to make it more extendable by explicitly handling error cases. The refactored code should behave the same. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index cdf0d0d671..ef6f9b3a33 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1293,18 +1293,28 @@ static int check_exist_and_connected(struct ref *ref_map) static int fetch_refs(struct transport *transport, struct ref *ref_map) { - int ret = check_exist_and_connected(ref_map); + int ret; + + /* + * We don't need to perform a fetch in case we can already satisfy all + * refs. + */ + ret = check_exist_and_connected(ref_map); if (ret) { trace2_region_enter("fetch", "fetch_refs", the_repository); ret = transport_fetch_refs(transport, ref_map); trace2_region_leave("fetch", "fetch_refs", the_repository); + if (ret) + goto out; } - if (!ret) - /* - * Keep the new pack's ".keep" file around to allow the caller - * time to update refs to reference the new objects. - */ - return 0; + + /* + * Keep the new pack's ".keep" file around to allow the caller + * time to update refs to reference the new objects. + */ + return ret; + +out: transport_unlock_pack(transport); return ret; } -- cgit v1.2.3 From 1c7d1ab6f4a79e44406f304ec01b0a143dae9abb Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:10:02 +0200 Subject: fetch: merge fetching and consuming refs The functions `fetch_refs()` and `consume_refs()` must always be called together such that we first obtain all missing objects and then update our local refs to match the remote refs. In a subsequent patch, we'll further require that `fetch_refs()` must always be called before `consume_refs()` such that it can correctly assert that we have all objects after the fetch given that we're about to move the connectivity check. Make this requirement explicit by merging both functions into a single `fetch_and_consume_refs()` function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index ef6f9b3a33..a1e17edd8b 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1291,8 +1291,9 @@ static int check_exist_and_connected(struct ref *ref_map) return check_connected(iterate_ref_map, &rm, &opt); } -static int fetch_refs(struct transport *transport, struct ref *ref_map) +static int fetch_and_consume_refs(struct transport *transport, struct ref *ref_map) { + int connectivity_checked; int ret; /* @@ -1308,30 +1309,18 @@ static int fetch_refs(struct transport *transport, struct ref *ref_map) goto out; } - /* - * Keep the new pack's ".keep" file around to allow the caller - * time to update refs to reference the new objects. - */ - return ret; - -out: - transport_unlock_pack(transport); - return ret; -} - -/* Update local refs based on the ref values fetched from a remote */ -static int consume_refs(struct transport *transport, struct ref *ref_map) -{ - int connectivity_checked = transport->smart_options + connectivity_checked = transport->smart_options ? transport->smart_options->connectivity_checked : 0; - int ret; + trace2_region_enter("fetch", "consume_refs", the_repository); ret = store_updated_refs(transport->url, transport->remote->name, connectivity_checked, ref_map); - transport_unlock_pack(transport); trace2_region_leave("fetch", "consume_refs", the_repository); + +out: + transport_unlock_pack(transport); return ret; } @@ -1520,8 +1509,7 @@ static void backfill_tags(struct transport *transport, struct ref *ref_map) transport_set_option(transport, TRANS_OPT_FOLLOWTAGS, NULL); transport_set_option(transport, TRANS_OPT_DEPTH, "0"); transport_set_option(transport, TRANS_OPT_DEEPEN_RELATIVE, NULL); - if (!fetch_refs(transport, ref_map)) - consume_refs(transport, ref_map); + fetch_and_consume_refs(transport, ref_map); if (gsecondary) { transport_disconnect(gsecondary); @@ -1612,7 +1600,7 @@ static int do_fetch(struct transport *transport, transport->url); } } - if (fetch_refs(transport, ref_map) || consume_refs(transport, ref_map)) { + if (fetch_and_consume_refs(transport, ref_map)) { free_refs(ref_map); retcode = 1; goto cleanup; -- cgit v1.2.3 From caff8b73402d4b5edb2c6c755506c5a90351b69a Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 1 Sep 2021 15:10:06 +0200 Subject: fetch: avoid second connectivity check if we already have all objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When fetching refs, we are doing two connectivity checks: - The first one is done such that we can skip fetching refs in the case where we already have all objects referenced by the updated set of refs. - The second one verifies that we have all objects after we have fetched objects. We always execute both connectivity checks, but this is wasteful in case the first connectivity check already notices that we have all objects locally available. Skip the second connectivity check in case we already had all objects available. This gives us a nice speedup when doing a mirror-fetch in a repository with about 2.3M refs where the fetching repo already has all objects: Benchmark #1: HEAD~: git-fetch Time (mean ± σ): 30.025 s ± 0.081 s [User: 27.070 s, System: 4.933 s] Range (min … max): 29.900 s … 30.111 s 5 runs Benchmark #2: HEAD: git-fetch Time (mean ± σ): 25.574 s ± 0.177 s [User: 22.855 s, System: 4.683 s] Range (min … max): 25.399 s … 25.765 s 5 runs Summary 'HEAD: git-fetch' ran 1.17 ± 0.01 times faster than 'HEAD~: git-fetch' Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/fetch.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'builtin/fetch.c') diff --git a/builtin/fetch.c b/builtin/fetch.c index a1e17edd8b..e2c952ec67 100644 --- a/builtin/fetch.c +++ b/builtin/fetch.c @@ -1293,7 +1293,7 @@ static int check_exist_and_connected(struct ref *ref_map) static int fetch_and_consume_refs(struct transport *transport, struct ref *ref_map) { - int connectivity_checked; + int connectivity_checked = 1; int ret; /* @@ -1307,11 +1307,10 @@ static int fetch_and_consume_refs(struct transport *transport, struct ref *ref_m trace2_region_leave("fetch", "fetch_refs", the_repository); if (ret) goto out; + connectivity_checked = transport->smart_options ? + transport->smart_options->connectivity_checked : 0; } - connectivity_checked = transport->smart_options - ? transport->smart_options->connectivity_checked : 0; - trace2_region_enter("fetch", "consume_refs", the_repository); ret = store_updated_refs(transport->url, transport->remote->name, -- cgit v1.2.3