From b1adb38458679af98a057bcdc988a7f6ce1247d6 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:17:14 -0400 Subject: cat-file: rename batch_{loose,packed}_object callbacks We're not really doing the batch-show operation in these callbacks, but just collecting the set of objects. That distinction will become more important in a future patch, so let's rename them now to avoid cluttering that diff. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'builtin') diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 4a44b2404f..2d34f3b867 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -420,18 +420,18 @@ static int batch_object_cb(const struct object_id *oid, void *vdata) return 0; } -static int batch_loose_object(const struct object_id *oid, - const char *path, - void *data) +static int collect_loose_object(const struct object_id *oid, + const char *path, + void *data) { oid_array_append(data, oid); return 0; } -static int batch_packed_object(const struct object_id *oid, - struct packed_git *pack, - uint32_t pos, - void *data) +static int collect_packed_object(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *data) { oid_array_append(data, oid); return 0; @@ -476,8 +476,8 @@ static int batch_objects(struct batch_options *opt) struct oid_array sa = OID_ARRAY_INIT; struct object_cb_data cb; - for_each_loose_object(batch_loose_object, &sa, 0); - for_each_packed_object(batch_packed_object, &sa, 0); + for_each_loose_object(collect_loose_object, &sa, 0); + for_each_packed_object(collect_packed_object, &sa, 0); if (repository_format_partial_clone) warning("This repository has extensions.partialClone set. Some objects may not be loaded."); -- cgit v1.2.3 From 0750bb5b51f021ecad6f33b7ec88cdfc2a8cdff4 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Fri, 10 Aug 2018 19:24:57 -0400 Subject: cat-file: support "unordered" output for --batch-all-objects If you're going to access the contents of every object in a packfile, it's generally much more efficient to do so in pack order, rather than in hash order. That increases the locality of access within the packfile, which in turn is friendlier to the delta base cache, since the packfile puts related deltas next to each other. By contrast, hash order is effectively random, since the sha1 has no discernible relationship to the content. This patch introduces an "--unordered" option to cat-file which iterates over packs in pack-order under the hood. You can see the results when dumping all of the file content: $ time ./git cat-file --batch-all-objects --buffer --batch | wc -c 6883195596 real 0m44.491s user 0m42.902s sys 0m5.230s $ time ./git cat-file --unordered \ --batch-all-objects --buffer --batch | wc -c 6883195596 real 0m6.075s user 0m4.774s sys 0m3.548s Same output, different order, way faster. The same speed-up applies even if you end up accessing the object content in a different process, like: git cat-file --batch-all-objects --buffer --batch-check | grep blob | git cat-file --batch='%(objectname) %(rest)' | wc -c Adding "--unordered" to the first command drops the runtime in git.git from 24s to 3.5s. Side note: there are actually further speedups available for doing it all in-process now. Since we are outputting the object content during the actual pack iteration, we know where to find the object and could skip the extra lookup done by oid_object_info(). This patch stops short of that optimization since the underlying API isn't ready for us to make those sorts of direct requests. So if --unordered is so much better, why not make it the default? Two reasons: 1. We've promised in the documentation that --batch-all-objects outputs in hash order. Since cat-file is plumbing, people may be relying on that default, and we can't change it. 2. It's actually _slower_ for some cases. We have to compute the pack revindex to walk in pack order. And our de-duplication step uses an oidset, rather than a sort-and-dedup, which can end up being more expensive. If we're just accessing the type and size of each object, for example, like: git cat-file --batch-all-objects --buffer --batch-check my best-of-five warm cache timings go from 900ms to 1100ms using --unordered. Though it's possible in a cold-cache or under memory pressure that we could do better, since we'd have better locality within the packfile. And one final question: why is it "--unordered" and not "--pack-order"? The answer is again two-fold: 1. "pack order" isn't a well-defined thing across the whole set of objects. We're hitting loose objects, as well as objects in multiple packs, and the only ordering we're promising is _within_ a single pack. The rest is apparently random. 2. The point here is optimization. So we don't want to promise any particular ordering, but only to say that we will choose an ordering which is likely to be efficient for accessing the object content. That leaves the door open for further changes in the future without having to add another compatibility option. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 5 deletions(-) (limited to 'builtin') diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 2d34f3b867..45992c9be9 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -21,6 +21,7 @@ struct batch_options { int print_contents; int buffer_output; int all_objects; + int unordered; int cmdmode; /* may be 'w' or 'c' for --filters or --textconv */ const char *format; }; @@ -410,6 +411,7 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt, struct object_cb_data { struct batch_options *opt; struct expand_data *expand; + struct oidset *seen; }; static int batch_object_cb(const struct object_id *oid, void *vdata) @@ -437,6 +439,32 @@ static int collect_packed_object(const struct object_id *oid, return 0; } +static int batch_unordered_object(const struct object_id *oid, void *vdata) +{ + struct object_cb_data *data = vdata; + + if (oidset_contains(data->seen, oid)) + return 0; + oidset_insert(data->seen, oid); + + return batch_object_cb(oid, data); +} + +static int batch_unordered_loose(const struct object_id *oid, + const char *path, + void *data) +{ + return batch_unordered_object(oid, data); +} + +static int batch_unordered_packed(const struct object_id *oid, + struct packed_git *pack, + uint32_t pos, + void *data) +{ + return batch_unordered_object(oid, data); +} + static int batch_objects(struct batch_options *opt) { struct strbuf buf = STRBUF_INIT; @@ -473,19 +501,35 @@ static int batch_objects(struct batch_options *opt) data.info.typep = &data.type; if (opt->all_objects) { - struct oid_array sa = OID_ARRAY_INIT; struct object_cb_data cb; - for_each_loose_object(collect_loose_object, &sa, 0); - for_each_packed_object(collect_packed_object, &sa, 0); if (repository_format_partial_clone) warning("This repository has extensions.partialClone set. Some objects may not be loaded."); cb.opt = opt; cb.expand = &data; - oid_array_for_each_unique(&sa, batch_object_cb, &cb); - oid_array_clear(&sa); + if (opt->unordered) { + struct oidset seen = OIDSET_INIT; + + cb.seen = &seen; + + for_each_loose_object(batch_unordered_loose, &cb, 0); + for_each_packed_object(batch_unordered_packed, &cb, + FOR_EACH_OBJECT_PACK_ORDER); + + oidset_clear(&seen); + } else { + struct oid_array sa = OID_ARRAY_INIT; + + for_each_loose_object(collect_loose_object, &sa, 0); + for_each_packed_object(collect_packed_object, &sa, 0); + + oid_array_for_each_unique(&sa, batch_object_cb, &cb); + + oid_array_clear(&sa); + } + return 0; } @@ -586,6 +630,8 @@ int cmd_cat_file(int argc, const char **argv, const char *prefix) N_("follow in-tree symlinks (used with --batch or --batch-check)")), OPT_BOOL(0, "batch-all-objects", &batch.all_objects, N_("show all objects with --batch or --batch-check")), + OPT_BOOL(0, "unordered", &batch.unordered, + N_("do not order --batch-all-objects output")), OPT_END() }; -- cgit v1.2.3 From ced9fff75dad2578d7583ba3085970b03c66c57b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:14:27 -0400 Subject: cat-file: use oidset check-and-insert We don't need to check if the oidset has our object before we insert it; that's done as part of the insertion. We can just rely on the return value from oidset_insert(), which saves one hash lookup per object. This measurable speedup is tiny and within the run-to-run noise, but the result is simpler to read, too. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'builtin') diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 45992c9be9..04b5cda191 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -443,9 +443,8 @@ static int batch_unordered_object(const struct object_id *oid, void *vdata) { struct object_cb_data *data = vdata; - if (oidset_contains(data->seen, oid)) + if (oidset_insert(data->seen, oid)) return 0; - oidset_insert(data->seen, oid); return batch_object_cb(oid, data); } -- cgit v1.2.3 From 54d2f0d945abac2d8a8a1bcc258db937e597189e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:18:06 -0400 Subject: cat-file: split batch "buf" into two variables We use the "buf" strbuf for two things: to read incoming lines, and as a scratch space for test-expanding the user-provided format. Let's split this into two variables with descriptive names, which makes their purpose and lifetime more clear. It will also help in a future patch when we start using the "output" buffer for more expansions. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'builtin') diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 04b5cda191..3ed1d0be80 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -466,7 +466,8 @@ static int batch_unordered_packed(const struct object_id *oid, static int batch_objects(struct batch_options *opt) { - struct strbuf buf = STRBUF_INIT; + struct strbuf input = STRBUF_INIT; + struct strbuf output = STRBUF_INIT; struct expand_data data; int save_warning; int retval = 0; @@ -481,8 +482,9 @@ static int batch_objects(struct batch_options *opt) */ memset(&data, 0, sizeof(data)); data.mark_query = 1; - strbuf_expand(&buf, opt->format, expand_format, &data); + strbuf_expand(&output, opt->format, expand_format, &data); data.mark_query = 0; + strbuf_release(&output); if (opt->cmdmode) data.split_on_whitespace = 1; @@ -542,14 +544,14 @@ static int batch_objects(struct batch_options *opt) save_warning = warn_on_object_refname_ambiguity; warn_on_object_refname_ambiguity = 0; - while (strbuf_getline(&buf, stdin) != EOF) { + while (strbuf_getline(&input, stdin) != EOF) { if (data.split_on_whitespace) { /* * Split at first whitespace, tying off the beginning * of the string and saving the remainder (or NULL) in * data.rest. */ - char *p = strpbrk(buf.buf, " \t"); + char *p = strpbrk(input.buf, " \t"); if (p) { while (*p && strchr(" \t", *p)) *p++ = '\0'; @@ -557,10 +559,10 @@ static int batch_objects(struct batch_options *opt) data.rest = p; } - batch_one_object(buf.buf, opt, &data); + batch_one_object(input.buf, opt, &data); } - strbuf_release(&buf); + strbuf_release(&input); warn_on_object_refname_ambiguity = save_warning; return retval; } -- cgit v1.2.3 From 79ed0a5e2627a0e1eab0448e6f32d781e80bfafa Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:20:22 -0400 Subject: cat-file: use a single strbuf for all output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we're in batch mode, we end up in batch_object_write() for each object, which allocates its own strbuf for each call. Instead, we can provide a single "scratch" buffer that gets reused for each output. When running: git cat-file --batch-all-objects --batch-check='%(objectname)' on git.git, my best-of-five time drops from: real 0m0.171s user 0m0.159s sys 0m0.012s to: real 0m0.133s user 0m0.121s sys 0m0.012s Note that we could do this just by putting the "scratch" pointer into "struct expand_data", but I chose instead to add an extra parameter to the callstack. That's more verbose, but it makes it a bit more obvious what is going on, which in turn makes it easy to see where we need to be releasing the string in the caller (right after the loop which uses it in each case). Based-on-a-patch-by: René Scharfe Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/cat-file.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) (limited to 'builtin') diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 3ed1d0be80..08dced2614 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -338,11 +338,11 @@ static void print_object_or_die(struct batch_options *opt, struct expand_data *d } } -static void batch_object_write(const char *obj_name, struct batch_options *opt, +static void batch_object_write(const char *obj_name, + struct strbuf *scratch, + struct batch_options *opt, struct expand_data *data) { - struct strbuf buf = STRBUF_INIT; - if (!data->skip_object_info && oid_object_info_extended(the_repository, &data->oid, &data->info, OBJECT_INFO_LOOKUP_REPLACE) < 0) { @@ -352,10 +352,10 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt, return; } - strbuf_expand(&buf, opt->format, expand_format, data); - strbuf_addch(&buf, '\n'); - batch_write(opt, buf.buf, buf.len); - strbuf_release(&buf); + strbuf_reset(scratch); + strbuf_expand(scratch, opt->format, expand_format, data); + strbuf_addch(scratch, '\n'); + batch_write(opt, scratch->buf, scratch->len); if (opt->print_contents) { print_object_or_die(opt, data); @@ -363,7 +363,9 @@ static void batch_object_write(const char *obj_name, struct batch_options *opt, } } -static void batch_one_object(const char *obj_name, struct batch_options *opt, +static void batch_one_object(const char *obj_name, + struct strbuf *scratch, + struct batch_options *opt, struct expand_data *data) { struct object_context ctx; @@ -405,20 +407,21 @@ static void batch_one_object(const char *obj_name, struct batch_options *opt, return; } - batch_object_write(obj_name, opt, data); + batch_object_write(obj_name, scratch, opt, data); } struct object_cb_data { struct batch_options *opt; struct expand_data *expand; struct oidset *seen; + struct strbuf *scratch; }; static int batch_object_cb(const struct object_id *oid, void *vdata) { struct object_cb_data *data = vdata; oidcpy(&data->expand->oid, oid); - batch_object_write(NULL, data->opt, data->expand); + batch_object_write(NULL, data->scratch, data->opt, data->expand); return 0; } @@ -509,6 +512,7 @@ static int batch_objects(struct batch_options *opt) cb.opt = opt; cb.expand = &data; + cb.scratch = &output; if (opt->unordered) { struct oidset seen = OIDSET_INIT; @@ -531,6 +535,7 @@ static int batch_objects(struct batch_options *opt) oid_array_clear(&sa); } + strbuf_release(&output); return 0; } @@ -559,10 +564,11 @@ static int batch_objects(struct batch_options *opt) data.rest = p; } - batch_one_object(input.buf, opt, &data); + batch_one_object(input.buf, &output, opt, &data); } strbuf_release(&input); + strbuf_release(&output); warn_on_object_refname_ambiguity = save_warning; return retval; } -- cgit v1.2.3 From 0889aae1cd18c1804ba01c1a4229e516dfb9fe9b Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 14 Aug 2018 14:21:18 -0400 Subject: for_each_*_object: move declarations to object-store.h The for_each_loose_object() and for_each_packed_object() functions are meant to be part of a unified interface: they use the same set of for_each_object_flags, and it's not inconceivable that we might one day add a single for_each_object() wrapper around them. Let's put them together in a single file, so we can avoid awkwardness like saying "the flags for this function are over in cache.h". Moving the loose functions to packfile.h is silly. Moving the packed functions to cache.h works, but makes the "cache.h is a kitchen sink" problem worse. The best place is the recently-created object-store.h, since these are quite obviously related to object storage. The for_each_*_in_objdir() functions do not use the same flags, but they are logically part of the same interface as for_each_loose_object(), and share callback signatures. So we'll move those, as well, as they also make sense in object-store.h. Signed-off-by: Jeff King Signed-off-by: Junio C Hamano --- builtin/prune-packed.c | 1 + 1 file changed, 1 insertion(+) (limited to 'builtin') diff --git a/builtin/prune-packed.c b/builtin/prune-packed.c index 4ff525e50f..a9e7b552b9 100644 --- a/builtin/prune-packed.c +++ b/builtin/prune-packed.c @@ -3,6 +3,7 @@ #include "progress.h" #include "parse-options.h" #include "packfile.h" +#include "object-store.h" static const char * const prune_packed_usage[] = { N_("git prune-packed [-n | --dry-run] [-q | --quiet]"), -- cgit v1.2.3