diff options
author | Junio C Hamano <gitster@pobox.com> | 2021-03-24 14:36:27 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2021-03-24 14:36:27 -0700 |
commit | 2744383cbda9bbbe4219bd3532757ae6d28460e1 (patch) | |
tree | 8ca08ee3555ef97487136e9e9de5699ca19d8990 | |
parent | Merge branch 'tb/push-simple-uses-branch-merge-config' (diff) | |
parent | builtin/pack-objects.c: ignore missing links with --stdin-packs (diff) | |
download | tgif-2744383cbda9bbbe4219bd3532757ae6d28460e1.tar.xz |
Merge branch 'tb/geometric-repack'
"git repack" so far has been only capable of repacking everything
under the sun into a single pack (or split by size). A cleverer
strategy to reduce the cost of repacking a repository has been
introduced.
* tb/geometric-repack:
builtin/pack-objects.c: ignore missing links with --stdin-packs
builtin/repack.c: reword comment around pack-objects flags
builtin/repack.c: be more conservative with unsigned overflows
builtin/repack.c: assign pack split later
t7703: test --geometric repack with loose objects
builtin/repack.c: do not repack single packs with --geometric
builtin/repack.c: add '--geometric' option
packfile: add kept-pack cache for find_kept_pack_entry()
builtin/pack-objects.c: rewrite honor-pack-keep logic
p5303: measure time to repack with keep
p5303: add missing &&-chains
builtin/pack-objects.c: add '--stdin-packs' option
revision: learn '--no-kept-objects'
packfile: introduce 'find_kept_pack_entry()'
-rw-r--r-- | Documentation/git-pack-objects.txt | 10 | ||||
-rw-r--r-- | Documentation/git-repack.txt | 23 | ||||
-rw-r--r-- | builtin/pack-objects.c | 330 | ||||
-rw-r--r-- | builtin/repack.c | 207 | ||||
-rw-r--r-- | object-store.h | 5 | ||||
-rw-r--r-- | packfile.c | 67 | ||||
-rw-r--r-- | packfile.h | 5 | ||||
-rw-r--r-- | revision.c | 15 | ||||
-rw-r--r-- | revision.h | 4 | ||||
-rwxr-xr-x | t/perf/p5303-many-packs.sh | 36 | ||||
-rwxr-xr-x | t/t5300-pack-object.sh | 135 | ||||
-rwxr-xr-x | t/t6114-keep-packs.sh | 69 | ||||
-rwxr-xr-x | t/t7703-repack-geometric.sh | 183 |
13 files changed, 1029 insertions, 60 deletions
diff --git a/Documentation/git-pack-objects.txt b/Documentation/git-pack-objects.txt index f85cb7ea93..25d9fbe37a 100644 --- a/Documentation/git-pack-objects.txt +++ b/Documentation/git-pack-objects.txt @@ -85,6 +85,16 @@ base-name:: reference was included in the resulting packfile. This can be useful to send new tags to native Git clients. +--stdin-packs:: + Read the basenames of packfiles (e.g., `pack-1234abcd.pack`) + from the standard input, instead of object names or revision + arguments. The resulting pack contains all objects listed in the + included packs (those not beginning with `^`), excluding any + objects listed in the excluded packs (beginning with `^`). ++ +Incompatible with `--revs`, or options that imply `--revs` (such as +`--all`), with the exception of `--unpacked`, which is compatible. + --window=<n>:: --depth=<n>:: These two options affect how the objects contained in diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt index fbd4b4ae06..317d63cf0d 100644 --- a/Documentation/git-repack.txt +++ b/Documentation/git-repack.txt @@ -165,6 +165,29 @@ depth is 4095. Pass the `--delta-islands` option to `git-pack-objects`, see linkgit:git-pack-objects[1]. +-g=<factor>:: +--geometric=<factor>:: + Arrange resulting pack structure so that each successive pack + contains at least `<factor>` times the number of objects as the + next-largest pack. ++ +`git repack` ensures this by determining a "cut" of packfiles that need +to be repacked into one in order to ensure a geometric progression. It +picks the smallest set of packfiles such that as many of the larger +packfiles (by count of objects contained in that pack) may be left +intact. ++ +Unlike other repack modes, the set of objects to pack is determined +uniquely by the set of packs being "rolled-up"; in other words, the +packs determined to need to be combined in order to restore a geometric +progression. ++ +When `--unpacked` is specified, loose objects are implicitly included in +this "roll-up", without respect to their reachability. This is subject +to change in the future. This option (implying a drastically different +repack mode) is not guaranteed to work with all other combinations of +option to `git repack`). + CONFIGURATION ------------- diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 4bb602688c..8319831514 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1188,7 +1188,8 @@ static int have_duplicate_entry(const struct object_id *oid, return 1; } -static int want_found_object(int exclude, struct packed_git *p) +static int want_found_object(const struct object_id *oid, int exclude, + struct packed_git *p) { if (exclude) return 1; @@ -1204,27 +1205,82 @@ static int want_found_object(int exclude, struct packed_git *p) * make sure no copy of this object appears in _any_ pack that makes us * to omit the object, so we need to check all the packs. * - * We can however first check whether these options can possible matter; + * We can however first check whether these options can possibly matter; * if they do not matter we know we want the object in generated pack. * Otherwise, we signal "-1" at the end to tell the caller that we do * not know either way, and it needs to check more packs. */ - if (!ignore_packed_keep_on_disk && - !ignore_packed_keep_in_core && - (!local || !have_non_local_packs)) - return 1; + /* + * Objects in packs borrowed from elsewhere are discarded regardless of + * if they appear in other packs that weren't borrowed. + */ if (local && !p->pack_local) return 0; - if (p->pack_local && - ((ignore_packed_keep_on_disk && p->pack_keep) || - (ignore_packed_keep_in_core && p->pack_keep_in_core))) - return 0; + + /* + * Then handle .keep first, as we have a fast(er) path there. + */ + if (ignore_packed_keep_on_disk || ignore_packed_keep_in_core) { + /* + * Set the flags for the kept-pack cache to be the ones we want + * to ignore. + * + * That is, if we are ignoring objects in on-disk keep packs, + * then we want to search through the on-disk keep and ignore + * the in-core ones. + */ + unsigned flags = 0; + if (ignore_packed_keep_on_disk) + flags |= ON_DISK_KEEP_PACKS; + if (ignore_packed_keep_in_core) + flags |= IN_CORE_KEEP_PACKS; + + if (ignore_packed_keep_on_disk && p->pack_keep) + return 0; + if (ignore_packed_keep_in_core && p->pack_keep_in_core) + return 0; + if (has_object_kept_pack(oid, flags)) + return 0; + } + + /* + * At this point we know definitively that either we don't care about + * keep-packs, or the object is not in one. Keep checking other + * conditions... + */ + if (!local || !have_non_local_packs) + return 1; /* we don't know yet; keep looking for more packs */ return -1; } +static int want_object_in_pack_one(struct packed_git *p, + const struct object_id *oid, + int exclude, + struct packed_git **found_pack, + off_t *found_offset) +{ + off_t offset; + + if (p == *found_pack) + offset = *found_offset; + else + offset = find_pack_entry_one(oid->hash, p); + + if (offset) { + if (!*found_pack) { + if (!is_pack_valid(p)) + return -1; + *found_offset = offset; + *found_pack = p; + } + return want_found_object(oid, exclude, p); + } + return -1; +} + /* * Check whether we want the object in the pack (e.g., we do not want * objects found in non-local stores if the "--local" option was used). @@ -1252,7 +1308,7 @@ static int want_object_in_pack(const struct object_id *oid, * are present we will determine the answer right now. */ if (*found_pack) { - want = want_found_object(exclude, *found_pack); + want = want_found_object(oid, exclude, *found_pack); if (want != -1) return want; } @@ -1260,51 +1316,20 @@ static int want_object_in_pack(const struct object_id *oid, for (m = get_multi_pack_index(the_repository); m; m = m->next) { struct pack_entry e; if (fill_midx_entry(the_repository, oid, &e, m)) { - struct packed_git *p = e.p; - off_t offset; - - if (p == *found_pack) - offset = *found_offset; - else - offset = find_pack_entry_one(oid->hash, p); - - if (offset) { - if (!*found_pack) { - if (!is_pack_valid(p)) - continue; - *found_offset = offset; - *found_pack = p; - } - want = want_found_object(exclude, p); - if (want != -1) - return want; - } + want = want_object_in_pack_one(e.p, oid, exclude, found_pack, found_offset); + if (want != -1) + return want; } } list_for_each(pos, get_packed_git_mru(the_repository)) { struct packed_git *p = list_entry(pos, struct packed_git, mru); - off_t offset; - - if (p == *found_pack) - offset = *found_offset; - else - offset = find_pack_entry_one(oid->hash, p); - - if (offset) { - if (!*found_pack) { - if (!is_pack_valid(p)) - continue; - *found_offset = offset; - *found_pack = p; - } - want = want_found_object(exclude, p); - if (!exclude && want > 0) - list_move(&p->mru, - get_packed_git_mru(the_repository)); - if (want != -1) - return want; - } + want = want_object_in_pack_one(p, oid, exclude, found_pack, found_offset); + if (!exclude && want > 0) + list_move(&p->mru, + get_packed_git_mru(the_repository)); + if (want != -1) + return want; } if (uri_protocols.nr) { @@ -2986,6 +3011,191 @@ static int git_pack_config(const char *k, const char *v, void *cb) return git_default_config(k, v, cb); } +/* Counters for trace2 output when in --stdin-packs mode. */ +static int stdin_packs_found_nr; +static int stdin_packs_hints_nr; + +static int add_object_entry_from_pack(const struct object_id *oid, + struct packed_git *p, + uint32_t pos, + void *_data) +{ + struct rev_info *revs = _data; + struct object_info oi = OBJECT_INFO_INIT; + off_t ofs; + enum object_type type; + + display_progress(progress_state, ++nr_seen); + + if (have_duplicate_entry(oid, 0)) + return 0; + + ofs = nth_packed_object_offset(p, pos); + if (!want_object_in_pack(oid, 0, &p, &ofs)) + return 0; + + oi.typep = &type; + if (packed_object_info(the_repository, p, ofs, &oi) < 0) + die(_("could not get type of object %s in pack %s"), + oid_to_hex(oid), p->pack_name); + else if (type == OBJ_COMMIT) { + /* + * commits in included packs are used as starting points for the + * subsequent revision walk + */ + add_pending_oid(revs, NULL, oid, 0); + } + + stdin_packs_found_nr++; + + create_object_entry(oid, type, 0, 0, 0, p, ofs); + + return 0; +} + +static void show_commit_pack_hint(struct commit *commit, void *_data) +{ + /* nothing to do; commits don't have a namehash */ +} + +static void show_object_pack_hint(struct object *object, const char *name, + void *_data) +{ + struct object_entry *oe = packlist_find(&to_pack, &object->oid); + if (!oe) + return; + + /* + * Our 'to_pack' list was constructed by iterating all objects packed in + * included packs, and so doesn't have a non-zero hash field that you + * would typically pick up during a reachability traversal. + * + * Make a best-effort attempt to fill in the ->hash and ->no_try_delta + * here using a now in order to perhaps improve the delta selection + * process. + */ + oe->hash = pack_name_hash(name); + oe->no_try_delta = name && no_try_delta(name); + + stdin_packs_hints_nr++; +} + +static int pack_mtime_cmp(const void *_a, const void *_b) +{ + struct packed_git *a = ((const struct string_list_item*)_a)->util; + struct packed_git *b = ((const struct string_list_item*)_b)->util; + + /* + * order packs by descending mtime so that objects are laid out + * roughly as newest-to-oldest + */ + if (a->mtime < b->mtime) + return 1; + else if (b->mtime < a->mtime) + return -1; + else + return 0; +} + +static void read_packs_list_from_stdin(void) +{ + struct strbuf buf = STRBUF_INIT; + struct string_list include_packs = STRING_LIST_INIT_DUP; + struct string_list exclude_packs = STRING_LIST_INIT_DUP; + struct string_list_item *item = NULL; + + struct packed_git *p; + struct rev_info revs; + + repo_init_revisions(the_repository, &revs, NULL); + /* + * Use a revision walk to fill in the namehash of objects in the include + * packs. To save time, we'll avoid traversing through objects that are + * in excluded packs. + * + * That may cause us to avoid populating all of the namehash fields of + * all included objects, but our goal is best-effort, since this is only + * an optimization during delta selection. + */ + revs.no_kept_objects = 1; + revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS; + revs.blob_objects = 1; + revs.tree_objects = 1; + revs.tag_objects = 1; + revs.ignore_missing_links = 1; + + while (strbuf_getline(&buf, stdin) != EOF) { + if (!buf.len) + continue; + + if (*buf.buf == '^') + string_list_append(&exclude_packs, buf.buf + 1); + else + string_list_append(&include_packs, buf.buf); + + strbuf_reset(&buf); + } + + string_list_sort(&include_packs); + string_list_sort(&exclude_packs); + + for (p = get_all_packs(the_repository); p; p = p->next) { + const char *pack_name = pack_basename(p); + + item = string_list_lookup(&include_packs, pack_name); + if (!item) + item = string_list_lookup(&exclude_packs, pack_name); + + if (item) + item->util = p; + } + + /* + * First handle all of the excluded packs, marking them as kept in-core + * so that later calls to add_object_entry() discards any objects that + * are also found in excluded packs. + */ + for_each_string_list_item(item, &exclude_packs) { + struct packed_git *p = item->util; + if (!p) + die(_("could not find pack '%s'"), item->string); + p->pack_keep_in_core = 1; + } + + /* + * Order packs by ascending mtime; use QSORT directly to access the + * string_list_item's ->util pointer, which string_list_sort() does not + * provide. + */ + QSORT(include_packs.items, include_packs.nr, pack_mtime_cmp); + + for_each_string_list_item(item, &include_packs) { + struct packed_git *p = item->util; + if (!p) + die(_("could not find pack '%s'"), item->string); + for_each_object_in_pack(p, + add_object_entry_from_pack, + &revs, + FOR_EACH_OBJECT_PACK_ORDER); + } + + if (prepare_revision_walk(&revs)) + die(_("revision walk setup failed")); + traverse_commit_list(&revs, + show_commit_pack_hint, + show_object_pack_hint, + NULL); + + trace2_data_intmax("pack-objects", the_repository, "stdin_packs_found", + stdin_packs_found_nr); + trace2_data_intmax("pack-objects", the_repository, "stdin_packs_hints", + stdin_packs_hints_nr); + + strbuf_release(&buf); + string_list_clear(&include_packs, 0); + string_list_clear(&exclude_packs, 0); +} + static void read_object_list_from_stdin(void) { char line[GIT_MAX_HEXSZ + 1 + PATH_MAX + 2]; @@ -3489,6 +3699,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) struct strvec rp = STRVEC_INIT; int rev_list_unpacked = 0, rev_list_all = 0, rev_list_reflog = 0; int rev_list_index = 0; + int stdin_packs = 0; struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; struct option pack_objects_options[] = { OPT_SET_INT('q', "quiet", &progress, @@ -3539,6 +3750,8 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) OPT_SET_INT_F(0, "indexed-objects", &rev_list_index, N_("include objects referred to by the index"), 1, PARSE_OPT_NONEG), + OPT_BOOL(0, "stdin-packs", &stdin_packs, + N_("read packs from stdin")), OPT_BOOL(0, "stdout", &pack_to_stdout, N_("output pack to stdout")), OPT_BOOL(0, "include-tag", &include_tag, @@ -3645,7 +3858,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) use_internal_rev_list = 1; strvec_push(&rp, "--indexed-objects"); } - if (rev_list_unpacked) { + if (rev_list_unpacked && !stdin_packs) { use_internal_rev_list = 1; strvec_push(&rp, "--unpacked"); } @@ -3690,8 +3903,13 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (filter_options.choice) { if (!pack_to_stdout) die(_("cannot use --filter without --stdout")); + if (stdin_packs) + die(_("cannot use --filter with --stdin-packs")); } + if (stdin_packs && use_internal_rev_list) + die(_("cannot use internal rev list with --stdin-packs")); + /* * "soft" reasons not to use bitmaps - for on-disk repack by default we want * @@ -3750,7 +3968,13 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress) progress_state = start_progress(_("Enumerating objects"), 0); - if (!use_internal_rev_list) + if (stdin_packs) { + /* avoids adding objects in excluded packs */ + ignore_packed_keep_in_core = 1; + read_packs_list_from_stdin(); + if (rev_list_unpacked) + add_unreachable_loose_objects(); + } else if (!use_internal_rev_list) read_object_list_from_stdin(); else { get_object_list(rp.nr, rp.v); diff --git a/builtin/repack.c b/builtin/repack.c index 01440de2d5..6ce2556c9e 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -297,6 +297,142 @@ static void repack_promisor_objects(const struct pack_objects_args *args, #define ALL_INTO_ONE 1 #define LOOSEN_UNREACHABLE 2 +struct pack_geometry { + struct packed_git **pack; + uint32_t pack_nr, pack_alloc; + uint32_t split; +}; + +static uint32_t geometry_pack_weight(struct packed_git *p) +{ + if (open_pack_index(p)) + die(_("cannot open index for %s"), p->pack_name); + return p->num_objects; +} + +static int geometry_cmp(const void *va, const void *vb) +{ + uint32_t aw = geometry_pack_weight(*(struct packed_git **)va), + bw = geometry_pack_weight(*(struct packed_git **)vb); + + if (aw < bw) + return -1; + if (aw > bw) + return 1; + return 0; +} + +static void init_pack_geometry(struct pack_geometry **geometry_p) +{ + struct packed_git *p; + struct pack_geometry *geometry; + + *geometry_p = xcalloc(1, sizeof(struct pack_geometry)); + geometry = *geometry_p; + + for (p = get_all_packs(the_repository); p; p = p->next) { + if (!pack_kept_objects && p->pack_keep) + continue; + + ALLOC_GROW(geometry->pack, + geometry->pack_nr + 1, + geometry->pack_alloc); + + geometry->pack[geometry->pack_nr] = p; + geometry->pack_nr++; + } + + QSORT(geometry->pack, geometry->pack_nr, geometry_cmp); +} + +static void split_pack_geometry(struct pack_geometry *geometry, int factor) +{ + uint32_t i; + uint32_t split; + off_t total_size = 0; + + if (!geometry->pack_nr) { + geometry->split = geometry->pack_nr; + return; + } + + /* + * First, count the number of packs (in descending order of size) which + * already form a geometric progression. + */ + for (i = geometry->pack_nr - 1; i > 0; i--) { + struct packed_git *ours = geometry->pack[i]; + struct packed_git *prev = geometry->pack[i - 1]; + + if (unsigned_mult_overflows(factor, geometry_pack_weight(prev))) + die(_("pack %s too large to consider in geometric " + "progression"), + prev->pack_name); + + if (geometry_pack_weight(ours) < factor * geometry_pack_weight(prev)) + break; + } + + split = i; + + if (split) { + /* + * Move the split one to the right, since the top element in the + * last-compared pair can't be in the progression. Only do this + * when we split in the middle of the array (otherwise if we got + * to the end, then the split is in the right place). + */ + split++; + } + + /* + * Then, anything to the left of 'split' must be in a new pack. But, + * creating that new pack may cause packs in the heavy half to no longer + * form a geometric progression. + * + * Compute an expected size of the new pack, and then determine how many + * packs in the heavy half need to be joined into it (if any) to restore + * the geometric progression. + */ + for (i = 0; i < split; i++) { + struct packed_git *p = geometry->pack[i]; + + if (unsigned_add_overflows(total_size, geometry_pack_weight(p))) + die(_("pack %s too large to roll up"), p->pack_name); + total_size += geometry_pack_weight(p); + } + for (i = split; i < geometry->pack_nr; i++) { + struct packed_git *ours = geometry->pack[i]; + + if (unsigned_mult_overflows(factor, total_size)) + die(_("pack %s too large to roll up"), ours->pack_name); + + if (geometry_pack_weight(ours) < factor * total_size) { + if (unsigned_add_overflows(total_size, + geometry_pack_weight(ours))) + die(_("pack %s too large to roll up"), + ours->pack_name); + + split++; + total_size += geometry_pack_weight(ours); + } else + break; + } + + geometry->split = split; +} + +static void clear_pack_geometry(struct pack_geometry *geometry) +{ + if (!geometry) + return; + + free(geometry->pack); + geometry->pack_nr = 0; + geometry->pack_alloc = 0; + geometry->split = 0; +} + int cmd_repack(int argc, const char **argv, const char *prefix) { struct child_process cmd = CHILD_PROCESS_INIT; @@ -304,6 +440,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) struct string_list names = STRING_LIST_INIT_DUP; struct string_list rollback = STRING_LIST_INIT_NODUP; struct string_list existing_packs = STRING_LIST_INIT_DUP; + struct pack_geometry *geometry = NULL; struct strbuf line = STRBUF_INIT; int i, ext, ret; FILE *out; @@ -316,6 +453,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; int no_update_server_info = 0; struct pack_objects_args po_args = {NULL}; + int geometric_factor = 0; struct option builtin_repack_options[] = { OPT_BIT('a', NULL, &pack_everything, @@ -356,6 +494,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) N_("repack objects in packs marked with .keep")), OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"), N_("do not repack this pack")), + OPT_INTEGER('g', "geometric", &geometric_factor, + N_("find a geometric progression with factor <N>")), OPT_END() }; @@ -382,6 +522,13 @@ int cmd_repack(int argc, const char **argv, const char *prefix) if (write_bitmaps && !(pack_everything & ALL_INTO_ONE)) die(_(incremental_bitmap_conflict_error)); + if (geometric_factor) { + if (pack_everything) + die(_("--geometric is incompatible with -A, -a")); + init_pack_geometry(&geometry); + split_pack_geometry(geometry, geometric_factor); + } + packdir = mkpathdup("%s/pack", get_object_directory()); packtmp = mkpathdup("%s/.tmp-%d-pack", packdir, (int)getpid()); @@ -396,9 +543,21 @@ int cmd_repack(int argc, const char **argv, const char *prefix) strvec_pushf(&cmd.args, "--keep-pack=%s", keep_pack_list.items[i].string); strvec_push(&cmd.args, "--non-empty"); - strvec_push(&cmd.args, "--all"); - strvec_push(&cmd.args, "--reflog"); - strvec_push(&cmd.args, "--indexed-objects"); + if (!geometry) { + /* + * We need to grab all reachable objects, including those that + * are reachable from reflogs and the index. + * + * When repacking into a geometric progression of packs, + * however, we ask 'git pack-objects --stdin-packs', and it is + * not about packing objects based on reachability but about + * repacking all the objects in specified packs and loose ones + * (indeed, --stdin-packs is incompatible with these options). + */ + strvec_push(&cmd.args, "--all"); + strvec_push(&cmd.args, "--reflog"); + strvec_push(&cmd.args, "--indexed-objects"); + } if (has_promisor_remote()) strvec_push(&cmd.args, "--exclude-promisor-objects"); if (write_bitmaps > 0) @@ -429,17 +588,37 @@ int cmd_repack(int argc, const char **argv, const char *prefix) strvec_push(&cmd.env_array, "GIT_REF_PARANOIA=1"); } } + } else if (geometry) { + strvec_push(&cmd.args, "--stdin-packs"); + strvec_push(&cmd.args, "--unpacked"); } else { strvec_push(&cmd.args, "--unpacked"); strvec_push(&cmd.args, "--incremental"); } - cmd.no_stdin = 1; + if (geometry) + cmd.in = -1; + else + cmd.no_stdin = 1; ret = start_command(&cmd); if (ret) return ret; + if (geometry) { + FILE *in = xfdopen(cmd.in, "w"); + /* + * The resulting pack should contain all objects in packs that + * are going to be rolled up, but exclude objects in packs which + * are being left alone. + */ + for (i = 0; i < geometry->split; i++) + fprintf(in, "%s\n", pack_basename(geometry->pack[i])); + for (i = geometry->split; i < geometry->pack_nr; i++) + fprintf(in, "^%s\n", pack_basename(geometry->pack[i])); + fclose(in); + } + out = xfdopen(cmd.out, "r"); while (strbuf_getline_lf(&line, out) != EOF) { if (line.len != the_hash_algo->hexsz) @@ -507,6 +686,25 @@ int cmd_repack(int argc, const char **argv, const char *prefix) if (!string_list_has_string(&names, sha1)) remove_redundant_pack(packdir, item->string); } + + if (geometry) { + struct strbuf buf = STRBUF_INIT; + + uint32_t i; + for (i = 0; i < geometry->split; i++) { + struct packed_git *p = geometry->pack[i]; + if (string_list_has_string(&names, + hash_to_hex(p->hash))) + continue; + + strbuf_reset(&buf); + strbuf_addstr(&buf, pack_basename(p)); + strbuf_strip_suffix(&buf, ".pack"); + + remove_redundant_pack(packdir, buf.buf); + } + strbuf_release(&buf); + } if (!po_args.quiet && isatty(2)) opts |= PRUNE_PACKED_VERBOSE; prune_packed_objects(opts); @@ -528,6 +726,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) string_list_clear(&names, 0); string_list_clear(&rollback, 0); string_list_clear(&existing_packs, 0); + clear_pack_geometry(geometry); strbuf_release(&line); return 0; diff --git a/object-store.h b/object-store.h index 541dab0858..ec32c23dcb 100644 --- a/object-store.h +++ b/object-store.h @@ -153,6 +153,11 @@ struct raw_object_store { /* A most-recently-used ordered version of the packed_git list. */ struct list_head packed_git_mru; + struct { + struct packed_git **packs; + unsigned flags; + } kept_pack_cache; + /* * A map of packfiles to packed_git structs for tracking which * packs have been loaded already. diff --git a/packfile.c b/packfile.c index ea29f4ba77..6661f3325a 100644 --- a/packfile.c +++ b/packfile.c @@ -2066,12 +2066,79 @@ int find_pack_entry(struct repository *r, const struct object_id *oid, struct pa return 0; } +static void maybe_invalidate_kept_pack_cache(struct repository *r, + unsigned flags) +{ + if (!r->objects->kept_pack_cache.packs) + return; + if (r->objects->kept_pack_cache.flags == flags) + return; + FREE_AND_NULL(r->objects->kept_pack_cache.packs); + r->objects->kept_pack_cache.flags = 0; +} + +static struct packed_git **kept_pack_cache(struct repository *r, unsigned flags) +{ + maybe_invalidate_kept_pack_cache(r, flags); + + if (!r->objects->kept_pack_cache.packs) { + struct packed_git **packs = NULL; + size_t nr = 0, alloc = 0; + struct packed_git *p; + + /* + * We want "all" packs here, because we need to cover ones that + * are used by a midx, as well. We need to look in every one of + * them (instead of the midx itself) to cover duplicates. It's + * possible that an object is found in two packs that the midx + * covers, one kept and one not kept, but the midx returns only + * the non-kept version. + */ + for (p = get_all_packs(r); p; p = p->next) { + if ((p->pack_keep && (flags & ON_DISK_KEEP_PACKS)) || + (p->pack_keep_in_core && (flags & IN_CORE_KEEP_PACKS))) { + ALLOC_GROW(packs, nr + 1, alloc); + packs[nr++] = p; + } + } + ALLOC_GROW(packs, nr + 1, alloc); + packs[nr] = NULL; + + r->objects->kept_pack_cache.packs = packs; + r->objects->kept_pack_cache.flags = flags; + } + + return r->objects->kept_pack_cache.packs; +} + +int find_kept_pack_entry(struct repository *r, + const struct object_id *oid, + unsigned flags, + struct pack_entry *e) +{ + struct packed_git **cache; + + for (cache = kept_pack_cache(r, flags); *cache; cache++) { + struct packed_git *p = *cache; + if (fill_pack_entry(oid, e, p)) + return 1; + } + + return 0; +} + int has_object_pack(const struct object_id *oid) { struct pack_entry e; return find_pack_entry(the_repository, oid, &e); } +int has_object_kept_pack(const struct object_id *oid, unsigned flags) +{ + struct pack_entry e; + return find_kept_pack_entry(the_repository, oid, flags, &e); +} + int has_pack_index(const unsigned char *sha1) { struct stat st; diff --git a/packfile.h b/packfile.h index 4cfec9e8d3..3ae117a8ae 100644 --- a/packfile.h +++ b/packfile.h @@ -162,13 +162,18 @@ int packed_object_info(struct repository *r, void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1); const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1); +#define ON_DISK_KEEP_PACKS 1 +#define IN_CORE_KEEP_PACKS 2 + /* * Iff a pack file in the given repository contains the object named by sha1, * return true and store its location to e. */ int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e); +int find_kept_pack_entry(struct repository *r, const struct object_id *oid, unsigned flags, struct pack_entry *e); int has_object_pack(const struct object_id *oid); +int has_object_kept_pack(const struct object_id *oid, unsigned flags); int has_pack_index(const unsigned char *sha1); diff --git a/revision.c b/revision.c index 99c859f797..553c0faa9b 100644 --- a/revision.c +++ b/revision.c @@ -2336,6 +2336,16 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg revs->unpacked = 1; } else if (starts_with(arg, "--unpacked=")) { die(_("--unpacked=<packfile> no longer supported")); + } else if (!strcmp(arg, "--no-kept-objects")) { + revs->no_kept_objects = 1; + revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS; + revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS; + } else if (skip_prefix(arg, "--no-kept-objects=", &optarg)) { + revs->no_kept_objects = 1; + if (!strcmp(optarg, "in-core")) + revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS; + if (!strcmp(optarg, "on-disk")) + revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS; } else if (!strcmp(arg, "-r")) { revs->diff = 1; revs->diffopt.flags.recursive = 1; @@ -3795,6 +3805,11 @@ enum commit_action get_commit_action(struct rev_info *revs, struct commit *commi return commit_ignore; if (revs->unpacked && has_object_pack(&commit->object.oid)) return commit_ignore; + if (revs->no_kept_objects) { + if (has_object_kept_pack(&commit->object.oid, + revs->keep_pack_cache_flags)) + return commit_ignore; + } if (commit->object.flags & UNINTERESTING) return commit_ignore; if (revs->line_level_traverse && !want_ancestry(revs)) { diff --git a/revision.h b/revision.h index e6be3c845e..a20a530d52 100644 --- a/revision.h +++ b/revision.h @@ -148,6 +148,7 @@ struct rev_info { edge_hint_aggressive:1, limited:1, unpacked:1, + no_kept_objects:1, boundary:2, count:1, left_right:1, @@ -317,6 +318,9 @@ struct rev_info { * This is loaded from the commit-graph being used. */ struct bloom_filter_settings *bloom_filter_settings; + + /* misc. flags related to '--no-kept-objects' */ + unsigned keep_pack_cache_flags; }; int ref_excluded(struct string_list *, const char *path); diff --git a/t/perf/p5303-many-packs.sh b/t/perf/p5303-many-packs.sh index ce0c42cc9f..35c0cbdf49 100755 --- a/t/perf/p5303-many-packs.sh +++ b/t/perf/p5303-many-packs.sh @@ -28,11 +28,18 @@ repack_into_n () { push @commits, $_ if $. % 5 == 1; } print reverse @commits; - ' "$1" >pushes + ' "$1" >pushes && # create base packfile - head -n 1 pushes | - git pack-objects --delta-base-offset --revs staging/pack + base_pack=$( + head -n 1 pushes | + git pack-objects --delta-base-offset --revs staging/pack + ) && + test_export base_pack && + + # create an empty packfile + empty_pack=$(git pack-objects staging/pack </dev/null) && + test_export empty_pack && # and then incrementals between each pair of commits last= && @@ -49,6 +56,12 @@ repack_into_n () { last=$rev done <pushes && + ( + find staging -type f -name 'pack-*.pack' | + xargs -n 1 basename | grep -v "$base_pack" && + printf "^pack-%s.pack\n" $base_pack + ) >stdin.packs + # and install the whole thing rm -f .git/objects/pack/* && mv staging/* .git/objects/pack/ @@ -91,6 +104,23 @@ do --reflog --indexed-objects --delta-base-offset \ --stdout </dev/null >/dev/null ' + + test_perf "repack with kept ($nr_packs)" ' + git pack-objects --keep-true-parents \ + --keep-pack=pack-$empty_pack.pack \ + --honor-pack-keep --non-empty --all \ + --reflog --indexed-objects --delta-base-offset \ + --stdout </dev/null >/dev/null + ' + + test_perf "repack with --stdin-packs ($nr_packs)" ' + git pack-objects \ + --keep-true-parents \ + --stdin-packs \ + --non-empty \ + --delta-base-offset \ + --stdout <stdin.packs >/dev/null + ' done # Measure pack loading with 10,000 packs. diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh index d586fdc7a9..b5699e0b28 100755 --- a/t/t5300-pack-object.sh +++ b/t/t5300-pack-object.sh @@ -532,4 +532,139 @@ test_expect_success 'prefetch objects' ' test_line_count = 1 donelines ' +test_expect_success 'setup for --stdin-packs tests' ' + git init stdin-packs && + ( + cd stdin-packs && + + test_commit A && + test_commit B && + test_commit C && + + for id in A B C + do + git pack-objects .git/objects/pack/pack-$id \ + --incremental --revs <<-EOF + refs/tags/$id + EOF + done && + + ls -la .git/objects/pack + ) +' + +test_expect_success '--stdin-packs with excluded packs' ' + ( + cd stdin-packs && + + PACK_A="$(basename .git/objects/pack/pack-A-*.pack)" && + PACK_B="$(basename .git/objects/pack/pack-B-*.pack)" && + PACK_C="$(basename .git/objects/pack/pack-C-*.pack)" && + + git pack-objects test --stdin-packs <<-EOF && + $PACK_A + ^$PACK_B + $PACK_C + EOF + + ( + git show-index <$(ls .git/objects/pack/pack-A-*.idx) && + git show-index <$(ls .git/objects/pack/pack-C-*.idx) + ) >expect.raw && + git show-index <$(ls test-*.idx) >actual.raw && + + cut -d" " -f2 <expect.raw | sort >expect && + cut -d" " -f2 <actual.raw | sort >actual && + test_cmp expect actual + ) +' + +test_expect_success '--stdin-packs is incompatible with --filter' ' + ( + cd stdin-packs && + test_must_fail git pack-objects --stdin-packs --stdout \ + --filter=blob:none </dev/null 2>err && + test_i18ngrep "cannot use --filter with --stdin-packs" err + ) +' + +test_expect_success '--stdin-packs is incompatible with --revs' ' + ( + cd stdin-packs && + test_must_fail git pack-objects --stdin-packs --revs out \ + </dev/null 2>err && + test_i18ngrep "cannot use internal rev list with --stdin-packs" err + ) +' + +test_expect_success '--stdin-packs with loose objects' ' + ( + cd stdin-packs && + + PACK_A="$(basename .git/objects/pack/pack-A-*.pack)" && + PACK_B="$(basename .git/objects/pack/pack-B-*.pack)" && + PACK_C="$(basename .git/objects/pack/pack-C-*.pack)" && + + test_commit D && # loose + + git pack-objects test2 --stdin-packs --unpacked <<-EOF && + $PACK_A + ^$PACK_B + $PACK_C + EOF + + ( + git show-index <$(ls .git/objects/pack/pack-A-*.idx) && + git show-index <$(ls .git/objects/pack/pack-C-*.idx) && + git rev-list --objects --no-object-names \ + refs/tags/C..refs/tags/D + + ) >expect.raw && + ls -la . && + git show-index <$(ls test2-*.idx) >actual.raw && + + cut -d" " -f2 <expect.raw | sort >expect && + cut -d" " -f2 <actual.raw | sort >actual && + test_cmp expect actual + ) +' + +test_expect_success '--stdin-packs with broken links' ' + ( + cd stdin-packs && + + # make an unreachable object with a bogus parent + git cat-file -p HEAD >commit && + sed "s/$(git rev-parse HEAD^)/$(test_oid zero)/" <commit | + git hash-object -w -t commit --stdin >in && + + git pack-objects .git/objects/pack/pack-D <in && + + PACK_A="$(basename .git/objects/pack/pack-A-*.pack)" && + PACK_B="$(basename .git/objects/pack/pack-B-*.pack)" && + PACK_C="$(basename .git/objects/pack/pack-C-*.pack)" && + PACK_D="$(basename .git/objects/pack/pack-D-*.pack)" && + + git pack-objects test3 --stdin-packs --unpacked <<-EOF && + $PACK_A + ^$PACK_B + $PACK_C + $PACK_D + EOF + + ( + git show-index <$(ls .git/objects/pack/pack-A-*.idx) && + git show-index <$(ls .git/objects/pack/pack-C-*.idx) && + git show-index <$(ls .git/objects/pack/pack-D-*.idx) && + git rev-list --objects --no-object-names \ + refs/tags/C..refs/tags/D + ) >expect.raw && + git show-index <$(ls test3-*.idx) >actual.raw && + + cut -d" " -f2 <expect.raw | sort >expect && + cut -d" " -f2 <actual.raw | sort >actual && + test_cmp expect actual + ) +' + test_done diff --git a/t/t6114-keep-packs.sh b/t/t6114-keep-packs.sh new file mode 100755 index 0000000000..9239d8aa46 --- /dev/null +++ b/t/t6114-keep-packs.sh @@ -0,0 +1,69 @@ +#!/bin/sh + +test_description='rev-list with .keep packs' +. ./test-lib.sh + +test_expect_success 'setup' ' + test_commit loose && + test_commit packed && + test_commit kept && + + KEPT_PACK=$(git pack-objects --revs .git/objects/pack/pack <<-EOF + refs/tags/kept + ^refs/tags/packed + EOF + ) && + MISC_PACK=$(git pack-objects --revs .git/objects/pack/pack <<-EOF + refs/tags/packed + ^refs/tags/loose + EOF + ) && + + touch .git/objects/pack/pack-$KEPT_PACK.keep +' + +rev_list_objects () { + git rev-list "$@" >out && + sort out +} + +idx_objects () { + git show-index <$1 >expect-idx && + cut -d" " -f2 <expect-idx | sort +} + +test_expect_success '--no-kept-objects excludes trees and blobs in .keep packs' ' + rev_list_objects --objects --all --no-object-names >kept && + rev_list_objects --objects --all --no-object-names --no-kept-objects >no-kept && + + idx_objects .git/objects/pack/pack-$KEPT_PACK.idx >expect && + comm -3 kept no-kept >actual && + + test_cmp expect actual +' + +test_expect_success '--no-kept-objects excludes kept non-MIDX object' ' + test_config core.multiPackIndex true && + + # Create a pack with just the commit object in pack, and do not mark it + # as kept (even though it appears in $KEPT_PACK, which does have a .keep + # file). + MIDX_PACK=$(git pack-objects .git/objects/pack/pack <<-EOF + $(git rev-parse kept) + EOF + ) && + + # Write a MIDX containing all packs, but use the version of the commit + # at "kept" in a non-kept pack by touching $MIDX_PACK. + touch .git/objects/pack/pack-$MIDX_PACK.pack && + git multi-pack-index write && + + rev_list_objects --objects --no-object-names --no-kept-objects HEAD >actual && + ( + idx_objects .git/objects/pack/pack-$MISC_PACK.idx && + git rev-list --objects --no-object-names refs/tags/loose + ) | sort >expect && + test_cmp expect actual +' + +test_done diff --git a/t/t7703-repack-geometric.sh b/t/t7703-repack-geometric.sh new file mode 100755 index 0000000000..5ccaa440e0 --- /dev/null +++ b/t/t7703-repack-geometric.sh @@ -0,0 +1,183 @@ +#!/bin/sh + +test_description='git repack --geometric works correctly' + +. ./test-lib.sh + +GIT_TEST_MULTI_PACK_INDEX=0 + +objdir=.git/objects +midx=$objdir/pack/multi-pack-index + +test_expect_success '--geometric with no packs' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + git repack --geometric 2 >out && + test_i18ngrep "Nothing new to pack" out + ) +' + +test_expect_success '--geometric with one pack' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + test_commit "base" && + git repack -d && + + git repack --geometric 2 >out && + + test_i18ngrep "Nothing new to pack" out + ) +' + +test_expect_success '--geometric with an intact progression' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + # These packs already form a geometric progression. + test_commit_bulk --start=1 1 && # 3 objects + test_commit_bulk --start=2 2 && # 6 objects + test_commit_bulk --start=4 4 && # 12 objects + + find $objdir/pack -name "*.pack" | sort >expect && + git repack --geometric 2 -d && + find $objdir/pack -name "*.pack" | sort >actual && + + test_cmp expect actual + ) +' + +test_expect_success '--geometric with loose objects' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + # These packs already form a geometric progression. + test_commit_bulk --start=1 1 && # 3 objects + test_commit_bulk --start=2 2 && # 6 objects + # The loose objects are packed together, breaking the + # progression. + test_commit loose && # 3 objects + + find $objdir/pack -name "*.pack" | sort >before && + git repack --geometric 2 -d && + find $objdir/pack -name "*.pack" | sort >after && + + comm -13 before after >new && + comm -23 before after >removed && + + test_line_count = 1 new && + test_must_be_empty removed && + + git repack --geometric 2 -d && + find $objdir/pack -name "*.pack" | sort >after && + + # The progression (3, 3, 6) is combined into one new pack. + test_line_count = 1 after + ) +' + +test_expect_success '--geometric with small-pack rollup' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + test_commit_bulk --start=1 1 && # 3 objects + test_commit_bulk --start=2 1 && # 3 objects + find $objdir/pack -name "*.pack" | sort >small && + test_commit_bulk --start=3 4 && # 12 objects + test_commit_bulk --start=7 8 && # 24 objects + find $objdir/pack -name "*.pack" | sort >before && + + git repack --geometric 2 -d && + + # Three packs in total; two of the existing large ones, and one + # new one. + find $objdir/pack -name "*.pack" | sort >after && + test_line_count = 3 after && + comm -3 small before | tr -d "\t" >large && + grep -qFf large after + ) +' + +test_expect_success '--geometric with small- and large-pack rollup' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + # size(small1) + size(small2) > size(medium) / 2 + test_commit_bulk --start=1 1 && # 3 objects + test_commit_bulk --start=2 1 && # 3 objects + test_commit_bulk --start=2 3 && # 7 objects + test_commit_bulk --start=6 9 && # 27 objects && + + find $objdir/pack -name "*.pack" | sort >before && + + git repack --geometric 2 -d && + + find $objdir/pack -name "*.pack" | sort >after && + comm -12 before after >untouched && + + # Two packs in total; the largest pack from before running "git + # repack", and one new one. + test_line_count = 1 untouched && + test_line_count = 2 after + ) +' + +test_expect_success '--geometric ignores kept packs' ' + git init geometric && + test_when_finished "rm -fr geometric" && + ( + cd geometric && + + test_commit kept && # 3 objects + test_commit pack && # 3 objects + + KEPT=$(git pack-objects --revs $objdir/pack/pack <<-EOF + refs/tags/kept + EOF + ) && + PACK=$(git pack-objects --revs $objdir/pack/pack <<-EOF + refs/tags/pack + ^refs/tags/kept + EOF + ) && + + # neither pack contains more than twice the number of objects in + # the other, so they should be combined. but, marking one as + # .kept on disk will "freeze" it, so the pack structure should + # remain unchanged. + touch $objdir/pack/pack-$KEPT.keep && + + find $objdir/pack -name "*.pack" | sort >before && + git repack --geometric 2 -d && + find $objdir/pack -name "*.pack" | sort >after && + + # both packs should still exist + test_path_is_file $objdir/pack/pack-$KEPT.pack && + test_path_is_file $objdir/pack/pack-$PACK.pack && + + # and no new packs should be created + test_cmp before after && + + # Passing --pack-kept-objects causes packs with a .keep file to + # be repacked, too. + git repack --geometric 2 -d --pack-kept-objects && + + find $objdir/pack -name "*.pack" >after && + test_line_count = 1 after + ) +' + +test_done |