diff options
author | Junio C Hamano <gitster@pobox.com> | 2018-09-17 13:53:50 -0700 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2018-09-17 13:53:50 -0700 |
commit | 49f210fd5279eeb0106cd7e4383a1c4454d30428 (patch) | |
tree | 980ae4660060e0b6e8d10600bb71262d1b200166 | |
parent | Merge branch 'jk/branch-l-1-repurpose' (diff) | |
parent | pack-objects: consider packs in multi-pack-index (diff) | |
download | tgif-49f210fd5279eeb0106cd7e4383a1c4454d30428.tar.xz |
Merge branch 'ds/multi-pack-index'
When there are too many packfiles in a repository (which is not
recommended), looking up an object in these would require
consulting many pack .idx files; a new mechanism to have a single
file that consolidates all of these .idx files is introduced.
* ds/multi-pack-index: (32 commits)
pack-objects: consider packs in multi-pack-index
midx: test a few commands that use get_all_packs
treewide: use get_all_packs
packfile: add all_packs list
midx: fix bug that skips midx with alternates
midx: stop reporting garbage
midx: mark bad packed objects
multi-pack-index: store local property
multi-pack-index: provide more helpful usage info
midx: clear midx on repack
packfile: skip loading index if in multi-pack-index
midx: prevent duplicate packfile loads
midx: use midx in approximate_object_count
midx: use existing midx when writing new one
midx: use midx in abbreviation calculations
midx: read objects from multi-pack-index
config: create core.multiPackIndex setting
midx: write object offsets
midx: write object id fanout chunk
midx: write object ids in a chunk
...
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | Documentation/config.txt | 5 | ||||
-rw-r--r-- | Documentation/git-multi-pack-index.txt | 56 | ||||
-rw-r--r-- | Documentation/technical/multi-pack-index.txt | 109 | ||||
-rw-r--r-- | Documentation/technical/pack-format.txt | 77 | ||||
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | builtin.h | 1 | ||||
-rw-r--r-- | builtin/count-objects.c | 2 | ||||
-rw-r--r-- | builtin/fsck.c | 4 | ||||
-rw-r--r-- | builtin/gc.c | 4 | ||||
-rw-r--r-- | builtin/multi-pack-index.c | 47 | ||||
-rw-r--r-- | builtin/pack-objects.c | 42 | ||||
-rw-r--r-- | builtin/pack-redundant.c | 4 | ||||
-rw-r--r-- | builtin/repack.c | 9 | ||||
-rw-r--r-- | command-list.txt | 1 | ||||
-rw-r--r-- | fast-import.c | 4 | ||||
-rw-r--r-- | git.c | 1 | ||||
-rw-r--r-- | http-backend.c | 4 | ||||
-rw-r--r-- | midx.c | 930 | ||||
-rw-r--r-- | midx.h | 47 | ||||
-rw-r--r-- | object-store.h | 15 | ||||
-rw-r--r-- | pack-bitmap.c | 2 | ||||
-rw-r--r-- | pack-objects.c | 2 | ||||
-rw-r--r-- | packfile.c | 199 | ||||
-rw-r--r-- | packfile.h | 10 | ||||
-rw-r--r-- | server-info.c | 4 | ||||
-rw-r--r-- | sha1-name.c | 70 | ||||
-rw-r--r-- | t/helper/test-read-midx.c | 51 | ||||
-rw-r--r-- | t/helper/test-tool.c | 1 | ||||
-rw-r--r-- | t/helper/test-tool.h | 1 | ||||
-rwxr-xr-x | t/t5319-multi-pack-index.sh | 217 |
31 files changed, 1859 insertions, 66 deletions
diff --git a/.gitignore b/.gitignore index ffceea7d59..9d1363a1eb 100644 --- a/.gitignore +++ b/.gitignore @@ -99,8 +99,9 @@ /git-mergetool--lib /git-mktag /git-mktree -/git-name-rev +/git-multi-pack-index /git-mv +/git-name-rev /git-notes /git-p4 /git-pack-redundant diff --git a/Documentation/config.txt b/Documentation/config.txt index 69a27eb688..6ecd70df0a 100644 --- a/Documentation/config.txt +++ b/Documentation/config.txt @@ -937,6 +937,11 @@ core.useReplaceRefs:: option was given on the command line. See linkgit:git[1] and linkgit:git-replace[1] for more information. +core.multiPackIndex:: + Use the multi-pack-index file to track multiple packfiles using a + single index. See link:technical/multi-pack-index.html[the + multi-pack-index design document]. + core.sparseCheckout:: Enable "sparse checkout" feature. See section "Sparse checkout" in linkgit:git-read-tree[1] for more information. diff --git a/Documentation/git-multi-pack-index.txt b/Documentation/git-multi-pack-index.txt new file mode 100644 index 0000000000..1f97e79912 --- /dev/null +++ b/Documentation/git-multi-pack-index.txt @@ -0,0 +1,56 @@ +git-multi-pack-index(1) +======================= + +NAME +---- +git-multi-pack-index - Write and verify multi-pack-indexes + + +SYNOPSIS +-------- +[verse] +'git multi-pack-index' [--object-dir=<dir>] <verb> + +DESCRIPTION +----------- +Write or verify a multi-pack-index (MIDX) file. + +OPTIONS +------- + +--object-dir=<dir>:: + Use given directory for the location of Git objects. We check + `<dir>/packs/multi-pack-index` for the current MIDX file, and + `<dir>/packs` for the pack-files to index. + +write:: + When given as the verb, write a new MIDX file to + `<dir>/packs/multi-pack-index`. + + +EXAMPLES +-------- + +* Write a MIDX file for the packfiles in the current .git folder. ++ +----------------------------------------------- +$ git multi-pack-index write +----------------------------------------------- + +* Write a MIDX file for the packfiles in an alternate object store. ++ +----------------------------------------------- +$ git multi-pack-index --object-dir <alt> write +----------------------------------------------- + + +SEE ALSO +-------- +See link:technical/multi-pack-index.html[The Multi-Pack-Index Design +Document] and link:technical/pack-format.html[The Multi-Pack-Index +Format] for more information on the multi-pack-index feature. + + +GIT +--- +Part of the linkgit:git[1] suite diff --git a/Documentation/technical/multi-pack-index.txt b/Documentation/technical/multi-pack-index.txt new file mode 100644 index 0000000000..d7e57639f7 --- /dev/null +++ b/Documentation/technical/multi-pack-index.txt @@ -0,0 +1,109 @@ +Multi-Pack-Index (MIDX) Design Notes +==================================== + +The Git object directory contains a 'pack' directory containing +packfiles (with suffix ".pack") and pack-indexes (with suffix +".idx"). The pack-indexes provide a way to lookup objects and +navigate to their offset within the pack, but these must come +in pairs with the packfiles. This pairing depends on the file +names, as the pack-index differs only in suffix with its pack- +file. While the pack-indexes provide fast lookup per packfile, +this performance degrades as the number of packfiles increases, +because abbreviations need to inspect every packfile and we are +more likely to have a miss on our most-recently-used packfile. +For some large repositories, repacking into a single packfile +is not feasible due to storage space or excessive repack times. + +The multi-pack-index (MIDX for short) stores a list of objects +and their offsets into multiple packfiles. It contains: + +- A list of packfile names. +- A sorted list of object IDs. +- A list of metadata for the ith object ID including: + - A value j referring to the jth packfile. + - An offset within the jth packfile for the object. +- If large offsets are required, we use another list of large + offsets similar to version 2 pack-indexes. + +Thus, we can provide O(log N) lookup time for any number +of packfiles. + +Design Details +-------------- + +- The MIDX is stored in a file named 'multi-pack-index' in the + .git/objects/pack directory. This could be stored in the pack + directory of an alternate. It refers only to packfiles in that + same directory. + +- The pack.multiIndex config setting must be on to consume MIDX files. + +- The file format includes parameters for the object ID hash + function, so a future change of hash algorithm does not require + a change in format. + +- The MIDX keeps only one record per object ID. If an object appears + in multiple packfiles, then the MIDX selects the copy in the most- + recently modified packfile. + +- If there exist packfiles in the pack directory not registered in + the MIDX, then those packfiles are loaded into the `packed_git` + list and `packed_git_mru` cache. + +- The pack-indexes (.idx files) remain in the pack directory so we + can delete the MIDX file, set core.midx to false, or downgrade + without any loss of information. + +- The MIDX file format uses a chunk-based approach (similar to the + commit-graph file) that allows optional data to be added. + +Future Work +----------- + +- Add a 'verify' subcommand to the 'git midx' builtin to verify the + contents of the multi-pack-index file match the offsets listed in + the corresponding pack-indexes. + +- The multi-pack-index allows many packfiles, especially in a context + where repacking is expensive (such as a very large repo), or + unexpected maintenance time is unacceptable (such as a high-demand + build machine). However, the multi-pack-index needs to be rewritten + in full every time. We can extend the format to be incremental, so + writes are fast. By storing a small "tip" multi-pack-index that + points to large "base" MIDX files, we can keep writes fast while + still reducing the number of binary searches required for object + lookups. + +- The reachability bitmap is currently paired directly with a single + packfile, using the pack-order as the object order to hopefully + compress the bitmaps well using run-length encoding. This could be + extended to pair a reachability bitmap with a multi-pack-index. If + the multi-pack-index is extended to store a "stable object order" + (a function Order(hash) = integer that is constant for a given hash, + even as the multi-pack-index is updated) then a reachability bitmap + could point to a multi-pack-index and be updated independently. + +- Packfiles can be marked as "special" using empty files that share + the initial name but replace ".pack" with ".keep" or ".promisor". + We can add an optional chunk of data to the multi-pack-index that + records flags of information about the packfiles. This allows new + states, such as 'repacked' or 'redeltified', that can help with + pack maintenance in a multi-pack environment. It may also be + helpful to organize packfiles by object type (commit, tree, blob, + etc.) and use this metadata to help that maintenance. + +- The partial clone feature records special "promisor" packs that + may point to objects that are not stored locally, but available + on request to a server. The multi-pack-index does not currently + track these promisor packs. + +Related Links +------------- +[0] https://bugs.chromium.org/p/git/issues/detail?id=6 + Chromium work item for: Multi-Pack Index (MIDX) + +[1] https://public-inbox.org/git/20180107181459.222909-1-dstolee@microsoft.com/ + An earlier RFC for the multi-pack-index feature + +[2] https://public-inbox.org/git/alpine.DEB.2.20.1803091557510.23109@alexmv-linux/ + Git Merge 2018 Contributor's summit notes (includes discussion of MIDX) diff --git a/Documentation/technical/pack-format.txt b/Documentation/technical/pack-format.txt index 70a99fd142..cab5bdd2ff 100644 --- a/Documentation/technical/pack-format.txt +++ b/Documentation/technical/pack-format.txt @@ -252,3 +252,80 @@ Pack file entry: <+ corresponding packfile. 20-byte SHA-1-checksum of all of the above. + +== multi-pack-index (MIDX) files have the following format: + +The multi-pack-index files refer to multiple pack-files and loose objects. + +In order to allow extensions that add extra data to the MIDX, we organize +the body into "chunks" and provide a lookup table at the beginning of the +body. The header includes certain length values, such as the number of packs, +the number of base MIDX files, hash lengths and types. + +All 4-byte numbers are in network order. + +HEADER: + + 4-byte signature: + The signature is: {'M', 'I', 'D', 'X'} + + 1-byte version number: + Git only writes or recognizes version 1. + + 1-byte Object Id Version + Git only writes or recognizes version 1 (SHA1). + + 1-byte number of "chunks" + + 1-byte number of base multi-pack-index files: + This value is currently always zero. + + 4-byte number of pack files + +CHUNK LOOKUP: + + (C + 1) * 12 bytes providing the chunk offsets: + First 4 bytes describe chunk id. Value 0 is a terminating label. + Other 8 bytes provide offset in current file for chunk to start. + (Chunks are provided in file-order, so you can infer the length + using the next chunk position if necessary.) + + The remaining data in the body is described one chunk at a time, and + these chunks may be given in any order. Chunks are required unless + otherwise specified. + +CHUNK DATA: + + Packfile Names (ID: {'P', 'N', 'A', 'M'}) + Stores the packfile names as concatenated, null-terminated strings. + Packfiles must be listed in lexicographic order for fast lookups by + name. This is the only chunk not guaranteed to be a multiple of four + bytes in length, so should be the last chunk for alignment reasons. + + OID Fanout (ID: {'O', 'I', 'D', 'F'}) + The ith entry, F[i], stores the number of OIDs with first + byte at most i. Thus F[255] stores the total + number of objects. + + OID Lookup (ID: {'O', 'I', 'D', 'L'}) + The OIDs for all objects in the MIDX are stored in lexicographic + order in this chunk. + + Object Offsets (ID: {'O', 'O', 'F', 'F'}) + Stores two 4-byte values for every object. + 1: The pack-int-id for the pack storing this object. + 2: The offset within the pack. + If all offsets are less than 2^31, then the large offset chunk + will not exist and offsets are stored as in IDX v1. + If there is at least one offset value larger than 2^32-1, then + the large offset chunk must exist. If the large offset chunk + exists and the 31st bit is on, then removing that bit reveals + the row in the large offsets containing the 8-byte offset of + this object. + + [Optional] Object Large Offsets (ID: {'L', 'O', 'F', 'F'}) + 8-byte offsets into large packfiles. + +TRAILER: + + 20-byte SHA1-checksum of the above contents. @@ -723,6 +723,7 @@ TEST_BUILTINS_OBJS += test-online-cpus.o TEST_BUILTINS_OBJS += test-path-utils.o TEST_BUILTINS_OBJS += test-prio-queue.o TEST_BUILTINS_OBJS += test-read-cache.o +TEST_BUILTINS_OBJS += test-read-midx.o TEST_BUILTINS_OBJS += test-ref-store.o TEST_BUILTINS_OBJS += test-regex.o TEST_BUILTINS_OBJS += test-repository.o @@ -900,6 +901,7 @@ LIB_OBJS += merge.o LIB_OBJS += merge-blobs.o LIB_OBJS += merge-recursive.o LIB_OBJS += mergesort.o +LIB_OBJS += midx.o LIB_OBJS += name-hash.o LIB_OBJS += negotiator/default.o LIB_OBJS += negotiator/skipping.o @@ -1060,6 +1062,7 @@ BUILTIN_OBJS += builtin/merge-recursive.o BUILTIN_OBJS += builtin/merge-tree.o BUILTIN_OBJS += builtin/mktag.o BUILTIN_OBJS += builtin/mktree.o +BUILTIN_OBJS += builtin/multi-pack-index.o BUILTIN_OBJS += builtin/mv.o BUILTIN_OBJS += builtin/name-rev.o BUILTIN_OBJS += builtin/notes.o @@ -191,6 +191,7 @@ extern int cmd_merge_recursive(int argc, const char **argv, const char *prefix); extern int cmd_merge_tree(int argc, const char **argv, const char *prefix); extern int cmd_mktag(int argc, const char **argv, const char *prefix); extern int cmd_mktree(int argc, const char **argv, const char *prefix); +extern int cmd_multi_pack_index(int argc, const char **argv, const char *prefix); extern int cmd_mv(int argc, const char **argv, const char *prefix); extern int cmd_name_rev(int argc, const char **argv, const char *prefix); extern int cmd_notes(int argc, const char **argv, const char *prefix); diff --git a/builtin/count-objects.c b/builtin/count-objects.c index d51e2ce1ec..a7cad052c6 100644 --- a/builtin/count-objects.c +++ b/builtin/count-objects.c @@ -123,7 +123,7 @@ int cmd_count_objects(int argc, const char **argv, const char *prefix) struct strbuf pack_buf = STRBUF_INIT; struct strbuf garbage_buf = STRBUF_INIT; - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local) continue; if (open_pack_index(p)) diff --git a/builtin/fsck.c b/builtin/fsck.c index 250f5af118..63c8578cc1 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -740,7 +740,7 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) struct progress *progress = NULL; if (show_progress) { - for (p = get_packed_git(the_repository); p; + for (p = get_all_packs(the_repository); p; p = p->next) { if (open_pack_index(p)) continue; @@ -749,7 +749,7 @@ int cmd_fsck(int argc, const char **argv, const char *prefix) progress = start_progress(_("Checking objects"), total); } - for (p = get_packed_git(the_repository); p; + for (p = get_all_packs(the_repository); p; p = p->next) { /* verify gives error messages itself */ if (verify_pack(p, fsck_obj_buffer, diff --git a/builtin/gc.c b/builtin/gc.c index 57069442b0..2b592260e9 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -183,7 +183,7 @@ static struct packed_git *find_base_packs(struct string_list *packs, { struct packed_git *p, *base = NULL; - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local) continue; if (limit) { @@ -208,7 +208,7 @@ static int too_many_packs(void) if (gc_auto_pack_limit <= 0) return 0; - for (cnt = 0, p = get_packed_git(the_repository); p; p = p->next) { + for (cnt = 0, p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local) continue; if (p->pack_keep) diff --git a/builtin/multi-pack-index.c b/builtin/multi-pack-index.c new file mode 100644 index 0000000000..2633efd95d --- /dev/null +++ b/builtin/multi-pack-index.c @@ -0,0 +1,47 @@ +#include "builtin.h" +#include "cache.h" +#include "config.h" +#include "parse-options.h" +#include "midx.h" + +static char const * const builtin_multi_pack_index_usage[] = { + N_("git multi-pack-index [--object-dir=<dir>] write"), + NULL +}; + +static struct opts_multi_pack_index { + const char *object_dir; +} opts; + +int cmd_multi_pack_index(int argc, const char **argv, + const char *prefix) +{ + static struct option builtin_multi_pack_index_options[] = { + OPT_FILENAME(0, "object-dir", &opts.object_dir, + N_("object directory containing set of packfile and pack-index pairs")), + OPT_END(), + }; + + git_config(git_default_config, NULL); + + argc = parse_options(argc, argv, prefix, + builtin_multi_pack_index_options, + builtin_multi_pack_index_usage, 0); + + if (!opts.object_dir) + opts.object_dir = get_object_directory(); + + if (argc == 0) + usage_with_options(builtin_multi_pack_index_usage, + builtin_multi_pack_index_options); + + if (argc > 1) { + die(_("too many arguments")); + return 1; + } + + if (!strcmp(argv[0], "write")) + return write_midx_file(opts.object_dir); + + die(_("unrecognized verb: %s"), argv[0]); +} diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index d1144a8f7e..caa4cd0211 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -31,6 +31,7 @@ #include "packfile.h" #include "object-store.h" #include "dir.h" +#include "midx.h" #define IN_PACK(obj) oe_in_pack(&to_pack, obj) #define SIZE(obj) oe_size(&to_pack, obj) @@ -1040,6 +1041,7 @@ static int want_object_in_pack(const struct object_id *oid, { int want; struct list_head *pos; + struct multi_pack_index *m; if (!exclude && local && has_loose_object_nonlocal(oid)) return 0; @@ -1054,6 +1056,32 @@ static int want_object_in_pack(const struct object_id *oid, if (want != -1) return want; } + + for (m = get_multi_pack_index(the_repository); m; m = m->next) { + struct pack_entry e; + if (fill_midx_entry(oid, &e, m)) { + struct packed_git *p = e.p; + off_t offset; + + if (p == *found_pack) + offset = *found_offset; + else + offset = find_pack_entry_one(oid->hash, p); + + if (offset) { + if (!*found_pack) { + if (!is_pack_valid(p)) + continue; + *found_offset = offset; + *found_pack = p; + } + want = want_found_object(exclude, p); + if (want != -1) + return want; + } + } + } + list_for_each(pos, get_packed_git_mru(the_repository)) { struct packed_git *p = list_entry(pos, struct packed_git, mru); off_t offset; @@ -2806,7 +2834,7 @@ static void add_objects_in_unpacked_packs(struct rev_info *revs) memset(&in_pack, 0, sizeof(in_pack)); - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { struct object_id oid; struct object *o; @@ -2870,7 +2898,7 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid) struct packed_git *p; p = (last_found != (void *)1) ? last_found : - get_packed_git(the_repository); + get_all_packs(the_repository); while (p) { if ((!p->pack_local || p->pack_keep || @@ -2880,7 +2908,7 @@ static int has_sha1_pack_kept_or_nonlocal(const struct object_id *oid) return 1; } if (p == last_found) - p = get_packed_git(the_repository); + p = get_all_packs(the_repository); else p = p->next; if (p == last_found) @@ -2916,7 +2944,7 @@ static void loosen_unused_packed_objects(struct rev_info *revs) uint32_t i; struct object_id oid; - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local || p->pack_keep || p->pack_keep_in_core) continue; @@ -3063,7 +3091,7 @@ static void add_extra_kept_packs(const struct string_list *names) if (!names->nr) return; - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { const char *name = basename(p->pack_name); int i; @@ -3336,7 +3364,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) add_extra_kept_packs(&keep_pack_list); if (ignore_packed_keep_on_disk) { struct packed_git *p; - for (p = get_packed_git(the_repository); p; p = p->next) + for (p = get_all_packs(the_repository); p; p = p->next) if (p->pack_local && p->pack_keep) break; if (!p) /* no keep-able packs found */ @@ -3349,7 +3377,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) * it also covers non-local objects */ struct packed_git *p; - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local) { have_non_local_packs = 1; break; diff --git a/builtin/pack-redundant.c b/builtin/pack-redundant.c index 0494dceff7..cf9a9aabd4 100644 --- a/builtin/pack-redundant.c +++ b/builtin/pack-redundant.c @@ -577,7 +577,7 @@ static struct pack_list * add_pack(struct packed_git *p) static struct pack_list * add_pack_file(const char *filename) { - struct packed_git *p = get_packed_git(the_repository); + struct packed_git *p = get_all_packs(the_repository); if (strlen(filename) < 40) die("Bad pack filename: %s", filename); @@ -592,7 +592,7 @@ static struct pack_list * add_pack_file(const char *filename) static void load_all(void) { - struct packed_git *p = get_packed_git(the_repository); + struct packed_git *p = get_all_packs(the_repository); while (p) { add_pack(p); diff --git a/builtin/repack.c b/builtin/repack.c index d5886039cc..42be88e86c 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -8,6 +8,7 @@ #include "strbuf.h" #include "string-list.h" #include "argv-array.h" +#include "midx.h" #include "packfile.h" #include "object-store.h" @@ -280,6 +281,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix) int keep_unreachable = 0; struct string_list keep_pack_list = STRING_LIST_INIT_NODUP; int no_update_server_info = 0; + int midx_cleared = 0; struct pack_objects_args po_args = {NULL}; struct option builtin_repack_options[] = { @@ -418,6 +420,13 @@ int cmd_repack(int argc, const char **argv, const char *prefix) for_each_string_list_item(item, &names) { for (ext = 0; ext < ARRAY_SIZE(exts); ext++) { char *fname, *fname_old; + + if (!midx_cleared) { + /* if we move a packfile, it will invalidated the midx */ + clear_midx_file(get_object_directory()); + midx_cleared = 1; + } + fname = mkpathdup("%s/pack-%s%s", packdir, item->string, exts[ext].name); if (!file_exists(fname)) { diff --git a/command-list.txt b/command-list.txt index a9dda3b8af..c36ea3c182 100644 --- a/command-list.txt +++ b/command-list.txt @@ -123,6 +123,7 @@ git-merge-index plumbingmanipulators git-merge-one-file purehelpers git-mergetool ancillarymanipulators complete git-merge-tree ancillaryinterrogators +git-multi-pack-index plumbingmanipulators git-mktag plumbingmanipulators git-mktree plumbingmanipulators git-mv mainporcelain worktree diff --git a/fast-import.c b/fast-import.c index 89bb0c9db3..f8c3acd3b5 100644 --- a/fast-import.c +++ b/fast-import.c @@ -1068,7 +1068,7 @@ static int store_object( duplicate_count_by_type[type]++; return 1; } else if (find_sha1_pack(oid.hash, - get_packed_git(the_repository))) { + get_all_packs(the_repository))) { e->type = type; e->pack_id = MAX_PACK_ID; e->idx.offset = 1; /* just not zero! */ @@ -1266,7 +1266,7 @@ static void stream_blob(uintmax_t len, struct object_id *oidout, uintmax_t mark) truncate_pack(&checkpoint); } else if (find_sha1_pack(oid.hash, - get_packed_git(the_repository))) { + get_all_packs(the_repository))) { e->type = OBJ_BLOB; e->pack_id = MAX_PACK_ID; e->idx.offset = 1; /* just not zero! */ @@ -508,6 +508,7 @@ static struct cmd_struct commands[] = { { "merge-tree", cmd_merge_tree, RUN_SETUP | NO_PARSEOPT }, { "mktag", cmd_mktag, RUN_SETUP | NO_PARSEOPT }, { "mktree", cmd_mktree, RUN_SETUP }, + { "multi-pack-index", cmd_multi_pack_index, RUN_SETUP_GENTLY }, { "mv", cmd_mv, RUN_SETUP | NEED_WORK_TREE }, { "name-rev", cmd_name_rev, RUN_SETUP }, { "notes", cmd_notes, RUN_SETUP }, diff --git a/http-backend.c b/http-backend.c index 458642ef72..9e894f197f 100644 --- a/http-backend.c +++ b/http-backend.c @@ -595,13 +595,13 @@ static void get_info_packs(struct strbuf *hdr, char *arg) size_t cnt = 0; select_getanyfile(hdr); - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (p->pack_local) cnt++; } strbuf_grow(&buf, cnt * 53 + 2); - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (p->pack_local) strbuf_addf(&buf, "P %s\n", p->pack_name + objdirlen + 6); } diff --git a/midx.c b/midx.c new file mode 100644 index 0000000000..f3e8dbc108 --- /dev/null +++ b/midx.c @@ -0,0 +1,930 @@ +#include "cache.h" +#include "config.h" +#include "csum-file.h" +#include "dir.h" +#include "lockfile.h" +#include "packfile.h" +#include "object-store.h" +#include "sha1-lookup.h" +#include "midx.h" + +#define MIDX_SIGNATURE 0x4d494458 /* "MIDX" */ +#define MIDX_VERSION 1 +#define MIDX_BYTE_FILE_VERSION 4 +#define MIDX_BYTE_HASH_VERSION 5 +#define MIDX_BYTE_NUM_CHUNKS 6 +#define MIDX_BYTE_NUM_PACKS 8 +#define MIDX_HASH_VERSION 1 +#define MIDX_HEADER_SIZE 12 +#define MIDX_HASH_LEN 20 +#define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN) + +#define MIDX_MAX_CHUNKS 5 +#define MIDX_CHUNK_ALIGNMENT 4 +#define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */ +#define MIDX_CHUNKID_OIDFANOUT 0x4f494446 /* "OIDF" */ +#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */ +#define MIDX_CHUNKID_OBJECTOFFSETS 0x4f4f4646 /* "OOFF" */ +#define MIDX_CHUNKID_LARGEOFFSETS 0x4c4f4646 /* "LOFF" */ +#define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t)) +#define MIDX_CHUNK_FANOUT_SIZE (sizeof(uint32_t) * 256) +#define MIDX_CHUNK_OFFSET_WIDTH (2 * sizeof(uint32_t)) +#define MIDX_CHUNK_LARGE_OFFSET_WIDTH (sizeof(uint64_t)) +#define MIDX_LARGE_OFFSET_NEEDED 0x80000000 + +static char *get_midx_filename(const char *object_dir) +{ + return xstrfmt("%s/pack/multi-pack-index", object_dir); +} + +struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local) +{ + struct multi_pack_index *m = NULL; + int fd; + struct stat st; + size_t midx_size; + void *midx_map = NULL; + uint32_t hash_version; + char *midx_name = get_midx_filename(object_dir); + uint32_t i; + const char *cur_pack_name; + + fd = git_open(midx_name); + + if (fd < 0) + goto cleanup_fail; + if (fstat(fd, &st)) { + error_errno(_("failed to read %s"), midx_name); + goto cleanup_fail; + } + + midx_size = xsize_t(st.st_size); + + if (midx_size < MIDX_MIN_SIZE) { + error(_("multi-pack-index file %s is too small"), midx_name); + goto cleanup_fail; + } + + FREE_AND_NULL(midx_name); + + midx_map = xmmap(NULL, midx_size, PROT_READ, MAP_PRIVATE, fd, 0); + + FLEX_ALLOC_MEM(m, object_dir, object_dir, strlen(object_dir)); + m->fd = fd; + m->data = midx_map; + m->data_len = midx_size; + m->local = local; + + m->signature = get_be32(m->data); + if (m->signature != MIDX_SIGNATURE) { + error(_("multi-pack-index signature 0x%08x does not match signature 0x%08x"), + m->signature, MIDX_SIGNATURE); + goto cleanup_fail; + } + + m->version = m->data[MIDX_BYTE_FILE_VERSION]; + if (m->version != MIDX_VERSION) { + error(_("multi-pack-index version %d not recognized"), + m->version); + goto cleanup_fail; + } + + hash_version = m->data[MIDX_BYTE_HASH_VERSION]; + if (hash_version != MIDX_HASH_VERSION) { + error(_("hash version %u does not match"), hash_version); + goto cleanup_fail; + } + m->hash_len = MIDX_HASH_LEN; + + m->num_chunks = m->data[MIDX_BYTE_NUM_CHUNKS]; + + m->num_packs = get_be32(m->data + MIDX_BYTE_NUM_PACKS); + + for (i = 0; i < m->num_chunks; i++) { + uint32_t chunk_id = get_be32(m->data + MIDX_HEADER_SIZE + + MIDX_CHUNKLOOKUP_WIDTH * i); + uint64_t chunk_offset = get_be64(m->data + MIDX_HEADER_SIZE + 4 + + MIDX_CHUNKLOOKUP_WIDTH * i); + + switch (chunk_id) { + case MIDX_CHUNKID_PACKNAMES: + m->chunk_pack_names = m->data + chunk_offset; + break; + + case MIDX_CHUNKID_OIDFANOUT: + m->chunk_oid_fanout = (uint32_t *)(m->data + chunk_offset); + break; + + case MIDX_CHUNKID_OIDLOOKUP: + m->chunk_oid_lookup = m->data + chunk_offset; + break; + + case MIDX_CHUNKID_OBJECTOFFSETS: + m->chunk_object_offsets = m->data + chunk_offset; + break; + + case MIDX_CHUNKID_LARGEOFFSETS: + m->chunk_large_offsets = m->data + chunk_offset; + break; + + case 0: + die(_("terminating multi-pack-index chunk id appears earlier than expected")); + break; + + default: + /* + * Do nothing on unrecognized chunks, allowing future + * extensions to add optional chunks. + */ + break; + } + } + + if (!m->chunk_pack_names) + die(_("multi-pack-index missing required pack-name chunk")); + if (!m->chunk_oid_fanout) + die(_("multi-pack-index missing required OID fanout chunk")); + if (!m->chunk_oid_lookup) + die(_("multi-pack-index missing required OID lookup chunk")); + if (!m->chunk_object_offsets) + die(_("multi-pack-index missing required object offsets chunk")); + + m->num_objects = ntohl(m->chunk_oid_fanout[255]); + + m->pack_names = xcalloc(m->num_packs, sizeof(*m->pack_names)); + m->packs = xcalloc(m->num_packs, sizeof(*m->packs)); + + cur_pack_name = (const char *)m->chunk_pack_names; + for (i = 0; i < m->num_packs; i++) { + m->pack_names[i] = cur_pack_name; + + cur_pack_name += strlen(cur_pack_name) + 1; + + if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) { + error(_("multi-pack-index pack names out of order: '%s' before '%s'"), + m->pack_names[i - 1], + m->pack_names[i]); + goto cleanup_fail; + } + } + + return m; + +cleanup_fail: + free(m); + free(midx_name); + if (midx_map) + munmap(midx_map, midx_size); + if (0 <= fd) + close(fd); + return NULL; +} + +static void close_midx(struct multi_pack_index *m) +{ + uint32_t i; + munmap((unsigned char *)m->data, m->data_len); + close(m->fd); + m->fd = -1; + + for (i = 0; i < m->num_packs; i++) { + if (m->packs[i]) { + close_pack(m->packs[i]); + free(m->packs); + } + } + FREE_AND_NULL(m->packs); + FREE_AND_NULL(m->pack_names); +} + +int prepare_midx_pack(struct multi_pack_index *m, uint32_t pack_int_id) +{ + struct strbuf pack_name = STRBUF_INIT; + + if (pack_int_id >= m->num_packs) + BUG("bad pack-int-id"); + + if (m->packs[pack_int_id]) + return 0; + + strbuf_addf(&pack_name, "%s/pack/%s", m->object_dir, + m->pack_names[pack_int_id]); + + m->packs[pack_int_id] = add_packed_git(pack_name.buf, pack_name.len, m->local); + strbuf_release(&pack_name); + return !m->packs[pack_int_id]; +} + +int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result) +{ + return bsearch_hash(oid->hash, m->chunk_oid_fanout, m->chunk_oid_lookup, + MIDX_HASH_LEN, result); +} + +struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct multi_pack_index *m, + uint32_t n) +{ + if (n >= m->num_objects) + return NULL; + + hashcpy(oid->hash, m->chunk_oid_lookup + m->hash_len * n); + return oid; +} + +static off_t nth_midxed_offset(struct multi_pack_index *m, uint32_t pos) +{ + const unsigned char *offset_data; + uint32_t offset32; + + offset_data = m->chunk_object_offsets + pos * MIDX_CHUNK_OFFSET_WIDTH; + offset32 = get_be32(offset_data + sizeof(uint32_t)); + + if (m->chunk_large_offsets && offset32 & MIDX_LARGE_OFFSET_NEEDED) { + if (sizeof(offset32) < sizeof(uint64_t)) + die(_("multi-pack-index stores a 64-bit offset, but off_t is too small")); + + offset32 ^= MIDX_LARGE_OFFSET_NEEDED; + return get_be64(m->chunk_large_offsets + sizeof(uint64_t) * offset32); + } + + return offset32; +} + +static uint32_t nth_midxed_pack_int_id(struct multi_pack_index *m, uint32_t pos) +{ + return get_be32(m->chunk_object_offsets + pos * MIDX_CHUNK_OFFSET_WIDTH); +} + +static int nth_midxed_pack_entry(struct multi_pack_index *m, struct pack_entry *e, uint32_t pos) +{ + uint32_t pack_int_id; + struct packed_git *p; + + if (pos >= m->num_objects) + return 0; + + pack_int_id = nth_midxed_pack_int_id(m, pos); + + if (prepare_midx_pack(m, pack_int_id)) + die(_("error preparing packfile from multi-pack-index")); + p = m->packs[pack_int_id]; + + /* + * We are about to tell the caller where they can locate the + * requested object. We better make sure the packfile is + * still here and can be accessed before supplying that + * answer, as it may have been deleted since the MIDX was + * loaded! + */ + if (!is_pack_valid(p)) + return 0; + + if (p->num_bad_objects) { + uint32_t i; + struct object_id oid; + nth_midxed_object_oid(&oid, m, pos); + for (i = 0; i < p->num_bad_objects; i++) + if (!hashcmp(oid.hash, + p->bad_object_sha1 + the_hash_algo->rawsz * i)) + return 0; + } + + e->offset = nth_midxed_offset(m, pos); + e->p = p; + + return 1; +} + +int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct multi_pack_index *m) +{ + uint32_t pos; + + if (!bsearch_midx(oid, m, &pos)) + return 0; + + return nth_midxed_pack_entry(m, e, pos); +} + +int midx_contains_pack(struct multi_pack_index *m, const char *idx_name) +{ + uint32_t first = 0, last = m->num_packs; + + while (first < last) { + uint32_t mid = first + (last - first) / 2; + const char *current; + int cmp; + + current = m->pack_names[mid]; + cmp = strcmp(idx_name, current); + if (!cmp) + return 1; + if (cmp > 0) { + first = mid + 1; + continue; + } + last = mid; + } + + return 0; +} + +int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local) +{ + struct multi_pack_index *m; + struct multi_pack_index *m_search; + int config_value; + + if (repo_config_get_bool(r, "core.multipackindex", &config_value) || + !config_value) + return 0; + + for (m_search = r->objects->multi_pack_index; m_search; m_search = m_search->next) + if (!strcmp(object_dir, m_search->object_dir)) + return 1; + + m = load_multi_pack_index(object_dir, local); + + if (m) { + m->next = r->objects->multi_pack_index; + r->objects->multi_pack_index = m; + return 1; + } + + return 0; +} + +static size_t write_midx_header(struct hashfile *f, + unsigned char num_chunks, + uint32_t num_packs) +{ + unsigned char byte_values[4]; + + hashwrite_be32(f, MIDX_SIGNATURE); + byte_values[0] = MIDX_VERSION; + byte_values[1] = MIDX_HASH_VERSION; + byte_values[2] = num_chunks; + byte_values[3] = 0; /* unused */ + hashwrite(f, byte_values, sizeof(byte_values)); + hashwrite_be32(f, num_packs); + + return MIDX_HEADER_SIZE; +} + +struct pack_list { + struct packed_git **list; + char **names; + uint32_t nr; + uint32_t alloc_list; + uint32_t alloc_names; + size_t pack_name_concat_len; + struct multi_pack_index *m; +}; + +static void add_pack_to_midx(const char *full_path, size_t full_path_len, + const char *file_name, void *data) +{ + struct pack_list *packs = (struct pack_list *)data; + + if (ends_with(file_name, ".idx")) { + if (packs->m && midx_contains_pack(packs->m, file_name)) + return; + + ALLOC_GROW(packs->list, packs->nr + 1, packs->alloc_list); + ALLOC_GROW(packs->names, packs->nr + 1, packs->alloc_names); + + packs->list[packs->nr] = add_packed_git(full_path, + full_path_len, + 0); + + if (!packs->list[packs->nr]) { + warning(_("failed to add packfile '%s'"), + full_path); + return; + } + + if (open_pack_index(packs->list[packs->nr])) { + warning(_("failed to open pack-index '%s'"), + full_path); + close_pack(packs->list[packs->nr]); + FREE_AND_NULL(packs->list[packs->nr]); + return; + } + + packs->names[packs->nr] = xstrdup(file_name); + packs->pack_name_concat_len += strlen(file_name) + 1; + packs->nr++; + } +} + +struct pack_pair { + uint32_t pack_int_id; + char *pack_name; +}; + +static int pack_pair_compare(const void *_a, const void *_b) +{ + struct pack_pair *a = (struct pack_pair *)_a; + struct pack_pair *b = (struct pack_pair *)_b; + return strcmp(a->pack_name, b->pack_name); +} + +static void sort_packs_by_name(char **pack_names, uint32_t nr_packs, uint32_t *perm) +{ + uint32_t i; + struct pack_pair *pairs; + + ALLOC_ARRAY(pairs, nr_packs); + + for (i = 0; i < nr_packs; i++) { + pairs[i].pack_int_id = i; + pairs[i].pack_name = pack_names[i]; + } + + QSORT(pairs, nr_packs, pack_pair_compare); + + for (i = 0; i < nr_packs; i++) { + pack_names[i] = pairs[i].pack_name; + perm[pairs[i].pack_int_id] = i; + } + + free(pairs); +} + +struct pack_midx_entry { + struct object_id oid; + uint32_t pack_int_id; + time_t pack_mtime; + uint64_t offset; +}; + +static int midx_oid_compare(const void *_a, const void *_b) +{ + const struct pack_midx_entry *a = (const struct pack_midx_entry *)_a; + const struct pack_midx_entry *b = (const struct pack_midx_entry *)_b; + int cmp = oidcmp(&a->oid, &b->oid); + + if (cmp) + return cmp; + + if (a->pack_mtime > b->pack_mtime) + return -1; + else if (a->pack_mtime < b->pack_mtime) + return 1; + + return a->pack_int_id - b->pack_int_id; +} + +static int nth_midxed_pack_midx_entry(struct multi_pack_index *m, + uint32_t *pack_perm, + struct pack_midx_entry *e, + uint32_t pos) +{ + if (pos >= m->num_objects) + return 1; + + nth_midxed_object_oid(&e->oid, m, pos); + e->pack_int_id = pack_perm[nth_midxed_pack_int_id(m, pos)]; + e->offset = nth_midxed_offset(m, pos); + + /* consider objects in midx to be from "old" packs */ + e->pack_mtime = 0; + return 0; +} + +static void fill_pack_entry(uint32_t pack_int_id, + struct packed_git *p, + uint32_t cur_object, + struct pack_midx_entry *entry) +{ + if (!nth_packed_object_oid(&entry->oid, p, cur_object)) + die(_("failed to locate object %d in packfile"), cur_object); + + entry->pack_int_id = pack_int_id; + entry->pack_mtime = p->mtime; + + entry->offset = nth_packed_object_offset(p, cur_object); +} + +/* + * It is possible to artificially get into a state where there are many + * duplicate copies of objects. That can create high memory pressure if + * we are to create a list of all objects before de-duplication. To reduce + * this memory pressure without a significant performance drop, automatically + * group objects by the first byte of their object id. Use the IDX fanout + * tables to group the data, copy to a local array, then sort. + * + * Copy only the de-duplicated entries (selected by most-recent modified time + * of a packfile containing the object). + */ +static struct pack_midx_entry *get_sorted_entries(struct multi_pack_index *m, + struct packed_git **p, + uint32_t *perm, + uint32_t nr_packs, + uint32_t *nr_objects) +{ + uint32_t cur_fanout, cur_pack, cur_object; + uint32_t alloc_fanout, alloc_objects, total_objects = 0; + struct pack_midx_entry *entries_by_fanout = NULL; + struct pack_midx_entry *deduplicated_entries = NULL; + uint32_t start_pack = m ? m->num_packs : 0; + + for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) + total_objects += p[cur_pack]->num_objects; + + /* + * As we de-duplicate by fanout value, we expect the fanout + * slices to be evenly distributed, with some noise. Hence, + * allocate slightly more than one 256th. + */ + alloc_objects = alloc_fanout = total_objects > 3200 ? total_objects / 200 : 16; + + ALLOC_ARRAY(entries_by_fanout, alloc_fanout); + ALLOC_ARRAY(deduplicated_entries, alloc_objects); + *nr_objects = 0; + + for (cur_fanout = 0; cur_fanout < 256; cur_fanout++) { + uint32_t nr_fanout = 0; + + if (m) { + uint32_t start = 0, end; + + if (cur_fanout) + start = ntohl(m->chunk_oid_fanout[cur_fanout - 1]); + end = ntohl(m->chunk_oid_fanout[cur_fanout]); + + for (cur_object = start; cur_object < end; cur_object++) { + ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout); + nth_midxed_pack_midx_entry(m, perm, + &entries_by_fanout[nr_fanout], + cur_object); + nr_fanout++; + } + } + + for (cur_pack = start_pack; cur_pack < nr_packs; cur_pack++) { + uint32_t start = 0, end; + + if (cur_fanout) + start = get_pack_fanout(p[cur_pack], cur_fanout - 1); + end = get_pack_fanout(p[cur_pack], cur_fanout); + + for (cur_object = start; cur_object < end; cur_object++) { + ALLOC_GROW(entries_by_fanout, nr_fanout + 1, alloc_fanout); + fill_pack_entry(perm[cur_pack], p[cur_pack], cur_object, &entries_by_fanout[nr_fanout]); + nr_fanout++; + } + } + + QSORT(entries_by_fanout, nr_fanout, midx_oid_compare); + + /* + * The batch is now sorted by OID and then mtime (descending). + * Take only the first duplicate. + */ + for (cur_object = 0; cur_object < nr_fanout; cur_object++) { + if (cur_object && !oidcmp(&entries_by_fanout[cur_object - 1].oid, + &entries_by_fanout[cur_object].oid)) + continue; + + ALLOC_GROW(deduplicated_entries, *nr_objects + 1, alloc_objects); + memcpy(&deduplicated_entries[*nr_objects], + &entries_by_fanout[cur_object], + sizeof(struct pack_midx_entry)); + (*nr_objects)++; + } + } + + free(entries_by_fanout); + return deduplicated_entries; +} + +static size_t write_midx_pack_names(struct hashfile *f, + char **pack_names, + uint32_t num_packs) +{ + uint32_t i; + unsigned char padding[MIDX_CHUNK_ALIGNMENT]; + size_t written = 0; + + for (i = 0; i < num_packs; i++) { + size_t writelen = strlen(pack_names[i]) + 1; + + if (i && strcmp(pack_names[i], pack_names[i - 1]) <= 0) + BUG("incorrect pack-file order: %s before %s", + pack_names[i - 1], + pack_names[i]); + + hashwrite(f, pack_names[i], writelen); + written += writelen; + } + + /* add padding to be aligned */ + i = MIDX_CHUNK_ALIGNMENT - (written % MIDX_CHUNK_ALIGNMENT); + if (i < MIDX_CHUNK_ALIGNMENT) { + memset(padding, 0, sizeof(padding)); + hashwrite(f, padding, i); + written += i; + } + + return written; +} + +static size_t write_midx_oid_fanout(struct hashfile *f, + struct pack_midx_entry *objects, + uint32_t nr_objects) +{ + struct pack_midx_entry *list = objects; + struct pack_midx_entry *last = objects + nr_objects; + uint32_t count = 0; + uint32_t i; + + /* + * Write the first-level table (the list is sorted, + * but we use a 256-entry lookup to be able to avoid + * having to do eight extra binary search iterations). + */ + for (i = 0; i < 256; i++) { + struct pack_midx_entry *next = list; + + while (next < last && next->oid.hash[0] == i) { + count++; + next++; + } + + hashwrite_be32(f, count); + list = next; + } + + return MIDX_CHUNK_FANOUT_SIZE; +} + +static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len, + struct pack_midx_entry *objects, + uint32_t nr_objects) +{ + struct pack_midx_entry *list = objects; + uint32_t i; + size_t written = 0; + + for (i = 0; i < nr_objects; i++) { + struct pack_midx_entry *obj = list++; + + if (i < nr_objects - 1) { + struct pack_midx_entry *next = list; + if (oidcmp(&obj->oid, &next->oid) >= 0) + BUG("OIDs not in order: %s >= %s", + oid_to_hex(&obj->oid), + oid_to_hex(&next->oid)); + } + + hashwrite(f, obj->oid.hash, (int)hash_len); + written += hash_len; + } + + return written; +} + +static size_t write_midx_object_offsets(struct hashfile *f, int large_offset_needed, + struct pack_midx_entry *objects, uint32_t nr_objects) +{ + struct pack_midx_entry *list = objects; + uint32_t i, nr_large_offset = 0; + size_t written = 0; + + for (i = 0; i < nr_objects; i++) { + struct pack_midx_entry *obj = list++; + + hashwrite_be32(f, obj->pack_int_id); + + if (large_offset_needed && obj->offset >> 31) + hashwrite_be32(f, MIDX_LARGE_OFFSET_NEEDED | nr_large_offset++); + else if (!large_offset_needed && obj->offset >> 32) + BUG("object %s requires a large offset (%"PRIx64") but the MIDX is not writing large offsets!", + oid_to_hex(&obj->oid), + obj->offset); + else + hashwrite_be32(f, (uint32_t)obj->offset); + + written += MIDX_CHUNK_OFFSET_WIDTH; + } + + return written; +} + +static size_t write_midx_large_offsets(struct hashfile *f, uint32_t nr_large_offset, + struct pack_midx_entry *objects, uint32_t nr_objects) +{ + struct pack_midx_entry *list = objects; + size_t written = 0; + + while (nr_large_offset) { + struct pack_midx_entry *obj = list++; + uint64_t offset = obj->offset; + + if (!(offset >> 31)) + continue; + + hashwrite_be32(f, offset >> 32); + hashwrite_be32(f, offset & 0xffffffffUL); + written += 2 * sizeof(uint32_t); + + nr_large_offset--; + } + + return written; +} + +int write_midx_file(const char *object_dir) +{ + unsigned char cur_chunk, num_chunks = 0; + char *midx_name; + uint32_t i; + struct hashfile *f = NULL; + struct lock_file lk; + struct pack_list packs; + uint32_t *pack_perm = NULL; + uint64_t written = 0; + uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1]; + uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1]; + uint32_t nr_entries, num_large_offsets = 0; + struct pack_midx_entry *entries = NULL; + int large_offsets_needed = 0; + + midx_name = get_midx_filename(object_dir); + if (safe_create_leading_directories(midx_name)) { + UNLEAK(midx_name); + die_errno(_("unable to create leading directories of %s"), + midx_name); + } + + packs.m = load_multi_pack_index(object_dir, 1); + + packs.nr = 0; + packs.alloc_list = packs.m ? packs.m->num_packs : 16; + packs.alloc_names = packs.alloc_list; + packs.list = NULL; + packs.names = NULL; + packs.pack_name_concat_len = 0; + ALLOC_ARRAY(packs.list, packs.alloc_list); + ALLOC_ARRAY(packs.names, packs.alloc_names); + + if (packs.m) { + for (i = 0; i < packs.m->num_packs; i++) { + ALLOC_GROW(packs.list, packs.nr + 1, packs.alloc_list); + ALLOC_GROW(packs.names, packs.nr + 1, packs.alloc_names); + + packs.list[packs.nr] = NULL; + packs.names[packs.nr] = xstrdup(packs.m->pack_names[i]); + packs.pack_name_concat_len += strlen(packs.names[packs.nr]) + 1; + packs.nr++; + } + } + + for_each_file_in_pack_dir(object_dir, add_pack_to_midx, &packs); + + if (packs.m && packs.nr == packs.m->num_packs) + goto cleanup; + + if (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT) + packs.pack_name_concat_len += MIDX_CHUNK_ALIGNMENT - + (packs.pack_name_concat_len % MIDX_CHUNK_ALIGNMENT); + + ALLOC_ARRAY(pack_perm, packs.nr); + sort_packs_by_name(packs.names, packs.nr, pack_perm); + + entries = get_sorted_entries(packs.m, packs.list, pack_perm, packs.nr, &nr_entries); + + for (i = 0; i < nr_entries; i++) { + if (entries[i].offset > 0x7fffffff) + num_large_offsets++; + if (entries[i].offset > 0xffffffff) + large_offsets_needed = 1; + } + + hold_lock_file_for_update(&lk, midx_name, LOCK_DIE_ON_ERROR); + f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf); + FREE_AND_NULL(midx_name); + + if (packs.m) + close_midx(packs.m); + + cur_chunk = 0; + num_chunks = large_offsets_needed ? 5 : 4; + + written = write_midx_header(f, num_chunks, packs.nr); + + chunk_ids[cur_chunk] = MIDX_CHUNKID_PACKNAMES; + chunk_offsets[cur_chunk] = written + (num_chunks + 1) * MIDX_CHUNKLOOKUP_WIDTH; + + cur_chunk++; + chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDFANOUT; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + packs.pack_name_concat_len; + + cur_chunk++; + chunk_ids[cur_chunk] = MIDX_CHUNKID_OIDLOOKUP; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + MIDX_CHUNK_FANOUT_SIZE; + + cur_chunk++; + chunk_ids[cur_chunk] = MIDX_CHUNKID_OBJECTOFFSETS; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_HASH_LEN; + + cur_chunk++; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + nr_entries * MIDX_CHUNK_OFFSET_WIDTH; + if (large_offsets_needed) { + chunk_ids[cur_chunk] = MIDX_CHUNKID_LARGEOFFSETS; + + cur_chunk++; + chunk_offsets[cur_chunk] = chunk_offsets[cur_chunk - 1] + + num_large_offsets * MIDX_CHUNK_LARGE_OFFSET_WIDTH; + } + + chunk_ids[cur_chunk] = 0; + + for (i = 0; i <= num_chunks; i++) { + if (i && chunk_offsets[i] < chunk_offsets[i - 1]) + BUG("incorrect chunk offsets: %"PRIu64" before %"PRIu64, + chunk_offsets[i - 1], + chunk_offsets[i]); + + if (chunk_offsets[i] % MIDX_CHUNK_ALIGNMENT) + BUG("chunk offset %"PRIu64" is not properly aligned", + chunk_offsets[i]); + + hashwrite_be32(f, chunk_ids[i]); + hashwrite_be32(f, chunk_offsets[i] >> 32); + hashwrite_be32(f, chunk_offsets[i]); + + written += MIDX_CHUNKLOOKUP_WIDTH; + } + + for (i = 0; i < num_chunks; i++) { + if (written != chunk_offsets[i]) + BUG("incorrect chunk offset (%"PRIu64" != %"PRIu64") for chunk id %"PRIx32, + chunk_offsets[i], + written, + chunk_ids[i]); + + switch (chunk_ids[i]) { + case MIDX_CHUNKID_PACKNAMES: + written += write_midx_pack_names(f, packs.names, packs.nr); + break; + + case MIDX_CHUNKID_OIDFANOUT: + written += write_midx_oid_fanout(f, entries, nr_entries); + break; + + case MIDX_CHUNKID_OIDLOOKUP: + written += write_midx_oid_lookup(f, MIDX_HASH_LEN, entries, nr_entries); + break; + + case MIDX_CHUNKID_OBJECTOFFSETS: + written += write_midx_object_offsets(f, large_offsets_needed, entries, nr_entries); + break; + + case MIDX_CHUNKID_LARGEOFFSETS: + written += write_midx_large_offsets(f, num_large_offsets, entries, nr_entries); + break; + + default: + BUG("trying to write unknown chunk id %"PRIx32, + chunk_ids[i]); + } + } + + if (written != chunk_offsets[num_chunks]) + BUG("incorrect final offset %"PRIu64" != %"PRIu64, + written, + chunk_offsets[num_chunks]); + + finalize_hashfile(f, NULL, CSUM_FSYNC | CSUM_HASH_IN_STREAM); + commit_lock_file(&lk); + +cleanup: + for (i = 0; i < packs.nr; i++) { + if (packs.list[i]) { + close_pack(packs.list[i]); + free(packs.list[i]); + } + free(packs.names[i]); + } + + free(packs.list); + free(packs.names); + free(entries); + free(pack_perm); + free(midx_name); + return 0; +} + +void clear_midx_file(const char *object_dir) +{ + char *midx = get_midx_filename(object_dir); + + if (remove_path(midx)) { + UNLEAK(midx); + die(_("failed to clear multi-pack-index at %s"), midx); + } + + free(midx); +} diff --git a/midx.h b/midx.h new file mode 100644 index 0000000000..a210f1af2a --- /dev/null +++ b/midx.h @@ -0,0 +1,47 @@ +#ifndef __MIDX_H__ +#define __MIDX_H__ + +#include "repository.h" + +struct multi_pack_index { + struct multi_pack_index *next; + + int fd; + + const unsigned char *data; + size_t data_len; + + uint32_t signature; + unsigned char version; + unsigned char hash_len; + unsigned char num_chunks; + uint32_t num_packs; + uint32_t num_objects; + + int local; + + const unsigned char *chunk_pack_names; + const uint32_t *chunk_oid_fanout; + const unsigned char *chunk_oid_lookup; + const unsigned char *chunk_object_offsets; + const unsigned char *chunk_large_offsets; + + const char **pack_names; + struct packed_git **packs; + char object_dir[FLEX_ARRAY]; +}; + +struct multi_pack_index *load_multi_pack_index(const char *object_dir, int local); +int prepare_midx_pack(struct multi_pack_index *m, uint32_t pack_int_id); +int bsearch_midx(const struct object_id *oid, struct multi_pack_index *m, uint32_t *result); +struct object_id *nth_midxed_object_oid(struct object_id *oid, + struct multi_pack_index *m, + uint32_t n); +int fill_midx_entry(const struct object_id *oid, struct pack_entry *e, struct multi_pack_index *m); +int midx_contains_pack(struct multi_pack_index *m, const char *idx_name); +int prepare_multi_pack_index_one(struct repository *r, const char *object_dir, int local); + +int write_midx_file(const char *object_dir); +void clear_midx_file(const char *object_dir); + +#endif diff --git a/object-store.h b/object-store.h index 67e66227d9..63b7605a3e 100644 --- a/object-store.h +++ b/object-store.h @@ -88,6 +88,8 @@ struct packed_git { char pack_name[FLEX_ARRAY]; /* more */ }; +struct multi_pack_index; + struct raw_object_store { /* * Path to the repository's object store. @@ -113,6 +115,13 @@ struct raw_object_store { /* * private data * + * should only be accessed directly by packfile.c and midx.c + */ + struct multi_pack_index *multi_pack_index; + + /* + * private data + * * should only be accessed directly by packfile.c */ @@ -121,6 +130,12 @@ struct raw_object_store { struct list_head packed_git_mru; /* + * A linked list containing all packfiles, starting with those + * contained in the multi_pack_index. + */ + struct packed_git *all_packs; + + /* * A fast, rough count of the number of objects in the repository. * These two fields are not meant for direct access. Use * approximate_object_count() instead. diff --git a/pack-bitmap.c b/pack-bitmap.c index f0a1937a1c..4e50ab391f 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -335,7 +335,7 @@ static int open_pack_bitmap(struct bitmap_index *bitmap_git) assert(!bitmap_git->map && !bitmap_git->loaded); - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if (open_pack_bitmap_1(bitmap_git, p) == 0) ret = 0; } diff --git a/pack-objects.c b/pack-objects.c index 6ef87e5683..d04cfa8e9f 100644 --- a/pack-objects.c +++ b/pack-objects.c @@ -99,7 +99,7 @@ static void prepare_in_pack_by_idx(struct packing_data *pdata) * (i.e. in_pack_idx also zero) should return NULL. */ mapping[cnt++] = NULL; - for (p = get_packed_git(the_repository); p; p = p->next, cnt++) { + for (p = get_all_packs(the_repository); p; p = p->next, cnt++) { if (cnt == nr) { free(mapping); return; diff --git a/packfile.c b/packfile.c index ebcb5742ec..cbef7033c3 100644 --- a/packfile.c +++ b/packfile.c @@ -15,6 +15,7 @@ #include "tree-walk.h" #include "tree.h" #include "object-store.h" +#include "midx.h" char *odb_pack_name(struct strbuf *buf, const unsigned char *sha1, @@ -196,6 +197,23 @@ int open_pack_index(struct packed_git *p) return ret; } +uint32_t get_pack_fanout(struct packed_git *p, uint32_t value) +{ + const uint32_t *level1_ofs = p->index_data; + + if (!level1_ofs) { + if (open_pack_index(p)) + return 0; + level1_ofs = p->index_data; + } + + if (p->index_version > 1) { + level1_ofs += 2; + } + + return ntohl(level1_ofs[value]); +} + static struct packed_git *alloc_packed_git(int extra) { struct packed_git *p = xmalloc(st_add(sizeof(*p), extra)); @@ -451,8 +469,19 @@ static int open_packed_git_1(struct packed_git *p) ssize_t read_result; const unsigned hashsz = the_hash_algo->rawsz; - if (!p->index_data && open_pack_index(p)) - return error("packfile %s index unavailable", p->pack_name); + if (!p->index_data) { + struct multi_pack_index *m; + const char *pack_name = strrchr(p->pack_name, '/'); + + for (m = the_repository->objects->multi_pack_index; + m; m = m->next) { + if (midx_contains_pack(m, pack_name)) + break; + } + + if (!m && open_pack_index(p)) + return error("packfile %s index unavailable", p->pack_name); + } if (!pack_max_fds) { unsigned int max_fds = get_max_fd_limit(); @@ -503,6 +532,10 @@ static int open_packed_git_1(struct packed_git *p) " supported (try upgrading GIT to a newer version)", p->pack_name, ntohl(hdr.hdr_version)); + /* Skip index checking if in multi-pack-index */ + if (!p->index_data) + return 0; + /* Verify the pack matches its index. */ if (p->num_objects != ntohl(hdr.hdr_entries)) return error("packfile %s claims to have %"PRIu32" objects" @@ -738,13 +771,14 @@ static void report_pack_garbage(struct string_list *list) report_helper(list, seen_bits, first, list->nr); } -static void prepare_packed_git_one(struct repository *r, char *objdir, int local) +void for_each_file_in_pack_dir(const char *objdir, + each_file_in_pack_dir_fn fn, + void *data) { struct strbuf path = STRBUF_INIT; size_t dirnamelen; DIR *dir; struct dirent *de; - struct string_list garbage = STRING_LIST_INIT_DUP; strbuf_addstr(&path, objdir); strbuf_addstr(&path, "/pack"); @@ -759,53 +793,87 @@ static void prepare_packed_git_one(struct repository *r, char *objdir, int local strbuf_addch(&path, '/'); dirnamelen = path.len; while ((de = readdir(dir)) != NULL) { - struct packed_git *p; - size_t base_len; - if (is_dot_or_dotdot(de->d_name)) continue; strbuf_setlen(&path, dirnamelen); strbuf_addstr(&path, de->d_name); - base_len = path.len; - if (strip_suffix_mem(path.buf, &base_len, ".idx")) { - /* Don't reopen a pack we already have. */ - for (p = r->objects->packed_git; p; - p = p->next) { - size_t len; - if (strip_suffix(p->pack_name, ".pack", &len) && - len == base_len && - !memcmp(p->pack_name, path.buf, len)) - break; - } - if (p == NULL && - /* - * See if it really is a valid .idx file with - * corresponding .pack file that we can map. - */ - (p = add_packed_git(path.buf, path.len, local)) != NULL) - install_packed_git(r, p); - } - - if (!report_garbage) - continue; - - if (ends_with(de->d_name, ".idx") || - ends_with(de->d_name, ".pack") || - ends_with(de->d_name, ".bitmap") || - ends_with(de->d_name, ".keep") || - ends_with(de->d_name, ".promisor")) - string_list_append(&garbage, path.buf); - else - report_garbage(PACKDIR_FILE_GARBAGE, path.buf); + fn(path.buf, path.len, de->d_name, data); } + closedir(dir); - report_pack_garbage(&garbage); - string_list_clear(&garbage, 0); strbuf_release(&path); } +struct prepare_pack_data { + struct repository *r; + struct string_list *garbage; + int local; + struct multi_pack_index *m; +}; + +static void prepare_pack(const char *full_name, size_t full_name_len, + const char *file_name, void *_data) +{ + struct prepare_pack_data *data = (struct prepare_pack_data *)_data; + struct packed_git *p; + size_t base_len = full_name_len; + + if (strip_suffix_mem(full_name, &base_len, ".idx") && + !(data->m && midx_contains_pack(data->m, file_name))) { + /* Don't reopen a pack we already have. */ + for (p = data->r->objects->packed_git; p; p = p->next) { + size_t len; + if (strip_suffix(p->pack_name, ".pack", &len) && + len == base_len && + !memcmp(p->pack_name, full_name, len)) + break; + } + + if (!p) { + p = add_packed_git(full_name, full_name_len, data->local); + if (p) + install_packed_git(data->r, p); + } + } + + if (!report_garbage) + return; + + if (!strcmp(file_name, "multi-pack-index")) + return; + if (ends_with(file_name, ".idx") || + ends_with(file_name, ".pack") || + ends_with(file_name, ".bitmap") || + ends_with(file_name, ".keep") || + ends_with(file_name, ".promisor")) + string_list_append(data->garbage, full_name); + else + report_garbage(PACKDIR_FILE_GARBAGE, full_name); +} + +static void prepare_packed_git_one(struct repository *r, char *objdir, int local) +{ + struct prepare_pack_data data; + struct string_list garbage = STRING_LIST_INIT_DUP; + + data.m = r->objects->multi_pack_index; + + /* look for the multi-pack-index for this object directory */ + while (data.m && strcmp(data.m->object_dir, objdir)) + data.m = data.m->next; + + data.r = r; + data.garbage = &garbage; + data.local = local; + + for_each_file_in_pack_dir(objdir, prepare_pack, &data); + + report_pack_garbage(data.garbage); + string_list_clear(data.garbage, 0); +} + static void prepare_packed_git(struct repository *r); /* * Give a fast, rough count of the number of objects in the repository. This @@ -818,10 +886,13 @@ unsigned long approximate_object_count(void) { if (!the_repository->objects->approximate_object_count_valid) { unsigned long count; + struct multi_pack_index *m; struct packed_git *p; prepare_packed_git(the_repository); count = 0; + for (m = get_multi_pack_index(the_repository); m; m = m->next) + count += m->num_objects; for (p = the_repository->objects->packed_git; p; p = p->next) { if (open_pack_index(p)) continue; @@ -893,11 +964,17 @@ static void prepare_packed_git(struct repository *r) if (r->objects->packed_git_initialized) return; + prepare_multi_pack_index_one(r, r->objects->objectdir, 1); prepare_packed_git_one(r, r->objects->objectdir, 1); prepare_alt_odb(r); - for (alt = r->objects->alt_odb_list; alt; alt = alt->next) + for (alt = r->objects->alt_odb_list; alt; alt = alt->next) { + prepare_multi_pack_index_one(r, alt->path, 0); prepare_packed_git_one(r, alt->path, 0); + } rearrange_packed_git(r); + + r->objects->all_packs = NULL; + prepare_packed_git_mru(r); r->objects->packed_git_initialized = 1; } @@ -915,6 +992,36 @@ struct packed_git *get_packed_git(struct repository *r) return r->objects->packed_git; } +struct multi_pack_index *get_multi_pack_index(struct repository *r) +{ + prepare_packed_git(r); + return r->objects->multi_pack_index; +} + +struct packed_git *get_all_packs(struct repository *r) +{ + prepare_packed_git(r); + + if (!r->objects->all_packs) { + struct packed_git *p = r->objects->packed_git; + struct multi_pack_index *m; + + for (m = r->objects->multi_pack_index; m; m = m->next) { + uint32_t i; + for (i = 0; i < m->num_packs; i++) { + if (!prepare_midx_pack(m, i)) { + m->packs[i]->next = p; + p = m->packs[i]; + } + } + } + + r->objects->all_packs = p; + } + + return r->objects->all_packs; +} + struct list_head *get_packed_git_mru(struct repository *r) { prepare_packed_git(r); @@ -1856,11 +1963,17 @@ static int fill_pack_entry(const struct object_id *oid, int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e) { struct list_head *pos; + struct multi_pack_index *m; prepare_packed_git(r); - if (!r->objects->packed_git) + if (!r->objects->packed_git && !r->objects->multi_pack_index) return 0; + for (m = r->objects->multi_pack_index; m; m = m->next) { + if (fill_midx_entry(oid, e, m)) + return 1; + } + list_for_each(pos, &r->objects->packed_git_mru) { struct packed_git *p = list_entry(pos, struct packed_git, mru); if (fill_pack_entry(oid, e, p)) { @@ -1923,7 +2036,7 @@ int for_each_packed_object(each_packed_object_fn cb, void *data, int pack_errors = 0; prepare_packed_git(the_repository); - for (p = the_repository->objects->packed_git; p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { if ((flags & FOR_EACH_OBJECT_LOCAL_ONLY) && !p->pack_local) continue; if ((flags & FOR_EACH_OBJECT_PROMISOR_ONLY) && diff --git a/packfile.h b/packfile.h index 630f35cf31..442625723d 100644 --- a/packfile.h +++ b/packfile.h @@ -33,6 +33,12 @@ extern char *sha1_pack_index_name(const unsigned char *sha1); extern struct packed_git *parse_pack_index(unsigned char *sha1, const char *idx_path); +typedef void each_file_in_pack_dir_fn(const char *full_path, size_t full_path_len, + const char *file_pach, void *data); +void for_each_file_in_pack_dir(const char *objdir, + each_file_in_pack_dir_fn fn, + void *data); + /* A hook to report invalid files in pack directory */ #define PACKDIR_FILE_PACK 1 #define PACKDIR_FILE_IDX 2 @@ -44,6 +50,8 @@ extern void install_packed_git(struct repository *r, struct packed_git *pack); struct packed_git *get_packed_git(struct repository *r); struct list_head *get_packed_git_mru(struct repository *r); +struct multi_pack_index *get_multi_pack_index(struct repository *r); +struct packed_git *get_all_packs(struct repository *r); /* * Give a rough count of objects in the repository. This sacrifices accuracy @@ -68,6 +76,8 @@ extern int open_pack_index(struct packed_git *); */ extern void close_pack_index(struct packed_git *); +extern uint32_t get_pack_fanout(struct packed_git *p, uint32_t value); + extern unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *); extern void close_pack_windows(struct packed_git *); extern void close_pack(struct packed_git *); diff --git a/server-info.c b/server-info.c index 41050c2449..e2b2d6a27a 100644 --- a/server-info.c +++ b/server-info.c @@ -199,7 +199,7 @@ static void init_pack_info(const char *infofile, int force) objdir = get_object_directory(); objdirlen = strlen(objdir); - for (p = get_packed_git(the_repository); p; p = p->next) { + for (p = get_all_packs(the_repository); p; p = p->next) { /* we ignore things on alternate path since they are * not available to the pullers in general. */ @@ -209,7 +209,7 @@ static void init_pack_info(const char *infofile, int force) } num_pack = i; info = xcalloc(num_pack, sizeof(struct pack_info *)); - for (i = 0, p = get_packed_git(the_repository); p; p = p->next) { + for (i = 0, p = get_all_packs(the_repository); p; p = p->next) { if (!p->pack_local) continue; info[i] = xcalloc(1, sizeof(struct pack_info)); diff --git a/sha1-name.c b/sha1-name.c index c9cc1318b7..6cccfbbfbf 100644 --- a/sha1-name.c +++ b/sha1-name.c @@ -12,6 +12,7 @@ #include "packfile.h" #include "object-store.h" #include "repository.h" +#include "midx.h" static int get_oid_oneline(const char *, struct object_id *, struct commit_list *); @@ -149,6 +150,32 @@ static int match_sha(unsigned len, const unsigned char *a, const unsigned char * return 1; } +static void unique_in_midx(struct multi_pack_index *m, + struct disambiguate_state *ds) +{ + uint32_t num, i, first = 0; + const struct object_id *current = NULL; + num = m->num_objects; + + if (!num) + return; + + bsearch_midx(&ds->bin_pfx, m, &first); + + /* + * At this point, "first" is the location of the lowest object + * with an object name that could match "bin_pfx". See if we have + * 0, 1 or more objects that actually match(es). + */ + for (i = first; i < num && !ds->ambiguous; i++) { + struct object_id oid; + current = nth_midxed_object_oid(&oid, m, i); + if (!match_sha(ds->len, ds->bin_pfx.hash, current->hash)) + break; + update_candidates(ds, current); + } +} + static void unique_in_pack(struct packed_git *p, struct disambiguate_state *ds) { @@ -177,8 +204,12 @@ static void unique_in_pack(struct packed_git *p, static void find_short_packed_object(struct disambiguate_state *ds) { + struct multi_pack_index *m; struct packed_git *p; + for (m = get_multi_pack_index(the_repository); m && !ds->ambiguous; + m = m->next) + unique_in_midx(m, ds); for (p = get_packed_git(the_repository); p && !ds->ambiguous; p = p->next) unique_in_pack(p, ds); @@ -529,6 +560,42 @@ static int extend_abbrev_len(const struct object_id *oid, void *cb_data) return 0; } +static void find_abbrev_len_for_midx(struct multi_pack_index *m, + struct min_abbrev_data *mad) +{ + int match = 0; + uint32_t num, first = 0; + struct object_id oid; + const struct object_id *mad_oid; + + if (!m->num_objects) + return; + + num = m->num_objects; + mad_oid = mad->oid; + match = bsearch_midx(mad_oid, m, &first); + + /* + * first is now the position in the packfile where we would insert + * mad->hash if it does not exist (or the position of mad->hash if + * it does exist). Hence, we consider a maximum of two objects + * nearby for the abbreviation length. + */ + mad->init_len = 0; + if (!match) { + if (nth_midxed_object_oid(&oid, m, first)) + extend_abbrev_len(&oid, mad); + } else if (first < num - 1) { + if (nth_midxed_object_oid(&oid, m, first + 1)) + extend_abbrev_len(&oid, mad); + } + if (first > 0) { + if (nth_midxed_object_oid(&oid, m, first - 1)) + extend_abbrev_len(&oid, mad); + } + mad->init_len = mad->cur_len; +} + static void find_abbrev_len_for_pack(struct packed_git *p, struct min_abbrev_data *mad) { @@ -567,8 +634,11 @@ static void find_abbrev_len_for_pack(struct packed_git *p, static void find_abbrev_len_packed(struct min_abbrev_data *mad) { + struct multi_pack_index *m; struct packed_git *p; + for (m = get_multi_pack_index(the_repository); m; m = m->next) + find_abbrev_len_for_midx(m, mad); for (p = get_packed_git(the_repository); p; p = p->next) find_abbrev_len_for_pack(p, mad); } diff --git a/t/helper/test-read-midx.c b/t/helper/test-read-midx.c new file mode 100644 index 0000000000..831b586d02 --- /dev/null +++ b/t/helper/test-read-midx.c @@ -0,0 +1,51 @@ +#include "test-tool.h" +#include "cache.h" +#include "midx.h" +#include "repository.h" +#include "object-store.h" + +static int read_midx_file(const char *object_dir) +{ + uint32_t i; + struct multi_pack_index *m = load_multi_pack_index(object_dir, 1); + + if (!m) + return 1; + + printf("header: %08x %d %d %d\n", + m->signature, + m->version, + m->num_chunks, + m->num_packs); + + printf("chunks:"); + + if (m->chunk_pack_names) + printf(" pack-names"); + if (m->chunk_oid_fanout) + printf(" oid-fanout"); + if (m->chunk_oid_lookup) + printf(" oid-lookup"); + if (m->chunk_object_offsets) + printf(" object-offsets"); + if (m->chunk_large_offsets) + printf(" large-offsets"); + + printf("\nnum_objects: %d\n", m->num_objects); + + printf("packs:\n"); + for (i = 0; i < m->num_packs; i++) + printf("%s\n", m->pack_names[i]); + + printf("object-dir: %s\n", m->object_dir); + + return 0; +} + +int cmd__read_midx(int argc, const char **argv) +{ + if (argc != 2) + usage("read-midx <object-dir>"); + + return read_midx_file(argv[1]); +} diff --git a/t/helper/test-tool.c b/t/helper/test-tool.c index 0edafcfd65..3276701710 100644 --- a/t/helper/test-tool.c +++ b/t/helper/test-tool.c @@ -28,6 +28,7 @@ static struct test_cmd cmds[] = { { "path-utils", cmd__path_utils }, { "prio-queue", cmd__prio_queue }, { "read-cache", cmd__read_cache }, + { "read-midx", cmd__read_midx }, { "ref-store", cmd__ref_store }, { "regex", cmd__regex }, { "repository", cmd__repository }, diff --git a/t/helper/test-tool.h b/t/helper/test-tool.h index e954e8c522..710fb1b286 100644 --- a/t/helper/test-tool.h +++ b/t/helper/test-tool.h @@ -24,6 +24,7 @@ int cmd__online_cpus(int argc, const char **argv); int cmd__path_utils(int argc, const char **argv); int cmd__prio_queue(int argc, const char **argv); int cmd__read_cache(int argc, const char **argv); +int cmd__read_midx(int argc, const char **argv); int cmd__ref_store(int argc, const char **argv); int cmd__regex(int argc, const char **argv); int cmd__repository(int argc, const char **argv); diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh new file mode 100755 index 0000000000..6f56b38674 --- /dev/null +++ b/t/t5319-multi-pack-index.sh @@ -0,0 +1,217 @@ +#!/bin/sh + +test_description='multi-pack-indexes' +. ./test-lib.sh + +objdir=.git/objects + +midx_read_expect () { + NUM_PACKS=$1 + NUM_OBJECTS=$2 + NUM_CHUNKS=$3 + OBJECT_DIR=$4 + EXTRA_CHUNKS="$5" + { + cat <<-EOF && + header: 4d494458 1 $NUM_CHUNKS $NUM_PACKS + chunks: pack-names oid-fanout oid-lookup object-offsets$EXTRA_CHUNKS + num_objects: $NUM_OBJECTS + packs: + EOF + if test $NUM_PACKS -ge 1 + then + ls $OBJECT_DIR/pack/ | grep idx | sort + fi && + printf "object-dir: $OBJECT_DIR\n" + } >expect && + test-tool read-midx $OBJECT_DIR >actual && + test_cmp expect actual +} + +test_expect_success 'write midx with no packs' ' + test_when_finished rm -f pack/multi-pack-index && + git multi-pack-index --object-dir=. write && + midx_read_expect 0 0 4 . +' + +generate_objects () { + i=$1 + iii=$(printf '%03i' $i) + { + test-tool genrandom "bar" 200 && + test-tool genrandom "baz $iii" 50 + } >wide_delta_$iii && + { + test-tool genrandom "foo"$i 100 && + test-tool genrandom "foo"$(( $i + 1 )) 100 && + test-tool genrandom "foo"$(( $i + 2 )) 100 + } >deep_delta_$iii && + { + echo $iii && + test-tool genrandom "$iii" 8192 + } >file_$iii && + git update-index --add file_$iii deep_delta_$iii wide_delta_$iii +} + +commit_and_list_objects () { + { + echo 101 && + test-tool genrandom 100 8192; + } >file_101 && + git update-index --add file_101 && + tree=$(git write-tree) && + commit=$(git commit-tree $tree -p HEAD</dev/null) && + { + echo $tree && + git ls-tree $tree | sed -e "s/.* \\([0-9a-f]*\\) .*/\\1/" + } >obj-list && + git reset --hard $commit +} + +test_expect_success 'create objects' ' + test_commit initial && + for i in $(test_seq 1 5) + do + generate_objects $i + done && + commit_and_list_objects +' + +test_expect_success 'write midx with one v1 pack' ' + pack=$(git pack-objects --index-version=1 $objdir/pack/test <obj-list) && + test_when_finished rm $objdir/pack/test-$pack.pack \ + $objdir/pack/test-$pack.idx $objdir/pack/multi-pack-index && + git multi-pack-index --object-dir=$objdir write && + midx_read_expect 1 18 4 $objdir +' + +midx_git_two_modes () { + if [ "$2" = "sorted" ] + then + git -c core.multiPackIndex=false $1 | sort >expect && + git -c core.multiPackIndex=true $1 | sort >actual + else + git -c core.multiPackIndex=false $1 >expect && + git -c core.multiPackIndex=true $1 >actual + fi && + test_cmp expect actual +} + +compare_results_with_midx () { + MSG=$1 + test_expect_success "check normal git operations: $MSG" ' + midx_git_two_modes "rev-list --objects --all" && + midx_git_two_modes "log --raw" && + midx_git_two_modes "count-objects --verbose" && + midx_git_two_modes "cat-file --batch-all-objects --buffer --batch-check" && + midx_git_two_modes "cat-file --batch-all-objects --buffer --batch-check --unsorted" sorted + ' +} + +test_expect_success 'write midx with one v2 pack' ' + git pack-objects --index-version=2,0x40 $objdir/pack/test <obj-list && + git multi-pack-index --object-dir=$objdir write && + midx_read_expect 1 18 4 $objdir +' + +compare_results_with_midx "one v2 pack" + +test_expect_success 'add more objects' ' + for i in $(test_seq 6 10) + do + generate_objects $i + done && + commit_and_list_objects +' + +test_expect_success 'write midx with two packs' ' + git pack-objects --index-version=1 $objdir/pack/test-2 <obj-list && + git multi-pack-index --object-dir=$objdir write && + midx_read_expect 2 34 4 $objdir +' + +compare_results_with_midx "two packs" + +test_expect_success 'add more packs' ' + for j in $(test_seq 11 20) + do + generate_objects $j && + commit_and_list_objects && + git pack-objects --index-version=2 $objdir/pack/test-pack <obj-list + done +' + +compare_results_with_midx "mixed mode (two packs + extra)" + +test_expect_success 'write midx with twelve packs' ' + git multi-pack-index --object-dir=$objdir write && + midx_read_expect 12 74 4 $objdir +' + +compare_results_with_midx "twelve packs" + +test_expect_success 'repack removes multi-pack-index' ' + test_path_is_file $objdir/pack/multi-pack-index && + git repack -adf && + test_path_is_missing $objdir/pack/multi-pack-index +' + +compare_results_with_midx "after repack" + +test_expect_success 'multi-pack-index and pack-bitmap' ' + git -c repack.writeBitmaps=true repack -ad && + git multi-pack-index write && + git rev-list --test-bitmap HEAD +' + +test_expect_success 'multi-pack-index and alternates' ' + git init --bare alt.git && + echo $(pwd)/alt.git/objects >.git/objects/info/alternates && + echo content1 >file1 && + altblob=$(GIT_DIR=alt.git git hash-object -w file1) && + git cat-file blob $altblob && + git rev-list --all +' + +compare_results_with_midx "with alternate (local midx)" + +test_expect_success 'multi-pack-index in an alternate' ' + mv .git/objects/pack/* alt.git/objects/pack && + test_commit add_local_objects && + git repack --local && + git multi-pack-index write && + midx_read_expect 1 3 4 $objdir && + git reset --hard HEAD~1 && + rm -f .git/objects/pack/* +' + +compare_results_with_midx "with alternate (remote midx)" + + +# usage: corrupt_data <file> <pos> [<data>] +corrupt_data () { + file=$1 + pos=$2 + data="${3:-\0}" + printf "$data" | dd of="$file" bs=1 seek="$pos" conv=notrunc +} + +# Force 64-bit offsets by manipulating the idx file. +# This makes the IDX file _incorrect_ so be careful to clean up after! +test_expect_success 'force some 64-bit offsets with pack-objects' ' + mkdir objects64 && + mkdir objects64/pack && + for i in $(test_seq 1 11) + do + generate_objects 11 + done && + commit_and_list_objects && + pack64=$(git pack-objects --index-version=2,0x40 objects64/pack/test-64 <obj-list) && + idx64=objects64/pack/test-64-$pack64.idx && + chmod u+w $idx64 && + corrupt_data $idx64 2999 "\02" && + midx64=$(git multi-pack-index --object-dir=objects64 write) && + midx_read_expect 1 63 5 objects64 " large-offsets" +' + +test_done |