diff options
author | Jeff King <peff@peff.net> | 2019-12-18 12:25:45 +0100 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2020-01-23 10:51:50 -0800 |
commit | bb514de356cfcfd314f7ac7ae1acfeede3fa4b1f (patch) | |
tree | dc222189645405ee1eab03d9493a553bf6f33b34 /builtin | |
parent | builtin/pack-objects: introduce obj_is_packed() (diff) | |
download | tgif-bb514de356cfcfd314f7ac7ae1acfeede3fa4b1f.tar.xz |
pack-objects: improve partial packfile reuse
The old code to reuse deltas from an existing packfile
just tried to dump a whole segment of the pack verbatim.
That's faster than the traditional way of actually adding
objects to the packing list, but it didn't kick in very
often. This new code is really going for a middle ground:
do _some_ per-object work, but way less than we'd
traditionally do.
The general strategy of the new code is to make a bitmap
of objects from the packfile we'll include, and then
iterate over it, writing out each object exactly as it is
in our on-disk pack, but _not_ adding it to our packlist
(which costs memory, and increases the search space for
deltas).
One complication is that if we're omitting some objects,
we can't set a delta against a base that we're not
sending. So we have to check each object in
try_partial_reuse() to make sure we have its delta.
About performance, in the worst case we might have
interleaved objects that we are sending or not sending,
and we'd have as many chunks as objects. But in practice
we send big chunks.
For instance, packing torvalds/linux on GitHub servers
now reused 6.5M objects, but only needed ~50k chunks.
Helped-by: Jonathan Tan <jonathantanmy@google.com>
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Christian Couder <chriscool@tuxfamily.org>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
Diffstat (limited to 'builtin')
-rw-r--r-- | builtin/pack-objects.c | 214 |
1 files changed, 170 insertions, 44 deletions
diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 08b562da72..28e56a94f7 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -92,7 +92,7 @@ static struct progress *progress_state; static struct packed_git *reuse_packfile; static uint32_t reuse_packfile_objects; -static off_t reuse_packfile_offset; +static struct bitmap *reuse_packfile_bitmap; static int use_bitmap_index_default = 1; static int use_bitmap_index = -1; @@ -785,57 +785,185 @@ static struct object_entry **compute_write_order(void) return wo; } -static off_t write_reused_pack(struct hashfile *f) + +/* + * A reused set of objects. All objects in a chunk have the same + * relative position in the original packfile and the generated + * packfile. + */ + +static struct reused_chunk { + /* The offset of the first object of this chunk in the original + * packfile. */ + off_t original; + /* The offset of the first object of this chunk in the generated + * packfile minus "original". */ + off_t difference; +} *reused_chunks; +static int reused_chunks_nr; +static int reused_chunks_alloc; + +static void record_reused_object(off_t where, off_t offset) +{ + if (reused_chunks_nr && reused_chunks[reused_chunks_nr-1].difference == offset) + return; + + ALLOC_GROW(reused_chunks, reused_chunks_nr + 1, + reused_chunks_alloc); + reused_chunks[reused_chunks_nr].original = where; + reused_chunks[reused_chunks_nr].difference = offset; + reused_chunks_nr++; +} + +/* + * Binary search to find the chunk that "where" is in. Note + * that we're not looking for an exact match, just the first + * chunk that contains it (which implicitly ends at the start + * of the next chunk. + */ +static off_t find_reused_offset(off_t where) { - unsigned char buffer[8192]; - off_t to_write, total; - int fd; + int lo = 0, hi = reused_chunks_nr; + while (lo < hi) { + int mi = lo + ((hi - lo) / 2); + if (where == reused_chunks[mi].original) + return reused_chunks[mi].difference; + if (where < reused_chunks[mi].original) + hi = mi; + else + lo = mi + 1; + } - if (!is_pack_valid(reuse_packfile)) - die(_("packfile is invalid: %s"), reuse_packfile->pack_name); + /* + * The first chunk starts at zero, so we can't have gone below + * there. + */ + assert(lo); + return reused_chunks[lo-1].difference; +} - fd = git_open(reuse_packfile->pack_name); - if (fd < 0) - die_errno(_("unable to open packfile for reuse: %s"), - reuse_packfile->pack_name); +static void write_reused_pack_one(size_t pos, struct hashfile *out, + struct pack_window **w_curs) +{ + off_t offset, next, cur; + enum object_type type; + unsigned long size; - if (lseek(fd, sizeof(struct pack_header), SEEK_SET) == -1) - die_errno(_("unable to seek in reused packfile")); + offset = reuse_packfile->revindex[pos].offset; + next = reuse_packfile->revindex[pos + 1].offset; - if (reuse_packfile_offset < 0) - reuse_packfile_offset = reuse_packfile->pack_size - the_hash_algo->rawsz; + record_reused_object(offset, offset - hashfile_total(out)); - total = to_write = reuse_packfile_offset - sizeof(struct pack_header); + cur = offset; + type = unpack_object_header(reuse_packfile, w_curs, &cur, &size); + assert(type >= 0); - while (to_write) { - int read_pack = xread(fd, buffer, sizeof(buffer)); + if (type == OBJ_OFS_DELTA) { + off_t base_offset; + off_t fixup; + + unsigned char header[MAX_PACK_OBJECT_HEADER]; + unsigned len; + + base_offset = get_delta_base(reuse_packfile, w_curs, &cur, type, offset); + assert(base_offset != 0); + + /* Convert to REF_DELTA if we must... */ + if (!allow_ofs_delta) { + int base_pos = find_revindex_position(reuse_packfile, base_offset); + const unsigned char *base_sha1 = + nth_packed_object_sha1(reuse_packfile, + reuse_packfile->revindex[base_pos].nr); + + len = encode_in_pack_object_header(header, sizeof(header), + OBJ_REF_DELTA, size); + hashwrite(out, header, len); + hashwrite(out, base_sha1, 20); + copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur); + return; + } - if (read_pack <= 0) - die_errno(_("unable to read from reused packfile")); + /* Otherwise see if we need to rewrite the offset... */ + fixup = find_reused_offset(offset) - + find_reused_offset(base_offset); + if (fixup) { + unsigned char ofs_header[10]; + unsigned i, ofs_len; + off_t ofs = offset - base_offset - fixup; - if (read_pack > to_write) - read_pack = to_write; + len = encode_in_pack_object_header(header, sizeof(header), + OBJ_OFS_DELTA, size); - hashwrite(f, buffer, read_pack); - to_write -= read_pack; + i = sizeof(ofs_header) - 1; + ofs_header[i] = ofs & 127; + while (ofs >>= 7) + ofs_header[--i] = 128 | (--ofs & 127); + + ofs_len = sizeof(ofs_header) - i; + + hashwrite(out, header, len); + hashwrite(out, ofs_header + sizeof(ofs_header) - ofs_len, ofs_len); + copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur); + return; + } + + /* ...otherwise we have no fixup, and can write it verbatim */ + } + + copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset); +} + +static size_t write_reused_pack_verbatim(struct hashfile *out, + struct pack_window **w_curs) +{ + size_t pos = 0; + + while (pos < reuse_packfile_bitmap->word_alloc && + reuse_packfile_bitmap->words[pos] == (eword_t)~0) + pos++; + + if (pos) { + off_t to_write; + + written = (pos * BITS_IN_EWORD); + to_write = reuse_packfile->revindex[written].offset + - sizeof(struct pack_header); + + /* We're recording one chunk, not one object. */ + record_reused_object(sizeof(struct pack_header), 0); + hashflush(out); + copy_pack_data(out, reuse_packfile, w_curs, + sizeof(struct pack_header), to_write); - /* - * We don't know the actual number of objects written, - * only how many bytes written, how many bytes total, and - * how many objects total. So we can fake it by pretending all - * objects we are writing are the same size. This gives us a - * smooth progress meter, and at the end it matches the true - * answer. - */ - written = reuse_packfile_objects * - (((double)(total - to_write)) / total); display_progress(progress_state, written); } + return pos; +} + +static void write_reused_pack(struct hashfile *f) +{ + size_t i = 0; + uint32_t offset; + struct pack_window *w_curs = NULL; + + if (allow_ofs_delta) + i = write_reused_pack_verbatim(f, &w_curs); + + for (; i < reuse_packfile_bitmap->word_alloc; ++i) { + eword_t word = reuse_packfile_bitmap->words[i]; + size_t pos = (i * BITS_IN_EWORD); + + for (offset = 0; offset < BITS_IN_EWORD; ++offset) { + if ((word >> offset) == 0) + break; + + offset += ewah_bit_ctz64(word >> offset); + write_reused_pack_one(pos + offset, f, &w_curs); + display_progress(progress_state, ++written); + } + } - close(fd); - written = reuse_packfile_objects; - display_progress(progress_state, written); - return reuse_packfile_offset - sizeof(struct pack_header); + unuse_pack(&w_curs); } static const char no_split_warning[] = N_( @@ -868,11 +996,9 @@ static void write_pack_file(void) offset = write_pack_header(f, nr_remaining); if (reuse_packfile) { - off_t packfile_size; assert(pack_to_stdout); - - packfile_size = write_reused_pack(f); - offset += packfile_size; + write_reused_pack(f); + offset = hashfile_total(f); } nr_written = 0; @@ -2677,6 +2803,7 @@ static void prepare_pack(int window, int depth) if (nr_deltas && n > 1) { unsigned nr_done = 0; + if (progress) progress_state = start_progress(_("Compressing objects"), nr_deltas); @@ -3062,7 +3189,6 @@ static int pack_options_allow_reuse(void) { return allow_pack_reuse && pack_to_stdout && - allow_ofs_delta && !ignore_packed_keep_on_disk && !ignore_packed_keep_in_core && (!local || !have_non_local_packs) && @@ -3079,7 +3205,7 @@ static int get_object_list_from_bitmap(struct rev_info *revs) bitmap_git, &reuse_packfile, &reuse_packfile_objects, - &reuse_packfile_offset)) { + &reuse_packfile_bitmap)) { assert(reuse_packfile_objects); nr_result += reuse_packfile_objects; display_progress(progress_state, nr_result); |