From 3b1ca60f8f317b483c8c1805ab500ff2b014cbec Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 8 Dec 2020 17:03:14 -0500 Subject: ewah/ewah_bitmap.c: avoid open-coding ALLOC_GROW() 'ewah/ewah_bitmap.c:buffer_grow()' is responsible for growing the buffer used to store the bits of an EWAH bitmap. It is essentially doing the same task as the 'ALLOC_GROW()' macro, so use that instead. This simplifies the callers of 'buffer_grow()', who no longer have to ask for a specific size, but rather specify how much of the buffer they need. They also no longer need to guard 'buffer_grow()' behind an if statement, since 'ALLOC_GROW()' (and, by extension, 'buffer_grow()') is a noop if the buffer is already large enough. But, the most significant change is that this fixes a bug when calling buffer_grow() with both 'alloc_size' and 'new_size' set to 1. In this case, truncating integer math will leave the new size set to 1, causing the buffer to never grow. Instead, let alloc_nr() handle this, which asks for '(new_size + 16) * 3 / 2' instead of 'new_size * 3 / 2'. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/ewah_bitmap.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/ewah/ewah_bitmap.c b/ewah/ewah_bitmap.c index d59b1afe3d..2a8c7c5c33 100644 --- a/ewah/ewah_bitmap.c +++ b/ewah/ewah_bitmap.c @@ -19,6 +19,7 @@ #include "git-compat-util.h" #include "ewok.h" #include "ewok_rlw.h" +#include "cache.h" static inline size_t min_size(size_t a, size_t b) { @@ -33,20 +34,13 @@ static inline size_t max_size(size_t a, size_t b) static inline void buffer_grow(struct ewah_bitmap *self, size_t new_size) { size_t rlw_offset = (uint8_t *)self->rlw - (uint8_t *)self->buffer; - - if (self->alloc_size >= new_size) - return; - - self->alloc_size = new_size; - REALLOC_ARRAY(self->buffer, self->alloc_size); + ALLOC_GROW(self->buffer, new_size, self->alloc_size); self->rlw = self->buffer + (rlw_offset / sizeof(eword_t)); } static inline void buffer_push(struct ewah_bitmap *self, eword_t value) { - if (self->buffer_size + 1 >= self->alloc_size) - buffer_grow(self, self->buffer_size * 3 / 2); - + buffer_grow(self, self->buffer_size + 1); self->buffer[self->buffer_size++] = value; } @@ -137,8 +131,7 @@ void ewah_add_dirty_words( rlw_set_literal_words(self->rlw, literals + can_add); - if (self->buffer_size + can_add >= self->alloc_size) - buffer_grow(self, (self->buffer_size + can_add) * 3 / 2); + buffer_grow(self, self->buffer_size + can_add); if (negate) { size_t i; -- cgit v1.2.3 From ca510902008e99168b4fca53660e10b7a68b37f1 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:19 -0500 Subject: pack-bitmap: fix header size check When we parse a .bitmap header, we first check that we have enough bytes to make a valid header. We do that based on sizeof(struct bitmap_disk_header). However, as of 0f4d6cada8 (pack-bitmap: make bitmap header handling hash agnostic, 2019-02-19), that struct oversizes its checksum member to GIT_MAX_RAWSZ. That means we need to adjust for the difference between that constant and the size of the actual hash we're using. That commit adjusted the code which moves our pointer forward, but forgot to update the size check. This meant we were overly strict about the header size (requiring room for a 32-byte worst-case hash, when sha1 is only 20 bytes). But in practice it didn't matter because bitmap files tend to have at least 12 bytes of actual data anyway, so it was unlikely for a valid file to be caught by this. Let's fix it by pulling the header size into a separate variable and using it in both spots. That fixes the bug and simplifies the code to make it harder to have a mismatch like this in the future. It will also come in handy in the next patch for more bounds checking. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index 4077e731e8..fe5647e72e 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -138,9 +138,10 @@ static struct ewah_bitmap *read_bitmap_1(struct bitmap_index *index) static int load_bitmap_header(struct bitmap_index *index) { struct bitmap_disk_header *header = (void *)index->map; + size_t header_size = sizeof(*header) - GIT_MAX_RAWSZ + the_hash_algo->rawsz; - if (index->map_size < sizeof(*header) + the_hash_algo->rawsz) - return error("Corrupted bitmap index (missing header data)"); + if (index->map_size < header_size + the_hash_algo->rawsz) + return error("Corrupted bitmap index (too small)"); if (memcmp(header->magic, BITMAP_IDX_SIGNATURE, sizeof(BITMAP_IDX_SIGNATURE)) != 0) return error("Corrupted bitmap index file (wrong header)"); @@ -164,7 +165,7 @@ static int load_bitmap_header(struct bitmap_index *index) } index->entry_count = ntohl(header->entry_count); - index->map_pos += sizeof(*header) - GIT_MAX_RAWSZ + the_hash_algo->rawsz; + index->map_pos += header_size; return 0; } -- cgit v1.2.3 From ec6c7b43679fcd7db00019a850636abc5c94e44e Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:24 -0500 Subject: pack-bitmap: bounds-check size of cache extension A .bitmap file may have a "name hash cache" extension, which puts a sequence of uint32_t values (one per object) at the end of the file. When we see a flag indicating this extension, we blindly subtract the appropriate number of bytes from our available length. However, if the .bitmap file is too short, we'll underflow our length variable and wrap around, thinking we have a very large length. This can lead to reading out-of-bounds bytes while loading individual ewah bitmaps. We can fix this by checking the number of available bytes when we parse the header. The existing "truncated bitmap" test is now split into two tests: one where we don't have this extension at all (and hence actually do try to read a truncated ewah bitmap) and one where we realize up-front that we can't even fit in the cache structure. We'll check stderr in each case to make sure we hit the error we're expecting. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 8 ++++++-- t/t5310-pack-bitmaps.sh | 17 +++++++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index fe5647e72e..074d9ac8f2 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -153,14 +153,18 @@ static int load_bitmap_header(struct bitmap_index *index) /* Parse known bitmap format options */ { uint32_t flags = ntohs(header->options); + size_t cache_size = st_mult(index->pack->num_objects, sizeof(uint32_t)); + unsigned char *index_end = index->map + index->map_size - the_hash_algo->rawsz; if ((flags & BITMAP_OPT_FULL_DAG) == 0) return error("Unsupported options for bitmap index file " "(Git requires BITMAP_OPT_FULL_DAG)"); if (flags & BITMAP_OPT_HASH_CACHE) { - unsigned char *end = index->map + index->map_size - the_hash_algo->rawsz; - index->hashes = ((uint32_t *)end) - index->pack->num_objects; + if (cache_size > index_end - index->map - header_size) + return error("corrupted bitmap index file (too short to fit hash cache)"); + index->hashes = (void *)(index_end - cache_size); + index_end -= cache_size; } } diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index 8318781d2b..e2c3907a68 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -343,7 +343,8 @@ test_expect_success 'pack reuse respects --incremental' ' test_must_be_empty actual ' -test_expect_success 'truncated bitmap fails gracefully' ' +test_expect_success 'truncated bitmap fails gracefully (ewah)' ' + test_config pack.writebitmaphashcache false && git repack -ad && git rev-list --use-bitmap-index --count --all >expect && bitmap=$(ls .git/objects/pack/*.bitmap) && @@ -352,7 +353,19 @@ test_expect_success 'truncated bitmap fails gracefully' ' mv -f $bitmap.tmp $bitmap && git rev-list --use-bitmap-index --count --all >actual 2>stderr && test_cmp expect actual && - test_i18ngrep corrupt stderr + test_i18ngrep corrupt.ewah.bitmap stderr +' + +test_expect_success 'truncated bitmap fails gracefully (cache)' ' + git repack -ad && + git rev-list --use-bitmap-index --count --all >expect && + bitmap=$(ls .git/objects/pack/*.bitmap) && + test_when_finished "rm -f $bitmap" && + test_copy_bytes 512 <$bitmap >$bitmap.tmp && + mv -f $bitmap.tmp $bitmap && + git rev-list --use-bitmap-index --count --all >actual 2>stderr && + test_cmp expect actual && + test_i18ngrep corrupted.bitmap.index stderr ' # have_delta -- cgit v1.2.3 From c5cd7490762b20cdec3844e99cfc908c38917906 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:28 -0500 Subject: t5310: drop size of truncated ewah bitmap We truncate the .bitmap file to 512 bytes and expect to run into problems reading an individual ewah file. But this length is somewhat arbitrary, and just happened to work when the test was added in 9d2e330b17 (ewah_read_mmap: bounds-check mmap reads, 2018-06-14). An upcoming commit will change the size of the history we create in the test repo, which will cause this test to fail. We can future-proof it a bit more by reducing the size of the truncated bitmap file. Signed-off-by: Jeff King Helped-by: Junio C Hamano Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t5310-pack-bitmaps.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index e2c3907a68..cdebcd79b6 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -22,10 +22,11 @@ has_any () { test_expect_success 'setup repo with moderate-sized history' ' test_commit_bulk --id=file 100 && + git branch -M second && git checkout -b other HEAD~5 && test_commit_bulk --id=side 10 && - git checkout master && - bitmaptip=$(git rev-parse master) && + git checkout second && + bitmaptip=$(git rev-parse second) && blob=$(echo tagged-blob | git hash-object -w --stdin) && git tag tagged-blob $blob && git config repack.writebitmaps true @@ -63,8 +64,8 @@ rev_list_tests() { ' test_expect_success "counting non-linear history ($state)" ' - git rev-list --count other...master >expect && - git rev-list --use-bitmap-index --count other...master >actual && + git rev-list --count other...second >expect && + git rev-list --use-bitmap-index --count other...second >actual && test_cmp expect actual ' @@ -128,7 +129,7 @@ test_expect_success 'setup further non-bitmapped commits' ' rev_list_tests 'partial bitmap' test_expect_success 'fetch (partial bitmap)' ' - git --git-dir=clone.git fetch origin master:master && + git --git-dir=clone.git fetch origin second:second && git rev-parse HEAD >expect && git --git-dir=clone.git rev-parse HEAD >actual && test_cmp expect actual @@ -230,7 +231,7 @@ test_expect_success 'full repack, reusing previous bitmaps' ' ' test_expect_success 'fetch (full bitmap)' ' - git --git-dir=clone.git fetch origin master:master && + git --git-dir=clone.git fetch origin second:second && git rev-parse HEAD >expect && git --git-dir=clone.git rev-parse HEAD >actual && test_cmp expect actual @@ -349,7 +350,7 @@ test_expect_success 'truncated bitmap fails gracefully (ewah)' ' git rev-list --use-bitmap-index --count --all >expect && bitmap=$(ls .git/objects/pack/*.bitmap) && test_when_finished "rm -f $bitmap" && - test_copy_bytes 512 <$bitmap >$bitmap.tmp && + test_copy_bytes 256 <$bitmap >$bitmap.tmp && mv -f $bitmap.tmp $bitmap && git rev-list --use-bitmap-index --count --all >actual 2>stderr && test_cmp expect actual && -- cgit v1.2.3 From 2978b00691c1246149892882265cd62a6921cbf5 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:33 -0500 Subject: rev-list: die when --test-bitmap detects a mismatch You can use "git rev-list --test-bitmap HEAD" to check that bitmaps produce the same answer we'd get from a regular traversal. But if we detect an error, we only print "mismatch", and still exit with a successful error code. That makes the uses of --test-bitmap in the test suite (e.g., in t5310) mostly pointless: even if we saw an error, the tests wouldn't notice. Let's instead call die(), which will let these tests work as designed, and alert us if the bitmaps are bogus. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index 074d9ac8f2..4431f9f120 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -1328,7 +1328,7 @@ void test_bitmap_walk(struct rev_info *revs) if (bitmap_equals(result, tdata.base)) fprintf(stderr, "OK!\n"); else - fprintf(stderr, "Mismatch!\n"); + die("mismatch in bitmap results"); free_bitmap_index(bitmap_git); } -- cgit v1.2.3 From d574bf43e806e0d4d6cda7c2f5d016a87843078f Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:38 -0500 Subject: ewah: factor out bitmap growth We auto-grow bitmaps when somebody asks to set a bit whose position is outside of our currently allocated range. Other operations besides single bit-setting might need to do this, too, so let's pull it into its own function. Note that we change the semantics a little: you now ask for the number of words you'd like to have, not the id of the block you'd like to write to. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/bitmap.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index d8cec585af..7c1ecfa6fd 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -35,18 +35,22 @@ struct bitmap *bitmap_new(void) return bitmap_word_alloc(32); } -void bitmap_set(struct bitmap *self, size_t pos) +static void bitmap_grow(struct bitmap *self, size_t word_alloc) { - size_t block = EWAH_BLOCK(pos); - - if (block >= self->word_alloc) { + if (word_alloc > self->word_alloc) { size_t old_size = self->word_alloc; - self->word_alloc = block ? block * 2 : 1; + self->word_alloc = word_alloc * 2; REALLOC_ARRAY(self->words, self->word_alloc); memset(self->words + old_size, 0x0, (self->word_alloc - old_size) * sizeof(eword_t)); } +} + +void bitmap_set(struct bitmap *self, size_t pos) +{ + size_t block = EWAH_BLOCK(pos); + bitmap_grow(self, block + 1); self->words[block] |= EWAH_MASK(pos); } -- cgit v1.2.3 From 2e2d141afdaa2b086ac5d4228de0dd0003eb69ff Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:42 -0500 Subject: ewah: make bitmap growth less aggressive If you ask to set a bit in the Nth word and we haven't yet allocated that many slots in our array, we'll increase the bitmap size to 2*N. This means we might frequently end up with bitmaps that are twice the necessary size (as soon as you ask for the biggest bit, we'll size up to twice that). But if we just allocate as many words as were asked for, we may not grow fast enough. The worst case there is setting bit 0, then 1, etc. Each time we grow we'd just extend by one more word, giving us linear reallocations (and quadratic memory copies). A middle ground is relying on alloc_nr(), which causes us to grow by a factor of roughly 3/2 instead of 2. That's less aggressive than doubling, and it may help avoid fragmenting memory. (If we start with N, then grow twice, our total is N*(3/2)^2 = 9N/4. After growing twice, that array of size 9N/4 can fit into the space vacated by the original array and first growth, N+3N/2 = 10N/4 > 9N/4, leading to less fragmentation in memory). Our worst case is still 3/2N wasted bits (you set bit N-1, then setting bit N causes us to grow by 3/2), but our average should be much better. This isn't usually that big a deal, but it will matter as we shift the reachability bitmap generation code to store more bitmaps in memory. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/bitmap.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 7c1ecfa6fd..6f9e5c529b 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -37,13 +37,10 @@ struct bitmap *bitmap_new(void) static void bitmap_grow(struct bitmap *self, size_t word_alloc) { - if (word_alloc > self->word_alloc) { - size_t old_size = self->word_alloc; - self->word_alloc = word_alloc * 2; - REALLOC_ARRAY(self->words, self->word_alloc); - memset(self->words + old_size, 0x0, - (self->word_alloc - old_size) * sizeof(eword_t)); - } + size_t old_size = self->word_alloc; + ALLOC_GROW(self->words, word_alloc, self->word_alloc); + memset(self->words + old_size, 0x0, + (self->word_alloc - old_size) * sizeof(eword_t)); } void bitmap_set(struct bitmap *self, size_t pos) -- cgit v1.2.3 From 3ed675101aad3dcc948ad0e1d41e55d65e693740 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:46 -0500 Subject: ewah: implement bitmap_or() We have a function to bitwise-OR an ewah into an uncompressed bitmap, but not to OR two uncompressed bitmaps. Let's add it. Interestingly, we have a public header declaration going back to e1273106f6 (ewah: compressed bitmap implementation, 2013-11-14), but the function was never implemented. That was all OK since there were no users of 'bitmap_or()', but a first caller will be added in a couple of patches. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/bitmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 6f9e5c529b..0a3502603f 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -122,6 +122,15 @@ void bitmap_and_not(struct bitmap *self, struct bitmap *other) self->words[i] &= ~other->words[i]; } +void bitmap_or(struct bitmap *self, const struct bitmap *other) +{ + size_t i; + + bitmap_grow(self, other->word_alloc); + for (i = 0; i < other->word_alloc; i++) + self->words[i] |= other->words[i]; +} + void bitmap_or_ewah(struct bitmap *self, struct ewah_bitmap *other) { size_t original_size = self->word_alloc; -- cgit v1.2.3 From ccae08e822d71aaae1aa2660631d7ded8f4b97e7 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:50 -0500 Subject: ewah: add bitmap_dup() function There's no easy way to make a copy of a bitmap. Obviously a caller can iterate over the bits and set them one by one in a new bitmap, but we can go much faster by copying whole words with memcpy(). Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/bitmap.c | 7 +++++++ ewah/ewok.h | 1 + 2 files changed, 8 insertions(+) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index 0a3502603f..b5f6376282 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -35,6 +35,13 @@ struct bitmap *bitmap_new(void) return bitmap_word_alloc(32); } +struct bitmap *bitmap_dup(const struct bitmap *src) +{ + struct bitmap *dst = bitmap_word_alloc(src->word_alloc); + COPY_ARRAY(dst->words, src->words, src->word_alloc); + return dst; +} + static void bitmap_grow(struct bitmap *self, size_t word_alloc) { size_t old_size = self->word_alloc; diff --git a/ewah/ewok.h b/ewah/ewok.h index 011852bef1..1fc555e672 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -173,6 +173,7 @@ struct bitmap { struct bitmap *bitmap_new(void); struct bitmap *bitmap_word_alloc(size_t word_alloc); +struct bitmap *bitmap_dup(const struct bitmap *src); void bitmap_set(struct bitmap *self, size_t pos); void bitmap_unset(struct bitmap *self, size_t pos); int bitmap_get(struct bitmap *self, size_t pos); -- cgit v1.2.3 From 4a9c5817290d347f59dd47a081be8404c3cedbe8 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:55 -0500 Subject: pack-bitmap-write: reimplement bitmap writing The bitmap generation code works by iterating over the set of commits for which we plan to write bitmaps, and then for each one performing a traditional traversal over the reachable commits and trees, filling in the bitmap. Between two traversals, we can often reuse the previous bitmap result as long as the first commit is an ancestor of the second. However, our worst case is that we may end up doing "n" complete complete traversals to the root in order to create "n" bitmaps. In a real-world case (the shared-storage repo consisting of all GitHub forks of chromium/chromium), we perform very poorly: generating bitmaps takes ~3 hours, whereas we can walk the whole object graph in ~3 minutes. This commit completely rewrites the algorithm, with the goal of accessing each object only once. It works roughly like this: - generate a list of commits in topo-order using a single traversal - invert the edges of the graph (so have parents point at their children) - make one pass in reverse topo-order, generating a bitmap for each commit and passing the result along to child nodes We generate correct results because each node we visit has already had all of its ancestors added to the bitmap. And we make only two linear passes over the commits. We also visit each tree usually only once. When filling in a bitmap, we don't bother to recurse into trees whose bit is already set in the bitmap (since we know we've already done so when setting their bit). That means that if commit A references tree T, none of its descendants will need to open T again. I say "usually", though, because it is possible for a given tree to be mentioned in unrelated parts of history (e.g., cherry-picking to a parallel branch). So we've accomplished our goal, and the resulting algorithm is pretty simple to understand. But there are some downsides, at least with this initial implementation: - we no longer reuse the results of any on-disk bitmaps when generating. So we'd expect to sometimes be slower than the original when bitmaps already exist. However, this is something we'll be able to add back in later. - we use much more memory. Instead of keeping one bitmap in memory at a time, we're passing them up through the graph. So our memory use should scale with the graph width (times the size of a bitmap). So how does it perform? For a clone of linux.git, generating bitmaps from scratch with the old algorithm took 63s. Using this algorithm it takes 205s. Which is much worse, but _might_ be acceptable if it behaved linearly as the size grew. It also increases peak heap usage by ~1G. That's not impossibly large, but not encouraging. On the complete fork-network of torvalds/linux, it increases the peak RAM usage by 40GB. Yikes. (I forgot to record the time it took, but the memory usage was too much to consider this reasonable anyway). On the complete fork-network of chromium/chromium, I ran out of memory before succeeding. Some back-of-the-envelope calculations indicate it would need 80+GB to complete. So at this stage, we've managed to make things much worse. But because of the way this new algorithm is structured, there are a lot of opportunities for optimization on top. We'll start implementing those in the follow-on patches. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 284 +++++++++++++++++++++++++++++----------------------- 1 file changed, 161 insertions(+), 123 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 5e998bdaa7..bcd059ccd9 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -110,8 +110,6 @@ void bitmap_writer_build_type_index(struct packing_data *to_pack, /** * Compute the actual bitmaps */ -static struct object **seen_objects; -static unsigned int seen_objects_nr, seen_objects_alloc; static inline void push_bitmapped_commit(struct commit *commit, struct ewah_bitmap *reused) { @@ -127,21 +125,6 @@ static inline void push_bitmapped_commit(struct commit *commit, struct ewah_bitm writer.selected_nr++; } -static inline void mark_as_seen(struct object *object) -{ - ALLOC_GROW(seen_objects, seen_objects_nr + 1, seen_objects_alloc); - seen_objects[seen_objects_nr++] = object; -} - -static inline void reset_all_seen(void) -{ - unsigned int i; - for (i = 0; i < seen_objects_nr; ++i) { - seen_objects[i]->flags &= ~(SEEN | ADDED | SHOWN); - } - seen_objects_nr = 0; -} - static uint32_t find_object_pos(const struct object_id *oid) { struct object_entry *entry = packlist_find(writer.to_pack, oid); @@ -154,60 +137,6 @@ static uint32_t find_object_pos(const struct object_id *oid) return oe_in_pack_pos(writer.to_pack, entry); } -static void show_object(struct object *object, const char *name, void *data) -{ - struct bitmap *base = data; - bitmap_set(base, find_object_pos(&object->oid)); - mark_as_seen(object); -} - -static void show_commit(struct commit *commit, void *data) -{ - mark_as_seen((struct object *)commit); -} - -static int -add_to_include_set(struct bitmap *base, struct commit *commit) -{ - khiter_t hash_pos; - uint32_t bitmap_pos = find_object_pos(&commit->object.oid); - - if (bitmap_get(base, bitmap_pos)) - return 0; - - hash_pos = kh_get_oid_map(writer.bitmaps, commit->object.oid); - if (hash_pos < kh_end(writer.bitmaps)) { - struct bitmapped_commit *bc = kh_value(writer.bitmaps, hash_pos); - bitmap_or_ewah(base, bc->bitmap); - return 0; - } - - bitmap_set(base, bitmap_pos); - return 1; -} - -static int -should_include(struct commit *commit, void *_data) -{ - struct bitmap *base = _data; - - if (!add_to_include_set(base, commit)) { - struct commit_list *parent = commit->parents; - - mark_as_seen((struct object *)commit); - - while (parent) { - parent->item->object.flags |= SEEN; - mark_as_seen((struct object *)parent->item); - parent = parent->next; - } - - return 0; - } - - return 1; -} - static void compute_xor_offsets(void) { static const int MAX_XOR_OFFSET_SEARCH = 10; @@ -248,79 +177,188 @@ static void compute_xor_offsets(void) } } -void bitmap_writer_build(struct packing_data *to_pack) -{ - static const double REUSE_BITMAP_THRESHOLD = 0.2; +struct bb_commit { + struct commit_list *children; + struct bitmap *bitmap; + unsigned selected:1; + unsigned idx; /* within selected array */ +}; - int i, reuse_after, need_reset; - struct bitmap *base = bitmap_new(); - struct rev_info revs; +define_commit_slab(bb_data, struct bb_commit); - writer.bitmaps = kh_init_oid_map(); - writer.to_pack = to_pack; +struct bitmap_builder { + struct bb_data data; + struct commit **commits; + size_t commits_nr, commits_alloc; +}; - if (writer.show_progress) - writer.progress = start_progress("Building bitmaps", writer.selected_nr); +static void bitmap_builder_init(struct bitmap_builder *bb, + struct bitmap_writer *writer) +{ + struct rev_info revs; + struct commit *commit; + unsigned int i; - repo_init_revisions(to_pack->repo, &revs, NULL); - revs.tag_objects = 1; - revs.tree_objects = 1; - revs.blob_objects = 1; - revs.no_walk = 0; + memset(bb, 0, sizeof(*bb)); + init_bb_data(&bb->data); - revs.include_check = should_include; reset_revision_walk(); + repo_init_revisions(writer->to_pack->repo, &revs, NULL); + revs.topo_order = 1; + + for (i = 0; i < writer->selected_nr; i++) { + struct commit *c = writer->selected[i].commit; + struct bb_commit *ent = bb_data_at(&bb->data, c); + ent->selected = 1; + ent->idx = i; + add_pending_object(&revs, &c->object, ""); + } - reuse_after = writer.selected_nr * REUSE_BITMAP_THRESHOLD; - need_reset = 0; + if (prepare_revision_walk(&revs)) + die("revision walk setup failed"); - for (i = writer.selected_nr - 1; i >= 0; --i) { - struct bitmapped_commit *stored; - struct object *object; + while ((commit = get_revision(&revs))) { + struct commit_list *p; - khiter_t hash_pos; - int hash_ret; + parse_commit_or_die(commit); - stored = &writer.selected[i]; - object = (struct object *)stored->commit; + ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); + bb->commits[bb->commits_nr++] = commit; - if (stored->bitmap == NULL) { - if (i < writer.selected_nr - 1 && - (need_reset || - !in_merge_bases(writer.selected[i + 1].commit, - stored->commit))) { - bitmap_reset(base); - reset_all_seen(); - } + for (p = commit->parents; p; p = p->next) { + struct bb_commit *ent = bb_data_at(&bb->data, p->item); + commit_list_insert(commit, &ent->children); + } + } +} - add_pending_object(&revs, object, ""); - revs.include_check_data = base; +static void bitmap_builder_clear(struct bitmap_builder *bb) +{ + clear_bb_data(&bb->data); + free(bb->commits); + bb->commits_nr = bb->commits_alloc = 0; +} - if (prepare_revision_walk(&revs)) - die("revision walk setup failed"); +static void fill_bitmap_tree(struct bitmap *bitmap, + struct tree *tree) +{ + uint32_t pos; + struct tree_desc desc; + struct name_entry entry; - traverse_commit_list(&revs, show_commit, show_object, base); + /* + * If our bit is already set, then there is nothing to do. Both this + * tree and all of its children will be set. + */ + pos = find_object_pos(&tree->object.oid); + if (bitmap_get(bitmap, pos)) + return; + bitmap_set(bitmap, pos); - object_array_clear(&revs.pending); + if (parse_tree(tree) < 0) + die("unable to load tree object %s", + oid_to_hex(&tree->object.oid)); + init_tree_desc(&desc, tree->buffer, tree->size); - stored->bitmap = bitmap_to_ewah(base); - need_reset = 0; - } else - need_reset = 1; + while (tree_entry(&desc, &entry)) { + switch (object_type(entry.mode)) { + case OBJ_TREE: + fill_bitmap_tree(bitmap, + lookup_tree(the_repository, &entry.oid)); + break; + case OBJ_BLOB: + bitmap_set(bitmap, find_object_pos(&entry.oid)); + break; + default: + /* Gitlink, etc; not reachable */ + break; + } + } + + free_tree_buffer(tree); +} - if (i >= reuse_after) - stored->flags |= BITMAP_FLAG_REUSE; +static void fill_bitmap_commit(struct bb_commit *ent, + struct commit *commit) +{ + if (!ent->bitmap) + ent->bitmap = bitmap_new(); - hash_pos = kh_put_oid_map(writer.bitmaps, object->oid, &hash_ret); - if (hash_ret == 0) - die("Duplicate entry when writing index: %s", - oid_to_hex(&object->oid)); + /* + * mark ourselves, but do not bother with parents; their values + * will already have been propagated to us + */ + bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid)); + fill_bitmap_tree(ent->bitmap, get_commit_tree(commit)); +} - kh_value(writer.bitmaps, hash_pos) = stored; - display_progress(writer.progress, writer.selected_nr - i); +static void store_selected(struct bb_commit *ent, struct commit *commit) +{ + struct bitmapped_commit *stored = &writer.selected[ent->idx]; + khiter_t hash_pos; + int hash_ret; + + /* + * the "reuse bitmaps" phase may have stored something here, but + * our new algorithm doesn't use it. Drop it. + */ + if (stored->bitmap) + ewah_free(stored->bitmap); + + stored->bitmap = bitmap_to_ewah(ent->bitmap); + + hash_pos = kh_put_oid_map(writer.bitmaps, commit->object.oid, &hash_ret); + if (hash_ret == 0) + die("Duplicate entry when writing index: %s", + oid_to_hex(&commit->object.oid)); + kh_value(writer.bitmaps, hash_pos) = stored; +} + +void bitmap_writer_build(struct packing_data *to_pack) +{ + struct bitmap_builder bb; + size_t i; + int nr_stored = 0; /* for progress */ + + writer.bitmaps = kh_init_oid_map(); + writer.to_pack = to_pack; + + if (writer.show_progress) + writer.progress = start_progress("Building bitmaps", writer.selected_nr); + trace2_region_enter("pack-bitmap-write", "building_bitmaps_total", + the_repository); + + bitmap_builder_init(&bb, &writer); + for (i = bb.commits_nr; i > 0; i--) { + struct commit *commit = bb.commits[i-1]; + struct bb_commit *ent = bb_data_at(&bb.data, commit); + struct commit *child; + + fill_bitmap_commit(ent, commit); + + if (ent->selected) { + store_selected(ent, commit); + nr_stored++; + display_progress(writer.progress, nr_stored); + } + + while ((child = pop_commit(&ent->children))) { + struct bb_commit *child_ent = + bb_data_at(&bb.data, child); + + if (child_ent->bitmap) + bitmap_or(child_ent->bitmap, ent->bitmap); + else + child_ent->bitmap = bitmap_dup(ent->bitmap); + } + bitmap_free(ent->bitmap); + ent->bitmap = NULL; } + bitmap_builder_clear(&bb); + + trace2_region_leave("pack-bitmap-write", "building_bitmaps_total", + the_repository); - bitmap_free(base); stop_progress(&writer.progress); compute_xor_offsets(); -- cgit v1.2.3 From 010e5eacfb004218087da054a61772da8fc40463 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:03:59 -0500 Subject: pack-bitmap-write: pass ownership of intermediate bitmaps Our algorithm to generate reachability bitmaps walks through the commit graph from the bottom up, passing bitmap data from each commit to its descendants. For a linear stretch of history like: A -- B -- C our sequence of steps is: - compute the bitmap for A by walking its trees, etc - duplicate A's bitmap as a starting point for B; we can now free A's bitmap, since we only needed it as an intermediate result - OR in any extra objects that B can reach into its bitmap - duplicate B's bitmap as a starting point for C; likewise, free B's bitmap - OR in objects for C, and so on... Rather than duplicating bitmaps and immediately freeing the original, we can just pass ownership from commit to commit. Note that this doesn't always work: - the recipient may be a merge which already has an intermediate bitmap from its other ancestor. In that case we have to OR our result into it. Note that the first ancestor to reach the merge does get to pass ownership, though. - we may have multiple children; we can only pass ownership to one of them However, it happens often enough and copying bitmaps is expensive enough that this provides a noticeable speedup. On a clone of linux.git, this reduces the time to generate bitmaps from 205s to 70s. This is about the same amount of time it took to generate bitmaps using our old "many traversals" algorithm (the previous commit measures the identical scenario as taking 63s). It unfortunately provides only a very modest reduction in the peak memory usage, though. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index bcd059ccd9..1eb9615df8 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -333,6 +333,7 @@ void bitmap_writer_build(struct packing_data *to_pack) struct commit *commit = bb.commits[i-1]; struct bb_commit *ent = bb_data_at(&bb.data, commit); struct commit *child; + int reused = 0; fill_bitmap_commit(ent, commit); @@ -348,10 +349,15 @@ void bitmap_writer_build(struct packing_data *to_pack) if (child_ent->bitmap) bitmap_or(child_ent->bitmap, ent->bitmap); - else + else if (reused) child_ent->bitmap = bitmap_dup(ent->bitmap); + else { + child_ent->bitmap = ent->bitmap; + reused = 1; + } } - bitmap_free(ent->bitmap); + if (!reused) + bitmap_free(ent->bitmap); ent->bitmap = NULL; } bitmap_builder_clear(&bb); -- cgit v1.2.3 From 6dc5ef759f25fbded11d235362c59f59498809e6 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:03 -0500 Subject: pack-bitmap-write: fill bitmap with commit history The current implementation of bitmap_writer_build() creates a reachability bitmap for every walked commit. After computing a bitmap for a commit, those bits are pushed to an in-progress bitmap for its children. fill_bitmap_commit() assumes the bits corresponding to objects reachable from the parents of a commit are already set. This means that when visiting a new commit, we only have to walk the objects reachable between it and any of its parents. A future change to bitmap_writer_build() will relax this condition so not all parents have their bits set. Prepare for that by having 'fill_bitmap_commit()' walk parents until reaching commits whose bits are already set. Then, walk the trees for these commits as well. This has no functional change with the current implementation of bitmap_writer_build(). Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 1eb9615df8..957639241e 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -12,6 +12,7 @@ #include "sha1-lookup.h" #include "pack-objects.h" #include "commit-reach.h" +#include "prio-queue.h" struct bitmapped_commit { struct commit *commit; @@ -279,17 +280,30 @@ static void fill_bitmap_tree(struct bitmap *bitmap, } static void fill_bitmap_commit(struct bb_commit *ent, - struct commit *commit) + struct commit *commit, + struct prio_queue *queue) { if (!ent->bitmap) ent->bitmap = bitmap_new(); - /* - * mark ourselves, but do not bother with parents; their values - * will already have been propagated to us - */ bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid)); - fill_bitmap_tree(ent->bitmap, get_commit_tree(commit)); + prio_queue_put(queue, commit); + + while (queue->nr) { + struct commit_list *p; + struct commit *c = prio_queue_get(queue); + + bitmap_set(ent->bitmap, find_object_pos(&c->object.oid)); + fill_bitmap_tree(ent->bitmap, get_commit_tree(c)); + + for (p = c->parents; p; p = p->next) { + int pos = find_object_pos(&p->item->object.oid); + if (!bitmap_get(ent->bitmap, pos)) { + bitmap_set(ent->bitmap, pos); + prio_queue_put(queue, p->item); + } + } + } } static void store_selected(struct bb_commit *ent, struct commit *commit) @@ -319,6 +333,7 @@ void bitmap_writer_build(struct packing_data *to_pack) struct bitmap_builder bb; size_t i; int nr_stored = 0; /* for progress */ + struct prio_queue queue = { compare_commits_by_gen_then_commit_date }; writer.bitmaps = kh_init_oid_map(); writer.to_pack = to_pack; @@ -335,7 +350,7 @@ void bitmap_writer_build(struct packing_data *to_pack) struct commit *child; int reused = 0; - fill_bitmap_commit(ent, commit); + fill_bitmap_commit(ent, commit, &queue); if (ent->selected) { store_selected(ent, commit); @@ -360,6 +375,7 @@ void bitmap_writer_build(struct packing_data *to_pack) bitmap_free(ent->bitmap); ent->bitmap = NULL; } + clear_prio_queue(&queue); bitmap_builder_clear(&bb); trace2_region_leave("pack-bitmap-write", "building_bitmaps_total", -- cgit v1.2.3 From ed03a58b655f661ef6f9efd8816efe0c8cf07fa0 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:08 -0500 Subject: bitmap: implement bitmap_is_subset() The bitmap_is_subset() function checks if the 'self' bitmap contains any bitmaps that are not on in the 'other' bitmap. Up until this patch, it had a declaration, but no implementation or callers. A subsequent patch will want this function, so implement it here. Helped-by: Junio C Hamano Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- ewah/bitmap.c | 21 +++++++++++++++++++++ ewah/ewok.h | 2 +- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ewah/bitmap.c b/ewah/bitmap.c index b5f6376282..0d31cdc866 100644 --- a/ewah/bitmap.c +++ b/ewah/bitmap.c @@ -195,6 +195,27 @@ int bitmap_equals(struct bitmap *self, struct bitmap *other) return 1; } +int bitmap_is_subset(struct bitmap *self, struct bitmap *other) +{ + size_t common_size, i; + + if (self->word_alloc < other->word_alloc) + common_size = self->word_alloc; + else { + common_size = other->word_alloc; + for (i = common_size; i < self->word_alloc; i++) { + if (self->words[i]) + return 1; + } + } + + for (i = 0; i < common_size; i++) { + if (self->words[i] & ~other->words[i]) + return 1; + } + return 0; +} + void bitmap_reset(struct bitmap *bitmap) { memset(bitmap->words, 0x0, bitmap->word_alloc * sizeof(eword_t)); diff --git a/ewah/ewok.h b/ewah/ewok.h index 1fc555e672..66920965da 100644 --- a/ewah/ewok.h +++ b/ewah/ewok.h @@ -180,7 +180,7 @@ int bitmap_get(struct bitmap *self, size_t pos); void bitmap_reset(struct bitmap *self); void bitmap_free(struct bitmap *self); int bitmap_equals(struct bitmap *self, struct bitmap *other); -int bitmap_is_subset(struct bitmap *self, struct bitmap *super); +int bitmap_is_subset(struct bitmap *self, struct bitmap *other); struct ewah_bitmap * bitmap_to_ewah(struct bitmap *bitmap); struct bitmap *ewah_to_bitmap(struct ewah_bitmap *ewah); -- cgit v1.2.3 From 597b2c39af9ee65496591448715588b711e91947 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:13 -0500 Subject: commit: implement commit_list_contains() It can be helpful to check if a commit_list contains a commit. Use pointer equality, assuming lookup_commit() was used. Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- commit.c | 11 +++++++++++ commit.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/commit.c b/commit.c index fe1fa3dc41..9a785bf906 100644 --- a/commit.c +++ b/commit.c @@ -544,6 +544,17 @@ struct commit_list *commit_list_insert(struct commit *item, struct commit_list * return new_list; } +int commit_list_contains(struct commit *item, struct commit_list *list) +{ + while (list) { + if (list->item == item) + return 1; + list = list->next; + } + + return 0; +} + unsigned commit_list_count(const struct commit_list *l) { unsigned c = 0; diff --git a/commit.h b/commit.h index 5467786c7b..742a6de460 100644 --- a/commit.h +++ b/commit.h @@ -167,6 +167,8 @@ int find_commit_subject(const char *commit_buffer, const char **subject); struct commit_list *commit_list_insert(struct commit *item, struct commit_list **list); +int commit_list_contains(struct commit *item, + struct commit_list *list); struct commit_list **commit_list_append(struct commit *commit, struct commit_list **next); unsigned commit_list_count(const struct commit_list *l); -- cgit v1.2.3 From 1467b9572aa57c1369aaac897fcbbddde4080d75 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:17 -0500 Subject: t5310: add branch-based checks The current rev-list tests that check the bitmap data only work on HEAD instead of multiple branches. Expand the test cases to handle both 'master' and 'other' branches. Signed-off-by: Derrick Stolee Helped-by: Junio C Hamano Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- t/t5310-pack-bitmaps.sh | 61 +++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 27 deletions(-) diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index cdebcd79b6..1dd9723721 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -42,63 +42,70 @@ test_expect_success 'rev-list --test-bitmap verifies bitmaps' ' git rev-list --test-bitmap HEAD ' -rev_list_tests() { - state=$1 - - test_expect_success "counting commits via bitmap ($state)" ' - git rev-list --count HEAD >expect && - git rev-list --use-bitmap-index --count HEAD >actual && +rev_list_tests_head () { + test_expect_success "counting commits via bitmap ($state, $branch)" ' + git rev-list --count $branch >expect && + git rev-list --use-bitmap-index --count $branch >actual && test_cmp expect actual ' - test_expect_success "counting partial commits via bitmap ($state)" ' - git rev-list --count HEAD~5..HEAD >expect && - git rev-list --use-bitmap-index --count HEAD~5..HEAD >actual && + test_expect_success "counting partial commits via bitmap ($state, $branch)" ' + git rev-list --count $branch~5..$branch >expect && + git rev-list --use-bitmap-index --count $branch~5..$branch >actual && test_cmp expect actual ' - test_expect_success "counting commits with limit ($state)" ' - git rev-list --count -n 1 HEAD >expect && - git rev-list --use-bitmap-index --count -n 1 HEAD >actual && + test_expect_success "counting commits with limit ($state, $branch)" ' + git rev-list --count -n 1 $branch >expect && + git rev-list --use-bitmap-index --count -n 1 $branch >actual && test_cmp expect actual ' - test_expect_success "counting non-linear history ($state)" ' + test_expect_success "counting non-linear history ($state, $branch)" ' git rev-list --count other...second >expect && git rev-list --use-bitmap-index --count other...second >actual && test_cmp expect actual ' - test_expect_success "counting commits with limiting ($state)" ' - git rev-list --count HEAD -- 1.t >expect && - git rev-list --use-bitmap-index --count HEAD -- 1.t >actual && + test_expect_success "counting commits with limiting ($state, $branch)" ' + git rev-list --count $branch -- 1.t >expect && + git rev-list --use-bitmap-index --count $branch -- 1.t >actual && test_cmp expect actual ' - test_expect_success "counting objects via bitmap ($state)" ' - git rev-list --count --objects HEAD >expect && - git rev-list --use-bitmap-index --count --objects HEAD >actual && + test_expect_success "counting objects via bitmap ($state, $branch)" ' + git rev-list --count --objects $branch >expect && + git rev-list --use-bitmap-index --count --objects $branch >actual && test_cmp expect actual ' - test_expect_success "enumerate commits ($state)" ' - git rev-list --use-bitmap-index HEAD >actual && - git rev-list HEAD >expect && + test_expect_success "enumerate commits ($state, $branch)" ' + git rev-list --use-bitmap-index $branch >actual && + git rev-list $branch >expect && test_bitmap_traversal --no-confirm-bitmaps expect actual ' - test_expect_success "enumerate --objects ($state)" ' - git rev-list --objects --use-bitmap-index HEAD >actual && - git rev-list --objects HEAD >expect && + test_expect_success "enumerate --objects ($state, $branch)" ' + git rev-list --objects --use-bitmap-index $branch >actual && + git rev-list --objects $branch >expect && test_bitmap_traversal expect actual ' - test_expect_success "bitmap --objects handles non-commit objects ($state)" ' - git rev-list --objects --use-bitmap-index HEAD tagged-blob >actual && + test_expect_success "bitmap --objects handles non-commit objects ($state, $branch)" ' + git rev-list --objects --use-bitmap-index $branch tagged-blob >actual && grep $blob actual ' } +rev_list_tests () { + state=$1 + + for branch in "second" "other" + do + rev_list_tests_head + done +} + rev_list_tests 'full bitmap' test_expect_success 'clone from bitmapped repository' ' -- cgit v1.2.3 From 928e3f42adc032ce7b7c1323408648d2b6713509 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:22 -0500 Subject: pack-bitmap-write: rename children to reverse_edges The bitmap_builder_init() method walks the reachable commits in topological order and constructs a "reverse graph" along the way. At the moment, this reverse graph contains an edge from commit A to commit B if and only if A is a parent of B. Thus, the name "children" is appropriate for for this reverse graph. In the next change, we will repurpose the reverse graph to not be directly-adjacent commits in the commit-graph, but instead a more abstract relationship. The previous changes have already incorporated the necessary updates to fill_bitmap_commit() that allow these edges to not be immediate children. Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 957639241e..7e218d02a6 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -179,7 +179,7 @@ static void compute_xor_offsets(void) } struct bb_commit { - struct commit_list *children; + struct commit_list *reverse_edges; struct bitmap *bitmap; unsigned selected:1; unsigned idx; /* within selected array */ @@ -228,7 +228,7 @@ static void bitmap_builder_init(struct bitmap_builder *bb, for (p = commit->parents; p; p = p->next) { struct bb_commit *ent = bb_data_at(&bb->data, p->item); - commit_list_insert(commit, &ent->children); + commit_list_insert(commit, &ent->reverse_edges); } } } @@ -358,7 +358,7 @@ void bitmap_writer_build(struct packing_data *to_pack) display_progress(writer.progress, nr_stored); } - while ((child = pop_commit(&ent->children))) { + while ((child = pop_commit(&ent->reverse_edges))) { struct bb_commit *child_ent = bb_data_at(&bb.data, child); -- cgit v1.2.3 From c6b0c3910c80a7b7a5c1da3b70d8054ade515692 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 8 Dec 2020 17:04:26 -0500 Subject: pack-bitmap.c: check reads more aggressively when loading Before 'load_bitmap_entries_v1()' reads an actual EWAH bitmap, it should check that it can safely do so by ensuring that there are at least 6 bytes available to be read (four for the commit's index position, and then two more for the xor offset and flags, respectively). Likewise, it should check that the commit index it read refers to a legitimate object in the pack. The first fix catches a truncation bug that was exposed when testing, and the second is purely precautionary. There are some possible future improvements, not pursued here. They are: - Computing the correct boundary of the bitmap itself in the caller and ensuring that we don't read past it. This may or may not be worth it, since in a truncation situation, all bets are off: (is the trailer still there and the bitmap entries malformed, or is the trailer truncated?). The best we can do is try to read what's there as if it's correct data (and protect ourselves when it's obviously bogus). - Avoid the magic "6" by teaching read_be32() and read_u8() (both of which are custom helpers for this function) to check sizes before advancing the pointers. - Adding more tests in this area. Testing these truncation situations are remarkably fragile to even subtle changes in the bitmap generation. So, the resulting tests are likely to be quite brittle. Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index 4431f9f120..60c781d100 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -229,11 +229,16 @@ static int load_bitmap_entries_v1(struct bitmap_index *index) uint32_t commit_idx_pos; struct object_id oid; + if (index->map_size - index->map_pos < 6) + return error("corrupt ewah bitmap: truncated header for entry %d", i); + commit_idx_pos = read_be32(index->map, &index->map_pos); xor_offset = read_u8(index->map, &index->map_pos); flags = read_u8(index->map, &index->map_pos); - nth_packed_object_id(&oid, index->pack, commit_idx_pos); + if (nth_packed_object_id(&oid, index->pack, commit_idx_pos) < 0) + return error("corrupt ewah bitmap: commit index %u out of range", + (unsigned)commit_idx_pos); bitmap = read_bitmap_1(index); if (!bitmap) -- cgit v1.2.3 From 089f751360f62545a8fa836ad8132eb1260d2723 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:04:30 -0500 Subject: pack-bitmap-write: build fewer intermediate bitmaps The bitmap_writer_build() method calls bitmap_builder_init() to construct a list of commits reachable from the selected commits along with a "reverse graph". This reverse graph has edges pointing from a commit to other commits that can reach that commit. After computing a reachability bitmap for a commit, the values in that bitmap are then copied to the reachability bitmaps across the edges in the reverse graph. We can now relax the role of the reverse graph to greatly reduce the number of intermediate reachability bitmaps we compute during this reverse walk. The end result is that we walk objects the same number of times as before when constructing the reachability bitmaps, but we also spend much less time copying bits between bitmaps and have much lower memory pressure in the process. The core idea is to select a set of "important" commits based on interactions among the sets of commits reachable from each selected commit. The first technical concept is to create a new 'commit_mask' member in the bb_commit struct. Note that the selected commits are provided in an ordered array. The first thing to do is to mark the ith bit in the commit_mask for the ith selected commit. As we walk the commit-graph, we copy the bits in a commit's commit_mask to its parents. At the end of the walk, the ith bit in the commit_mask for a commit C stores a boolean representing "The ith selected commit can reach C." As we walk, we will discover non-selected commits that are important. We will get into this later, but those important commits must also receive bit positions, growing the width of the bitmasks as we walk. At the true end of the walk, the ith bit means "the ith _important_ commit can reach C." MAXIMAL COMMITS --------------- We use a new 'maximal' bit in the bb_commit struct to represent whether a commit is important or not. The term "maximal" comes from the partially-ordered set of commits in the commit-graph where C >= P if P is a parent of C, and then extending the relationship transitively. Instead of taking the maximal commits across the entire commit-graph, we instead focus on selecting each commit that is maximal among commits with the same bits on in their commit_mask. This definition is important, so let's consider an example. Suppose we have three selected commits A, B, and C. These are assigned bitmasks 100, 010, and 001 to start. Each of these can be marked as maximal immediately because they each will be the uniquely maximal commit that contains their own bit. Keep in mind that that these commits may have different bitmasks after the walk; for example, if B can reach C but A cannot, then the final bitmask for C is 011. Even in these cases, C would still be a maximal commit among all commits with the third bit on in their masks. Now define sets X, Y, and Z to be the sets of commits reachable from A, B, and C, respectively. The intersections of these sets correspond to different bitmasks: * 100: X - (Y union Z) * 010: Y - (X union Z) * 001: Z - (X union Y) * 110: (X intersect Y) - Z * 101: (X intersect Z) - Y * 011: (Y intersect Z) - X * 111: X intersect Y intersect Z This can be visualized with the following Hasse diagram: 100 010 001 | \ / \ / | | \/ \/ | | /\ /\ | | / \ / \ | 110 101 011 \___ | ___/ \ | / 111 Some of these bitmasks may not be represented, depending on the topology of the commit-graph. In fact, we are counting on it, since the number of possible bitmasks is exponential in the number of selected commits, but is also limited by the total number of commits. In practice, very few bitmasks are possible because most commits converge on a common "trunk" in the commit history. With this three-bit example, we wish to find commits that are maximal for each bitmask. How can we identify this as we are walking? As we walk, we visit a commit C. Since we are walking the commits in topo-order, we know that C is visited after all of its children are visited. Thus, when we get C from the revision walk we inspect the 'maximal' property of its bb_data and use that to determine if C is truly important. Its commit_mask is also nearly final. If C is not one of the originally-selected commits, then assign a bit position to C (by incrementing num_maximal) and set that bit on in commit_mask. See "MULTIPLE MAXIMAL COMMITS" below for more detail on this. Now that the commit C is known to be maximal or not, consider each parent P of C. Compute two new values: * c_not_p : true if and only if the commit_mask for C contains a bit that is not contained in the commit_mask for P. * p_not_c : true if and only if the commit_mask for P contains a bit that is not contained in the commit_mask for P. If c_not_p is false, then P already has all of the bits that C would provide to its commit_mask. In this case, move on to other parents as C has nothing to contribute to P's state that was not already provided by other children of P. We continue with the case that c_not_p is true. This means there are bits in C's commit_mask to copy to P's commit_mask, so use bitmap_or() to add those bits. If p_not_c is also true, then set the maximal bit for P to one. This means that if no other commit has P as a parent, then P is definitely maximal. This is because no child had the same bitmask. It is important to think about the maximal bit for P at this point as a temporary state: "P is maximal based on current information." In contrast, if p_not_c is false, then set the maximal bit for P to zero. Further, clear all reverse_edges for P since any edges that were previously assigned to P are no longer important. P will gain all reverse edges based on C. The final thing we need to do is to update the reverse edges for P. These reverse edges respresent "which closest maximal commits contributed bits to my commit_mask?" Since C contributed bits to P's commit_mask in this case, C must add to the reverse edges of P. If C is maximal, then C is a 'closest' maximal commit that contributed bits to P. Add C to P's reverse_edges list. Otherwise, C has a list of maximal commits that contributed bits to its bitmask (and this list is exactly one element). Add all of these items to P's reverse_edges list. Be careful to ignore duplicates here. After inspecting all parents P for a commit C, we can clear the commit_mask for C. This reduces the memory load to be limited to the "width" of the commit graph. Consider our ABC/XYZ example from earlier and let's inspect the state of the commits for an interesting bitmask, say 011. Suppose that D is the only maximal commit with this bitmask (in the first three bits). All other commits with bitmask 011 have D as the only entry in their reverse_edges list. D's reverse_edges list contains B and C. COMPUTING REACHABILITY BITMAPS ------------------------------ Now that we have our definition, let's zoom out and consider what happens with our new reverse graph when computing reachability bitmaps. We walk the reverse graph in reverse-topo-order, so we visit commits with largest commit_masks first. After we compute the reachability bitmap for a commit C, we push the bits in that bitmap to each commit D in the reverse edge list for C. Then, when we finally visit D we already have the bits for everything reachable from maximal commits that D can reach and we only need to walk the objects in the set-difference. In our ABC/XYZ example, when we finally walk for the commit A we only need to walk commits with bitmask equal to A's bitmask. If that bitmask is 100, then we are only walking commits in X - (Y union Z) because the bitmap already contains the bits for objects reachable from (X intersect Y) union (X intersect Z) (i.e. the bits from the reachability bitmaps for the maximal commits with bitmasks 110 and 101). The behavior is intended to walk each commit (and the trees that commit introduces) at most once while allocating and copying fewer reachability bitmaps. There is one caveat: what happens when there are multiple maximal commits with the same bitmask, with respect to the initial set of selected commits? MULTIPLE MAXIMAL COMMITS ------------------------ Earlier, we mentioned that when we discover a new maximal commit, we assign a new bit position to that commit and set that bit position to one for that commit. This is absolutely important for interesting commit-graphs such as git/git and torvalds/linux. The reason is due to the existence of "butterflies" in the commit-graph partial order. Here is an example of four commits forming a butterfly: I J |\ /| | \/ | | /\ | |/ \| M N \ / |/ Q Here, I and J both have parents M and N. In general, these do not need to be exact parent relationships, but reachability relationships. The most important part is that M and N cannot reach each other, so they are independent in the partial order. If I had commit_mask 10 and J had commit_mask 01, then M and N would both be assigned commit_mask 11 and be maximal commits with the bitmask 11. Then, what happens when M and N can both reach a commit Q? If Q is also assigned the bitmask 11, then it is not maximal but is reachable from both M and N. While this is not necessarily a deal-breaker for our abstract definition of finding maximal commits according to a given bitmask, we have a few issues that can come up in our larger picture of constructing reachability bitmaps. In particular, if we do not also consider Q to be a "maximal" commit, then we will walk commits reachable from Q twice: once when computing the reachability bitmap for M and another time when computing the reachability bitmap for N. This becomes much worse if the topology continues this pattern with multiple butterflies. The solution has already been mentioned: each of M and N are assigned their own bits to the bitmask and hence they become uniquely maximal for their bitmasks. Finally, Q also becomes maximal and thus we do not need to walk its commits multiple times. The final bitmasks for these commits are as follows: I:10 J:01 |\ /| | \ _____/ | | /\____ | |/ \ | M:111 N:1101 \ / Q:1111 Further, Q's reverse edge list is { M, N }, while M and N both have reverse edge list { I, J }. PERFORMANCE MEASUREMENTS ------------------------ Now that we've spent a LOT of time on the theory of this algorithm, let's show that this is actually worth all that effort. To test the performance, use GIT_TRACE2_PERF=1 when running 'git repack -abd' in a repository with no existing reachability bitmaps. This avoids any issues with keeping existing bitmaps to skew the numbers. Inspect the "building_bitmaps_total" region in the trace2 output to focus on the portion of work that is affected by this change. Here are the performance comparisons for a few repositories. The timings are for the following versions of Git: "multi" is the timing from before any reverse graph is constructed, where we might perform multiple traversals. "reverse" is for the previous change where the reverse graph has every reachable commit. Finally "maximal" is the version introduced here where the reverse graph only contains the maximal commits. Repository: git/git multi: 2.628 sec reverse: 2.344 sec maximal: 2.047 sec Repository: torvalds/linux multi: 64.7 sec reverse: 205.3 sec maximal: 44.7 sec So in all cases we've not only recovered any time lost to switching to the reverse-edge algorithm, but we come out ahead of "multi" in all cases. Likewise, peak heap has gone back to something reasonable: Repository: torvalds/linux multi: 2.087 GB reverse: 3.141 GB maximal: 2.288 GB While I do not have access to full fork networks on GitHub, Peff has run this algorithm on the chromium/chromium fork network and reported a change from 3 hours to ~233 seconds. That network is particularly beneficial for this approach because it has a long, linear history along with many tags. The "multi" approach was obviously quadratic and the new approach is linear. Helped-by: Jeff King Signed-off-by: Derrick Stolee Helped-by: Johannes Schindelin Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 72 +++++++++++++++++++++++++++++++++++++---- t/t5310-pack-bitmaps.sh | 85 +++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 148 insertions(+), 9 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 7e218d02a6..0af93193d8 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -180,8 +180,10 @@ static void compute_xor_offsets(void) struct bb_commit { struct commit_list *reverse_edges; + struct bitmap *commit_mask; struct bitmap *bitmap; - unsigned selected:1; + unsigned selected:1, + maximal:1; unsigned idx; /* within selected array */ }; @@ -198,7 +200,7 @@ static void bitmap_builder_init(struct bitmap_builder *bb, { struct rev_info revs; struct commit *commit; - unsigned int i; + unsigned int i, num_maximal; memset(bb, 0, sizeof(*bb)); init_bb_data(&bb->data); @@ -210,27 +212,85 @@ static void bitmap_builder_init(struct bitmap_builder *bb, for (i = 0; i < writer->selected_nr; i++) { struct commit *c = writer->selected[i].commit; struct bb_commit *ent = bb_data_at(&bb->data, c); + ent->selected = 1; + ent->maximal = 1; ent->idx = i; + + ent->commit_mask = bitmap_new(); + bitmap_set(ent->commit_mask, i); + add_pending_object(&revs, &c->object, ""); } + num_maximal = writer->selected_nr; if (prepare_revision_walk(&revs)) die("revision walk setup failed"); while ((commit = get_revision(&revs))) { struct commit_list *p; + struct bb_commit *c_ent; parse_commit_or_die(commit); - ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); - bb->commits[bb->commits_nr++] = commit; + c_ent = bb_data_at(&bb->data, commit); + + if (c_ent->maximal) { + if (!c_ent->selected) { + bitmap_set(c_ent->commit_mask, num_maximal); + num_maximal++; + } + + ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); + bb->commits[bb->commits_nr++] = commit; + } for (p = commit->parents; p; p = p->next) { - struct bb_commit *ent = bb_data_at(&bb->data, p->item); - commit_list_insert(commit, &ent->reverse_edges); + struct bb_commit *p_ent = bb_data_at(&bb->data, p->item); + int c_not_p, p_not_c; + + if (!p_ent->commit_mask) { + p_ent->commit_mask = bitmap_new(); + c_not_p = 1; + p_not_c = 0; + } else { + c_not_p = bitmap_is_subset(c_ent->commit_mask, p_ent->commit_mask); + p_not_c = bitmap_is_subset(p_ent->commit_mask, c_ent->commit_mask); + } + + if (!c_not_p) + continue; + + bitmap_or(p_ent->commit_mask, c_ent->commit_mask); + + if (p_not_c) + p_ent->maximal = 1; + else { + p_ent->maximal = 0; + free_commit_list(p_ent->reverse_edges); + p_ent->reverse_edges = NULL; + } + + if (c_ent->maximal) { + commit_list_insert(commit, &p_ent->reverse_edges); + } else { + struct commit_list *cc = c_ent->reverse_edges; + + for (; cc; cc = cc->next) { + if (!commit_list_contains(cc->item, p_ent->reverse_edges)) + commit_list_insert(cc->item, &p_ent->reverse_edges); + } + } } + + bitmap_free(c_ent->commit_mask); + c_ent->commit_mask = NULL; } + + trace2_data_intmax("pack-bitmap-write", the_repository, + "num_selected_commits", writer->selected_nr); + trace2_data_intmax("pack-bitmap-write", the_repository, + "num_maximal_commits", num_maximal); } static void bitmap_builder_clear(struct bitmap_builder *bb) diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index 1dd9723721..7ba796b2c3 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -20,12 +20,88 @@ has_any () { grep -Ff "$1" "$2" } +# To ensure the logic for "maximal commits" is exercised, make +# the repository a bit more complicated. +# +# other master +# * * +# (99 commits) (99 commits) +# * * +# |\ /| +# | * octo-other octo-master * | +# |/|\_________ ____________/|\| +# | \ \/ __________/ | +# | | ________/\ / | +# * |/ * merge-right * +# | _|__________/ \____________ | +# |/ | \| +# (l1) * * merge-left * (r1) +# | / \________________________ | +# |/ \| +# (l2) * * (r2) +# \___________________________ | +# \| +# * (base) +# +# The important part for the maximal commit algorithm is how +# the bitmasks are extended. Assuming starting bit positions +# for master (bit 0) and other (bit 1), and some flexibility +# in the order that merge bases are visited, the bitmasks at +# the end should be: +# +# master: 1 (maximal, selected) +# other: 01 (maximal, selected) +# octo-master: 1 +# octo-other: 01 +# merge-right: 111 (maximal) +# (l1): 111 +# (r1): 111 +# merge-left: 1101 (maximal) +# (l2): 11111 (maximal) +# (r2): 111101 (maximal) +# (base): 1111111 (maximal) + test_expect_success 'setup repo with moderate-sized history' ' - test_commit_bulk --id=file 100 && + test_commit_bulk --id=file 10 && git branch -M second && git checkout -b other HEAD~5 && test_commit_bulk --id=side 10 && + + # add complicated history setup, including merges and + # ambiguous merge-bases + + git checkout -b merge-left other~2 && + git merge second~2 -m "merge-left" && + + git checkout -b merge-right second~1 && + git merge other~1 -m "merge-right" && + + git checkout -b octo-second second && + git merge merge-left merge-right -m "octopus-second" && + + git checkout -b octo-other other && + git merge merge-left merge-right -m "octopus-other" && + + git checkout other && + git merge octo-other -m "pull octopus" && + + git checkout second && + git merge octo-second -m "pull octopus" && + + # Remove these branches so they are not selected + # as bitmap tips + git branch -D merge-left && + git branch -D merge-right && + git branch -D octo-other && + git branch -D octo-second && + + # add padding to make these merges less interesting + # and avoid having them selected for bitmaps + test_commit_bulk --id=file 100 && + git checkout other && + test_commit_bulk --id=side 100 && git checkout second && + bitmaptip=$(git rev-parse second) && blob=$(echo tagged-blob | git hash-object -w --stdin) && git tag tagged-blob $blob && @@ -33,9 +109,12 @@ test_expect_success 'setup repo with moderate-sized history' ' ' test_expect_success 'full repack creates bitmaps' ' - git repack -ad && + GIT_TRACE2_EVENT_NESTING=4 GIT_TRACE2_EVENT="$(pwd)/trace" \ + git repack -ad && ls .git/objects/pack/ | grep bitmap >output && - test_line_count = 1 output + test_line_count = 1 output && + grep "\"key\":\"num_selected_commits\",\"value\":\"106\"" trace && + grep "\"key\":\"num_maximal_commits\",\"value\":\"111\"" trace ' test_expect_success 'rev-list --test-bitmap verifies bitmaps' ' -- cgit v1.2.3 From 449fa5ee06906ca6d109e06b14cb4f8ea60a6c88 Mon Sep 17 00:00:00 2001 From: Jeff King Date: Tue, 8 Dec 2020 17:04:34 -0500 Subject: pack-bitmap-write: ignore BITMAP_FLAG_REUSE The on-disk bitmap format has a flag to mark a bitmap to be "reused". This is a rather curious feature, and works like this: - a run of pack-objects would decide to mark the last 80% of the bitmaps it generates with the reuse flag - the next time we generate bitmaps, we'd see those reuse flags from the last run, and mark those commits as special: - we'd be more likely to select those commits to get bitmaps in the new output - when generating the bitmap for a selected commit, we'd reuse the old bitmap as-is (rearranging the bits to match the new pack, of course) However, neither of these behaviors particularly makes sense. Just because a commit happened to be bitmapped last time does not make it a good candidate for having a bitmap this time. In particular, we may choose bitmaps based on how recent they are in history, or whether a ref tip points to them, and those things will change. We're better off re-considering fresh which commits are good candidates. Reusing the existing bitmap _is_ a reasonable thing to do to save computation. But only reusing exact bitmaps is a weak form of this. If we have an old bitmap for A and now want a new bitmap for its child, we should be able to compute that only by looking at trees and that are new to the child. But this code would consider only exact reuse (which is perhaps why it was eager to select those commits in the first place). Furthermore, the recent switch to the reverse-edge algorithm for generating bitmaps dropped this optimization entirely (and yet still performs better). So let's do a few cleanups: - drop the whole "reusing bitmaps" phase of generating bitmaps. It's not helping anything, and is mostly unused code (or worse, code that is using CPU but not doing anything useful) - drop the use of the on-disk reuse flag to select commits to bitmap - stop setting the on-disk reuse flag in bitmaps we generate (since nothing respects it anymore) We will keep a few innards of the reuse code, which will help us implement a more capable version of the "reuse" optimization: - simplify rebuild_existing_bitmaps() into a function that only builds the mapping of bits between the old and new orders, but doesn't actually convert any bitmaps - make rebuild_bitmap() public; we'll call it lazily to convert bitmaps as we traverse (using the mapping created above) Signed-off-by: Jeff King Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 1 - pack-bitmap-write.c | 50 +++++--------------------------------------------- pack-bitmap.c | 46 ++++++---------------------------------------- pack-bitmap.h | 6 +++++- 4 files changed, 16 insertions(+), 87 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 5617c01b5a..2a00358f34 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1104,7 +1104,6 @@ static void write_pack_file(void) stop_progress(&progress_state); bitmap_writer_show_progress(progress); - bitmap_writer_reuse_bitmaps(&to_pack); bitmap_writer_select_commits(indexed_commits, indexed_commits_nr, -1); bitmap_writer_build(&to_pack); bitmap_writer_finish(written_list, nr_written, diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 0af93193d8..333058854d 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -30,7 +30,6 @@ struct bitmap_writer { struct ewah_bitmap *tags; kh_oid_map_t *bitmaps; - kh_oid_map_t *reused; struct packing_data *to_pack; struct bitmapped_commit *selected; @@ -112,7 +111,7 @@ void bitmap_writer_build_type_index(struct packing_data *to_pack, * Compute the actual bitmaps */ -static inline void push_bitmapped_commit(struct commit *commit, struct ewah_bitmap *reused) +static inline void push_bitmapped_commit(struct commit *commit) { if (writer.selected_nr >= writer.selected_alloc) { writer.selected_alloc = (writer.selected_alloc + 32) * 2; @@ -120,7 +119,7 @@ static inline void push_bitmapped_commit(struct commit *commit, struct ewah_bitm } writer.selected[writer.selected_nr].commit = commit; - writer.selected[writer.selected_nr].bitmap = reused; + writer.selected[writer.selected_nr].bitmap = NULL; writer.selected[writer.selected_nr].flags = 0; writer.selected_nr++; @@ -372,13 +371,6 @@ static void store_selected(struct bb_commit *ent, struct commit *commit) khiter_t hash_pos; int hash_ret; - /* - * the "reuse bitmaps" phase may have stored something here, but - * our new algorithm doesn't use it. Drop it. - */ - if (stored->bitmap) - ewah_free(stored->bitmap); - stored->bitmap = bitmap_to_ewah(ent->bitmap); hash_pos = kh_put_oid_map(writer.bitmaps, commit->object.oid, &hash_ret); @@ -480,35 +472,6 @@ static int date_compare(const void *_a, const void *_b) return (long)b->date - (long)a->date; } -void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack) -{ - struct bitmap_index *bitmap_git; - if (!(bitmap_git = prepare_bitmap_git(to_pack->repo))) - return; - - writer.reused = kh_init_oid_map(); - rebuild_existing_bitmaps(bitmap_git, to_pack, writer.reused, - writer.show_progress); - /* - * NEEDSWORK: rebuild_existing_bitmaps() makes writer.reused reference - * some bitmaps in bitmap_git, so we can't free the latter. - */ -} - -static struct ewah_bitmap *find_reused_bitmap(const struct object_id *oid) -{ - khiter_t hash_pos; - - if (!writer.reused) - return NULL; - - hash_pos = kh_get_oid_map(writer.reused, *oid); - if (hash_pos >= kh_end(writer.reused)) - return NULL; - - return kh_value(writer.reused, hash_pos); -} - void bitmap_writer_select_commits(struct commit **indexed_commits, unsigned int indexed_commits_nr, int max_bitmaps) @@ -522,12 +485,11 @@ void bitmap_writer_select_commits(struct commit **indexed_commits, if (indexed_commits_nr < 100) { for (i = 0; i < indexed_commits_nr; ++i) - push_bitmapped_commit(indexed_commits[i], NULL); + push_bitmapped_commit(indexed_commits[i]); return; } for (;;) { - struct ewah_bitmap *reused_bitmap = NULL; struct commit *chosen = NULL; next = next_commit_index(i); @@ -542,15 +504,13 @@ void bitmap_writer_select_commits(struct commit **indexed_commits, if (next == 0) { chosen = indexed_commits[i]; - reused_bitmap = find_reused_bitmap(&chosen->object.oid); } else { chosen = indexed_commits[i + next]; for (j = 0; j <= next; ++j) { struct commit *cm = indexed_commits[i + j]; - reused_bitmap = find_reused_bitmap(&cm->object.oid); - if (reused_bitmap || (cm->object.flags & NEEDS_BITMAP) != 0) { + if ((cm->object.flags & NEEDS_BITMAP) != 0) { chosen = cm; break; } @@ -560,7 +520,7 @@ void bitmap_writer_select_commits(struct commit **indexed_commits, } } - push_bitmapped_commit(chosen, reused_bitmap); + push_bitmapped_commit(chosen); i += next + 1; display_progress(writer.progress, i); diff --git a/pack-bitmap.c b/pack-bitmap.c index 60c781d100..d1368b69bb 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -1338,9 +1338,9 @@ void test_bitmap_walk(struct rev_info *revs) free_bitmap_index(bitmap_git); } -static int rebuild_bitmap(uint32_t *reposition, - struct ewah_bitmap *source, - struct bitmap *dest) +int rebuild_bitmap(const uint32_t *reposition, + struct ewah_bitmap *source, + struct bitmap *dest) { uint32_t pos = 0; struct ewah_iterator it; @@ -1369,19 +1369,11 @@ static int rebuild_bitmap(uint32_t *reposition, return 0; } -int rebuild_existing_bitmaps(struct bitmap_index *bitmap_git, - struct packing_data *mapping, - kh_oid_map_t *reused_bitmaps, - int show_progress) +uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git, + struct packing_data *mapping) { uint32_t i, num_objects; uint32_t *reposition; - struct bitmap *rebuild; - struct stored_bitmap *stored; - struct progress *progress = NULL; - - khiter_t hash_pos; - int hash_ret; num_objects = bitmap_git->pack->num_objects; reposition = xcalloc(num_objects, sizeof(uint32_t)); @@ -1399,33 +1391,7 @@ int rebuild_existing_bitmaps(struct bitmap_index *bitmap_git, reposition[i] = oe_in_pack_pos(mapping, oe) + 1; } - rebuild = bitmap_new(); - i = 0; - - if (show_progress) - progress = start_progress("Reusing bitmaps", 0); - - kh_foreach_value(bitmap_git->bitmaps, stored, { - if (stored->flags & BITMAP_FLAG_REUSE) { - if (!rebuild_bitmap(reposition, - lookup_stored_bitmap(stored), - rebuild)) { - hash_pos = kh_put_oid_map(reused_bitmaps, - stored->oid, - &hash_ret); - kh_value(reused_bitmaps, hash_pos) = - bitmap_to_ewah(rebuild); - } - bitmap_reset(rebuild); - display_progress(progress, ++i); - } - }); - - stop_progress(&progress); - - free(reposition); - bitmap_free(rebuild); - return 0; + return reposition; } void free_bitmap_index(struct bitmap_index *b) diff --git a/pack-bitmap.h b/pack-bitmap.h index 1203120c43..afa4115136 100644 --- a/pack-bitmap.h +++ b/pack-bitmap.h @@ -73,7 +73,11 @@ void bitmap_writer_set_checksum(unsigned char *sha1); void bitmap_writer_build_type_index(struct packing_data *to_pack, struct pack_idx_entry **index, uint32_t index_nr); -void bitmap_writer_reuse_bitmaps(struct packing_data *to_pack); +uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git, + struct packing_data *mapping); +int rebuild_bitmap(const uint32_t *reposition, + struct ewah_bitmap *source, + struct bitmap *dest); void bitmap_writer_select_commits(struct commit **indexed_commits, unsigned int indexed_commits_nr, int max_bitmaps); void bitmap_writer_build(struct packing_data *to_pack); -- cgit v1.2.3 From 98c31f366a1770fb7ea04125ff2d8b1ea1f7d0d7 Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 8 Dec 2020 17:05:09 -0500 Subject: pack-bitmap: factor out 'bitmap_for_commit()' A couple of callers within pack-bitmap.c duplicate logic to lookup a given object id in the bitamps khash. Factor this out into a new function, 'bitmap_for_commit()' to reduce some code duplication. Make this new function non-static, since it will be used in later commits from outside of pack-bitmap.c. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 33 +++++++++++++++++++-------------- pack-bitmap.h | 2 ++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index d1368b69bb..5efb8af121 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -380,6 +380,16 @@ struct include_data { struct bitmap *seen; }; +struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git, + struct commit *commit) +{ + khiter_t hash_pos = kh_get_oid_map(bitmap_git->bitmaps, + commit->object.oid); + if (hash_pos >= kh_end(bitmap_git->bitmaps)) + return NULL; + return lookup_stored_bitmap(kh_value(bitmap_git->bitmaps, hash_pos)); +} + static inline int bitmap_position_extended(struct bitmap_index *bitmap_git, const struct object_id *oid) { @@ -465,10 +475,10 @@ static void show_commit(struct commit *commit, void *data) static int add_to_include_set(struct bitmap_index *bitmap_git, struct include_data *data, - const struct object_id *oid, + struct commit *commit, int bitmap_pos) { - khiter_t hash_pos; + struct ewah_bitmap *partial; if (data->seen && bitmap_get(data->seen, bitmap_pos)) return 0; @@ -476,10 +486,9 @@ static int add_to_include_set(struct bitmap_index *bitmap_git, if (bitmap_get(data->base, bitmap_pos)) return 0; - hash_pos = kh_get_oid_map(bitmap_git->bitmaps, *oid); - if (hash_pos < kh_end(bitmap_git->bitmaps)) { - struct stored_bitmap *st = kh_value(bitmap_git->bitmaps, hash_pos); - bitmap_or_ewah(data->base, lookup_stored_bitmap(st)); + partial = bitmap_for_commit(bitmap_git, commit); + if (partial) { + bitmap_or_ewah(data->base, partial); return 0; } @@ -498,8 +507,7 @@ static int should_include(struct commit *commit, void *_data) (struct object *)commit, NULL); - if (!add_to_include_set(data->bitmap_git, data, &commit->object.oid, - bitmap_pos)) { + if (!add_to_include_set(data->bitmap_git, data, commit, bitmap_pos)) { struct commit_list *parent = commit->parents; while (parent) { @@ -1282,10 +1290,10 @@ void test_bitmap_walk(struct rev_info *revs) { struct object *root; struct bitmap *result = NULL; - khiter_t pos; size_t result_popcnt; struct bitmap_test_data tdata; struct bitmap_index *bitmap_git; + struct ewah_bitmap *bm; if (!(bitmap_git = prepare_bitmap_git(revs->repo))) die("failed to load bitmap indexes"); @@ -1297,12 +1305,9 @@ void test_bitmap_walk(struct rev_info *revs) bitmap_git->version, bitmap_git->entry_count); root = revs->pending.objects[0].item; - pos = kh_get_oid_map(bitmap_git->bitmaps, root->oid); - - if (pos < kh_end(bitmap_git->bitmaps)) { - struct stored_bitmap *st = kh_value(bitmap_git->bitmaps, pos); - struct ewah_bitmap *bm = lookup_stored_bitmap(st); + bm = bitmap_for_commit(bitmap_git, (struct commit *)root); + if (bm) { fprintf(stderr, "Found bitmap for %s. %d bits / %08x checksum\n", oid_to_hex(&root->oid), (int)bm->bit_size, ewah_checksum(bm)); diff --git a/pack-bitmap.h b/pack-bitmap.h index afa4115136..25dfcf5615 100644 --- a/pack-bitmap.h +++ b/pack-bitmap.h @@ -78,6 +78,8 @@ uint32_t *create_bitmap_mapping(struct bitmap_index *bitmap_git, int rebuild_bitmap(const uint32_t *reposition, struct ewah_bitmap *source, struct bitmap *dest); +struct ewah_bitmap *bitmap_for_commit(struct bitmap_index *bitmap_git, + struct commit *commit); void bitmap_writer_select_commits(struct commit **indexed_commits, unsigned int indexed_commits_nr, int max_bitmaps); void bitmap_writer_build(struct packing_data *to_pack); -- cgit v1.2.3 From 83578051a9c4eca4e6bcd44687e84915e1064bba Mon Sep 17 00:00:00 2001 From: Taylor Blau Date: Tue, 8 Dec 2020 17:05:14 -0500 Subject: pack-bitmap: factor out 'add_commit_to_bitmap()' 'find_objects()' currently needs to interact with the bitmaps khash pretty closely. To make 'find_objects()' read a little more straightforwardly, remove some of the khash-level details into a new function that describes what it does: 'add_commit_to_bitmap()'. Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/pack-bitmap.c b/pack-bitmap.c index 5efb8af121..d88745fb02 100644 --- a/pack-bitmap.c +++ b/pack-bitmap.c @@ -521,6 +521,23 @@ static int should_include(struct commit *commit, void *_data) return 1; } +static int add_commit_to_bitmap(struct bitmap_index *bitmap_git, + struct bitmap **base, + struct commit *commit) +{ + struct ewah_bitmap *or_with = bitmap_for_commit(bitmap_git, commit); + + if (!or_with) + return 0; + + if (*base == NULL) + *base = ewah_to_bitmap(or_with); + else + bitmap_or_ewah(*base, or_with); + + return 1; +} + static struct bitmap *find_objects(struct bitmap_index *bitmap_git, struct rev_info *revs, struct object_list *roots, @@ -544,21 +561,10 @@ static struct bitmap *find_objects(struct bitmap_index *bitmap_git, struct object *object = roots->item; roots = roots->next; - if (object->type == OBJ_COMMIT) { - khiter_t pos = kh_get_oid_map(bitmap_git->bitmaps, object->oid); - - if (pos < kh_end(bitmap_git->bitmaps)) { - struct stored_bitmap *st = kh_value(bitmap_git->bitmaps, pos); - struct ewah_bitmap *or_with = lookup_stored_bitmap(st); - - if (base == NULL) - base = ewah_to_bitmap(or_with); - else - bitmap_or_ewah(base, or_with); - - object->flags |= SEEN; - continue; - } + if (object->type == OBJ_COMMIT && + add_commit_to_bitmap(bitmap_git, &base, (struct commit *)object)) { + object->flags |= SEEN; + continue; } object_list_insert(object, ¬_mapped); -- cgit v1.2.3 From 341fa34887630a7adf6b3a771ae866ce66e7967e Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:05:21 -0500 Subject: pack-bitmap-write: use existing bitmaps When constructing new bitmaps, we perform a commit and tree walk in fill_bitmap_commit() and fill_bitmap_tree(). This walk would benefit from using existing bitmaps when available. We must track the existing bitmaps and translate them into the new object order, but this is generally faster than parsing trees. In fill_bitmap_commit(), we must reorder thing somewhat. The priority queue walks commits from newest-to-oldest, which means we correctly stop walking when reaching a commit with a bitmap. However, if we walk trees interleaved with the commits, then we might be parsing trees that are actually part of a re-used bitmap. To avoid over-walking trees, add them to a LIFO queue and walk them after exploring commits completely. On git.git, this reduces a second immediate bitmap computation from 2.0s to 1.0s. On linux.git, we go from 32s to 22s. On chromium's fork network, we go from 227s to 198s. Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 333058854d..76c8236f94 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -340,20 +340,37 @@ static void fill_bitmap_tree(struct bitmap *bitmap, static void fill_bitmap_commit(struct bb_commit *ent, struct commit *commit, - struct prio_queue *queue) + struct prio_queue *queue, + struct prio_queue *tree_queue, + struct bitmap_index *old_bitmap, + const uint32_t *mapping) { if (!ent->bitmap) ent->bitmap = bitmap_new(); - bitmap_set(ent->bitmap, find_object_pos(&commit->object.oid)); prio_queue_put(queue, commit); while (queue->nr) { struct commit_list *p; struct commit *c = prio_queue_get(queue); + if (old_bitmap && mapping) { + struct ewah_bitmap *old = bitmap_for_commit(old_bitmap, c); + /* + * If this commit has an old bitmap, then translate that + * bitmap and add its bits to this one. No need to walk + * parents or the tree for this commit. + */ + if (old && !rebuild_bitmap(mapping, old, ent->bitmap)) + continue; + } + + /* + * Mark ourselves and queue our tree. The commit + * walk ensures we cover all parents. + */ bitmap_set(ent->bitmap, find_object_pos(&c->object.oid)); - fill_bitmap_tree(ent->bitmap, get_commit_tree(c)); + prio_queue_put(tree_queue, get_commit_tree(c)); for (p = c->parents; p; p = p->next) { int pos = find_object_pos(&p->item->object.oid); @@ -363,6 +380,9 @@ static void fill_bitmap_commit(struct bb_commit *ent, } } } + + while (tree_queue->nr) + fill_bitmap_tree(ent->bitmap, prio_queue_get(tree_queue)); } static void store_selected(struct bb_commit *ent, struct commit *commit) @@ -386,6 +406,9 @@ void bitmap_writer_build(struct packing_data *to_pack) size_t i; int nr_stored = 0; /* for progress */ struct prio_queue queue = { compare_commits_by_gen_then_commit_date }; + struct prio_queue tree_queue = { NULL }; + struct bitmap_index *old_bitmap; + uint32_t *mapping; writer.bitmaps = kh_init_oid_map(); writer.to_pack = to_pack; @@ -395,6 +418,12 @@ void bitmap_writer_build(struct packing_data *to_pack) trace2_region_enter("pack-bitmap-write", "building_bitmaps_total", the_repository); + old_bitmap = prepare_bitmap_git(to_pack->repo); + if (old_bitmap) + mapping = create_bitmap_mapping(old_bitmap, to_pack); + else + mapping = NULL; + bitmap_builder_init(&bb, &writer); for (i = bb.commits_nr; i > 0; i--) { struct commit *commit = bb.commits[i-1]; @@ -402,7 +431,8 @@ void bitmap_writer_build(struct packing_data *to_pack) struct commit *child; int reused = 0; - fill_bitmap_commit(ent, commit, &queue); + fill_bitmap_commit(ent, commit, &queue, &tree_queue, + old_bitmap, mapping); if (ent->selected) { store_selected(ent, commit); @@ -428,7 +458,9 @@ void bitmap_writer_build(struct packing_data *to_pack) ent->bitmap = NULL; } clear_prio_queue(&queue); + clear_prio_queue(&tree_queue); bitmap_builder_clear(&bb); + free(mapping); trace2_region_leave("pack-bitmap-write", "building_bitmaps_total", the_repository); -- cgit v1.2.3 From 45f4eeb2919e2c802a6c1f64a2b1c299f7434938 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:05:26 -0500 Subject: pack-bitmap-write: relax unique revwalk condition The previous commits improved the bitmap computation process for very long, linear histories with many refs by removing quadratic growth in how many objects were walked. The strategy of computing "intermediate commits" using bitmasks for which refs can reach those commits partitioned the poset of reachable objects so each part could be walked exactly once. This was effective for linear histories. However, there was a (significant) drawback: wide histories with many refs had an explosion of memory costs to compute the commit bitmasks during the exploration that discovers these intermediate commits. Since these wide histories are unlikely to repeat walking objects, the benefit of walking objects multiple times was not expensive before. But now, the commit walk *before computing bitmaps* is incredibly expensive. In an effort to discover a happy medium, this change reduces the walk for intermediate commits to only the first-parent history. This focuses the walk on how the histories converge, which still has significant reduction in repeat object walks. It is still possible to create quadratic behavior in this version, but it is probably less likely in realistic data shapes. Here is some data taken on a fresh clone of the kernel: | runtime (sec) | peak heap (GB) | | | | | from | with | from | with | | scratch | existing | scratch | existing | -----------+---------+----------+---------+----------- original | 64.044 | 83.241 | 2.088 | 2.194 | last patch | 45.049 | 37.624 | 2.267 | 2.334 | this patch | 88.478 | 53.218 | 2.157 | 2.224 | Signed-off-by: Derrick Stolee Helped-by: Johannes Schindelin Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 14 +++++--------- t/t5310-pack-bitmaps.sh | 33 +++++++++++++++++---------------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index 76c8236f94..d2af4a974f 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -199,7 +199,7 @@ static void bitmap_builder_init(struct bitmap_builder *bb, { struct rev_info revs; struct commit *commit; - unsigned int i, num_maximal; + unsigned int i, num_maximal = 0; memset(bb, 0, sizeof(*bb)); init_bb_data(&bb->data); @@ -207,6 +207,7 @@ static void bitmap_builder_init(struct bitmap_builder *bb, reset_revision_walk(); repo_init_revisions(writer->to_pack->repo, &revs, NULL); revs.topo_order = 1; + revs.first_parent_only = 1; for (i = 0; i < writer->selected_nr; i++) { struct commit *c = writer->selected[i].commit; @@ -221,13 +222,12 @@ static void bitmap_builder_init(struct bitmap_builder *bb, add_pending_object(&revs, &c->object, ""); } - num_maximal = writer->selected_nr; if (prepare_revision_walk(&revs)) die("revision walk setup failed"); while ((commit = get_revision(&revs))) { - struct commit_list *p; + struct commit_list *p = commit->parents; struct bb_commit *c_ent; parse_commit_or_die(commit); @@ -235,16 +235,12 @@ static void bitmap_builder_init(struct bitmap_builder *bb, c_ent = bb_data_at(&bb->data, commit); if (c_ent->maximal) { - if (!c_ent->selected) { - bitmap_set(c_ent->commit_mask, num_maximal); - num_maximal++; - } - + num_maximal++; ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); bb->commits[bb->commits_nr++] = commit; } - for (p = commit->parents; p; p = p->next) { + if (p) { struct bb_commit *p_ent = bb_data_at(&bb->data, p->item); int c_not_p, p_not_c; diff --git a/t/t5310-pack-bitmaps.sh b/t/t5310-pack-bitmaps.sh index 7ba796b2c3..aa2ccdb78b 100755 --- a/t/t5310-pack-bitmaps.sh +++ b/t/t5310-pack-bitmaps.sh @@ -23,12 +23,12 @@ has_any () { # To ensure the logic for "maximal commits" is exercised, make # the repository a bit more complicated. # -# other master +# other second # * * # (99 commits) (99 commits) # * * # |\ /| -# | * octo-other octo-master * | +# | * octo-other octo-second * | # |/|\_________ ____________/|\| # | \ \/ __________/ | # | | ________/\ / | @@ -43,23 +43,24 @@ has_any () { # \| # * (base) # +# We only push bits down the first-parent history, which +# makes some of these commits unimportant! +# # The important part for the maximal commit algorithm is how # the bitmasks are extended. Assuming starting bit positions -# for master (bit 0) and other (bit 1), and some flexibility -# in the order that merge bases are visited, the bitmasks at -# the end should be: +# for second (bit 0) and other (bit 1), the bitmasks at the +# end should be: # -# master: 1 (maximal, selected) +# second: 1 (maximal, selected) # other: 01 (maximal, selected) -# octo-master: 1 -# octo-other: 01 -# merge-right: 111 (maximal) -# (l1): 111 -# (r1): 111 -# merge-left: 1101 (maximal) -# (l2): 11111 (maximal) -# (r2): 111101 (maximal) -# (base): 1111111 (maximal) +# (base): 11 (maximal) +# +# This complicated history was important for a previous +# version of the walk that guarantees never walking a +# commit multiple times. That goal might be important +# again, so preserve this complicated case. For now, this +# test will guarantee that the bitmaps are computed +# correctly, even with the repeat calculations. test_expect_success 'setup repo with moderate-sized history' ' test_commit_bulk --id=file 10 && @@ -114,7 +115,7 @@ test_expect_success 'full repack creates bitmaps' ' ls .git/objects/pack/ | grep bitmap >output && test_line_count = 1 output && grep "\"key\":\"num_selected_commits\",\"value\":\"106\"" trace && - grep "\"key\":\"num_maximal_commits\",\"value\":\"111\"" trace + grep "\"key\":\"num_maximal_commits\",\"value\":\"107\"" trace ' test_expect_success 'rev-list --test-bitmap verifies bitmaps' ' -- cgit v1.2.3 From f077b0a9860f383a3cd0ce3a0e2a11ff2f27fc65 Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Tue, 8 Dec 2020 17:05:30 -0500 Subject: pack-bitmap-write: better reuse bitmaps If the old bitmap file contains a bitmap for a given commit, then that commit does not need help from intermediate commits in its history to compute its final bitmap. Eject that commit from the walk and insert it into a separate list of reusable commits that are eventually stored in the list of commits for computing bitmaps. This helps the repeat bitmap computation task, even if the selected commits shift drastically. This helps when a previously-bitmapped commit exists in the first-parent history of a newly-selected commit. Since we stop the walk at these commits and we use a first-parent walk, it is harder to walk "around" these bitmapped commits. It's not impossible, but we can greatly reduce the computation time for many selected commits. | runtime (sec) | peak heap (GB) | | | | | from | with | from | with | | scratch | existing | scratch | existing | -----------+---------+----------+---------+----------- last patch | 88.478 | 53.218 | 2.157 | 2.224 | this patch | 86.681 | 16.164 | 2.157 | 2.222 | Signed-off-by: Derrick Stolee Signed-off-by: Taylor Blau Signed-off-by: Junio C Hamano --- pack-bitmap-write.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/pack-bitmap-write.c b/pack-bitmap-write.c index d2af4a974f..cc5ead9990 100644 --- a/pack-bitmap-write.c +++ b/pack-bitmap-write.c @@ -195,10 +195,13 @@ struct bitmap_builder { }; static void bitmap_builder_init(struct bitmap_builder *bb, - struct bitmap_writer *writer) + struct bitmap_writer *writer, + struct bitmap_index *old_bitmap) { struct rev_info revs; struct commit *commit; + struct commit_list *reusable = NULL; + struct commit_list *r; unsigned int i, num_maximal = 0; memset(bb, 0, sizeof(*bb)); @@ -234,6 +237,31 @@ static void bitmap_builder_init(struct bitmap_builder *bb, c_ent = bb_data_at(&bb->data, commit); + /* + * If there is no commit_mask, there is no reason to iterate + * over this commit; it is not selected (if it were, it would + * not have a blank commit mask) and all its children have + * existing bitmaps (see the comment starting with "This commit + * has an existing bitmap" below), so it does not contribute + * anything to the final bitmap file or its descendants. + */ + if (!c_ent->commit_mask) + continue; + + if (old_bitmap && bitmap_for_commit(old_bitmap, commit)) { + /* + * This commit has an existing bitmap, so we can + * get its bits immediately without an object + * walk. That is, it is reusable as-is and there is no + * need to continue walking beyond it. + * + * Mark it as such and add it to bb->commits separately + * to avoid allocating a position in the commit mask. + */ + commit_list_insert(commit, &reusable); + goto next; + } + if (c_ent->maximal) { num_maximal++; ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); @@ -278,14 +306,22 @@ static void bitmap_builder_init(struct bitmap_builder *bb, } } +next: bitmap_free(c_ent->commit_mask); c_ent->commit_mask = NULL; } + for (r = reusable; r; r = r->next) { + ALLOC_GROW(bb->commits, bb->commits_nr + 1, bb->commits_alloc); + bb->commits[bb->commits_nr++] = r->item; + } + trace2_data_intmax("pack-bitmap-write", the_repository, "num_selected_commits", writer->selected_nr); trace2_data_intmax("pack-bitmap-write", the_repository, "num_maximal_commits", num_maximal); + + free_commit_list(reusable); } static void bitmap_builder_clear(struct bitmap_builder *bb) @@ -420,7 +456,7 @@ void bitmap_writer_build(struct packing_data *to_pack) else mapping = NULL; - bitmap_builder_init(&bb, &writer); + bitmap_builder_init(&bb, &writer, old_bitmap); for (i = bb.commits_nr; i > 0; i--) { struct commit *commit = bb.commits[i-1]; struct bb_commit *ent = bb_data_at(&bb.data, commit); -- cgit v1.2.3