From 57059091fad25427bce9b3d47e073ce0518d164b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:28 -0400 Subject: get rid of num_packed_objects() The coming index format change doesn't allow for the number of objects to be determined from the size of the index file directly. Instead, Let's initialize a field in the packed_git structure with the object count when the index is validated since the count is always known at that point. While at it let's reorder some struct packed_git fields to avoid padding due to needed 64-bit alignment for some of them. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 45ac3e482a..6bff17b130 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -164,7 +164,7 @@ static int cmp_offset(const void *a_, const void *b_) static void prepare_pack_revindex(struct pack_revindex *rix) { struct packed_git *p = rix->p; - int num_ent = num_packed_objects(p); + int num_ent = p->num_objects; int i; const char *index = p->index_data; @@ -198,7 +198,7 @@ static struct revindex_entry * find_packed_object(struct packed_git *p, prepare_pack_revindex(rix); revindex = rix->revindex; lo = 0; - hi = num_packed_objects(p) + 1; + hi = p->num_objects + 1; do { int mi = (lo + hi) / 2; if (revindex[mi].offset == ofs) { -- cgit v1.2.3 From 8723f216263ba4a0f06be7b93fada863c0931e09 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:29 -0400 Subject: make overflow test on delta base offset work regardless of variable size This patch introduces the MSB() macro to obtain the desired number of most significant bits from a given variable independently of the variable type. It is then used to better implement the overflow test on the OBJ_OFS_DELTA base offset variable with the property of always working correctly regardless of the type/size of that variable. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 6bff17b130..ee607a0d2c 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1014,7 +1014,7 @@ static void check_object(struct object_entry *entry) ofs = c & 127; while (c & 128) { ofs += 1; - if (!ofs || ofs & ~(~0UL >> 7)) + if (!ofs || MSB(ofs, 7)) die("delta base offset overflow in pack for %s", sha1_to_hex(entry->sha1)); c = buf[used_0++]; -- cgit v1.2.3 From d7dd02231f75604e388afb905f7bf8afd1bf4b24 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:30 -0400 Subject: add overflow tests on pack offset variables Change a few size and offset variables to more appropriate type, then add overflow tests on those offsets. This prevents any bad data to be generated/processed if off_t happens to not be large enough to handle some big packs. Better be safe than sorry. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index ee607a0d2c..d0be879443 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -369,7 +369,7 @@ static int revalidate_loose_object(struct object_entry *entry, return check_loose_inflate(map, mapsize, size); } -static off_t write_object(struct sha1file *f, +static unsigned long write_object(struct sha1file *f, struct object_entry *entry) { unsigned long size; @@ -503,16 +503,23 @@ static off_t write_one(struct sha1file *f, struct object_entry *e, off_t offset) { + unsigned long size; + + /* offset is non zero if object is written already. */ if (e->offset || e->preferred_base) - /* offset starts from header size and cannot be zero - * if it is written already. - */ return offset; - /* if we are deltified, write out its base object first. */ + + /* if we are deltified, write out base object first. */ if (e->delta) offset = write_one(f, e->delta, offset); + e->offset = offset; - return offset + write_object(f, e); + size = write_object(f, e); + + /* make sure off_t is sufficiently large not to wrap */ + if (offset > offset + size) + die("pack too large for current definition of off_t"); + return offset + size; } static void write_pack_file(void) -- cgit v1.2.3 From 78d1e84fe5586ed45740b915b52e6856bb3f1c44 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:31 -0400 Subject: compute a CRC32 for each object as stored in a pack The most important optimization for performance when repacking is the ability to reuse data from a previous pack as is and bypass any delta or even SHA1 computation by simply copying the raw data from one pack to another directly. The problem with this is that any data corruption within a copied object would go unnoticed and the new (repacked) pack would be self-consistent with its own checksum despite containing a corrupted object. This is a real issue that already happened at least once in the past. In some attempt to prevent this, we validate the copied data by inflating it and making sure no error is signaled by zlib. But this is still not perfect as a significant portion of a pack content is made of object headers and references to delta base objects which are not deflated and therefore not validated when repacking actually making the pack data reuse still not as safe as it could be. Of course a full SHA1 validation could be performed, but that implies full data inflating and delta replaying which is extremely costly, which cost the data reuse optimization was designed to avoid in the first place. So the best solution to this is simply to store a CRC32 of the raw pack data for each object in the pack index. This way any object in a pack can be validated before being copied as is in another pack, including header and any other non deflated data. Why CRC32 instead of a faster checksum like Adler32? Quoting Wikipedia: Jonathan Stone discovered in 2001 that Adler-32 has a weakness for very short messages. He wrote "Briefly, the problem is that, for very short packets, Adler32 is guaranteed to give poor coverage of the available bits. Don't take my word for it, ask Mark Adler. :-)" The problem is that sum A does not wrap for short messages. The maximum value of A for a 128-byte message is 32640, which is below the value 65521 used by the modulo operation. An extended explanation can be found in RFC 3309, which mandates the use of CRC32 instead of Adler-32 for SCTP, the Stream Control Transmission Protocol. In the context of a GIT pack, we have lots of small objects, especially deltas, which are likely to be quite small and in a size range for which Adler32 is dimed not to be sufficient. Another advantage of CRC32 is the possibility for recovery from certain types of small corruptions like single bit errors which are the most probable type of corruptions. OK what this patch does is to compute the CRC32 of each object written to a pack within pack-objects. It is not written to the index yet and it is obviously not validated when reusing pack data yet either. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index d0be879443..03e36f0183 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -44,6 +44,7 @@ struct object_entry { * be used as the base objectto delta huge * objects against. */ + uint32_t crc32; /* crc of raw pack data for this object */ }; /* @@ -381,6 +382,9 @@ static unsigned long write_object(struct sha1file *f, enum object_type obj_type; int to_reuse = 0; + if (!pack_to_stdout) + crc32_begin(f); + obj_type = entry->type; if (! entry->in_pack) to_reuse = 0; /* can't reuse what we don't have */ @@ -496,6 +500,8 @@ static unsigned long write_object(struct sha1file *f, if (entry->delta) written_delta++; written++; + if (!pack_to_stdout) + entry->crc32 = crc32_end(f); return hdrlen + datalen; } -- cgit v1.2.3 From c553ca25bd60dc9fd50b8bc7bd329601b81cee66 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 01:06:33 -0400 Subject: pack-objects: learn about pack index version 2 Pack index version 2 goes as follows: - 8 bytes of header with signature and version. - 256 entries of 4-byte first-level fan-out table. - Table of sorted 20-byte SHA1 records for each object in pack. - Table of 4-byte CRC32 entries for raw pack object data. - Table of 4-byte offset entries for objects in the pack if offset is representable with 31 bits or less, otherwise it is an index in the next table with top bit set. - Table of 8-byte offset entries indexed from previous table for offsets which are 32 bits or more (optional). - 20-byte SHA1 checksum of sorted object names. - 20-byte SHA1 checksum of the above. The object SHA1 table is all contiguous so future pack format that would contain this table directly won't require big changes to the code. It is also tighter for slightly better cache locality when looking up entries. Support for large packs exceeding 31 bits in size won't impose an index size bloat for packs within that range that don't need a 64-bit offset. And because newer objects which are likely to be the most frequently used are located at the beginning of the pack, they won't pay the 64-bit offset lookup at run time either even if the pack is large. Right now an index version 2 is created only when the biggest offset in a pack reaches 31 bits. It might be a good idea to always use index version 2 eventually to benefit from the CRC32 it contains when reusing pack data while repacking. [jc: with the "oops" fix to keep track of the last offset correctly] Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 99 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 87 insertions(+), 12 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 03e36f0183..8cf2871751 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -169,13 +169,33 @@ static void prepare_pack_revindex(struct pack_revindex *rix) int i; const char *index = p->index_data; - index += 4 * 256; rix->revindex = xmalloc(sizeof(*rix->revindex) * (num_ent + 1)); - for (i = 0; i < num_ent; i++) { - uint32_t hl = *((uint32_t *)(index + 24 * i)); - rix->revindex[i].offset = ntohl(hl); - rix->revindex[i].nr = i; + index += 4 * 256; + + if (p->index_version > 1) { + const uint32_t *off_32 = + (uint32_t *)(index + 8 + p->num_objects * (20 + 4)); + const uint32_t *off_64 = off_32 + p->num_objects; + for (i = 0; i < num_ent; i++) { + uint32_t off = ntohl(*off_32++); + if (!(off & 0x80000000)) { + rix->revindex[i].offset = off; + } else { + rix->revindex[i].offset = + ((uint64_t)ntohl(*off_64++)) << 32; + rix->revindex[i].offset |= + ntohl(*off_64++); + } + rix->revindex[i].nr = i; + } + } else { + for (i = 0; i < num_ent; i++) { + uint32_t hl = *((uint32_t *)(index + 24 * i)); + rix->revindex[i].offset = ntohl(hl); + rix->revindex[i].nr = i; + } } + /* This knows the pack format -- the 20-byte trailer * follows immediately after the last object data. */ @@ -528,11 +548,11 @@ static off_t write_one(struct sha1file *f, return offset + size; } -static void write_pack_file(void) +static off_t write_pack_file(void) { uint32_t i; struct sha1file *f; - off_t offset; + off_t offset, last_obj_offset = 0; struct pack_header hdr; unsigned last_percent = 999; int do_progress = progress; @@ -555,6 +575,7 @@ static void write_pack_file(void) if (!nr_result) goto done; for (i = 0; i < nr_objects; i++) { + last_obj_offset = offset; offset = write_one(f, objects + i, offset); if (do_progress) { unsigned percent = written * 100 / nr_result; @@ -572,9 +593,11 @@ static void write_pack_file(void) if (written != nr_result) die("wrote %u objects while expecting %u", written, nr_result); sha1close(f, pack_file_sha1, 1); + + return last_obj_offset; } -static void write_index_file(void) +static void write_index_file(off_t last_obj_offset) { uint32_t i; struct sha1file *f = sha1create("%s-%s.%s", base_name, @@ -582,6 +605,18 @@ static void write_index_file(void) struct object_entry **list = sorted_by_sha; struct object_entry **last = list + nr_result; uint32_t array[256]; + uint32_t index_version; + + /* if last object's offset is >= 2^31 we should use index V2 */ + index_version = (last_obj_offset >> 31) ? 2 : 1; + + /* index versions 2 and above need a header */ + if (index_version >= 2) { + struct pack_idx_header hdr; + hdr.idx_signature = htonl(PACK_IDX_SIGNATURE); + hdr.idx_version = htonl(index_version); + sha1write(f, &hdr, sizeof(hdr)); + } /* * Write the first-level table (the list is sorted, @@ -607,10 +642,49 @@ static void write_index_file(void) list = sorted_by_sha; for (i = 0; i < nr_result; i++) { struct object_entry *entry = *list++; - uint32_t offset = htonl(entry->offset); - sha1write(f, &offset, 4); + if (index_version < 2) { + uint32_t offset = htonl(entry->offset); + sha1write(f, &offset, 4); + } sha1write(f, entry->sha1, 20); } + + if (index_version >= 2) { + unsigned int nr_large_offset = 0; + + /* write the crc32 table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *entry = *list++; + uint32_t crc32_val = htonl(entry->crc32); + sha1write(f, &crc32_val, 4); + } + + /* write the 32-bit offset table */ + list = sorted_by_sha; + for (i = 0; i < nr_objects; i++) { + struct object_entry *entry = *list++; + uint32_t offset = (entry->offset <= 0x7fffffff) ? + entry->offset : (0x80000000 | nr_large_offset++); + offset = htonl(offset); + sha1write(f, &offset, 4); + } + + /* write the large offset table */ + list = sorted_by_sha; + while (nr_large_offset) { + struct object_entry *entry = *list++; + uint64_t offset = entry->offset; + if (offset > 0x7fffffff) { + uint32_t split[2]; + split[0] = htonl(offset >> 32); + split[1] = htonl(offset & 0xffffffff); + sha1write(f, split, 8); + nr_large_offset--; + } + } + } + sha1write(f, pack_file_sha1, 20); sha1close(f, NULL, 1); } @@ -1698,6 +1772,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (reuse_cached_pack(object_list_sha1)) ; else { + off_t last_obj_offset; if (nr_result) prepare_pack(window, depth); if (progress == 1 && pack_to_stdout) { @@ -1707,9 +1782,9 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) signal(SIGALRM, SIG_IGN ); progress_update = 0; } - write_pack_file(); + last_obj_offset = write_pack_file(); if (!pack_to_stdout) { - write_index_file(); + write_index_file(last_obj_offset); puts(sha1_to_hex(object_list_sha1)); } } -- cgit v1.2.3 From 4ba7d711539122c21bd44af06b6cab4fc4f65a74 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 9 Apr 2007 17:32:03 -0400 Subject: allow forcing index v2 and 64-bit offset treshold This is necessary for testing the new capabilities in some automated way without having an actual 4GB+ pack. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 8cf2871751..099dea0e1e 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -597,6 +597,9 @@ static off_t write_pack_file(void) return last_obj_offset; } +static uint32_t index_default_version = 1; +static uint32_t index_off32_limit = 0x7fffffff; + static void write_index_file(off_t last_obj_offset) { uint32_t i; @@ -608,7 +611,7 @@ static void write_index_file(off_t last_obj_offset) uint32_t index_version; /* if last object's offset is >= 2^31 we should use index V2 */ - index_version = (last_obj_offset >> 31) ? 2 : 1; + index_version = (last_obj_offset >> 31) ? 2 : index_default_version; /* index versions 2 and above need a header */ if (index_version >= 2) { @@ -664,7 +667,7 @@ static void write_index_file(off_t last_obj_offset) list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { struct object_entry *entry = *list++; - uint32_t offset = (entry->offset <= 0x7fffffff) ? + uint32_t offset = (entry->offset <= index_off32_limit) ? entry->offset : (0x80000000 | nr_large_offset++); offset = htonl(offset); sha1write(f, &offset, 4); @@ -675,7 +678,7 @@ static void write_index_file(off_t last_obj_offset) while (nr_large_offset) { struct object_entry *entry = *list++; uint64_t offset = entry->offset; - if (offset > 0x7fffffff) { + if (offset > index_off32_limit) { uint32_t split[2]; split[0] = htonl(offset >> 32); split[1] = htonl(offset & 0xffffffff); @@ -1714,6 +1717,17 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) rp_av[1] = "--objects-edge"; continue; } + if (!prefixcmp(arg, "--index-version=")) { + char *c; + index_default_version = strtoul(arg + 16, &c, 10); + if (index_default_version > 2) + die("bad %s", arg); + if (*c == ',') + index_off32_limit = strtoul(c+1, &c, 0); + if (*c || index_off32_limit & 0x80000000) + die("bad %s", arg); + continue; + } usage(pack_usage); } -- cgit v1.2.3 From 91ecbeca48e675fef8141451edb2c86577f9d63c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 10 Apr 2007 00:15:41 -0400 Subject: validate reused pack data with CRC when possible This replaces the inflate validation with a CRC validation when reusing data from a pack which uses index version 2. That makes repacking much safer against corruptions, and it should be a bit faster too. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 46 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 12 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 099dea0e1e..687b4b5a99 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -233,12 +233,6 @@ static struct revindex_entry * find_packed_object(struct packed_git *p, die("internal error: pack revindex corrupt"); } -static off_t find_packed_object_size(struct packed_git *p, off_t ofs) -{ - struct revindex_entry *entry = find_packed_object(p, ofs); - return entry[1].offset - ofs; -} - static const unsigned char *find_packed_object_name(struct packed_git *p, off_t ofs) { @@ -321,6 +315,28 @@ static int check_pack_inflate(struct packed_git *p, stream.total_in == len) ? 0 : -1; } +static int check_pack_crc(struct packed_git *p, struct pack_window **w_curs, + off_t offset, off_t len, unsigned int nr) +{ + const uint32_t *index_crc; + uint32_t data_crc = crc32(0, Z_NULL, 0); + + do { + unsigned int avail; + void *data = use_pack(p, w_curs, offset, &avail); + if (avail > len) + avail = len; + data_crc = crc32(data_crc, data, avail); + offset += avail; + len -= avail; + } while (len); + + index_crc = p->index_data; + index_crc += 2 + 256 + p->num_objects * (20/4) + nr; + + return data_crc != ntohl(*index_crc); +} + static void copy_pack_data(struct sha1file *f, struct packed_git *p, struct pack_window **w_curs, @@ -485,6 +501,7 @@ static unsigned long write_object(struct sha1file *f, else { struct packed_git *p = entry->in_pack; struct pack_window *w_curs = NULL; + struct revindex_entry *revidx; off_t offset; if (entry->delta) { @@ -507,12 +524,17 @@ static unsigned long write_object(struct sha1file *f, hdrlen += 20; } - offset = entry->in_pack_offset + entry->in_pack_header_size; - datalen = find_packed_object_size(p, entry->in_pack_offset) - - entry->in_pack_header_size; - if (!pack_to_stdout && check_pack_inflate(p, &w_curs, - offset, datalen, entry->size)) - die("corrupt delta in pack %s", sha1_to_hex(entry->sha1)); + offset = entry->in_pack_offset; + revidx = find_packed_object(p, offset); + datalen = revidx[1].offset - offset; + if (!pack_to_stdout && p->index_version > 1 && + check_pack_crc(p, &w_curs, offset, datalen, revidx->nr)) + die("bad packed object CRC for %s", sha1_to_hex(entry->sha1)); + offset += entry->in_pack_header_size; + datalen -= entry->in_pack_header_size; + if (!pack_to_stdout && p->index_version == 1 && + check_pack_inflate(p, &w_curs, offset, datalen, entry->size)) + die("corrupt packed object for %s", sha1_to_hex(entry->sha1)); copy_pack_data(f, p, &w_curs, offset, datalen); unuse_pack(&w_curs); reused++; -- cgit v1.2.3 From 29b734e4788143603adf8046174219bac67794e0 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 10 Apr 2007 22:54:36 -0400 Subject: clean up add_object_entry() This function used to call locate_object_entry_hash() _twice_ per added object while only once should suffice. Let's reorganize that code a bit. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 52 ++++++++++++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 687b4b5a99..bc5f2329a8 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -781,12 +781,19 @@ static unsigned name_hash(const char *name) static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude) { - uint32_t idx = nr_objects; struct object_entry *entry; - struct packed_git *p; + struct packed_git *p, *found_pack = NULL; off_t found_offset = 0; - struct packed_git *found_pack = NULL; - int ix, status = 0; + int ix; + + ix = nr_objects ? locate_object_entry_hash(sha1) : -1; + if (ix >= 0) { + if (exclude) { + entry = objects + object_ix[ix] - 1; + entry->preferred_base = 1; + } + return 0; + } if (!exclude) { for (p = packed_git; p; p = p->next) { @@ -803,43 +810,34 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud } } } - if ((entry = locate_object_entry(sha1)) != NULL) - goto already_added; - if (idx >= nr_alloc) { - nr_alloc = (idx + 1024) * 3 / 2; + if (nr_objects >= nr_alloc) { + nr_alloc = (nr_alloc + 1024) * 3 / 2; objects = xrealloc(objects, nr_alloc * sizeof(*entry)); } - entry = objects + idx; - nr_objects = idx + 1; + + entry = objects + nr_objects++; memset(entry, 0, sizeof(*entry)); hashcpy(entry->sha1, sha1); entry->hash = hash; + if (exclude) + entry->preferred_base = 1; + if (found_pack) { + entry->in_pack = found_pack; + entry->in_pack_offset = found_offset; + } if (object_ix_hashsz * 3 <= nr_objects * 4) rehash_objects(); - else { - ix = locate_object_entry_hash(entry->sha1); - if (0 <= ix) - die("internal error in object hashing."); - object_ix[-1 - ix] = idx + 1; - } - status = 1; + else + object_ix[-1 - ix] = nr_objects; - already_added: if (progress_update) { fprintf(stderr, "Counting objects...%u\r", nr_objects); progress_update = 0; } - if (exclude) - entry->preferred_base = 1; - else { - if (found_pack) { - entry->in_pack = found_pack; - entry->in_pack_offset = found_offset; - } - } - return status; + + return 1; } struct pbase_tree_cache { -- cgit v1.2.3 From 8a5a8d6c97e36dbd95361eab1109e4380fe45df4 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:28:10 -0400 Subject: pack-objects: optimize preferred base handling a bit Let's avoid some cycles when there is no base to test against, and avoid unnecessary object lookups. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index bc5f2329a8..62a011e2e9 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -959,22 +959,21 @@ static void add_pbase_object(struct tree_desc *tree, const char *fullname) { struct name_entry entry; + int cmp; while (tree_entry(tree,&entry)) { - unsigned long size; - enum object_type type; - - if (tree_entry_len(entry.path, entry.sha1) != cmplen || - memcmp(entry.path, name, cmplen) || - !has_sha1_file(entry.sha1) || - (type = sha1_object_info(entry.sha1, &size)) < 0) + cmp = tree_entry_len(entry.path, entry.sha1) != cmplen ? 1 : + memcmp(name, entry.path, cmplen); + if (cmp > 0) continue; + if (cmp < 0) + return; if (name[cmplen] != '/') { unsigned hash = name_hash(fullname); add_object_entry(entry.sha1, hash, 1); return; } - if (type == OBJ_TREE) { + if (S_ISDIR(entry.mode)) { struct tree_desc sub; struct pbase_tree_cache *tree; const char *down = name+cmplen+1; @@ -1034,15 +1033,15 @@ static int check_pbase_path(unsigned hash) static void add_preferred_base_object(const char *name, unsigned hash) { struct pbase_tree *it; - int cmplen = name_cmp_len(name); + int cmplen; - if (check_pbase_path(hash)) + if (!num_preferred_base || check_pbase_path(hash)) return; + cmplen = name_cmp_len(name); for (it = pbase_tree; it; it = it->next) { if (cmplen == 0) { - hash = name_hash(""); - add_object_entry(it->pcache.sha1, hash, 1); + add_object_entry(it->pcache.sha1, 0, 1); } else { struct tree_desc tree; @@ -1587,9 +1586,7 @@ static void read_object_list_from_stdin(void) static void show_commit(struct commit *commit) { - unsigned hash = name_hash(""); - add_preferred_base_object("", hash); - add_object_entry(commit->object.sha1, hash, 0); + add_object_entry(commit->object.sha1, 0, 0); } static void show_object(struct object_array_entry *p) -- cgit v1.2.3 From adcc70950e594065050c375ace8a039678d2e31f Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:28:52 -0400 Subject: pack-objects: equal objects in size should delta against newer objects Before finding best delta combinations, we sort objects by name hash, then by size, then by their position in memory. Then we walk the list backwards to test delta candidates. We hope that a bigger size usually means a newer objects. But a bigger address in memory does not mean a newer object. So the last comparison must be reversed. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 62a011e2e9..869ca1ab26 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1276,7 +1276,7 @@ static int type_size_sort(const struct object_entry *a, const struct object_entr return -1; if (a->size > b->size) return 1; - return a < b ? -1 : (a > b); + return a > b ? -1 : (a < b); /* newest last */ } struct unpacked { -- cgit v1.2.3 From 898b14cedc353de95945fcc56e14f463c3066bf0 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:29:16 -0400 Subject: pack-objects: rework check_delta_limit usage Objects that have delta "children" from pack data reuse must consider the depth of their deepest child when they try to deltify themselves for those children not to become too deep. However, in the context of a "thin" pack, the delta children depth was skipped entirely on the presumption that the pack was always going to be exploded on the receiving end, hence the delta length wasn't an issue. Now that we keep received packs as is and reuse pack data when repacking, those packs do contain delta chains that are longer than expected. Worse, those delta chain may even grow longer when the pack is further repacked into another thin pack for a subsequent transmission. So this patch restores strict delta length even for thin packs, and it moves check_delta_limit() usage directly in the delta loop where it is needed. This way the delta_limit can be removed from struct object_entry as well. Oh and the initial value was wrong too. The progress_interval() function was moved to a more logical location in the process. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 76 +++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 44 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 869ca1ab26..d44b8f4c01 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -27,7 +27,6 @@ struct object_entry { * nonzero if already written. */ unsigned int depth; /* delta depth */ - unsigned int delta_limit; /* base adjustment for in-pack delta */ unsigned int hash; /* name hint hash */ enum object_type type; enum object_type in_pack_type; /* could be delta */ @@ -1172,19 +1171,6 @@ static void check_object(struct object_entry *entry) sha1_to_hex(entry->sha1)); } -static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) -{ - struct object_entry *child = me->delta_child; - unsigned int m = n; - while (child) { - unsigned int c = check_delta_limit(child, n + 1); - if (m < c) - m = c; - child = child->delta_sibling; - } - return m; -} - static void get_object_details(void) { uint32_t i; @@ -1193,23 +1179,6 @@ static void get_object_details(void) prepare_pack_ix(); for (i = 0, entry = objects; i < nr_objects; i++, entry++) check_object(entry); - - if (nr_objects == nr_result) { - /* - * Depth of objects that depend on the entry -- this - * is subtracted from depth-max to break too deep - * delta chain because of delta data reusing. - * However, we loosen this restriction when we know we - * are creating a thin pack -- it will have to be - * expanded on the other end anyway, so do not - * artificially cut the delta chain and let it go as - * deep as it wants. - */ - for (i = 0, entry = objects; i < nr_objects; i++, entry++) - if (!entry->delta && entry->delta_child) - entry->delta_limit = - check_delta_limit(entry, 1); - } } typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *); @@ -1322,16 +1291,7 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, trg_entry->in_pack_type != OBJ_OFS_DELTA) return 0; - /* - * If the current object is at pack edge, take the depth the - * objects that depend on the current object into account -- - * otherwise they would become too deep. - */ - if (trg_entry->delta_child) { - if (max_depth <= trg_entry->delta_limit) - return 0; - max_depth -= trg_entry->delta_limit; - } + /* Let's not bust the allowed depth. */ if (src_entry->depth >= max_depth) return 0; @@ -1378,9 +1338,17 @@ static int try_delta(struct unpacked *trg, struct unpacked *src, return 1; } -static void progress_interval(int signum) +static unsigned int check_delta_limit(struct object_entry *me, unsigned int n) { - progress_update = 1; + struct object_entry *child = me->delta_child; + unsigned int m = n; + while (child) { + unsigned int c = check_delta_limit(child, n + 1); + if (m < c) + m = c; + child = child->delta_sibling; + } + return m; } static void find_deltas(struct object_entry **list, int window, int depth) @@ -1389,6 +1357,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) unsigned int array_size = window * sizeof(struct unpacked); struct unpacked *array; unsigned last_percent = 999; + int max_depth; if (!nr_objects) return; @@ -1429,6 +1398,18 @@ static void find_deltas(struct object_entry **list, int window, int depth) n->data = NULL; n->entry = entry; + /* + * If the current object is at pack edge, take the depth the + * objects that depend on the current object into account + * otherwise they would become too deep. + */ + max_depth = depth; + if (entry->delta_child) { + max_depth -= check_delta_limit(entry, 0); + if (max_depth <= 0) + goto next; + } + j = window; while (--j > 0) { uint32_t other_idx = idx + j; @@ -1438,9 +1419,10 @@ static void find_deltas(struct object_entry **list, int window, int depth) m = array + other_idx; if (!m->entry) break; - if (try_delta(n, m, depth) < 0) + if (try_delta(n, m, max_depth) < 0) break; } + /* if we made n a delta, and if n is already at max * depth, leaving it in the window is pointless. we * should evict it first. @@ -1448,6 +1430,7 @@ static void find_deltas(struct object_entry **list, int window, int depth) if (entry->delta && depth <= entry->depth) continue; + next: idx++; if (idx >= window) idx = 0; @@ -1525,6 +1508,11 @@ static int reuse_cached_pack(unsigned char *sha1) return 1; } +static void progress_interval(int signum) +{ + progress_update = 1; +} + static void setup_progress_signal(void) { struct sigaction sa; -- cgit v1.2.3 From 9668cf59a83d1aa881036818abf29cc2ea9e291b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:29:54 -0400 Subject: pack-objects: clean up list sorting Get rid of sort_comparator() as it impose a run time double indirect function call for little compile time type checking gain. Also get rid of create_sorted_list() as it only has one user which would as well be just fine doing its sorting locally. Eventually the list of deltifiable objects might be shorter than the whole object list. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 53 +++++++++++++++++++++----------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index d44b8f4c01..15119d63d9 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -66,7 +66,7 @@ static int local; static int incremental; static int allow_ofs_delta; -static struct object_entry **sorted_by_sha, **sorted_by_type; +static struct object_entry **sorted_by_sha; static struct object_entry *objects; static uint32_t nr_objects, nr_alloc, nr_result; static const char *base_name; @@ -1181,31 +1181,10 @@ static void get_object_details(void) check_object(entry); } -typedef int (*entry_sort_t)(const struct object_entry *, const struct object_entry *); - -static entry_sort_t current_sort; - -static int sort_comparator(const void *_a, const void *_b) -{ - struct object_entry *a = *(struct object_entry **)_a; - struct object_entry *b = *(struct object_entry **)_b; - return current_sort(a,b); -} - -static struct object_entry **create_sorted_list(entry_sort_t sort) -{ - struct object_entry **list = xmalloc(nr_objects * sizeof(struct object_entry *)); - uint32_t i; - - for (i = 0; i < nr_objects; i++) - list[i] = objects + i; - current_sort = sort; - qsort(list, nr_objects, sizeof(struct object_entry *), sort_comparator); - return list; -} - -static int sha1_sort(const struct object_entry *a, const struct object_entry *b) +static int sha1_sort(const void *_a, const void *_b) { + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; return hashcmp(a->sha1, b->sha1); } @@ -1222,13 +1201,15 @@ static struct object_entry **create_final_object_list(void) if (!objects[i].preferred_base) list[j++] = objects + i; } - current_sort = sha1_sort; - qsort(list, nr_result, sizeof(struct object_entry *), sort_comparator); + qsort(list, nr_result, sizeof(struct object_entry *), sha1_sort); return list; } -static int type_size_sort(const struct object_entry *a, const struct object_entry *b) +static int type_size_sort(const void *_a, const void *_b) { + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + if (a->type < b->type) return -1; if (a->type > b->type) @@ -1448,10 +1429,20 @@ static void find_deltas(struct object_entry **list, int window, int depth) static void prepare_pack(int window, int depth) { + struct object_entry **delta_list; + uint32_t i; + get_object_details(); - sorted_by_type = create_sorted_list(type_size_sort); - if (window && depth) - find_deltas(sorted_by_type, window+1, depth); + + if (!window || !depth) + return; + + delta_list = xmalloc(nr_objects * sizeof(*delta_list)); + for (i = 0; i < nr_objects; i++) + delta_list[i] = objects + i; + qsort(delta_list, nr_objects, sizeof(*delta_list), type_size_sort); + find_deltas(delta_list, window+1, depth); + free(delta_list); } static int reuse_cached_pack(unsigned char *sha1) -- cgit v1.2.3 From f7ae6a930a3c5d501439cdba98417af747350738 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:30:15 -0400 Subject: pack-objects: get rid of reuse_cached_pack This capability is practically never useful, and therefore never tested, because it is fairly unlikely that the requested pack will be already available. Furthermore it is of little gain over the ability to reuse existing pack data. In fact the ability to change delta type on the fly when reusing delta data is a nice thing that has almost no cost and allows greater backward compatibility with a client's capabilities than if the client is blindly sent a whole pack without any discrimination. And this "feature" is simply in the way of other cleanups. Let's get rid of it. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 86 ++++++++------------------------------------------ 1 file changed, 14 insertions(+), 72 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 15119d63d9..c2f7c30817 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -1445,60 +1445,6 @@ static void prepare_pack(int window, int depth) free(delta_list); } -static int reuse_cached_pack(unsigned char *sha1) -{ - static const char cache[] = "pack-cache/pack-%s.%s"; - char *cached_pack, *cached_idx; - int ifd, ofd, ifd_ix = -1; - - cached_pack = git_path(cache, sha1_to_hex(sha1), "pack"); - ifd = open(cached_pack, O_RDONLY); - if (ifd < 0) - return 0; - - if (!pack_to_stdout) { - cached_idx = git_path(cache, sha1_to_hex(sha1), "idx"); - ifd_ix = open(cached_idx, O_RDONLY); - if (ifd_ix < 0) { - close(ifd); - return 0; - } - } - - if (progress) - fprintf(stderr, "Reusing %u objects pack %s\n", nr_objects, - sha1_to_hex(sha1)); - - if (pack_to_stdout) { - if (copy_fd(ifd, 1)) - exit(1); - close(ifd); - } - else { - char name[PATH_MAX]; - snprintf(name, sizeof(name), - "%s-%s.%s", base_name, sha1_to_hex(sha1), "pack"); - ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666); - if (ofd < 0) - die("unable to open %s (%s)", name, strerror(errno)); - if (copy_fd(ifd, ofd)) - exit(1); - close(ifd); - - snprintf(name, sizeof(name), - "%s-%s.%s", base_name, sha1_to_hex(sha1), "idx"); - ofd = open(name, O_CREAT | O_EXCL | O_WRONLY, 0666); - if (ofd < 0) - die("unable to open %s (%s)", name, strerror(errno)); - if (copy_fd(ifd_ix, ofd)) - exit(1); - close(ifd_ix); - puts(sha1_to_hex(sha1)); - } - - return 1; -} - static void progress_interval(int signum) { progress_update = 1; @@ -1618,6 +1564,7 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) SHA_CTX ctx; int depth = 10; struct object_entry **list; + off_t last_obj_offset; int use_internal_rev_list = 0; int thin = 0; uint32_t i; @@ -1779,24 +1726,19 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress && (nr_objects != nr_result)) fprintf(stderr, "Result has %u objects.\n", nr_result); - if (reuse_cached_pack(object_list_sha1)) - ; - else { - off_t last_obj_offset; - if (nr_result) - prepare_pack(window, depth); - if (progress == 1 && pack_to_stdout) { - /* the other end usually displays progress itself */ - struct itimerval v = {{0,},}; - setitimer(ITIMER_REAL, &v, NULL); - signal(SIGALRM, SIG_IGN ); - progress_update = 0; - } - last_obj_offset = write_pack_file(); - if (!pack_to_stdout) { - write_index_file(last_obj_offset); - puts(sha1_to_hex(object_list_sha1)); - } + if (nr_result) + prepare_pack(window, depth); + if (progress == 1 && pack_to_stdout) { + /* the other end usually displays progress itself */ + struct itimerval v = {{0,},}; + setitimer(ITIMER_REAL, &v, NULL); + signal(SIGALRM, SIG_IGN ); + progress_update = 0; + } + last_obj_offset = write_pack_file(); + if (!pack_to_stdout) { + write_index_file(last_obj_offset); + puts(sha1_to_hex(object_list_sha1)); } if (progress) fprintf(stderr, "Total %u (delta %u), reused %u (delta %u)\n", -- cgit v1.2.3 From 81a216a5d6a12976b20d9a39829562f280ae96f2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:31:05 -0400 Subject: pack-objects: get rid of create_final_object_list() Because we don't have to know the SHA1 h(hence the name) of the pack up front anymore, let's get rid of yet another global sorted object list and sort them only in write_index_file(), then compute the object list SHA1 on the fly. This has the advantage of saving another chunk of memory, and the sorted list SHA1 won't be computed needlessly on servers during a fetch. Of course the cunning plan is also to make write_index_file() much like the function with the same name in index-pack.c for an eventual easy sharing. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 127 ++++++++++++++++++++++++++++--------------------- 1 file changed, 72 insertions(+), 55 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index c2f7c30817..7af1776673 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -59,17 +59,16 @@ struct object_entry { * heuristics. */ -static unsigned char object_list_sha1[20]; static int non_empty; static int no_reuse_delta; static int local; static int incremental; static int allow_ofs_delta; -static struct object_entry **sorted_by_sha; static struct object_entry *objects; static uint32_t nr_objects, nr_alloc, nr_result; -static const char *base_name; +static const char *pack_tmp_name, *idx_tmp_name; +static char tmpname[PATH_MAX]; static unsigned char pack_file_sha1[20]; static int progress = 1; static volatile sig_atomic_t progress_update; @@ -578,13 +577,19 @@ static off_t write_pack_file(void) unsigned last_percent = 999; int do_progress = progress; - if (!base_name) { + if (pack_to_stdout) { f = sha1fd(1, ""); do_progress >>= 1; + } else { + int fd; + snprintf(tmpname, sizeof(tmpname), "tmp_pack_XXXXXX"); + fd = mkstemp(tmpname); + if (fd < 0) + die("unable to create %s: %s\n", tmpname, strerror(errno)); + pack_tmp_name = xstrdup(tmpname); + f = sha1fd(fd, pack_tmp_name); } - else - f = sha1create("%s-%s.%s", base_name, - sha1_to_hex(object_list_sha1), "pack"); + if (do_progress) fprintf(stderr, "Writing %u objects.\n", nr_result); @@ -618,18 +623,46 @@ static off_t write_pack_file(void) return last_obj_offset; } +static int sha1_sort(const void *_a, const void *_b) +{ + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + return hashcmp(a->sha1, b->sha1); +} + static uint32_t index_default_version = 1; static uint32_t index_off32_limit = 0x7fffffff; -static void write_index_file(off_t last_obj_offset) +static void write_index_file(off_t last_obj_offset, unsigned char *sha1) { - uint32_t i; - struct sha1file *f = sha1create("%s-%s.%s", base_name, - sha1_to_hex(object_list_sha1), "idx"); - struct object_entry **list = sorted_by_sha; - struct object_entry **last = list + nr_result; + struct sha1file *f; + struct object_entry **sorted_by_sha, **list, **last; uint32_t array[256]; - uint32_t index_version; + uint32_t i, index_version; + SHA_CTX ctx; + int fd; + + snprintf(tmpname, sizeof(tmpname), "tmp_idx_XXXXXX"); + fd = mkstemp(tmpname); + if (fd < 0) + die("unable to create %s: %s\n", tmpname, strerror(errno)); + idx_tmp_name = xstrdup(tmpname); + f = sha1fd(fd, idx_tmp_name); + + if (nr_result) { + uint32_t j = 0; + sorted_by_sha = + xcalloc(nr_result, sizeof(struct object_entry *)); + for (i = 0; i < nr_objects; i++) + if (!objects[i].preferred_base) + sorted_by_sha[j++] = objects + i; + if (j != nr_result) + die("listed %u objects while expecting %u", j, nr_result); + qsort(sorted_by_sha, nr_result, sizeof(*sorted_by_sha), sha1_sort); + list = sorted_by_sha; + last = sorted_by_sha + nr_result; + } else + sorted_by_sha = list = last = NULL; /* if last object's offset is >= 2^31 we should use index V2 */ index_version = (last_obj_offset >> 31) ? 2 : index_default_version; @@ -660,9 +693,10 @@ static void write_index_file(off_t last_obj_offset) } sha1write(f, array, 256 * 4); - /* - * Write the actual SHA1 entries.. - */ + /* Compute the SHA1 hash of sorted object names. */ + SHA1_Init(&ctx); + + /* Write the actual SHA1 entries. */ list = sorted_by_sha; for (i = 0; i < nr_result; i++) { struct object_entry *entry = *list++; @@ -671,6 +705,7 @@ static void write_index_file(off_t last_obj_offset) sha1write(f, &offset, 4); } sha1write(f, entry->sha1, 20); + SHA1_Update(&ctx, entry->sha1, 20); } if (index_version >= 2) { @@ -711,6 +746,8 @@ static void write_index_file(off_t last_obj_offset) sha1write(f, pack_file_sha1, 20); sha1close(f, NULL, 1); + free(sorted_by_sha); + SHA1_Final(sha1, &ctx); } static int locate_object_entry_hash(const unsigned char *sha1) @@ -789,6 +826,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud if (ix >= 0) { if (exclude) { entry = objects + object_ix[ix] - 1; + if (!entry->preferred_base) + nr_result--; entry->preferred_base = 1; } return 0; @@ -821,6 +860,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud entry->hash = hash; if (exclude) entry->preferred_base = 1; + else + nr_result++; if (found_pack) { entry->in_pack = found_pack; entry->in_pack_offset = found_offset; @@ -1181,30 +1222,6 @@ static void get_object_details(void) check_object(entry); } -static int sha1_sort(const void *_a, const void *_b) -{ - const struct object_entry *a = *(struct object_entry **)_a; - const struct object_entry *b = *(struct object_entry **)_b; - return hashcmp(a->sha1, b->sha1); -} - -static struct object_entry **create_final_object_list(void) -{ - struct object_entry **list; - uint32_t i, j; - - for (i = nr_result = 0; i < nr_objects; i++) - if (!objects[i].preferred_base) - nr_result++; - list = xmalloc(nr_result * sizeof(struct object_entry *)); - for (i = j = 0; i < nr_objects; i++) { - if (!objects[i].preferred_base) - list[j++] = objects + i; - } - qsort(list, nr_result, sizeof(struct object_entry *), sha1_sort); - return list; -} - static int type_size_sort(const void *_a, const void *_b) { const struct object_entry *a = *(struct object_entry **)_a; @@ -1561,13 +1578,12 @@ static void get_object_list(int ac, const char **av) int cmd_pack_objects(int argc, const char **argv, const char *prefix) { - SHA_CTX ctx; int depth = 10; - struct object_entry **list; - off_t last_obj_offset; int use_internal_rev_list = 0; int thin = 0; uint32_t i; + off_t last_obj_offset; + const char *base_name = NULL; const char **rp_av; int rp_ac_alloc = 64; int rp_ac; @@ -1712,20 +1728,10 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) if (progress) fprintf(stderr, "Done counting %u objects.\n", nr_objects); - sorted_by_sha = create_final_object_list(); if (non_empty && !nr_result) return 0; - - SHA1_Init(&ctx); - list = sorted_by_sha; - for (i = 0; i < nr_result; i++) { - struct object_entry *entry = *list++; - SHA1_Update(&ctx, entry->sha1, 20); - } - SHA1_Final(object_list_sha1, &ctx); if (progress && (nr_objects != nr_result)) fprintf(stderr, "Result has %u objects.\n", nr_result); - if (nr_result) prepare_pack(window, depth); if (progress == 1 && pack_to_stdout) { @@ -1737,7 +1743,18 @@ int cmd_pack_objects(int argc, const char **argv, const char *prefix) } last_obj_offset = write_pack_file(); if (!pack_to_stdout) { - write_index_file(last_obj_offset); + unsigned char object_list_sha1[20]; + write_index_file(last_obj_offset, object_list_sha1); + snprintf(tmpname, sizeof(tmpname), "%s-%s.pack", + base_name, sha1_to_hex(object_list_sha1)); + if (rename(pack_tmp_name, tmpname)) + die("unable to rename temporary pack file: %s", + strerror(errno)); + snprintf(tmpname, sizeof(tmpname), "%s-%s.idx", + base_name, sha1_to_hex(object_list_sha1)); + if (rename(idx_tmp_name, tmpname)) + die("unable to rename temporary index file: %s", + strerror(errno)); puts(sha1_to_hex(object_list_sha1)); } if (progress) -- cgit v1.2.3 From a3fbf4dfe1bf2386add261dc7c2809b652b5f9ae Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:31:31 -0400 Subject: pack-objects: make in_pack_header_size a variable of its own It currently aliases delta_size on the principle that reused deltas won't go through the whole delta matching loop hence delta_size was unused. This is not true if given delta doesn't find its base in the pack though. But we need that information even for whole object data reuse. Well in short the current state looks awful and is prone to bugs. It just works fine now because try_delta() tests trg_entry->delta before using trg_entry->delta_size, but that is a bit subtle and I was wondering for a while why things just worked fine... even if I'm guilty of having introduced this abomination myself in the first place. Let's do the sensible thing instead with no ambiguity, which is to have a separate variable for in_pack_header_size. This might even help future optimizations. While at it, let's reorder some struct object_entry members so they all align well with their own width, regardless of the architecture or the size of off_t. Some memory saving is to be expected with this alone. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 7af1776673..7100a76cd2 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -22,28 +22,26 @@ git-pack-objects [{ -q | --progress | --all-progress }] \n\ struct object_entry { unsigned char sha1[20]; + uint32_t crc32; /* crc of raw pack data for this object */ + off_t offset; /* offset into the final pack file */ unsigned long size; /* uncompressed size */ - off_t offset; /* offset into the final pack file; - * nonzero if already written. - */ - unsigned int depth; /* delta depth */ unsigned int hash; /* name hint hash */ - enum object_type type; - enum object_type in_pack_type; /* could be delta */ - unsigned long delta_size; /* delta data size (uncompressed) */ -#define in_pack_header_size delta_size /* only when reusing pack data */ - struct object_entry *delta; /* delta base object */ + unsigned int depth; /* delta depth */ struct packed_git *in_pack; /* already in pack */ off_t in_pack_offset; + struct object_entry *delta; /* delta base object */ struct object_entry *delta_child; /* deltified objects who bases me */ struct object_entry *delta_sibling; /* other deltified objects who * uses the same base as me */ - int preferred_base; /* we do not pack this, but is encouraged to - * be used as the base objectto delta huge - * objects against. - */ - uint32_t crc32; /* crc of raw pack data for this object */ + unsigned long delta_size; /* delta data size (uncompressed) */ + enum object_type type; + enum object_type in_pack_type; /* could be delta */ + unsigned char in_pack_header_size; + unsigned char preferred_base; /* we do not pack this, but is available + * to be used as the base objectto delta + * objects against. + */ }; /* -- cgit v1.2.3 From 5c49c11686df9d1c27a194349d0b2092e6446f42 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 16 Apr 2007 12:32:13 -0400 Subject: pack-objects: better check_object() performances With large amount of objects, check_object() is really trashing the pack sliding map and the filesystem cache. It has a completely random access pattern especially with old objects where delta replay jumps back and forth all over the pack. This patch improves things by: 1) sorting objects by their offset in pack before calling check_object() so the pack access pattern is linear; 2) recording the object type at add_object_entry() time since it is already known in most cases; 3) recording the pack offset even for preferred_base objects; 4) avoid calling sha1_object_info() if all possible. This limits pack accesses to the bare minimum and makes them perfectly linear. In the process check_object() was made more clear (to me at least). Note: I thought about walking the sorted_by_offset list backward in get_object_details() so if a pack happens to be larger than the available file cache, then the cache would have been populated with useful data from the beginning of the pack already when find_deltas() is called. Strangely, testing (on Linux) showed absolutely no performance difference. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 206 ++++++++++++++++++++++++++++++------------------- 1 file changed, 126 insertions(+), 80 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 7100a76cd2..19fae4c917 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -813,7 +813,8 @@ static unsigned name_hash(const char *name) return hash; } -static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude) +static int add_object_entry(const unsigned char *sha1, enum object_type type, + unsigned hash, int exclude) { struct object_entry *entry; struct packed_git *p, *found_pack = NULL; @@ -831,19 +832,19 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud return 0; } - if (!exclude) { - for (p = packed_git; p; p = p->next) { - off_t offset = find_pack_entry_one(sha1, p); - if (offset) { - if (incremental) - return 0; - if (local && !p->pack_local) - return 0; - if (!found_pack) { - found_offset = offset; - found_pack = p; - } + for (p = packed_git; p; p = p->next) { + off_t offset = find_pack_entry_one(sha1, p); + if (offset) { + if (!found_pack) { + found_offset = offset; + found_pack = p; } + if (exclude) + break; + if (incremental) + return 0; + if (local && !p->pack_local) + return 0; } } @@ -856,6 +857,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud memset(entry, 0, sizeof(*entry)); hashcpy(entry->sha1, sha1); entry->hash = hash; + if (type) + entry->type = type; if (exclude) entry->preferred_base = 1; else @@ -1008,7 +1011,9 @@ static void add_pbase_object(struct tree_desc *tree, return; if (name[cmplen] != '/') { unsigned hash = name_hash(fullname); - add_object_entry(entry.sha1, hash, 1); + add_object_entry(entry.sha1, + S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB, + hash, 1); return; } if (S_ISDIR(entry.mode)) { @@ -1079,7 +1084,7 @@ static void add_preferred_base_object(const char *name, unsigned hash) cmplen = name_cmp_len(name); for (it = pbase_tree; it; it = it->next) { if (cmplen == 0) { - add_object_entry(it->pcache.sha1, 0, 1); + add_object_entry(it->pcache.sha1, OBJ_TREE, 0, 1); } else { struct tree_desc tree; @@ -1121,87 +1126,105 @@ static void add_preferred_base(unsigned char *sha1) static void check_object(struct object_entry *entry) { - if (entry->in_pack && !entry->preferred_base) { + if (entry->in_pack) { struct packed_git *p = entry->in_pack; struct pack_window *w_curs = NULL; - unsigned long size, used; + const unsigned char *base_ref = NULL; + struct object_entry *base_entry; + unsigned long used, used_0; unsigned int avail; - unsigned char *buf; - struct object_entry *base_entry = NULL; + off_t ofs; + unsigned char *buf, c; buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); - /* We want in_pack_type even if we do not reuse delta. + /* + * We want in_pack_type even if we do not reuse delta. * There is no point not reusing non-delta representations. */ used = unpack_object_header_gently(buf, avail, - &entry->in_pack_type, &size); + &entry->in_pack_type, + &entry->size); - /* Check if it is delta, and the base is also an object - * we are going to pack. If so we will reuse the existing - * delta. + /* + * Determine if this is a delta and if so whether we can + * reuse it or not. Otherwise let's find out as cheaply as + * possible what the actual type and size for this object is. */ - if (!no_reuse_delta) { - unsigned char c; - const unsigned char *base_name; - off_t ofs; - unsigned long used_0; - /* there is at least 20 bytes left in the pack */ - switch (entry->in_pack_type) { - case OBJ_REF_DELTA: - base_name = use_pack(p, &w_curs, - entry->in_pack_offset + used, NULL); - used += 20; - break; - case OBJ_OFS_DELTA: - buf = use_pack(p, &w_curs, - entry->in_pack_offset + used, NULL); - used_0 = 0; - c = buf[used_0++]; - ofs = c & 127; - while (c & 128) { - ofs += 1; - if (!ofs || MSB(ofs, 7)) - die("delta base offset overflow in pack for %s", - sha1_to_hex(entry->sha1)); - c = buf[used_0++]; - ofs = (ofs << 7) + (c & 127); - } - if (ofs >= entry->in_pack_offset) - die("delta base offset out of bound for %s", + switch (entry->in_pack_type) { + default: + /* Not a delta hence we've already got all we need. */ + entry->type = entry->in_pack_type; + entry->in_pack_header_size = used; + unuse_pack(&w_curs); + return; + case OBJ_REF_DELTA: + if (!no_reuse_delta && !entry->preferred_base) + base_ref = use_pack(p, &w_curs, + entry->in_pack_offset + used, NULL); + entry->in_pack_header_size = used + 20; + break; + case OBJ_OFS_DELTA: + buf = use_pack(p, &w_curs, + entry->in_pack_offset + used, NULL); + used_0 = 0; + c = buf[used_0++]; + ofs = c & 127; + while (c & 128) { + ofs += 1; + if (!ofs || MSB(ofs, 7)) + die("delta base offset overflow in pack for %s", sha1_to_hex(entry->sha1)); - ofs = entry->in_pack_offset - ofs; - base_name = find_packed_object_name(p, ofs); - used += used_0; - break; - default: - base_name = NULL; + c = buf[used_0++]; + ofs = (ofs << 7) + (c & 127); } - if (base_name) - base_entry = locate_object_entry(base_name); + if (ofs >= entry->in_pack_offset) + die("delta base offset out of bound for %s", + sha1_to_hex(entry->sha1)); + ofs = entry->in_pack_offset - ofs; + if (!no_reuse_delta && !entry->preferred_base) + base_ref = find_packed_object_name(p, ofs); + entry->in_pack_header_size = used + used_0; + break; } - unuse_pack(&w_curs); - entry->in_pack_header_size = used; - if (base_entry) { - - /* Depth value does not matter - find_deltas() - * will never consider reused delta as the - * base object to deltify other objects - * against, in order to avoid circular deltas. + if (base_ref && (base_entry = locate_object_entry(base_ref))) { + /* + * If base_ref was set above that means we wish to + * reuse delta data, and we even found that base + * in the list of objects we want to pack. Goodie! + * + * Depth value does not matter - find_deltas() will + * never consider reused delta as the base object to + * deltify other objects against, in order to avoid + * circular deltas. */ - - /* uncompressed size of the delta data */ - entry->size = size; - entry->delta = base_entry; entry->type = entry->in_pack_type; - + entry->delta = base_entry; entry->delta_sibling = base_entry->delta_child; base_entry->delta_child = entry; + unuse_pack(&w_curs); + return; + } + if (entry->type) { + /* + * This must be a delta and we already know what the + * final object type is. Let's extract the actual + * object size from the delta header. + */ + entry->size = get_size_from_delta(p, &w_curs, + entry->in_pack_offset + entry->in_pack_header_size); + unuse_pack(&w_curs); return; } - /* Otherwise we would do the usual */ + + /* + * No choice but to fall back to the recursive delta walk + * with sha1_object_info() to find about the object type + * at this point... + */ + unuse_pack(&w_curs); } entry->type = sha1_object_info(entry->sha1, &entry->size); @@ -1210,14 +1233,37 @@ static void check_object(struct object_entry *entry) sha1_to_hex(entry->sha1)); } +static int pack_offset_sort(const void *_a, const void *_b) +{ + const struct object_entry *a = *(struct object_entry **)_a; + const struct object_entry *b = *(struct object_entry **)_b; + + /* avoid filesystem trashing with loose objects */ + if (!a->in_pack && !b->in_pack) + return hashcmp(a->sha1, b->sha1); + + if (a->in_pack < b->in_pack) + return -1; + if (a->in_pack > b->in_pack) + return 1; + return a->in_pack_offset < b->in_pack_offset ? -1 : + (a->in_pack_offset > b->in_pack_offset); +} + static void get_object_details(void) { uint32_t i; - struct object_entry *entry; + struct object_entry **sorted_by_offset; + + sorted_by_offset = xcalloc(nr_objects, sizeof(struct object_entry *)); + for (i = 0; i < nr_objects; i++) + sorted_by_offset[i] = objects + i; + qsort(sorted_by_offset, nr_objects, sizeof(*sorted_by_offset), pack_offset_sort); prepare_pack_ix(); - for (i = 0, entry = objects; i < nr_objects; i++, entry++) - check_object(entry); + for (i = 0; i < nr_objects; i++) + check_object(sorted_by_offset[i]); + free(sorted_by_offset); } static int type_size_sort(const void *_a, const void *_b) @@ -1520,20 +1566,20 @@ static void read_object_list_from_stdin(void) hash = name_hash(line+41); add_preferred_base_object(line+41, hash); - add_object_entry(sha1, hash, 0); + add_object_entry(sha1, 0, hash, 0); } } static void show_commit(struct commit *commit) { - add_object_entry(commit->object.sha1, 0, 0); + add_object_entry(commit->object.sha1, OBJ_COMMIT, 0, 0); } static void show_object(struct object_array_entry *p) { unsigned hash = name_hash(p->name); add_preferred_base_object(p->name, hash); - add_object_entry(p->item->sha1, hash, 0); + add_object_entry(p->item->sha1, p->item->type, hash, 0); } static void show_edge(struct commit *commit) -- cgit v1.2.3 From e4d58311ba1a6cefa2ac5a3d95918af0d9b43588 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 19 Apr 2007 22:28:02 -0400 Subject: pack-objects: remove obsolete comments The sorted-by-sha ans sorted-by-type arrays are no more. Signed-off-by: Nicolas Pitre Signed-off-by: Junio C Hamano --- builtin-pack-objects.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'builtin-pack-objects.c') diff --git a/builtin-pack-objects.c b/builtin-pack-objects.c index 19fae4c917..c72e07a2bb 100644 --- a/builtin-pack-objects.c +++ b/builtin-pack-objects.c @@ -49,22 +49,15 @@ struct object_entry { * expanded). nr_objects & nr_alloc controls this array. They are stored * in the order we see -- typically rev-list --objects order that gives us * nice "minimum seek" order. - * - * sorted-by-sha ans sorted-by-type are arrays of pointers that point at - * elements in the objects array. The former is used to build the pack - * index (lists object names in the ascending order to help offset lookup), - * and the latter is used to group similar things together by try_delta() - * heuristics. */ +static struct object_entry *objects; +static uint32_t nr_objects, nr_alloc, nr_result; static int non_empty; static int no_reuse_delta; static int local; static int incremental; static int allow_ofs_delta; - -static struct object_entry *objects; -static uint32_t nr_objects, nr_alloc, nr_result; static const char *pack_tmp_name, *idx_tmp_name; static char tmpname[PATH_MAX]; static unsigned char pack_file_sha1[20]; @@ -76,8 +69,7 @@ static int num_preferred_base; /* * The object names in objects array are hashed with this hashtable, - * to help looking up the entry by object name. Binary search from - * sorted_by_sha is also possible but this was easier to code and faster. + * to help looking up the entry by object name. * This hashtable is built after all the objects are seen. */ static int *object_ix; -- cgit v1.2.3