diff options
-rw-r--r-- | sha1-lookup.c | 47 | ||||
-rw-r--r-- | t/lib-pack.sh | 100 | ||||
-rwxr-xr-x | t/t5308-pack-detect-duplicates.sh | 80 | ||||
-rwxr-xr-x | t/t5309-pack-delta-cycles.sh | 77 | ||||
-rw-r--r-- | test-sha1.c | 15 |
5 files changed, 316 insertions, 3 deletions
diff --git a/sha1-lookup.c b/sha1-lookup.c index c4dc55d1f5..2dd851598a 100644 --- a/sha1-lookup.c +++ b/sha1-lookup.c @@ -204,7 +204,54 @@ int sha1_entry_pos(const void *table, * byte 0 thru (ofs-1) are the same between * lo and hi; ofs is the first byte that is * different. + * + * If ofs==20, then no bytes are different, + * meaning we have entries with duplicate + * keys. We know that we are in a solid run + * of this entry (because the entries are + * sorted, and our lo and hi are the same, + * there can be nothing but this single key + * in between). So we can stop the search. + * Either one of these entries is it (and + * we do not care which), or we do not have + * it. + * + * Furthermore, we know that one of our + * endpoints must be the edge of the run of + * duplicates. For example, given this + * sequence: + * + * idx 0 1 2 3 4 5 + * key A C C C C D + * + * If we are searching for "B", we might + * hit the duplicate run at lo=1, hi=3 + * (e.g., by first mi=3, then mi=0). But we + * can never have lo > 1, because B < C. + * That is, if our key is less than the + * run, we know that "lo" is the edge, but + * we can say nothing of "hi". Similarly, + * if our key is greater than the run, we + * know that "hi" is the edge, but we can + * say nothing of "lo". + * + * Therefore if we do not find it, we also + * know where it would go if it did exist: + * just on the far side of the edge that we + * know about. */ + if (ofs == 20) { + mi = lo; + mi_key = base + elem_size * mi + key_offset; + cmp = memcmp(mi_key, key, 20); + if (!cmp) + return mi; + if (cmp < 0) + return -1 - hi; + else + return -1 - lo; + } + hiv = hi_key[ofs_0]; if (ofs_0 < 19) hiv = (hiv << 8) | hi_key[ofs_0+1]; diff --git a/t/lib-pack.sh b/t/lib-pack.sh new file mode 100644 index 0000000000..7e8685b44c --- /dev/null +++ b/t/lib-pack.sh @@ -0,0 +1,100 @@ +#!/bin/sh +# +# Support routines for hand-crafting weird or malicious packs. +# +# You can make a complete pack like: +# +# pack_header 2 >foo.pack && +# pack_obj e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 >>foo.pack && +# pack_obj e68fe8129b546b101aee9510c5328e7f21ca1d18 >>foo.pack && +# pack_trailer foo.pack + +# Print the big-endian 4-byte octal representation of $1 +uint32_octal () { + n=$1 + printf '\%o' $(($n / 16777216)); n=$((n % 16777216)) + printf '\%o' $(($n / 65536)); n=$((n % 65536)) + printf '\%o' $(($n / 256)); n=$((n % 256)) + printf '\%o' $(($n )); +} + +# Print the big-endian 4-byte binary representation of $1 +uint32_binary () { + printf "$(uint32_octal "$1")" +} + +# Print a pack header, version 2, for a pack with $1 objects +pack_header () { + printf 'PACK' && + printf '\0\0\0\2' && + uint32_binary "$1" +} + +# Print the pack data for object $1, as a delta against object $2 (or as a full +# object if $2 is missing or empty). The output is suitable for including +# directly in the packfile, and represents the entirety of the object entry. +# Doing this on the fly (especially picking your deltas) is quite tricky, so we +# have hardcoded some well-known objects. See the case statements below for the +# complete list. +pack_obj () { + case "$1" in + # empty blob + e69de29bb2d1d6434b8b29ae775ad8c2e48c5391) + case "$2" in + '') + printf '\060\170\234\003\0\0\0\0\1' + return + ;; + esac + ;; + + # blob containing "\7\76" + e68fe8129b546b101aee9510c5328e7f21ca1d18) + case "$2" in + '') + printf '\062\170\234\143\267\3\0\0\116\0\106' + return + ;; + 01d7713666f4de822776c7622c10f1b07de280dc) + printf '\165\1\327\161\66\146\364\336\202\47\166' && + printf '\307\142\54\20\361\260\175\342\200\334\170' && + printf '\234\143\142\142\142\267\003\0\0\151\0\114' + return + ;; + esac + ;; + + # blob containing "\7\0" + 01d7713666f4de822776c7622c10f1b07de280dc) + case "$2" in + '') + printf '\062\170\234\143\147\0\0\0\20\0\10' + return + ;; + e68fe8129b546b101aee9510c5328e7f21ca1d18) + printf '\165\346\217\350\22\233\124\153\20\32\356' && + printf '\225\20\305\62\216\177\41\312\35\30\170\234' && + printf '\143\142\142\142\147\0\0\0\53\0\16' + return + ;; + esac + ;; + esac + + echo >&2 "BUG: don't know how to print $1${2:+ (from $2)}" + return 1 +} + +# Compute and append pack trailer to "$1" +pack_trailer () { + test-sha1 -b <"$1" >trailer.tmp && + cat trailer.tmp >>"$1" && + rm -f trailer.tmp +} + +# Remove any existing packs to make sure that +# whatever we index next will be the pack that we +# actually use. +clear_packs () { + rm -f .git/objects/pack/* +} diff --git a/t/t5308-pack-detect-duplicates.sh b/t/t5308-pack-detect-duplicates.sh new file mode 100755 index 0000000000..9c5a8766ab --- /dev/null +++ b/t/t5308-pack-detect-duplicates.sh @@ -0,0 +1,80 @@ +#!/bin/sh + +test_description='handling of duplicate objects in incoming packfiles' +. ./test-lib.sh +. "$TEST_DIRECTORY"/lib-pack.sh + +# The sha1s we have in our pack. It's important that these have the same +# starting byte, so that they end up in the same fanout section of the index. +# That lets us make sure we are exercising the binary search with both sets. +LO_SHA1=e68fe8129b546b101aee9510c5328e7f21ca1d18 +HI_SHA1=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 + +# And here's a "missing sha1" which will produce failed lookups. It must also +# be in the same fanout section, and should be between the two (so that during +# our binary search, we are sure to end up looking at one or the other of the +# duplicate runs). +MISSING_SHA1='e69d000000000000000000000000000000000000' + +# git will never intentionally create packfiles with +# duplicate objects, so we have to construct them by hand. +# +# $1 is the name of the packfile to create +# +# $2 is the number of times to duplicate each object +create_pack () { + pack_header "$((2 * $2))" >"$1" && + for i in $(test_seq 1 "$2"); do + pack_obj $LO_SHA1 && + pack_obj $HI_SHA1 + done >>"$1" && + pack_trailer "$1" +} + +# double-check that create_pack actually works +test_expect_success 'pack with no duplicates' ' + create_pack no-dups.pack 1 && + git index-pack --stdin <no-dups.pack +' + +test_expect_success 'index-pack will allow duplicate objects by default' ' + clear_packs && + create_pack dups.pack 100 && + git index-pack --stdin <dups.pack +' + +test_expect_success 'create batch-check test vectors' ' + cat >input <<-EOF && + $LO_SHA1 + $HI_SHA1 + $MISSING_SHA1 + EOF + cat >expect <<-EOF + $LO_SHA1 blob 2 + $HI_SHA1 blob 0 + $MISSING_SHA1 missing + EOF +' + +test_expect_success 'lookup in duplicated pack (binary search)' ' + git cat-file --batch-check <input >actual && + test_cmp expect actual +' + +test_expect_success 'lookup in duplicated pack (GIT_USE_LOOKUP)' ' + ( + GIT_USE_LOOKUP=1 && + export GIT_USE_LOOKUP && + git cat-file --batch-check <input >actual + ) && + test_cmp expect actual +' + +test_expect_success 'index-pack can reject packs with duplicates' ' + clear_packs && + create_pack dups.pack 2 && + test_must_fail git index-pack --strict --stdin <dups.pack && + test_expect_code 1 git cat-file -e $LO_SHA1 +' + +test_done diff --git a/t/t5309-pack-delta-cycles.sh b/t/t5309-pack-delta-cycles.sh new file mode 100755 index 0000000000..3e7861b075 --- /dev/null +++ b/t/t5309-pack-delta-cycles.sh @@ -0,0 +1,77 @@ +#!/bin/sh + +test_description='test index-pack handling of delta cycles in packfiles' +. ./test-lib.sh +. "$TEST_DIRECTORY"/lib-pack.sh + +# Two similar-ish objects that we have computed deltas between. +A=01d7713666f4de822776c7622c10f1b07de280dc +B=e68fe8129b546b101aee9510c5328e7f21ca1d18 + +# double-check our hand-constucted packs +test_expect_success 'index-pack works with a single delta (A->B)' ' + clear_packs && + { + pack_header 2 && + pack_obj $A $B && + pack_obj $B + } >ab.pack && + pack_trailer ab.pack && + git index-pack --stdin <ab.pack && + git cat-file -t $A && + git cat-file -t $B +' + +test_expect_success 'index-pack works with a single delta (B->A)' ' + clear_packs && + { + pack_header 2 && + pack_obj $A && + pack_obj $B $A + } >ba.pack && + pack_trailer ba.pack && + git index-pack --stdin <ba.pack && + git cat-file -t $A && + git cat-file -t $B +' + +test_expect_success 'index-pack detects missing base objects' ' + clear_packs && + { + pack_header 1 && + pack_obj $A $B + } >missing.pack && + pack_trailer missing.pack && + test_must_fail git index-pack --fix-thin --stdin <missing.pack +' + +test_expect_success 'index-pack detects REF_DELTA cycles' ' + clear_packs && + { + pack_header 2 && + pack_obj $A $B && + pack_obj $B $A + } >cycle.pack && + pack_trailer cycle.pack && + test_must_fail git index-pack --fix-thin --stdin <cycle.pack +' + +test_expect_failure 'failover to an object in another pack' ' + clear_packs && + git index-pack --stdin <ab.pack && + git index-pack --stdin --fix-thin <cycle.pack +' + +test_expect_failure 'failover to a duplicate object in the same pack' ' + clear_packs && + { + pack_header 3 && + pack_obj $A $B && + pack_obj $B $A && + pack_obj $A + } >recoverable.pack && + pack_trailer recoverable.pack && + git index-pack --fix-thin --stdin <recoverable.pack +' + +test_done diff --git a/test-sha1.c b/test-sha1.c index 80daba980e..e57eae10bf 100644 --- a/test-sha1.c +++ b/test-sha1.c @@ -5,10 +5,15 @@ int main(int ac, char **av) git_SHA_CTX ctx; unsigned char sha1[20]; unsigned bufsz = 8192; + int binary = 0; char *buffer; - if (ac == 2) - bufsz = strtoul(av[1], NULL, 10) * 1024 * 1024; + if (ac == 2) { + if (!strcmp(av[1], "-b")) + binary = 1; + else + bufsz = strtoul(av[1], NULL, 10) * 1024 * 1024; + } if (!bufsz) bufsz = 8192; @@ -42,6 +47,10 @@ int main(int ac, char **av) git_SHA1_Update(&ctx, buffer, this_sz); } git_SHA1_Final(sha1, &ctx); - puts(sha1_to_hex(sha1)); + + if (binary) + fwrite(sha1, 1, 20, stdout); + else + puts(sha1_to_hex(sha1)); exit(0); } |