5 files changed, 316 insertions, 3 deletions
diff --git a/sha1-lookup.c b/sha1-lookup.c
index c4dc55d1f5..2dd851598a 100644
--- a/sha1-lookup.c
+++ b/sha1-lookup.c
@@ -204,7 +204,54 @@ int sha1_entry_pos(const void *table,
 			 * byte 0 thru (ofs-1) are the same between
 			 * lo and hi; ofs is the first byte that is
 			 * different.
+			 *
+			 * If ofs==20, then no bytes are different,
+			 * meaning we have entries with duplicate
+			 * keys. We know that we are in a solid run
+			 * of this entry (because the entries are
+			 * sorted, and our lo and hi are the same,
+			 * there can be nothing but this single key
+			 * in between). So we can stop the search.
+			 * Either one of these entries is it (and
+			 * we do not care which), or we do not have
+			 * it.
+			 *
+			 * Furthermore, we know that one of our
+			 * endpoints must be the edge of the run of
+			 * duplicates. For example, given this
+			 * sequence:
+			 *
+			 *     idx 0 1 2 3 4 5
+			 *     key A C C C C D
+			 *
+			 * If we are searching for "B", we might
+			 * hit the duplicate run at lo=1, hi=3
+			 * (e.g., by first mi=3, then mi=0). But we
+			 * can never have lo > 1, because B < C.
+			 * That is, if our key is less than the
+			 * run, we know that "lo" is the edge, but
+			 * we can say nothing of "hi". Similarly,
+			 * if our key is greater than the run, we
+			 * know that "hi" is the edge, but we can
+			 * say nothing of "lo".
+			 *
+			 * Therefore if we do not find it, we also
+			 * know where it would go if it did exist:
+			 * just on the far side of the edge that we
+			 * know about.
 			 */
+			if (ofs == 20) {
+				mi = lo;
+				mi_key = base + elem_size * mi + key_offset;
+				cmp = memcmp(mi_key, key, 20);
+				if (!cmp)
+					return mi;
+				if (cmp < 0)
+					return -1 - hi;
+				else
+					return -1 - lo;
+			}
+
 			hiv = hi_key[ofs_0];
 			if (ofs_0 < 19)
 				hiv = (hiv << 8) | hi_key[ofs_0+1];
diff --git a/t/lib-pack.sh b/t/lib-pack.sh
new file mode 100644
index 0000000000..7e8685b44c
--- /dev/null
+++ b/t/lib-pack.sh
@@ -0,0 +1,100 @@
+#!/bin/sh
+#
+# Support routines for hand-crafting weird or malicious packs.
+#
+# You can make a complete pack like:
+#
+#   pack_header 2 >foo.pack &&
+#   pack_obj e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 >>foo.pack &&
+#   pack_obj e68fe8129b546b101aee9510c5328e7f21ca1d18 >>foo.pack &&
+#   pack_trailer foo.pack
+
+# Print the big-endian 4-byte octal representation of $1
+uint32_octal () {
+	n=$1
+	printf '\%o' $(($n / 16777216)); n=$((n % 16777216))
+	printf '\%o' $(($n /    65536)); n=$((n %    65536))
+	printf '\%o' $(($n /      256)); n=$((n %      256))
+	printf '\%o' $(($n           ));
+}
+
+# Print the big-endian 4-byte binary representation of $1
+uint32_binary () {
+	printf "$(uint32_octal "$1")"
+}
+
+# Print a pack header, version 2, for a pack with $1 objects
+pack_header () {
+	printf 'PACK' &&
+	printf '\0\0\0\2' &&
+	uint32_binary "$1"
+}
+
+# Print the pack data for object $1, as a delta against object $2 (or as a full
+# object if $2 is missing or empty). The output is suitable for including
+# directly in the packfile, and represents the entirety of the object entry.
+# Doing this on the fly (especially picking your deltas) is quite tricky, so we
+# have hardcoded some well-known objects. See the case statements below for the
+# complete list.
+pack_obj () {
+	case "$1" in
+	# empty blob
+	e69de29bb2d1d6434b8b29ae775ad8c2e48c5391)
+		case "$2" in
+		'')
+			printf '\060\170\234\003\0\0\0\0\1'
+			return
+			;;
+		esac
+		;;
+
+	# blob containing "\7\76"
+	e68fe8129b546b101aee9510c5328e7f21ca1d18)
+		case "$2" in
+		'')
+			printf '\062\170\234\143\267\3\0\0\116\0\106'
+			return
+			;;
+		01d7713666f4de822776c7622c10f1b07de280dc)
+			printf '\165\1\327\161\66\146\364\336\202\47\166' &&
+			printf '\307\142\54\20\361\260\175\342\200\334\170' &&
+			printf '\234\143\142\142\142\267\003\0\0\151\0\114'
+			return
+			;;
+		esac
+		;;
+
+	# blob containing "\7\0"
+	01d7713666f4de822776c7622c10f1b07de280dc)
+		case "$2" in
+		'')
+			printf '\062\170\234\143\147\0\0\0\20\0\10'
+			return
+			;;
+		e68fe8129b546b101aee9510c5328e7f21ca1d18)
+			printf '\165\346\217\350\22\233\124\153\20\32\356' &&
+			printf '\225\20\305\62\216\177\41\312\35\30\170\234' &&
+			printf '\143\142\142\142\147\0\0\0\53\0\16'
+			return
+			;;
+		esac
+		;;
+	esac
+
+	echo >&2 "BUG: don't know how to print $1${2:+ (from $2)}"
+	return 1
+}
+
+# Compute and append pack trailer to "$1"
+pack_trailer () {
+	test-sha1 -b <"$1" >trailer.tmp &&
+	cat trailer.tmp >>"$1" &&
+	rm -f trailer.tmp
+}
+
+# Remove any existing packs to make sure that
+# whatever we index next will be the pack that we
+# actually use.
+clear_packs () {
+	rm -f .git/objects/pack/*
+}
diff --git a/t/t5308-pack-detect-duplicates.sh b/t/t5308-pack-detect-duplicates.sh
new file mode 100755
index 0000000000..9c5a8766ab
--- /dev/null
+++ b/t/t5308-pack-detect-duplicates.sh
@@ -0,0 +1,80 @@
+#!/bin/sh
+
+test_description='handling of duplicate objects in incoming packfiles'
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-pack.sh
+
+# The sha1s we have in our pack. It's important that these have the same
+# starting byte, so that they end up in the same fanout section of the index.
+# That lets us make sure we are exercising the binary search with both sets.
+LO_SHA1=e68fe8129b546b101aee9510c5328e7f21ca1d18
+HI_SHA1=e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
+
+# And here's a "missing sha1" which will produce failed lookups. It must also
+# be in the same fanout section, and should be between the two (so that during
+# our binary search, we are sure to end up looking at one or the other of the
+# duplicate runs).
+MISSING_SHA1='e69d000000000000000000000000000000000000'
+
+# git will never intentionally create packfiles with
+# duplicate objects, so we have to construct them by hand.
+#
+# $1 is the name of the packfile to create
+#
+# $2 is the number of times to duplicate each object
+create_pack () {
+	pack_header "$((2 * $2))" >"$1" &&
+	for i in $(test_seq 1 "$2"); do
+		pack_obj $LO_SHA1 &&
+		pack_obj $HI_SHA1
+	done >>"$1" &&
+	pack_trailer "$1"
+}
+
+# double-check that create_pack actually works
+test_expect_success 'pack with no duplicates' '
+	create_pack no-dups.pack 1 &&
+	git index-pack --stdin <no-dups.pack
+'
+
+test_expect_success 'index-pack will allow duplicate objects by default' '
+	clear_packs &&
+	create_pack dups.pack 100 &&
+	git index-pack --stdin <dups.pack
+'
+
+test_expect_success 'create batch-check test vectors' '
+	cat >input <<-EOF &&
+	$LO_SHA1
+	$HI_SHA1
+	$MISSING_SHA1
+	EOF
+	cat >expect <<-EOF
+	$LO_SHA1 blob 2
+	$HI_SHA1 blob 0
+	$MISSING_SHA1 missing
+	EOF
+'
+
+test_expect_success 'lookup in duplicated pack (binary search)' '
+	git cat-file --batch-check <input >actual &&
+	test_cmp expect actual
+'
+
+test_expect_success 'lookup in duplicated pack (GIT_USE_LOOKUP)' '
+	(
+		GIT_USE_LOOKUP=1 &&
+		export GIT_USE_LOOKUP &&
+		git cat-file --batch-check <input >actual
+	) &&
+	test_cmp expect actual
+'
+
+test_expect_success 'index-pack can reject packs with duplicates' '
+	clear_packs &&
+	create_pack dups.pack 2 &&
+	test_must_fail git index-pack --strict --stdin <dups.pack &&
+	test_expect_code 1 git cat-file -e $LO_SHA1
+'
+
+test_done
diff --git a/t/t5309-pack-delta-cycles.sh b/t/t5309-pack-delta-cycles.sh
new file mode 100755
index 0000000000..3e7861b075
--- /dev/null
+++ b/t/t5309-pack-delta-cycles.sh
@@ -0,0 +1,77 @@
+#!/bin/sh
+
+test_description='test index-pack handling of delta cycles in packfiles'
+. ./test-lib.sh
+. "$TEST_DIRECTORY"/lib-pack.sh
+
+# Two similar-ish objects that we have computed deltas between.
+A=01d7713666f4de822776c7622c10f1b07de280dc
+B=e68fe8129b546b101aee9510c5328e7f21ca1d18
+
+# double-check our hand-constucted packs
+test_expect_success 'index-pack works with a single delta (A->B)' '
+	clear_packs &&
+	{
+		pack_header 2 &&
+		pack_obj $A $B &&
+		pack_obj $B
+	} >ab.pack &&
+	pack_trailer ab.pack &&
+	git index-pack --stdin <ab.pack &&
+	git cat-file -t $A &&
+	git cat-file -t $B
+'
+
+test_expect_success 'index-pack works with a single delta (B->A)' '
+	clear_packs &&
+	{
+		pack_header 2 &&
+		pack_obj $A &&
+		pack_obj $B $A
+	} >ba.pack &&
+	pack_trailer ba.pack &&
+	git index-pack --stdin <ba.pack &&
+	git cat-file -t $A &&
+	git cat-file -t $B
+'
+
+test_expect_success 'index-pack detects missing base objects' '
+	clear_packs &&
+	{
+		pack_header 1 &&
+		pack_obj $A $B
+	} >missing.pack &&
+	pack_trailer missing.pack &&
+	test_must_fail git index-pack --fix-thin --stdin <missing.pack
+'
+
+test_expect_success 'index-pack detects REF_DELTA cycles' '
+	clear_packs &&
+	{
+		pack_header 2 &&
+		pack_obj $A $B &&
+		pack_obj $B $A
+	} >cycle.pack &&
+	pack_trailer cycle.pack &&
+	test_must_fail git index-pack --fix-thin --stdin <cycle.pack
+'
+
+test_expect_failure 'failover to an object in another pack' '
+	clear_packs &&
+	git index-pack --stdin <ab.pack &&
+	git index-pack --stdin --fix-thin <cycle.pack
+'
+
+test_expect_failure 'failover to a duplicate object in the same pack' '
+	clear_packs &&
+	{
+		pack_header 3 &&
+		pack_obj $A $B &&
+		pack_obj $B $A &&
+		pack_obj $A
+	} >recoverable.pack &&
+	pack_trailer recoverable.pack &&
+	git index-pack --fix-thin --stdin <recoverable.pack
+'
+
+test_done
diff --git a/test-sha1.c b/test-sha1.c
index 80daba980e..e57eae10bf 100644
--- a/test-sha1.c
+++ b/test-sha1.c
@@ -5,10 +5,15 @@ int main(int ac, char **av)
 	git_SHA_CTX ctx;
 	unsigned char sha1[20];
 	unsigned bufsz = 8192;
+	int binary = 0;
 	char *buffer;
 
-	if (ac == 2)
-		bufsz = strtoul(av[1], NULL, 10) * 1024 * 1024;
+	if (ac == 2) {
+		if (!strcmp(av[1], "-b"))
+			binary = 1;
+		else
+			bufsz = strtoul(av[1], NULL, 10) * 1024 * 1024;
+	}
 
 	if (!bufsz)
 		bufsz = 8192;
@@ -42,6 +47,10 @@ int main(int ac, char **av)
 		git_SHA1_Update(&ctx, buffer, this_sz);
 	}
 	git_SHA1_Final(sha1, &ctx);
-	puts(sha1_to_hex(sha1));
+
+	if (binary)
+		fwrite(sha1, 1, 20, stdout);
+	else
+		puts(sha1_to_hex(sha1));
 	exit(0);
 }