From 77f6f4406fef43130e9c72e32ac5e30c5bbe0b9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sat, 20 May 2017 21:42:15 +0000 Subject: grep: add a test helper function for less verbose -f \0 tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a helper function to make the tests which check for patterns with \0 in them more succinct. Right now this isn't a big win, but subsequent commits will add a lot more of these tests. The helper is based on the match() function in t3070-wildmatch.sh. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7008-grep-binary.sh | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 't/t7008-grep-binary.sh') diff --git a/t/t7008-grep-binary.sh b/t/t7008-grep-binary.sh index 9c9c378119..df93d8e44c 100755 --- a/t/t7008-grep-binary.sh +++ b/t/t7008-grep-binary.sh @@ -4,6 +4,29 @@ test_description='git grep in binary files' . ./test-lib.sh +nul_match () { + matches=$1 + flags=$2 + pattern=$3 + pattern_human=$(echo "$pattern" | sed 's/Q//g') + + if test "$matches" = 1 + then + test_expect_success "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + git grep -f f $flags a + " + elif test "$matches" = 0 + then + test_expect_success "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + test_must_fail git grep -f f $flags a + " + else + test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false' + fi +} + test_expect_success 'setup' " echo 'binaryQfile' | q_to_nul >a && git add a && @@ -69,35 +92,12 @@ test_expect_failure 'git grep .fi a' ' git grep .fi a ' -test_expect_success 'git grep -F yf a' " - printf 'yQf' | q_to_nul >f && - git grep -f f -F a -" - -test_expect_success 'git grep -F yx a' " - printf 'yQx' | q_to_nul >f && - test_must_fail git grep -f f -F a -" - -test_expect_success 'git grep -Fi Yf a' " - printf 'YQf' | q_to_nul >f && - git grep -f f -Fi a -" - -test_expect_success 'git grep -Fi Yx a' " - printf 'YQx' | q_to_nul >f && - test_must_fail git grep -f f -Fi a -" - -test_expect_success 'git grep yf a' " - printf 'yQf' | q_to_nul >f && - git grep -f f a -" - -test_expect_success 'git grep yx a' " - printf 'yQx' | q_to_nul >f && - test_must_fail git grep -f f a -" +nul_match 1 '-F' 'yQf' +nul_match 0 '-F' 'yQx' +nul_match 1 '-Fi' 'YQf' +nul_match 0 '-Fi' 'YQx' +nul_match 1 '' 'yQf' +nul_match 0 '' 'yQx' test_expect_success 'grep respects binary diff attribute' ' echo text >t && -- cgit v1.2.3 From 12fc32faa8b51d33f3793c45ea20ab02b9717574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sat, 20 May 2017 21:42:16 +0000 Subject: grep: prepare for testing binary regexes containing rx metacharacters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add setup code needed for testing regexes that contain both binary data and regex metacharacters. The POSIX regcomp() function inherently can't support that, because it takes a \0-delimited char *, but other regex engines APIs like PCRE v2 take a pattern/length pair, and are thus able to handle \0s in patterns as well as any other character. When kwset was imported in commit 9eceddeec6 ("Use kwset in grep", 2011-08-21) this limitation was fixed, but at the expense of introducing the undocumented limitation that any pattern containing \0 implicitly becomes a fixed match (equivalent to -F having been provided). That's not something we'd like to keep in the future. The inability to match patterns containing \0 is a leaky implementation detail. So add tests as a first step towards changing that. In order to test that \0-patterns can properly match as regexes the test string needs to have some regex metacharacters in it. There were other blind spots in the tests. The code around kwset specially handles case-insensitive & non-ASCII data, but there were no tests for this. Fix all of that by amending the text being matched to contain both regex metacharacters & non-ASCII data. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7008-grep-binary.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 't/t7008-grep-binary.sh') diff --git a/t/t7008-grep-binary.sh b/t/t7008-grep-binary.sh index df93d8e44c..20370d6e0c 100755 --- a/t/t7008-grep-binary.sh +++ b/t/t7008-grep-binary.sh @@ -28,7 +28,7 @@ nul_match () { } test_expect_success 'setup' " - echo 'binaryQfile' | q_to_nul >a && + echo 'binaryQfileQm[*]cQ*æQð' | q_to_nul >a && git add a && git commit -m. " @@ -162,7 +162,7 @@ test_expect_success 'grep does not honor textconv' ' ' test_expect_success 'grep --textconv honors textconv' ' - echo "a:binaryQfile" >expect && + echo "a:binaryQfileQm[*]cQ*æQð" >expect && git grep --textconv Qfile >actual && test_cmp expect actual ' @@ -172,7 +172,7 @@ test_expect_success 'grep --no-textconv does not honor textconv' ' ' test_expect_success 'grep --textconv blob honors textconv' ' - echo "HEAD:a:binaryQfile" >expect && + echo "HEAD:a:binaryQfileQm[*]cQ*æQð" >expect && git grep --textconv Qfile HEAD:a >actual && test_cmp expect actual ' -- cgit v1.2.3 From 966be95549374916fc2736897b99c4918899b8ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=86var=20Arnfj=C3=B6r=C3=B0=20Bjarmason?= Date: Sat, 20 May 2017 21:42:17 +0000 Subject: grep: add tests to fix blind spots with \0 patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address a big blind spot in the tests for patterns containing \0. The is_fixed() function considers any string that contains \0 fixed, even if it contains regular expression metacharacters, those patterns are currently matched with kwset. Before this change removing that memchr(s, 0, len) check from is_fixed() wouldn't change the result of any of the tests, since regcomp() will happily match the part before the \0. The kwset path is dependent on whether the the -i flag is on, and whether the pattern has any non-ASCII characters, but none of this was tested for. Signed-off-by: Ævar Arnfjörð Bjarmason Signed-off-by: Junio C Hamano --- t/t7008-grep-binary.sh | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) (limited to 't/t7008-grep-binary.sh') diff --git a/t/t7008-grep-binary.sh b/t/t7008-grep-binary.sh index 20370d6e0c..615e7e0162 100755 --- a/t/t7008-grep-binary.sh +++ b/t/t7008-grep-binary.sh @@ -22,6 +22,18 @@ nul_match () { printf '$pattern' | q_to_nul >f && test_must_fail git grep -f f $flags a " + elif test "$matches" = T1 + then + test_expect_failure "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + git grep -f f $flags a + " + elif test "$matches" = T0 + then + test_expect_failure "git grep -f f $flags '$pattern_human' a" " + printf '$pattern' | q_to_nul >f && + test_must_fail git grep -f f $flags a + " else test_expect_success "PANIC: Test framework error. Unknown matches value $matches" 'false' fi @@ -98,6 +110,65 @@ nul_match 1 '-Fi' 'YQf' nul_match 0 '-Fi' 'YQx' nul_match 1 '' 'yQf' nul_match 0 '' 'yQx' +nul_match 1 '' 'æQð' +nul_match 1 '-F' 'eQm[*]c' +nul_match 1 '-Fi' 'EQM[*]C' + +# Regex patterns that would match but shouldn't with -F +nul_match 0 '-F' 'yQ[f]' +nul_match 0 '-F' '[y]Qf' +nul_match 0 '-Fi' 'YQ[F]' +nul_match 0 '-Fi' '[Y]QF' +nul_match 0 '-F' 'æQ[ð]' +nul_match 0 '-F' '[æ]Qð' +nul_match 0 '-Fi' 'ÆQ[Ð]' +nul_match 0 '-Fi' '[Æ]QÐ' + +# kwset is disabled on -i & non-ASCII. No way to match non-ASCII \0 +# patterns case-insensitively. +nul_match T1 '-i' 'ÆQÐ' + +# \0 implicitly disables regexes. This is an undocumented internal +# limitation. +nul_match T1 '' 'yQ[f]' +nul_match T1 '' '[y]Qf' +nul_match T1 '-i' 'YQ[F]' +nul_match T1 '-i' '[Y]Qf' +nul_match T1 '' 'æQ[ð]' +nul_match T1 '' '[æ]Qð' +nul_match T1 '-i' 'ÆQ[Ð]' + +# ... because of \0 implicitly disabling regexes regexes that +# should/shouldn't match don't do the right thing. +nul_match T1 '' 'eQm.*cQ' +nul_match T1 '-i' 'EQM.*cQ' +nul_match T0 '' 'eQm[*]c' +nul_match T0 '-i' 'EQM[*]C' + +# Due to the REG_STARTEND extension when kwset() is disabled on -i & +# non-ASCII the string will be matched in its entirety, but the +# pattern will be cut off at the first \0. +nul_match 0 '-i' 'NOMATCHQð' +nul_match T0 '-i' '[Æ]QNOMATCH' +nul_match T0 '-i' '[æ]QNOMATCH' +# Matches, but for the wrong reasons, just stops at [æ] +nul_match 1 '-i' '[Æ]Qð' +nul_match 1 '-i' '[æ]Qð' + +# Ensure that the matcher doesn't regress to something that stops at +# \0 +nul_match 0 '-F' 'yQ[f]' +nul_match 0 '-Fi' 'YQ[F]' +nul_match 0 '' 'yQNOMATCH' +nul_match 0 '' 'QNOMATCH' +nul_match 0 '-i' 'YQNOMATCH' +nul_match 0 '-i' 'QNOMATCH' +nul_match 0 '-F' 'æQ[ð]' +nul_match 0 '-Fi' 'ÆQ[Ð]' +nul_match 0 '' 'yQNÓMATCH' +nul_match 0 '' 'QNÓMATCH' +nul_match 0 '-i' 'YQNÓMATCH' +nul_match 0 '-i' 'QNÓMATCH' test_expect_success 'grep respects binary diff attribute' ' echo text >t && -- cgit v1.2.3