From 949782d860889cbd50edfdf8c00e91d7a4f6cb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:28 +0200 Subject: test-regex: isolate the bug test code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is in preparation to turn test-regex into some generic regex testing command. Helped-by: Eric Sunshine Helped-by: Ramsay Jones Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t0070-fundamental.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 't') diff --git a/t/t0070-fundamental.sh b/t/t0070-fundamental.sh index 5ed69a6f56..991ed2a48d 100755 --- a/t/t0070-fundamental.sh +++ b/t/t0070-fundamental.sh @@ -31,7 +31,7 @@ test_expect_success 'git_mkstemps_mode does not fail if fd 0 is not open' ' test_expect_success 'check for a bug in the regex routines' ' # if this test fails, re-build git with NO_REGEX=1 - test-regex + test-regex --bug ' test_done -- cgit v1.2.3 From 5c1ebcca4d1df3b2a84d01b3f32ec13bf8f760f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:30 +0200 Subject: grep/icase: avoid kwsset on literal non-ascii strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we detect the pattern is just a literal string, we avoid heavy regex engine and use fast substring search implemented in kwsset.c. But kws uses git-ctype which is locale-independent so it does not know how to fold case properly outside ascii range. Let regcomp or pcre take care of this case instead. Slower, but accurate. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7812-grep-icase-non-ascii.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100755 t/t7812-grep-icase-non-ascii.sh (limited to 't') diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh new file mode 100755 index 0000000000..b78a774dab --- /dev/null +++ b/t/t7812-grep-icase-non-ascii.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_LOCALE 'setup' ' + test_write_lines "TILRAUN: Halló Heimur!" >file && + git add file && + LC_ALL="$is_IS_locale" && + export LC_ALL +' + +test_have_prereq GETTEXT_LOCALE && +test-regex "HALLÓ" "Halló" ICASE && +test_set_prereq REGEX_LOCALE + +test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' + git grep -i "TILRAUN: Halló Heimur!" && + git grep -i "TILRAUN: HALLÓ HEIMUR!" +' + +test_done -- cgit v1.2.3 From 793dc676e08394ed15bffe0ed66606ff9ced1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:31 +0200 Subject: grep/icase: avoid kwsset when -F is specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to the previous commit, we can't use kws on icase search outside ascii range. But we can't simply pass the pattern to regcomp/pcre like the previous commit because it may contain regex special characters, so we need to quote the regex first. To avoid misquote traps that could lead to undefined behavior, we always stick to basic regex engine in this case. We don't need fancy features for grepping a literal string anyway. basic_regex_quote_buf() assumes that if the pattern is in a multibyte encoding, ascii chars must be unambiguously encoded as single bytes. This is true at least for UTF-8. For others, let's wait until people yell up. Chances are nobody uses multibyte, non utf-8 charsets anymore. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7812-grep-icase-non-ascii.sh | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 't') diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index b78a774dab..1929809d4a 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,4 +20,30 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' + git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 && + test_cmp expect2 debug2 +' + +test_expect_success REGEX_LOCALE 'grep string with regex, with -F' ' + test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file && + + git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 && + test_cmp expect2 debug2 +' + test_done -- cgit v1.2.3 From 9d9babb84d45234132f3cb1f4527ce1106e3d2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:33 +0200 Subject: grep/pcre: prepare locale-dependent tables for icase matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default tables are usually built with C locale and only suitable for LANG=C or similar. This should make case insensitive search work correctly for all single-byte charsets. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7813-grep-icase-iso.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 t/t7813-grep-icase-iso.sh (limited to 't') diff --git a/t/t7813-grep-icase-iso.sh b/t/t7813-grep-icase-iso.sh new file mode 100755 index 0000000000..efef7fb81f --- /dev/null +++ b/t/t7813-grep-icase-iso.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_ISO_LOCALE 'setup' ' + printf "TILRAUN: Hall Heimur!" >file && + git add file && + LC_ALL="$is_IS_iso_locale" && + export LC_ALL +' + +test_expect_success GETTEXT_ISO_LOCALE,LIBPCRE 'grep pcre string' ' + git grep --perl-regexp -i "TILRAUN: H.ll Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LL HEIMUR!" +' + +test_done -- cgit v1.2.3 From 18547aacf5ced88bf83d85b03b2b7b3fc6aa3f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:35 +0200 Subject: grep/pcre: support utf-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the previous change in this function, we add locale support for single-byte encodings only. It looks like pcre only supports utf-* as multibyte encodings, the others are left in the cold (which is fine). We need to enable PCRE_UTF8 so pcre can find character boundary correctly. It's needed for case folding (when --ignore-case is used) or '*', '+' or similar syntax is used. The "has_non_ascii()" check is to be on the conservative side. If there's non-ascii in the pattern, the searched content could still be in utf-8, but we can treat it just like a byte stream and everything should work. If we force utf-8 based on locale only and pcre validates utf-8 and the file content is in non-utf8 encoding, things break. Noticed-by: Plamen Totev Helped-by: Plamen Totev Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7812-grep-icase-non-ascii.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 't') diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 1929809d4a..08ae4c945d 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,6 +20,21 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 icase' ' + git grep --perl-regexp "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!" +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 string with "+"' ' + test_write_lines "TILRAUN: Hallóó Heimur!" >file2 && + git add file2 && + git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual && + echo file >expected && + echo file2 >>expected && + test_cmp expected actual +' + test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | grep fixed >debug1 && -- cgit v1.2.3 From b51a9c1479645b3e0c7d5156d027a97a4bb87977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:37 +0200 Subject: diffcore-pickaxe: support case insensitive match on non-ascii MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to the "grep -F -i" case, we can't use kws on icase search outside ascii range, so we quote the string and pass it to regcomp as a basic regexp and let regex engine deal with case sensitivity. The new test is put in t7812 instead of t4209-log-pickaxe because lib-gettext.sh might cause problems elsewhere, probably. Noticed-by: Plamen Totev Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano --- t/t7812-grep-icase-non-ascii.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 't') diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 08ae4c945d..169fd8d706 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -61,4 +61,11 @@ test_expect_success REGEX_LOCALE 'grep string with regex, with -F' ' test_cmp expect2 debug2 ' +test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' ' + git commit -m first && + git log --format=%f -i -S"TILRAUN: HALLÓ HEIMUR!" >actual && + echo first >expected && + test_cmp expected actual +' + test_done -- cgit v1.2.3