diff options
author | 2019-06-13 13:19:39 -0700 | |
---|---|---|
committer | 2019-06-13 13:19:40 -0700 | |
commit | 66dc7b68e4a332c662980b0266b9a1cb545f40ff (patch) | |
tree | b0a77642e65e7dbb8eff9e6ed2b4c9b065e0c144 | |
parent | Merge branch 'jk/unused-params-final-batch' (diff) | |
parent | fast-export: do automatic reencoding of commit messages only if requested (diff) | |
download | tgif-66dc7b68e4a332c662980b0266b9a1cb545f40ff.tar.xz |
Merge branch 'en/fast-export-encoding'
The "git fast-export/import" pair has been taught to handle commits
with log messages in encoding other than UTF-8 better.
* en/fast-export-encoding:
fast-export: do automatic reencoding of commit messages only if requested
fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8
fast-export: avoid stripping encoding header if we cannot reencode
fast-import: support 'encoding' commit header
t9350: fix encoding test to actually test reencoding
-rw-r--r-- | Documentation/git-fast-export.txt | 7 | ||||
-rw-r--r-- | Documentation/git-fast-import.txt | 7 | ||||
-rw-r--r-- | builtin/fast-export.c | 55 | ||||
-rw-r--r-- | fast-import.c | 11 | ||||
-rwxr-xr-x | t/t9300-fast-import.sh | 20 | ||||
-rwxr-xr-x | t/t9350-fast-export.sh | 78 | ||||
-rw-r--r-- | t/t9350/broken-iso-8859-7-commit-message.txt | 1 | ||||
-rw-r--r-- | t/t9350/simple-iso-8859-7-commit-message.txt | 1 |
8 files changed, 163 insertions, 17 deletions
diff --git a/Documentation/git-fast-export.txt b/Documentation/git-fast-export.txt index 64c01ba918..11427acdde 100644 --- a/Documentation/git-fast-export.txt +++ b/Documentation/git-fast-export.txt @@ -129,6 +129,13 @@ marks the same across runs. for intermediary filters (e.g. for rewriting commit messages which refer to older commits, or for stripping blobs by id). +--reencode=(yes|no|abort):: + Specify how to handle `encoding` header in commit objects. When + asking to 'abort' (which is the default), this program will die + when encountering such a commit object. With 'yes', the commit + message will be reencoded into UTF-8. With 'no', the original + encoding will be preserved. + --refspec:: Apply the specified refspec to each ref exported. Multiple of them can be specified. diff --git a/Documentation/git-fast-import.txt b/Documentation/git-fast-import.txt index d65cdb3d08..7baf9e47b5 100644 --- a/Documentation/git-fast-import.txt +++ b/Documentation/git-fast-import.txt @@ -388,6 +388,7 @@ change to the project. original-oid? ('author' (SP <name>)? SP LT <email> GT SP <when> LF)? 'committer' (SP <name>)? SP LT <email> GT SP <when> LF + ('encoding' SP <encoding>)? data ('from' SP <commit-ish> LF)? ('merge' SP <commit-ish> LF)? @@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option. See ``Date Formats'' above for the set of supported formats, and their syntax. +`encoding` +^^^^^^^^^^ +The optional `encoding` command indicates the encoding of the commit +message. Most commits are UTF-8 and the encoding is omitted, but this +allows importing commit messages into git without first reencoding them. + `from` ^^^^^^ The `from` command is used to specify the commit to initialize diff --git a/builtin/fast-export.c b/builtin/fast-export.c index 9e283482ef..c22cef3b2f 100644 --- a/builtin/fast-export.c +++ b/builtin/fast-export.c @@ -33,6 +33,7 @@ static const char *fast_export_usage[] = { static int progress; static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT; static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT; +static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT; static int fake_missing_tagger; static int use_done_feature; static int no_data; @@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt, return 0; } +static int parse_opt_reencode_mode(const struct option *opt, + const char *arg, int unset) +{ + if (unset) { + reencode_mode = REENCODE_ABORT; + return 0; + } + + switch (git_parse_maybe_bool(arg)) { + case 0: + reencode_mode = REENCODE_NO; + break; + case 1: + reencode_mode = REENCODE_YES; + break; + default: + if (!strcasecmp(arg, "abort")) + reencode_mode = REENCODE_ABORT; + else + return error("Unknown reencoding mode: %s", arg); + } + + return 0; +} + static struct decoration idnums; static uint32_t last_idnum; @@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end) bol = memmem(begin, end ? end - begin : strlen(begin), needle, strlen(needle)); if (!bol) - return git_commit_encoding; + return NULL; bol += strlen(needle); eol = strchrnul(bol, '\n'); *eol = '\0'; @@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev, } mark_next_object(&commit->object); - if (anonymize) + if (anonymize) { reencoded = anonymize_commit_message(message); - else if (!is_encoding_utf8(encoding)) - reencoded = reencode_string(message, "UTF-8", encoding); + } else if (encoding) { + switch(reencode_mode) { + case REENCODE_YES: + reencoded = reencode_string(message, "UTF-8", encoding); + break; + case REENCODE_NO: + break; + case REENCODE_ABORT: + die("Encountered commit-specific encoding %s in commit " + "%s; use --reencode=[yes|no] to handle it", + encoding, oid_to_hex(&commit->object.oid)); + } + } if (!commit->parents) printf("reset %s\n", refname); printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum); if (show_original_ids) printf("original-oid %s\n", oid_to_hex(&commit->object.oid)); - printf("%.*s\n%.*s\ndata %u\n%s", + printf("%.*s\n%.*s\n", (int)(author_end - author), author, - (int)(committer_end - committer), committer, + (int)(committer_end - committer), committer); + if (!reencoded && encoding) + printf("encoding %s\n", encoding); + printf("data %u\n%s", (unsigned)(reencoded ? strlen(reencoded) : message ? strlen(message) : 0), @@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix) OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"), N_("select handling of tags that tag filtered objects"), parse_opt_tag_of_filtered_mode), + OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"), + N_("select handling of commit messages in an alternate encoding"), + parse_opt_reencode_mode), OPT_STRING(0, "export-marks", &export_filename, N_("file"), N_("Dump marks to this file")), OPT_STRING(0, "import-marks", &import_filename, N_("file"), diff --git a/fast-import.c b/fast-import.c index f38d04fa58..76a7bd3699 100644 --- a/fast-import.c +++ b/fast-import.c @@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg) struct branch *b; char *author = NULL; char *committer = NULL; + const char *encoding = NULL; struct hash_list *merge_list = NULL; unsigned int merge_count; unsigned char prev_fanout, new_fanout; @@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg) } if (!committer) die("Expected committer but didn't get one"); + if (skip_prefix(command_buf.buf, "encoding ", &encoding)) + read_next_command(); parse_data(&msg, 0, NULL); read_next_command(); parse_from(b); @@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg) } strbuf_addf(&new_data, "author %s\n" - "committer %s\n" - "\n", + "committer %s\n", author ? author : committer, committer); + if (encoding) + strbuf_addf(&new_data, + "encoding %s\n", + encoding); + strbuf_addch(&new_data, '\n'); strbuf_addbuf(&new_data, &msg); free(author); free(committer); diff --git a/t/t9300-fast-import.sh b/t/t9300-fast-import.sh index 3668263c40..141b7fa35e 100755 --- a/t/t9300-fast-import.sh +++ b/t/t9300-fast-import.sh @@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import ' +### +### series X (other new features) +### + +test_expect_success 'X: handling encoding' ' + test_tick && + cat >input <<-INPUT_END && + commit refs/heads/encoding + committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE + encoding iso-8859-7 + data <<COMMIT + INPUT_END + + printf "Pi: \360\nCOMMIT\n" >>input && + + git fast-import <input && + git cat-file -p encoding | grep $(printf "\360") && + git log -1 --format=%B encoding | grep $(printf "\317\200") +' + test_done diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh index 5690fe2810..b4004e05c2 100755 --- a/t/t9350-fast-export.sh +++ b/t/t9350-fast-export.sh @@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' ' test $MUSS = $(git rev-parse --verify refs/tags/muss) ' -test_expect_success 'iso-8859-1' ' +test_expect_success 'reencoding iso-8859-7' ' - git config i18n.commitencoding ISO8859-1 && - # use author and committer name in ISO-8859-1 to match it. - . "$TEST_DIRECTORY"/t3901/8859-1.txt && + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && test_tick && echo rosten >file && - git commit -s -m den file && - git fast-export wer^..wer >iso8859-1.fi && - sed "s/wer/i18n/" iso8859-1.fi | + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n/" iso-8859-7.fi | (cd new && git fast-import && + # The commit object, if not re-encoded, would be 240 bytes. + # Removing the "encoding iso-8859-7\n" header drops 20 bytes. + # Re-encoding the Pi character from \xF0 (\360) in iso-8859-7 + # to \xCF\x80 (\317\200) in UTF-8 adds a byte. Check for + # the expected size. + test 221 -eq "$(git cat-file -s i18n)" && + # ...and for the expected translation of bytes. git cat-file commit i18n >actual && - grep "Áéí óú" actual) + grep $(printf "\317\200") actual && + # Also make sure the commit does not have the "encoding" header + ! grep ^encoding actual) +' + +test_expect_success 'aborting on iso-8859-7' ' + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi ' + +test_expect_success 'preserving iso-8859-7' ' + + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file && + git fast-export --reencode=no wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n-no-recoding/" iso-8859-7.fi | + (cd new && + git fast-import && + # The commit object, if not re-encoded, is 240 bytes. + # Removing the "encoding iso-8859-7\n" header would drops 20 + # bytes. Re-encoding the Pi character from \xF0 (\360) in + # iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte. + # Check for the expected size... + test 240 -eq "$(git cat-file -s i18n-no-recoding)" && + # ...as well as the expected byte. + git cat-file commit i18n-no-recoding >actual && + grep $(printf "\360") actual && + # Also make sure the commit has the "encoding" header + grep ^encoding actual) +' + +test_expect_success 'encoding preserved if reencoding fails' ' + + test_when_finished "git reset --hard HEAD~1" && + test_config i18n.commitencoding iso-8859-7 && + echo rosten >file && + git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file && + git fast-export --reencode=yes wer^..wer >iso-8859-7.fi && + sed "s/wer/i18n-invalid/" iso-8859-7.fi | + (cd new && + git fast-import && + git cat-file commit i18n-invalid >actual && + # Make sure the commit still has the encoding header + grep ^encoding actual && + # Verify that the commit has the expected size; i.e. + # that no bytes were re-encoded to a different encoding. + test 252 -eq "$(git cat-file -s i18n-invalid)" && + # ...and check for the original special bytes + grep $(printf "\360") actual && + grep $(printf "\377") actual) +' + test_expect_success 'import/export-marks' ' git checkout -b marks master && @@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME test_expect_success 'setup copies' ' - git config --unset i18n.commitencoding && git checkout -b copy rein && git mv file file3 && git commit -m move1 && diff --git a/t/t9350/broken-iso-8859-7-commit-message.txt b/t/t9350/broken-iso-8859-7-commit-message.txt new file mode 100644 index 0000000000..d06ad75b44 --- /dev/null +++ b/t/t9350/broken-iso-8859-7-commit-message.txt @@ -0,0 +1 @@ +Pi: ; Invalid:
\ No newline at end of file diff --git a/t/t9350/simple-iso-8859-7-commit-message.txt b/t/t9350/simple-iso-8859-7-commit-message.txt new file mode 100644 index 0000000000..8b3f0c3dba --- /dev/null +++ b/t/t9350/simple-iso-8859-7-commit-message.txt @@ -0,0 +1 @@ +Pi:
\ No newline at end of file |