diff options
author | Junio C Hamano <gitster@pobox.com> | 2018-05-08 15:59:22 +0900 |
---|---|---|
committer | Junio C Hamano <gitster@pobox.com> | 2018-05-08 15:59:22 +0900 |
commit | 1ac0ce4d32ab7a3546e7e84a562625576208c7db (patch) | |
tree | 23ebffb680b2fe7b69453bb55d8170f69d767a69 /convert.c | |
parent | Merge branch 'ab/nuke-emacs-contrib' (diff) | |
parent | convert: add round trip check based on 'core.checkRoundtripEncoding' (diff) | |
download | tgif-1ac0ce4d32ab7a3546e7e84a562625576208c7db.tar.xz |
Merge branch 'ls/checkout-encoding'
The new "checkout-encoding" attribute can ask Git to convert the
contents to the specified encoding when checking out to the working
tree (and the other way around when checking in).
* ls/checkout-encoding:
convert: add round trip check based on 'core.checkRoundtripEncoding'
convert: add tracing for 'working-tree-encoding' attribute
convert: check for detectable errors in UTF encodings
convert: add 'working-tree-encoding' attribute
utf8: add function to detect a missing UTF-16/32 BOM
utf8: add function to detect prohibited UTF-16/32 BOM
utf8: teach same_encoding() alternative UTF encoding names
strbuf: add a case insensitive starts_with()
strbuf: add xstrdup_toupper()
strbuf: remove unnecessary NUL assignment in xstrdup_tolower()
Diffstat (limited to 'convert.c')
-rw-r--r-- | convert.c | 276 |
1 files changed, 275 insertions, 1 deletions
@@ -7,6 +7,7 @@ #include "sigchain.h" #include "pkt-line.h" #include "sub-process.h" +#include "utf8.h" /* * convert.c - convert a file when checking it out and checking it in. @@ -265,6 +266,241 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats, } +static int validate_encoding(const char *path, const char *enc, + const char *data, size_t len, int die_on_error) +{ + /* We only check for UTF here as UTF?? can be an alias for UTF-?? */ + if (istarts_with(enc, "UTF")) { + /* + * Check for detectable errors in UTF encodings + */ + if (has_prohibited_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is prohibited in '%s' if encoded as %s"); + /* + * This advice is shown for UTF-??BE and UTF-??LE encodings. + * We cut off the last two characters of the encoding name + * to generate the encoding name suitable for BOMs. + */ + const char *advise_msg = _( + "The file '%s' contains a byte order " + "mark (BOM). Please use UTF-%s as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + upper[strlen(upper)-2] = '\0'; + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + + } else if (is_missing_required_utf_bom(enc, data, len)) { + const char *error_msg = _( + "BOM is required in '%s' if encoded as %s"); + const char *advise_msg = _( + "The file '%s' is missing a byte order " + "mark (BOM). Please use UTF-%sBE or UTF-%sLE " + "(depending on the byte order) as " + "working-tree-encoding."); + const char *stripped = NULL; + char *upper = xstrdup_toupper(enc); + if (!skip_prefix(upper, "UTF-", &stripped)) + skip_prefix(stripped, "UTF", &stripped); + advise(advise_msg, path, stripped, stripped); + free(upper); + if (die_on_error) + die(error_msg, path, enc); + else { + return error(error_msg, path, enc); + } + } + + } + return 0; +} + +static void trace_encoding(const char *context, const char *path, + const char *encoding, const char *buf, size_t len) +{ + static struct trace_key coe = TRACE_KEY_INIT(WORKING_TREE_ENCODING); + struct strbuf trace = STRBUF_INIT; + int i; + + strbuf_addf(&trace, "%s (%s, considered %s):\n", context, path, encoding); + for (i = 0; i < len && buf; ++i) { + strbuf_addf( + &trace,"| \e[2m%2i:\e[0m %2x \e[2m%c\e[0m%c", + i, + (unsigned char) buf[i], + (buf[i] > 32 && buf[i] < 127 ? buf[i] : ' '), + ((i+1) % 8 && (i+1) < len ? ' ' : '\n') + ); + } + strbuf_addchars(&trace, '\n', 1); + + trace_strbuf(&coe, &trace); + strbuf_release(&trace); +} + +static int check_roundtrip(const char *enc_name) +{ + /* + * check_roundtrip_encoding contains a string of comma and/or + * space separated encodings (eg. "UTF-16, ASCII, CP1125"). + * Search for the given encoding in that string. + */ + const char *found = strcasestr(check_roundtrip_encoding, enc_name); + const char *next; + int len; + if (!found) + return 0; + next = found + strlen(enc_name); + len = strlen(check_roundtrip_encoding); + return (found && ( + /* + * check that the found encoding is at the + * beginning of check_roundtrip_encoding or + * that it is prefixed with a space or comma + */ + found == check_roundtrip_encoding || ( + (isspace(found[-1]) || found[-1] == ',') + ) + ) && ( + /* + * check that the found encoding is at the + * end of check_roundtrip_encoding or + * that it is suffixed with a space or comma + */ + next == check_roundtrip_encoding + len || ( + next < check_roundtrip_encoding + len && + (isspace(next[0]) || next[0] == ',') + ) + )); +} + +static const char *default_encoding = "UTF-8"; + +static int encode_to_git(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc, int conv_flags) +{ + char *dst; + int dst_len; + int die_on_error = conv_flags & CONV_WRITE_OBJECT; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + /* + * Looks like we got called from "would_convert_to_git()". + * This means Git wants to know if it would encode (= modify!) + * the content. Let's answer with "yes", since an encoding was + * specified. + */ + if (!buf && !src) + return 1; + + if (validate_encoding(path, enc, src, src_len, die_on_error)) + return 0; + + trace_encoding("source", path, enc, src, src_len); + dst = reencode_string_len(src, src_len, default_encoding, enc, + &dst_len); + if (!dst) { + /* + * We could add the blob "as-is" to Git. However, on checkout + * we would try to reencode to the original encoding. This + * would fail and we would leave the user with a messed-up + * working tree. Let's try to avoid this by screaming loud. + */ + const char* msg = _("failed to encode '%s' from %s to %s"); + if (die_on_error) + die(msg, path, enc, default_encoding); + else { + error(msg, path, enc, default_encoding); + return 0; + } + } + trace_encoding("destination", path, default_encoding, dst, dst_len); + + /* + * UTF supports lossless conversion round tripping [1] and conversions + * between UTF and other encodings are mostly round trip safe as + * Unicode aims to be a superset of all other character encodings. + * However, certain encodings (e.g. SHIFT-JIS) are known to have round + * trip issues [2]. Check the round trip conversion for all encodings + * listed in core.checkRoundtripEncoding. + * + * The round trip check is only performed if content is written to Git. + * This ensures that no information is lost during conversion to/from + * the internal UTF-8 representation. + * + * Please note, the code below is not tested because I was not able to + * generate a faulty round trip without an iconv error. Iconv errors + * are already caught above. + * + * [1] http://unicode.org/faq/utf_bom.html#gen2 + * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode + */ + if (die_on_error && check_roundtrip(enc)) { + char *re_src; + int re_src_len; + + re_src = reencode_string_len(dst, dst_len, + enc, default_encoding, + &re_src_len); + + trace_printf("Checking roundtrip encoding for %s...\n", enc); + trace_encoding("reencoded source", path, enc, + re_src, re_src_len); + + if (!re_src || src_len != re_src_len || + memcmp(src, re_src, src_len)) { + const char* msg = _("encoding '%s' from %s to %s and " + "back is not the same"); + die(msg, path, enc, default_encoding); + } + + free(re_src); + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + +static int encode_to_worktree(const char *path, const char *src, size_t src_len, + struct strbuf *buf, const char *enc) +{ + char *dst; + int dst_len; + + /* + * No encoding is specified or there is nothing to encode. + * Tell the caller that the content was not modified. + */ + if (!enc || (src && !src_len)) + return 0; + + dst = reencode_string_len(src, src_len, enc, default_encoding, + &dst_len); + if (!dst) { + error("failed to encode '%s' from %s to %s", + path, default_encoding, enc); + return 0; + } + + strbuf_attach(buf, dst, dst_len, dst_len + 1); + return 1; +} + static int crlf_to_git(const struct index_state *istate, const char *path, const char *src, size_t len, struct strbuf *buf, @@ -978,6 +1214,24 @@ static int ident_to_worktree(const char *path, const char *src, size_t len, return 1; } +static const char *git_path_check_encoding(struct attr_check_item *check) +{ + const char *value = check->value; + + if (ATTR_UNSET(value) || !strlen(value)) + return NULL; + + if (ATTR_TRUE(value) || ATTR_FALSE(value)) { + die(_("true/false are no valid working-tree-encodings")); + } + + /* Don't encode to the default encoding */ + if (same_encoding(value, default_encoding)) + return NULL; + + return value; +} + static enum crlf_action git_path_check_crlf(struct attr_check_item *check) { const char *value = check->value; @@ -1033,6 +1287,7 @@ struct conv_attrs { enum crlf_action attr_action; /* What attr says */ enum crlf_action crlf_action; /* When no attr is set, use core.autocrlf */ int ident; + const char *working_tree_encoding; /* Supported encoding or default encoding if NULL */ }; static void convert_attrs(struct conv_attrs *ca, const char *path) @@ -1041,7 +1296,8 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) if (!check) { check = attr_check_initl("crlf", "ident", "filter", - "eol", "text", NULL); + "eol", "text", "working-tree-encoding", + NULL); user_convert_tail = &user_convert; git_config(read_convert_config, NULL); } @@ -1064,6 +1320,7 @@ static void convert_attrs(struct conv_attrs *ca, const char *path) else if (eol_attr == EOL_CRLF) ca->crlf_action = CRLF_TEXT_CRLF; } + ca->working_tree_encoding = git_path_check_encoding(ccheck + 5); } else { ca->drv = NULL; ca->crlf_action = CRLF_UNDEFINED; @@ -1144,6 +1401,13 @@ int convert_to_git(const struct index_state *istate, src = dst->buf; len = dst->len; } + + ret |= encode_to_git(path, src, len, dst, ca.working_tree_encoding, conv_flags); + if (ret && dst) { + src = dst->buf; + len = dst->len; + } + if (!(conv_flags & CONV_EOL_KEEP_CRLF)) { ret |= crlf_to_git(istate, path, src, len, dst, ca.crlf_action, conv_flags); if (ret && dst) { @@ -1167,6 +1431,7 @@ void convert_to_git_filter_fd(const struct index_state *istate, if (!apply_filter(path, NULL, 0, fd, dst, ca.drv, CAP_CLEAN, NULL)) die("%s: clean filter '%s' failed", path, ca.drv->name); + encode_to_git(path, dst->buf, dst->len, dst, ca.working_tree_encoding, conv_flags); crlf_to_git(istate, path, dst->buf, dst->len, dst, ca.crlf_action, conv_flags); ident_to_git(path, dst->buf, dst->len, dst, ca.ident); } @@ -1198,6 +1463,12 @@ static int convert_to_working_tree_internal(const char *path, const char *src, } } + ret |= encode_to_worktree(path, src, len, dst, ca.working_tree_encoding); + if (ret) { + src = dst->buf; + len = dst->len; + } + ret_filter = apply_filter( path, src, len, -1, dst, ca.drv, CAP_SMUDGE, dco); if (!ret_filter && ca.drv && ca.drv->required) @@ -1664,6 +1935,9 @@ struct stream_filter *get_stream_filter(const char *path, const struct object_id if (ca.drv && (ca.drv->process || ca.drv->smudge || ca.drv->clean)) return NULL; + if (ca.working_tree_encoding) + return NULL; + if (ca.crlf_action == CRLF_AUTO || ca.crlf_action == CRLF_AUTO_CRLF) return NULL; |