diff options
Diffstat (limited to 'utf8.c')
-rw-r--r-- | utf8.c | 140 |
1 files changed, 129 insertions, 11 deletions
@@ -489,6 +489,28 @@ char *reencode_string_iconv(const char *in, size_t insz, iconv_t conv, int *outs return out; } +static const char *fallback_encoding(const char *name) +{ + /* + * Some platforms do not have the variously spelled variants of + * UTF-8, so let's fall back to trying the most official + * spelling. We do so only as a fallback in case the platform + * does understand the user's spelling, but not our official + * one. + */ + if (is_encoding_utf8(name)) + return "UTF-8"; + + /* + * Even though latin-1 is still seen in e-mail + * headers, some platforms only install ISO-8859-1. + */ + if (!strcasecmp(name, "latin-1")) + return "ISO-8859-1"; + + return name; +} + char *reencode_string_len(const char *in, int insz, const char *out_encoding, const char *in_encoding, int *outsz) @@ -501,17 +523,9 @@ char *reencode_string_len(const char *in, int insz, conv = iconv_open(out_encoding, in_encoding); if (conv == (iconv_t) -1) { - /* - * Some platforms do not have the variously spelled variants of - * UTF-8, so let's fall back to trying the most official - * spelling. We do so only as a fallback in case the platform - * does understand the user's spelling, but not our official - * one. - */ - if (is_encoding_utf8(in_encoding)) - in_encoding = "UTF-8"; - if (is_encoding_utf8(out_encoding)) - out_encoding = "UTF-8"; + in_encoding = fallback_encoding(in_encoding); + out_encoding = fallback_encoding(out_encoding); + conv = iconv_open(out_encoding, in_encoding); if (conv == (iconv_t) -1) return NULL; @@ -561,3 +575,107 @@ int mbs_chrlen(const char **text, size_t *remainder_p, const char *encoding) return chrlen; } + +/* + * Pick the next char from the stream, ignoring codepoints an HFS+ would. + * Note that this is _not_ complete by any means. It's just enough + * to make is_hfs_dotgit() work, and should not be used otherwise. + */ +static ucs_char_t next_hfs_char(const char **in) +{ + while (1) { + ucs_char_t out = pick_one_utf8_char(in, NULL); + /* + * check for malformed utf8. Technically this + * gets converted to a percent-sequence, but + * returning 0 is good enough for is_hfs_dotgit + * to realize it cannot be .git + */ + if (!*in) + return 0; + + /* these code points are ignored completely */ + switch (out) { + case 0x200c: /* ZERO WIDTH NON-JOINER */ + case 0x200d: /* ZERO WIDTH JOINER */ + case 0x200e: /* LEFT-TO-RIGHT MARK */ + case 0x200f: /* RIGHT-TO-LEFT MARK */ + case 0x202a: /* LEFT-TO-RIGHT EMBEDDING */ + case 0x202b: /* RIGHT-TO-LEFT EMBEDDING */ + case 0x202c: /* POP DIRECTIONAL FORMATTING */ + case 0x202d: /* LEFT-TO-RIGHT OVERRIDE */ + case 0x202e: /* RIGHT-TO-LEFT OVERRIDE */ + case 0x206a: /* INHIBIT SYMMETRIC SWAPPING */ + case 0x206b: /* ACTIVATE SYMMETRIC SWAPPING */ + case 0x206c: /* INHIBIT ARABIC FORM SHAPING */ + case 0x206d: /* ACTIVATE ARABIC FORM SHAPING */ + case 0x206e: /* NATIONAL DIGIT SHAPES */ + case 0x206f: /* NOMINAL DIGIT SHAPES */ + case 0xfeff: /* ZERO WIDTH NO-BREAK SPACE */ + continue; + } + + return out; + } +} + +int is_hfs_dotgit(const char *path) +{ + ucs_char_t c; + + c = next_hfs_char(&path); + if (c != '.') + return 0; + c = next_hfs_char(&path); + + /* + * there's a great deal of other case-folding that occurs + * in HFS+, but this is enough to catch anything that will + * convert to ".git" + */ + if (c != 'g' && c != 'G') + return 0; + c = next_hfs_char(&path); + if (c != 'i' && c != 'I') + return 0; + c = next_hfs_char(&path); + if (c != 't' && c != 'T') + return 0; + c = next_hfs_char(&path); + if (c && !is_dir_sep(c)) + return 0; + + return 1; +} + +const char utf8_bom[] = "\357\273\277"; + +int skip_utf8_bom(char **text, size_t len) +{ + if (len < strlen(utf8_bom) || + memcmp(*text, utf8_bom, strlen(utf8_bom))) + return 0; + *text += strlen(utf8_bom); + return 1; +} + +void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int width, + const char *s) +{ + int slen = strlen(s); + int display_len = utf8_strnwidth(s, slen, 0); + int utf8_compensation = slen - display_len; + + if (display_len >= width) { + strbuf_addstr(buf, s); + return; + } + + if (position == ALIGN_LEFT) + strbuf_addf(buf, "%-*s", width + utf8_compensation, s); + else if (position == ALIGN_MIDDLE) { + int left = (width - display_len) / 2; + strbuf_addf(buf, "%*s%-*s", left, "", width - left + utf8_compensation, s); + } else if (position == ALIGN_RIGHT) + strbuf_addf(buf, "%*s", width + utf8_compensation, s); +} |