diff options
Diffstat (limited to 'vendor/github.com/buger/jsonparser/escape.go')
-rw-r--r-- | vendor/github.com/buger/jsonparser/escape.go | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/vendor/github.com/buger/jsonparser/escape.go b/vendor/github.com/buger/jsonparser/escape.go new file mode 100644 index 000000000..49669b942 --- /dev/null +++ b/vendor/github.com/buger/jsonparser/escape.go @@ -0,0 +1,173 @@ +package jsonparser + +import ( + "bytes" + "unicode/utf8" +) + +// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7 + +const supplementalPlanesOffset = 0x10000 +const highSurrogateOffset = 0xD800 +const lowSurrogateOffset = 0xDC00 + +const basicMultilingualPlaneReservedOffset = 0xDFFF +const basicMultilingualPlaneOffset = 0xFFFF + +func combineUTF16Surrogates(high, low rune) rune { + return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset) +} + +const badHex = -1 + +func h2I(c byte) int { + switch { + case c >= '0' && c <= '9': + return int(c - '0') + case c >= 'A' && c <= 'F': + return int(c - 'A' + 10) + case c >= 'a' && c <= 'f': + return int(c - 'a' + 10) + } + return badHex +} + +// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and +// is not checked. +// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together. +// This function only handles one; decodeUnicodeEscape handles this more complex case. +func decodeSingleUnicodeEscape(in []byte) (rune, bool) { + // We need at least 6 characters total + if len(in) < 6 { + return utf8.RuneError, false + } + + // Convert hex to decimal + h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5]) + if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex { + return utf8.RuneError, false + } + + // Compose the hex digits + return rune(h1<<12 + h2<<8 + h3<<4 + h4), true +} + +// isUTF16EncodedRune checks if a rune is in the range for non-BMP characters, +// which is used to describe UTF16 chars. +// Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane +func isUTF16EncodedRune(r rune) bool { + return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset +} + +func decodeUnicodeEscape(in []byte) (rune, int) { + if r, ok := decodeSingleUnicodeEscape(in); !ok { + // Invalid Unicode escape + return utf8.RuneError, -1 + } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) { + // Valid Unicode escape in Basic Multilingual Plane + return r, 6 + } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain + // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate" + return utf8.RuneError, -1 + } else if r2 < lowSurrogateOffset { + // Invalid UTF16 "low surrogate" + return utf8.RuneError, -1 + } else { + // Valid UTF16 surrogate pair + return combineUTF16Surrogates(r, r2), 12 + } +} + +// backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X] +var backslashCharEscapeTable = [...]byte{ + '"': '"', + '\\': '\\', + '/': '/', + 'b': '\b', + 'f': '\f', + 'n': '\n', + 'r': '\r', + 't': '\t', +} + +// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns +// how many characters were consumed from 'in' and emitted into 'out'. +// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error. +func unescapeToUTF8(in, out []byte) (inLen int, outLen int) { + if len(in) < 2 || in[0] != '\\' { + // Invalid escape due to insufficient characters for any escape or no initial backslash + return -1, -1 + } + + // https://tools.ietf.org/html/rfc7159#section-7 + switch e := in[1]; e { + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + // Valid basic 2-character escapes (use lookup table) + out[0] = backslashCharEscapeTable[e] + return 2, 1 + case 'u': + // Unicode escape + if r, inLen := decodeUnicodeEscape(in); inLen == -1 { + // Invalid Unicode escape + return -1, -1 + } else { + // Valid Unicode escape; re-encode as UTF8 + outLen := utf8.EncodeRune(out, r) + return inLen, outLen + } + } + + return -1, -1 +} + +// unescape unescapes the string contained in 'in' and returns it as a slice. +// If 'in' contains no escaped characters: +// Returns 'in'. +// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)): +// 'out' is used to build the unescaped string and is returned with no extra allocation +// Else: +// A new slice is allocated and returned. +func Unescape(in, out []byte) ([]byte, error) { + firstBackslash := bytes.IndexByte(in, '\\') + if firstBackslash == -1 { + return in, nil + } + + // Get a buffer of sufficient size (allocate if needed) + if cap(out) < len(in) { + out = make([]byte, len(in)) + } else { + out = out[0:len(in)] + } + + // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice) + copy(out, in[:firstBackslash]) + in = in[firstBackslash:] + buf := out[firstBackslash:] + + for len(in) > 0 { + // Unescape the next escaped character + inLen, bufLen := unescapeToUTF8(in, buf) + if inLen == -1 { + return nil, MalformedStringEscapeError + } + + in = in[inLen:] + buf = buf[bufLen:] + + // Copy everything up until the next backslash + nextBackslash := bytes.IndexByte(in, '\\') + if nextBackslash == -1 { + copy(buf, in) + buf = buf[len(in):] + break + } else { + copy(buf, in[:nextBackslash]) + buf = buf[nextBackslash:] + in = in[nextBackslash:] + } + } + + // Trim the out buffer to the amount that was actually emitted + return out[:len(out)-len(buf)], nil +} |