summaryrefslogtreecommitdiff
path: root/vendor/github.com/buger/jsonparser/escape.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/buger/jsonparser/escape.go')
-rw-r--r--vendor/github.com/buger/jsonparser/escape.go173
1 files changed, 173 insertions, 0 deletions
diff --git a/vendor/github.com/buger/jsonparser/escape.go b/vendor/github.com/buger/jsonparser/escape.go
new file mode 100644
index 000000000..49669b942
--- /dev/null
+++ b/vendor/github.com/buger/jsonparser/escape.go
@@ -0,0 +1,173 @@
+package jsonparser
+
+import (
+ "bytes"
+ "unicode/utf8"
+)
+
+// JSON Unicode stuff: see https://tools.ietf.org/html/rfc7159#section-7
+
+const supplementalPlanesOffset = 0x10000
+const highSurrogateOffset = 0xD800
+const lowSurrogateOffset = 0xDC00
+
+const basicMultilingualPlaneReservedOffset = 0xDFFF
+const basicMultilingualPlaneOffset = 0xFFFF
+
+func combineUTF16Surrogates(high, low rune) rune {
+ return supplementalPlanesOffset + (high-highSurrogateOffset)<<10 + (low - lowSurrogateOffset)
+}
+
+const badHex = -1
+
+func h2I(c byte) int {
+ switch {
+ case c >= '0' && c <= '9':
+ return int(c - '0')
+ case c >= 'A' && c <= 'F':
+ return int(c - 'A' + 10)
+ case c >= 'a' && c <= 'f':
+ return int(c - 'a' + 10)
+ }
+ return badHex
+}
+
+// decodeSingleUnicodeEscape decodes a single \uXXXX escape sequence. The prefix \u is assumed to be present and
+// is not checked.
+// In JSON, these escapes can either come alone or as part of "UTF16 surrogate pairs" that must be handled together.
+// This function only handles one; decodeUnicodeEscape handles this more complex case.
+func decodeSingleUnicodeEscape(in []byte) (rune, bool) {
+ // We need at least 6 characters total
+ if len(in) < 6 {
+ return utf8.RuneError, false
+ }
+
+ // Convert hex to decimal
+ h1, h2, h3, h4 := h2I(in[2]), h2I(in[3]), h2I(in[4]), h2I(in[5])
+ if h1 == badHex || h2 == badHex || h3 == badHex || h4 == badHex {
+ return utf8.RuneError, false
+ }
+
+ // Compose the hex digits
+ return rune(h1<<12 + h2<<8 + h3<<4 + h4), true
+}
+
+// isUTF16EncodedRune checks if a rune is in the range for non-BMP characters,
+// which is used to describe UTF16 chars.
+// Source: https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane
+func isUTF16EncodedRune(r rune) bool {
+ return highSurrogateOffset <= r && r <= basicMultilingualPlaneReservedOffset
+}
+
+func decodeUnicodeEscape(in []byte) (rune, int) {
+ if r, ok := decodeSingleUnicodeEscape(in); !ok {
+ // Invalid Unicode escape
+ return utf8.RuneError, -1
+ } else if r <= basicMultilingualPlaneOffset && !isUTF16EncodedRune(r) {
+ // Valid Unicode escape in Basic Multilingual Plane
+ return r, 6
+ } else if r2, ok := decodeSingleUnicodeEscape(in[6:]); !ok { // Note: previous decodeSingleUnicodeEscape success guarantees at least 6 bytes remain
+ // UTF16 "high surrogate" without manditory valid following Unicode escape for the "low surrogate"
+ return utf8.RuneError, -1
+ } else if r2 < lowSurrogateOffset {
+ // Invalid UTF16 "low surrogate"
+ return utf8.RuneError, -1
+ } else {
+ // Valid UTF16 surrogate pair
+ return combineUTF16Surrogates(r, r2), 12
+ }
+}
+
+// backslashCharEscapeTable: when '\X' is found for some byte X, it is to be replaced with backslashCharEscapeTable[X]
+var backslashCharEscapeTable = [...]byte{
+ '"': '"',
+ '\\': '\\',
+ '/': '/',
+ 'b': '\b',
+ 'f': '\f',
+ 'n': '\n',
+ 'r': '\r',
+ 't': '\t',
+}
+
+// unescapeToUTF8 unescapes the single escape sequence starting at 'in' into 'out' and returns
+// how many characters were consumed from 'in' and emitted into 'out'.
+// If a valid escape sequence does not appear as a prefix of 'in', (-1, -1) to signal the error.
+func unescapeToUTF8(in, out []byte) (inLen int, outLen int) {
+ if len(in) < 2 || in[0] != '\\' {
+ // Invalid escape due to insufficient characters for any escape or no initial backslash
+ return -1, -1
+ }
+
+ // https://tools.ietf.org/html/rfc7159#section-7
+ switch e := in[1]; e {
+ case '"', '\\', '/', 'b', 'f', 'n', 'r', 't':
+ // Valid basic 2-character escapes (use lookup table)
+ out[0] = backslashCharEscapeTable[e]
+ return 2, 1
+ case 'u':
+ // Unicode escape
+ if r, inLen := decodeUnicodeEscape(in); inLen == -1 {
+ // Invalid Unicode escape
+ return -1, -1
+ } else {
+ // Valid Unicode escape; re-encode as UTF8
+ outLen := utf8.EncodeRune(out, r)
+ return inLen, outLen
+ }
+ }
+
+ return -1, -1
+}
+
+// unescape unescapes the string contained in 'in' and returns it as a slice.
+// If 'in' contains no escaped characters:
+// Returns 'in'.
+// Else, if 'out' is of sufficient capacity (guaranteed if cap(out) >= len(in)):
+// 'out' is used to build the unescaped string and is returned with no extra allocation
+// Else:
+// A new slice is allocated and returned.
+func Unescape(in, out []byte) ([]byte, error) {
+ firstBackslash := bytes.IndexByte(in, '\\')
+ if firstBackslash == -1 {
+ return in, nil
+ }
+
+ // Get a buffer of sufficient size (allocate if needed)
+ if cap(out) < len(in) {
+ out = make([]byte, len(in))
+ } else {
+ out = out[0:len(in)]
+ }
+
+ // Copy the first sequence of unescaped bytes to the output and obtain a buffer pointer (subslice)
+ copy(out, in[:firstBackslash])
+ in = in[firstBackslash:]
+ buf := out[firstBackslash:]
+
+ for len(in) > 0 {
+ // Unescape the next escaped character
+ inLen, bufLen := unescapeToUTF8(in, buf)
+ if inLen == -1 {
+ return nil, MalformedStringEscapeError
+ }
+
+ in = in[inLen:]
+ buf = buf[bufLen:]
+
+ // Copy everything up until the next backslash
+ nextBackslash := bytes.IndexByte(in, '\\')
+ if nextBackslash == -1 {
+ copy(buf, in)
+ buf = buf[len(in):]
+ break
+ } else {
+ copy(buf, in[:nextBackslash])
+ buf = buf[nextBackslash:]
+ in = in[nextBackslash:]
+ }
+ }
+
+ // Trim the out buffer to the amount that was actually emitted
+ return out[:len(out)-len(buf)], nil
+}