diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse')
24 files changed, 2843 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/v2/.gitattributes b/vendor/github.com/tdewolff/parse/v2/.gitattributes new file mode 100644 index 000000000..9f4b74c09 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/.gitattributes @@ -0,0 +1 @@ +tests/*/corpus/* linguist-generated diff --git a/vendor/github.com/tdewolff/parse/v2/.gitignore b/vendor/github.com/tdewolff/parse/v2/.gitignore new file mode 100644 index 000000000..6144b690b --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/.gitignore @@ -0,0 +1,5 @@ +tests/*/fuzz-fuzz.zip +tests/*/crashers +tests/*/suppressions +tests/*/corpus/* +!tests/*/corpus/*.* diff --git a/vendor/github.com/tdewolff/parse/v2/.golangci.yml b/vendor/github.com/tdewolff/parse/v2/.golangci.yml new file mode 100644 index 000000000..7009f9201 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/.golangci.yml @@ -0,0 +1,16 @@ +linters: + enable: + - depguard + - dogsled + - gofmt + - goimports + - golint + - gosec + - govet + - megacheck + - misspell + - nakedret + - prealloc + - unconvert + - unparam + - wastedassign diff --git a/vendor/github.com/tdewolff/parse/v2/LICENSE.md b/vendor/github.com/tdewolff/parse/v2/LICENSE.md new file mode 100644 index 000000000..41677de41 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/LICENSE.md @@ -0,0 +1,22 @@ +Copyright (c) 2015 Taco de Wolff + + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file diff --git a/vendor/github.com/tdewolff/parse/v2/README.md b/vendor/github.com/tdewolff/parse/v2/README.md new file mode 100644 index 000000000..837c281ad --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/README.md @@ -0,0 +1,64 @@ +# Parse [](https://pkg.go.dev/github.com/tdewolff/parse/v2?tab=doc) [](https://goreportcard.com/report/github.com/tdewolff/parse) [](https://coveralls.io/github/tdewolff/parse?branch=master) [](https://www.patreon.com/tdewolff) + +This package contains several lexers and parsers written in [Go][1]. All subpackages are built to be streaming, high performance and to be in accordance with the official (latest) specifications. + +The lexers are implemented using `buffer.Lexer` in https://github.com/tdewolff/parse/buffer and the parsers work on top of the lexers. Some subpackages have hashes defined (using [Hasher](https://github.com/tdewolff/hasher)) that speed up common byte-slice comparisons. + +## Buffer +### Reader +Reader is a wrapper around a `[]byte` that implements the `io.Reader` interface. It is comparable to `bytes.Reader` but has slightly different semantics (and a slightly smaller memory footprint). + +### Writer +Writer is a buffer that implements the `io.Writer` interface and expands the buffer as needed. The reset functionality allows for better memory reuse. After calling `Reset`, it will overwrite the current buffer and thus reduce allocations. + +### Lexer +Lexer is a read buffer specifically designed for building lexers. It keeps track of two positions: a start and end position. The start position is the beginning of the current token being parsed, the end position is being moved forward until a valid token is found. Calling `Shift` will collapse the positions to the end and return the parsed `[]byte`. + +Moving the end position can go through `Move(int)` which also accepts negative integers. One can also use `Pos() int` to try and parse a token, and if it fails rewind with `Rewind(int)`, passing the previously saved position. + +`Peek(int) byte` will peek forward (relative to the end position) and return the byte at that location. `PeekRune(int) (rune, int)` returns UTF-8 runes and its length at the given **byte** position. Upon an error `Peek` will return `0`, the **user must peek at every character** and not skip any, otherwise it may skip a `0` and panic on out-of-bounds indexing. + +`Lexeme() []byte` will return the currently selected bytes, `Skip()` will collapse the selection. `Shift() []byte` is a combination of `Lexeme() []byte` and `Skip()`. + +When the passed `io.Reader` returned an error, `Err() error` will return that error even if not at the end of the buffer. + +### StreamLexer +StreamLexer behaves like Lexer but uses a buffer pool to read in chunks from `io.Reader`, retaining old buffers in memory that are still in use, and re-using old buffers otherwise. Calling `Free(n int)` frees up `n` bytes from the internal buffer(s). It holds an array of buffers to accommodate for keeping everything in-memory. Calling `ShiftLen() int` returns the number of bytes that have been shifted since the previous call to `ShiftLen`, which can be used to specify how many bytes need to be freed up from the buffer. If you don't need to keep returned byte slices around, call `Free(ShiftLen())` after every `Shift` call. + +## Strconv +This package contains string conversion function much like the standard library's `strconv` package, but it is specifically tailored for the performance needs within the `minify` package. + +For example, the floating-point to string conversion function is approximately twice as fast as the standard library, but it is not as precise. + +## CSS +This package is a CSS3 lexer and parser. Both follow the specification at [CSS Syntax Module Level 3](http://www.w3.org/TR/css-syntax-3/). The lexer takes an io.Reader and converts it into tokens until the EOF. The parser returns a parse tree of the full io.Reader input stream, but the low-level `Next` function can be used for stream parsing to returns grammar units until the EOF. + +[See README here](https://github.com/tdewolff/parse/tree/master/css). + +## HTML +This package is an HTML5 lexer. It follows the specification at [The HTML syntax](http://www.w3.org/TR/html5/syntax.html). The lexer takes an io.Reader and converts it into tokens until the EOF. + +[See README here](https://github.com/tdewolff/parse/tree/master/html). + +## JS +This package is a JS lexer (ECMA-262, edition 6.0). It follows the specification at [ECMAScript Language Specification](http://www.ecma-international.org/ecma-262/6.0/). The lexer takes an io.Reader and converts it into tokens until the EOF. + +[See README here](https://github.com/tdewolff/parse/tree/master/js). + +## JSON +This package is a JSON parser (ECMA-404). It follows the specification at [JSON](http://json.org/). The parser takes an io.Reader and converts it into tokens until the EOF. + +[See README here](https://github.com/tdewolff/parse/tree/master/json). + +## SVG +This package contains common hashes for SVG1.1 tags and attributes. + +## XML +This package is an XML1.0 lexer. It follows the specification at [Extensible Markup Language (XML) 1.0 (Fifth Edition)](http://www.w3.org/TR/xml/). The lexer takes an io.Reader and converts it into tokens until the EOF. + +[See README here](https://github.com/tdewolff/parse/tree/master/xml). + +## License +Released under the [MIT license](LICENSE.md). + +[1]: http://golang.org/ "Go Language" diff --git a/vendor/github.com/tdewolff/parse/v2/buffer/buffer.go b/vendor/github.com/tdewolff/parse/v2/buffer/buffer.go new file mode 100644 index 000000000..671b380d6 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/buffer/buffer.go @@ -0,0 +1,12 @@ +// Package buffer contains buffer and wrapper types for byte slices. It is useful for writing lexers or other high-performance byte slice handling. +// The `Reader` and `Writer` types implement the `io.Reader` and `io.Writer` respectively and provide a thinner and faster interface than `bytes.Buffer`. +// The `Lexer` type is useful for building lexers because it keeps track of the start and end position of a byte selection, and shifts the bytes whenever a valid token is found. +// The `StreamLexer` does the same, but keeps a buffer pool so that it reads a limited amount at a time, allowing to parse from streaming sources. +package buffer + +// defaultBufSize specifies the default initial length of internal buffers. +var defaultBufSize = 4096 + +// MinBuf specifies the default initial length of internal buffers. +// Solely here to support old versions of parse. +var MinBuf = defaultBufSize diff --git a/vendor/github.com/tdewolff/parse/v2/buffer/lexer.go b/vendor/github.com/tdewolff/parse/v2/buffer/lexer.go new file mode 100644 index 000000000..46e6bdafd --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/buffer/lexer.go @@ -0,0 +1,164 @@ +package buffer + +import ( + "io" + "io/ioutil" +) + +var nullBuffer = []byte{0} + +// Lexer is a buffered reader that allows peeking forward and shifting, taking an io.Reader. +// It keeps data in-memory until Free, taking a byte length, is called to move beyond the data. +type Lexer struct { + buf []byte + pos int // index in buf + start int // index in buf + err error + + restore func() +} + +// NewLexer returns a new Lexer for a given io.Reader, and uses ioutil.ReadAll to read it into a byte slice. +// If the io.Reader implements Bytes, that is used instead. +// It will append a NULL at the end of the buffer. +func NewLexer(r io.Reader) *Lexer { + var b []byte + if r != nil { + if buffer, ok := r.(interface { + Bytes() []byte + }); ok { + b = buffer.Bytes() + } else { + var err error + b, err = ioutil.ReadAll(r) + if err != nil { + return &Lexer{ + buf: nullBuffer, + err: err, + } + } + } + } + return NewLexerBytes(b) +} + +// NewLexerBytes returns a new Lexer for a given byte slice, and appends NULL at the end. +// To avoid reallocation, make sure the capacity has room for one more byte. +func NewLexerBytes(b []byte) *Lexer { + z := &Lexer{ + buf: b, + } + + n := len(b) + if n == 0 { + z.buf = nullBuffer + } else { + // Append NULL to buffer, but try to avoid reallocation + if cap(b) > n { + // Overwrite next byte but restore when done + b = b[:n+1] + c := b[n] + b[n] = 0 + + z.buf = b + z.restore = func() { + b[n] = c + } + } else { + z.buf = append(b, 0) + } + } + return z +} + +// Restore restores the replaced byte past the end of the buffer by NULL. +func (z *Lexer) Restore() { + if z.restore != nil { + z.restore() + z.restore = nil + } +} + +// Err returns the error returned from io.Reader or io.EOF when the end has been reached. +func (z *Lexer) Err() error { + return z.PeekErr(0) +} + +// PeekErr returns the error at position pos. When pos is zero, this is the same as calling Err(). +func (z *Lexer) PeekErr(pos int) error { + if z.err != nil { + return z.err + } else if z.pos+pos >= len(z.buf)-1 { + return io.EOF + } + return nil +} + +// Peek returns the ith byte relative to the end position. +// Peek returns 0 when an error has occurred, Err returns the error. +func (z *Lexer) Peek(pos int) byte { + pos += z.pos + return z.buf[pos] +} + +// PeekRune returns the rune and rune length of the ith byte relative to the end position. +func (z *Lexer) PeekRune(pos int) (rune, int) { + // from unicode/utf8 + c := z.Peek(pos) + if c < 0xC0 || z.Peek(pos+1) == 0 { + return rune(c), 1 + } else if c < 0xE0 || z.Peek(pos+2) == 0 { + return rune(c&0x1F)<<6 | rune(z.Peek(pos+1)&0x3F), 2 + } else if c < 0xF0 || z.Peek(pos+3) == 0 { + return rune(c&0x0F)<<12 | rune(z.Peek(pos+1)&0x3F)<<6 | rune(z.Peek(pos+2)&0x3F), 3 + } + return rune(c&0x07)<<18 | rune(z.Peek(pos+1)&0x3F)<<12 | rune(z.Peek(pos+2)&0x3F)<<6 | rune(z.Peek(pos+3)&0x3F), 4 +} + +// Move advances the position. +func (z *Lexer) Move(n int) { + z.pos += n +} + +// Pos returns a mark to which can be rewinded. +func (z *Lexer) Pos() int { + return z.pos - z.start +} + +// Rewind rewinds the position to the given position. +func (z *Lexer) Rewind(pos int) { + z.pos = z.start + pos +} + +// Lexeme returns the bytes of the current selection. +func (z *Lexer) Lexeme() []byte { + return z.buf[z.start:z.pos:z.pos] +} + +// Skip collapses the position to the end of the selection. +func (z *Lexer) Skip() { + z.start = z.pos +} + +// Shift returns the bytes of the current selection and collapses the position to the end of the selection. +func (z *Lexer) Shift() []byte { + b := z.buf[z.start:z.pos:z.pos] + z.start = z.pos + return b +} + +// Offset returns the character position in the buffer. +func (z *Lexer) Offset() int { + return z.pos +} + +// Bytes returns the underlying buffer. +func (z *Lexer) Bytes() []byte { + return z.buf[: len(z.buf)-1 : len(z.buf)-1] +} + +// Reset resets position to the underlying buffer. +func (z *Lexer) Reset() { + z.start = 0 + z.pos = 0 +} diff --git a/vendor/github.com/tdewolff/parse/v2/buffer/reader.go b/vendor/github.com/tdewolff/parse/v2/buffer/reader.go new file mode 100644 index 000000000..9926eef66 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/buffer/reader.go @@ -0,0 +1,44 @@ +package buffer + +import "io" + +// Reader implements an io.Reader over a byte slice. +type Reader struct { + buf []byte + pos int +} + +// NewReader returns a new Reader for a given byte slice. +func NewReader(buf []byte) *Reader { + return &Reader{ + buf: buf, + } +} + +// Read reads bytes into the given byte slice and returns the number of bytes read and an error if occurred. +func (r *Reader) Read(b []byte) (n int, err error) { + if len(b) == 0 { + return 0, nil + } + if r.pos >= len(r.buf) { + return 0, io.EOF + } + n = copy(b, r.buf[r.pos:]) + r.pos += n + return +} + +// Bytes returns the underlying byte slice. +func (r *Reader) Bytes() []byte { + return r.buf +} + +// Reset resets the position of the read pointer to the beginning of the underlying byte slice. +func (r *Reader) Reset() { + r.pos = 0 +} + +// Len returns the length of the buffer. +func (r *Reader) Len() int { + return len(r.buf) +} diff --git a/vendor/github.com/tdewolff/parse/v2/buffer/streamlexer.go b/vendor/github.com/tdewolff/parse/v2/buffer/streamlexer.go new file mode 100644 index 000000000..5ea2dd58d --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/buffer/streamlexer.go @@ -0,0 +1,223 @@ +package buffer + +import ( + "io" +) + +type block struct { + buf []byte + next int // index in pool plus one + active bool +} + +type bufferPool struct { + pool []block + head int // index in pool plus one + tail int // index in pool plus one + + pos int // byte pos in tail +} + +func (z *bufferPool) swap(oldBuf []byte, size int) []byte { + // find new buffer that can be reused + swap := -1 + for i := 0; i < len(z.pool); i++ { + if !z.pool[i].active && size <= cap(z.pool[i].buf) { + swap = i + break + } + } + if swap == -1 { // no free buffer found for reuse + if z.tail == 0 && z.pos >= len(oldBuf) && size <= cap(oldBuf) { // but we can reuse the current buffer! + z.pos -= len(oldBuf) + return oldBuf[:0] + } + // allocate new + z.pool = append(z.pool, block{make([]byte, 0, size), 0, true}) + swap = len(z.pool) - 1 + } + + newBuf := z.pool[swap].buf + + // put current buffer into pool + z.pool[swap] = block{oldBuf, 0, true} + if z.head != 0 { + z.pool[z.head-1].next = swap + 1 + } + z.head = swap + 1 + if z.tail == 0 { + z.tail = swap + 1 + } + + return newBuf[:0] +} + +func (z *bufferPool) free(n int) { + z.pos += n + // move the tail over to next buffers + for z.tail != 0 && z.pos >= len(z.pool[z.tail-1].buf) { + z.pos -= len(z.pool[z.tail-1].buf) + newTail := z.pool[z.tail-1].next + z.pool[z.tail-1].active = false // after this, any thread may pick up the inactive buffer, so it can't be used anymore + z.tail = newTail + } + if z.tail == 0 { + z.head = 0 + } +} + +// StreamLexer is a buffered reader that allows peeking forward and shifting, taking an io.Reader. +// It keeps data in-memory until Free, taking a byte length, is called to move beyond the data. +type StreamLexer struct { + r io.Reader + err error + + pool bufferPool + + buf []byte + start int // index in buf + pos int // index in buf + prevStart int + + free int +} + +// NewStreamLexer returns a new StreamLexer for a given io.Reader with a 4kB estimated buffer size. +// If the io.Reader implements Bytes, that buffer is used instead. +func NewStreamLexer(r io.Reader) *StreamLexer { + return NewStreamLexerSize(r, defaultBufSize) +} + +// NewStreamLexerSize returns a new StreamLexer for a given io.Reader and estimated required buffer size. +// If the io.Reader implements Bytes, that buffer is used instead. +func NewStreamLexerSize(r io.Reader, size int) *StreamLexer { + // if reader has the bytes in memory already, use that instead + if buffer, ok := r.(interface { + Bytes() []byte + }); ok { + return &StreamLexer{ + err: io.EOF, + buf: buffer.Bytes(), + } + } + return &StreamLexer{ + r: r, + buf: make([]byte, 0, size), + } +} + +func (z *StreamLexer) read(pos int) byte { + if z.err != nil { + return 0 + } + + // free unused bytes + z.pool.free(z.free) + z.free = 0 + + // get new buffer + c := cap(z.buf) + p := pos - z.start + 1 + if 2*p > c { // if the token is larger than half the buffer, increase buffer size + c = 2*c + p + } + d := len(z.buf) - z.start + buf := z.pool.swap(z.buf[:z.start], c) + copy(buf[:d], z.buf[z.start:]) // copy the left-overs (unfinished token) from the old buffer + + // read in new data for the rest of the buffer + var n int + for pos-z.start >= d && z.err == nil { + n, z.err = z.r.Read(buf[d:cap(buf)]) + d += n + } + pos -= z.start + z.pos -= z.start + z.start, z.buf = 0, buf[:d] + if pos >= d { + return 0 + } + return z.buf[pos] +} + +// Err returns the error returned from io.Reader. It may still return valid bytes for a while though. +func (z *StreamLexer) Err() error { + if z.err == io.EOF && z.pos < len(z.buf) { + return nil + } + return z.err +} + +// Free frees up bytes of length n from previously shifted tokens. +// Each call to Shift should at one point be followed by a call to Free with a length returned by ShiftLen. +func (z *StreamLexer) Free(n int) { + z.free += n +} + +// Peek returns the ith byte relative to the end position and possibly does an allocation. +// Peek returns zero when an error has occurred, Err returns the error. +// TODO: inline function +func (z *StreamLexer) Peek(pos int) byte { + pos += z.pos + if uint(pos) < uint(len(z.buf)) { // uint for BCE + return z.buf[pos] + } + return z.read(pos) +} + +// PeekRune returns the rune and rune length of the ith byte relative to the end position. +func (z *StreamLexer) PeekRune(pos int) (rune, int) { + // from unicode/utf8 + c := z.Peek(pos) + if c < 0xC0 { + return rune(c), 1 + } else if c < 0xE0 { + return rune(c&0x1F)<<6 | rune(z.Peek(pos+1)&0x3F), 2 + } else if c < 0xF0 { + return rune(c&0x0F)<<12 | rune(z.Peek(pos+1)&0x3F)<<6 | rune(z.Peek(pos+2)&0x3F), 3 + } + return rune(c&0x07)<<18 | rune(z.Peek(pos+1)&0x3F)<<12 | rune(z.Peek(pos+2)&0x3F)<<6 | rune(z.Peek(pos+3)&0x3F), 4 +} + +// Move advances the position. +func (z *StreamLexer) Move(n int) { + z.pos += n +} + +// Pos returns a mark to which can be rewinded. +func (z *StreamLexer) Pos() int { + return z.pos - z.start +} + +// Rewind rewinds the position to the given position. +func (z *StreamLexer) Rewind(pos int) { + z.pos = z.start + pos +} + +// Lexeme returns the bytes of the current selection. +func (z *StreamLexer) Lexeme() []byte { + return z.buf[z.start:z.pos] +} + +// Skip collapses the position to the end of the selection. +func (z *StreamLexer) Skip() { + z.start = z.pos +} + +// Shift returns the bytes of the current selection and collapses the position to the end of the selection. +// It also returns the number of bytes we moved since the last call to Shift. This can be used in calls to Free. +func (z *StreamLexer) Shift() []byte { + if z.pos > len(z.buf) { // make sure we peeked at least as much as we shift + z.read(z.pos - 1) + } + b := z.buf[z.start:z.pos] + z.start = z.pos + return b +} + +// ShiftLen returns the number of bytes moved since the last call to ShiftLen. This can be used in calls to Free because it takes into account multiple Shifts or Skips. +func (z *StreamLexer) ShiftLen() int { + n := z.start - z.prevStart + z.prevStart = z.start + return n +} diff --git a/vendor/github.com/tdewolff/parse/v2/buffer/writer.go b/vendor/github.com/tdewolff/parse/v2/buffer/writer.go new file mode 100644 index 000000000..b3c9990d9 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/buffer/writer.go @@ -0,0 +1,41 @@ +package buffer + +// Writer implements an io.Writer over a byte slice. +type Writer struct { + buf []byte +} + +// NewWriter returns a new Writer for a given byte slice. +func NewWriter(buf []byte) *Writer { + return &Writer{ + buf: buf, + } +} + +// Write writes bytes from the given byte slice and returns the number of bytes written and an error if occurred. When err != nil, n == 0. +func (w *Writer) Write(b []byte) (int, error) { + n := len(b) + end := len(w.buf) + if end+n > cap(w.buf) { + buf := make([]byte, end, 2*cap(w.buf)+n) + copy(buf, w.buf) + w.buf = buf + } + w.buf = w.buf[:end+n] + return copy(w.buf[end:], b), nil +} + +// Len returns the length of the underlying byte slice. +func (w *Writer) Len() int { + return len(w.buf) +} + +// Bytes returns the underlying byte slice. +func (w *Writer) Bytes() []byte { + return w.buf +} + +// Reset empties and reuses the current buffer. Subsequent writes will overwrite the buffer, so any reference to the underlying slice is invalidated after this call. +func (w *Writer) Reset() { + w.buf = w.buf[:0] +} diff --git a/vendor/github.com/tdewolff/parse/v2/common.go b/vendor/github.com/tdewolff/parse/v2/common.go new file mode 100644 index 000000000..da46cc3df --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/common.go @@ -0,0 +1,237 @@ +// Package parse contains a collection of parsers for various formats in its subpackages. +package parse + +import ( + "bytes" + "encoding/base64" + "errors" +) + +var ( + dataSchemeBytes = []byte("data:") + base64Bytes = []byte("base64") + textMimeBytes = []byte("text/plain") +) + +// ErrBadDataURI is returned by DataURI when the byte slice does not start with 'data:' or is too short. +var ErrBadDataURI = errors.New("not a data URI") + +// Number returns the number of bytes that parse as a number of the regex format (+|-)?([0-9]+(\.[0-9]+)?|\.[0-9]+)((e|E)(+|-)?[0-9]+)?. +func Number(b []byte) int { + if len(b) == 0 { + return 0 + } + i := 0 + if b[i] == '+' || b[i] == '-' { + i++ + if i >= len(b) { + return 0 + } + } + firstDigit := (b[i] >= '0' && b[i] <= '9') + if firstDigit { + i++ + for i < len(b) && b[i] >= '0' && b[i] <= '9' { + i++ + } + } + if i < len(b) && b[i] == '.' { + i++ + if i < len(b) && b[i] >= '0' && b[i] <= '9' { + i++ + for i < len(b) && b[i] >= '0' && b[i] <= '9' { + i++ + } + } else if firstDigit { + // . could belong to the next token + i-- + return i + } else { + return 0 + } + } else if !firstDigit { + return 0 + } + iOld := i + if i < len(b) && (b[i] == 'e' || b[i] == 'E') { + i++ + if i < len(b) && (b[i] == '+' || b[i] == '-') { + i++ + } + if i >= len(b) || b[i] < '0' || b[i] > '9' { + // e could belong to next token + return iOld + } + for i < len(b) && b[i] >= '0' && b[i] <= '9' { + i++ + } + } + return i +} + +// Dimension parses a byte-slice and returns the length of the number and its unit. +func Dimension(b []byte) (int, int) { + num := Number(b) + if num == 0 || num == len(b) { + return num, 0 + } else if b[num] == '%' { + return num, 1 + } else if b[num] >= 'a' && b[num] <= 'z' || b[num] >= 'A' && b[num] <= 'Z' { + i := num + 1 + for i < len(b) && (b[i] >= 'a' && b[i] <= 'z' || b[i] >= 'A' && b[i] <= 'Z') { + i++ + } + return num, i - num + } + return num, 0 +} + +// Mediatype parses a given mediatype and splits the mimetype from the parameters. +// It works similar to mime.ParseMediaType but is faster. +func Mediatype(b []byte) ([]byte, map[string]string) { + i := 0 + for i < len(b) && b[i] == ' ' { + i++ + } + b = b[i:] + n := len(b) + mimetype := b + var params map[string]string + for i := 3; i < n; i++ { // mimetype is at least three characters long + if b[i] == ';' || b[i] == ' ' { + mimetype = b[:i] + if b[i] == ' ' { + i++ // space + for i < n && b[i] == ' ' { + i++ + } + if n <= i || b[i] != ';' { + break + } + } + params = map[string]string{} + s := string(b) + PARAM: + i++ // semicolon + for i < n && s[i] == ' ' { + i++ + } + start := i + for i < n && s[i] != '=' && s[i] != ';' && s[i] != ' ' { + i++ + } + key := s[start:i] + for i < n && s[i] == ' ' { + i++ + } + if i < n && s[i] == '=' { + i++ + for i < n && s[i] == ' ' { + i++ + } + start = i + for i < n && s[i] != ';' && s[i] != ' ' { + i++ + } + } else { + start = i + } + params[key] = s[start:i] + for i < n && s[i] == ' ' { + i++ + } + if i < n && s[i] == ';' { + goto PARAM + } + break + } + } + return mimetype, params +} + +// DataURI parses the given data URI and returns the mediatype, data and ok. +func DataURI(dataURI []byte) ([]byte, []byte, error) { + if len(dataURI) > 5 && bytes.Equal(dataURI[:5], dataSchemeBytes) { + dataURI = dataURI[5:] + inBase64 := false + var mediatype []byte + i := 0 + for j := 0; j < len(dataURI); j++ { + c := dataURI[j] + if c == '=' || c == ';' || c == ',' { + if c != '=' && bytes.Equal(TrimWhitespace(dataURI[i:j]), base64Bytes) { + if len(mediatype) > 0 { + mediatype = mediatype[:len(mediatype)-1] + } + inBase64 = true + i = j + } else if c != ',' { + mediatype = append(append(mediatype, TrimWhitespace(dataURI[i:j])...), c) + i = j + 1 + } else { + mediatype = append(mediatype, TrimWhitespace(dataURI[i:j])...) + } + if c == ',' { + if len(mediatype) == 0 || mediatype[0] == ';' { + mediatype = textMimeBytes + } + data := dataURI[j+1:] + if inBase64 { + decoded := make([]byte, base64.StdEncoding.DecodedLen(len(data))) + n, err := base64.StdEncoding.Decode(decoded, data) + if err != nil { + return nil, nil, err + } + data = decoded[:n] + } else { + data = DecodeURL(data) + } + return mediatype, data, nil + } + } + } + } + return nil, nil, ErrBadDataURI +} + +// QuoteEntity parses the given byte slice and returns the quote that got matched (' or ") and its entity length. +// TODO: deprecated +func QuoteEntity(b []byte) (quote byte, n int) { + if len(b) < 5 || b[0] != '&' { + return 0, 0 + } + if b[1] == '#' { + if b[2] == 'x' { + i := 3 + for i < len(b) && b[i] == '0' { + i++ + } + if i+2 < len(b) && b[i] == '2' && b[i+2] == ';' { + if b[i+1] == '2' { + return '"', i + 3 // " + } else if b[i+1] == '7' { + return '\'', i + 3 // ' + } + } + } else { + i := 2 + for i < len(b) && b[i] == '0' { + i++ + } + if i+2 < len(b) && b[i] == '3' && b[i+2] == ';' { + if b[i+1] == '4' { + return '"', i + 3 // " + } else if b[i+1] == '9' { + return '\'', i + 3 // ' + } + } + } + } else if len(b) >= 6 && b[5] == ';' { + if bytes.Equal(b[1:5], []byte{'q', 'u', 'o', 't'}) { + return '"', 6 // " + } else if bytes.Equal(b[1:5], []byte{'a', 'p', 'o', 's'}) { + return '\'', 6 // ' + } + } + return 0, 0 +} diff --git a/vendor/github.com/tdewolff/parse/v2/error.go b/vendor/github.com/tdewolff/parse/v2/error.go new file mode 100644 index 000000000..f6657f711 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/error.go @@ -0,0 +1,47 @@ +package parse + +import ( + "bytes" + "fmt" + "io" +) + +// Error is a parsing error returned by parser. It contains a message and an offset at which the error occurred. +type Error struct { + Message string + Line int + Column int + Context string +} + +// NewError creates a new error +func NewError(r io.Reader, offset int, message string, a ...interface{}) *Error { + line, column, context := Position(r, offset) + if 0 < len(a) { + message = fmt.Sprintf(message, a...) + } + return &Error{ + Message: message, + Line: line, + Column: column, + Context: context, + } +} + +// NewErrorLexer creates a new error from an active Lexer. +func NewErrorLexer(l *Input, message string, a ...interface{}) *Error { + r := bytes.NewBuffer(l.Bytes()) + offset := l.Offset() + return NewError(r, offset, message, a...) +} + +// Position returns the line, column, and context of the error. +// Context is the entire line at which the error occurred. +func (e *Error) Position() (int, int, string) { + return e.Line, e.Column, e.Context +} + +// Error returns the error string, containing the context and line + column number. +func (e *Error) Error() string { + return fmt.Sprintf("%s on line %d and column %d\n%s", e.Message, e.Line, e.Column, e.Context) +} diff --git a/vendor/github.com/tdewolff/parse/v2/go.mod b/vendor/github.com/tdewolff/parse/v2/go.mod new file mode 100644 index 000000000..6432178e8 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/go.mod @@ -0,0 +1,5 @@ +module github.com/tdewolff/parse/v2 + +go 1.13 + +require github.com/tdewolff/test v1.0.6 diff --git a/vendor/github.com/tdewolff/parse/v2/go.sum b/vendor/github.com/tdewolff/parse/v2/go.sum new file mode 100644 index 000000000..7893d5c89 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/go.sum @@ -0,0 +1,2 @@ +github.com/tdewolff/test v1.0.6 h1:76mzYJQ83Op284kMT+63iCNCI7NEERsIN8dLM+RiKr4= +github.com/tdewolff/test v1.0.6/go.mod h1:6DAvZliBAAnD7rhVgwaM7DE5/d9NMOAJ09SqYqeK4QE= diff --git a/vendor/github.com/tdewolff/parse/v2/html/README.md b/vendor/github.com/tdewolff/parse/v2/html/README.md new file mode 100644 index 000000000..53145dbd9 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/README.md @@ -0,0 +1,98 @@ +# HTML [](https://pkg.go.dev/github.com/tdewolff/parse/v2/html?tab=doc) + +This package is an HTML5 lexer written in [Go][1]. It follows the specification at [The HTML syntax](http://www.w3.org/TR/html5/syntax.html). The lexer takes an io.Reader and converts it into tokens until the EOF. + +## Installation +Run the following command + + go get -u github.com/tdewolff/parse/v2/html + +or add the following import and run project with `go get` + + import "github.com/tdewolff/parse/v2/html" + +## Lexer +### Usage +The following initializes a new Lexer with io.Reader `r`: +``` go +l := html.NewLexer(parse.NewInput(r)) +``` + +To tokenize until EOF an error, use: +``` go +for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + // error or EOF set in l.Err() + return + case html.StartTagToken: + // ... + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + // ... + } + // ... + } +} +``` + +All tokens: +``` go +ErrorToken TokenType = iota // extra token when errors occur +CommentToken +DoctypeToken +StartTagToken +StartTagCloseToken +StartTagVoidToken +EndTagToken +AttributeToken +TextToken +``` + +### Examples +``` go +package main + +import ( + "os" + + "github.com/tdewolff/parse/v2/html" +) + +// Tokenize HTML from stdin. +func main() { + l := html.NewLexer(parse.NewInput(os.Stdin)) + for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + if l.Err() != io.EOF { + fmt.Println("Error on line", l.Line(), ":", l.Err()) + } + return + case html.StartTagToken: + fmt.Println("Tag", string(data)) + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + + key := dataAttr + val := l.AttrVal() + fmt.Println("Attribute", string(key), "=", string(val)) + } + // ... + } + } +} +``` + +## License +Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md). + +[1]: http://golang.org/ "Go Language" diff --git a/vendor/github.com/tdewolff/parse/v2/html/hash.go b/vendor/github.com/tdewolff/parse/v2/html/hash.go new file mode 100644 index 000000000..16432ade1 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/hash.go @@ -0,0 +1,81 @@ +package html + +// generated by hasher -type=Hash -file=hash.go; DO NOT EDIT, except for adding more constants to the list and rerun go generate + +// uses github.com/tdewolff/hasher +//go:generate hasher -type=Hash -file=hash.go + +// Hash defines perfect hashes for a predefined list of strings +type Hash uint32 + +// Unique hash definitions to be used instead of strings +const ( + Iframe Hash = 0x6 // iframe + Math Hash = 0x604 // math + Plaintext Hash = 0x1e09 // plaintext + Script Hash = 0xa06 // script + Style Hash = 0x1405 // style + Svg Hash = 0x1903 // svg + Textarea Hash = 0x2308 // textarea + Title Hash = 0xf05 // title + Xmp Hash = 0x1c03 // xmp +) + +// String returns the hash' name. +func (i Hash) String() string { + start := uint32(i >> 8) + n := uint32(i & 0xff) + if start+n > uint32(len(_Hash_text)) { + return "" + } + return _Hash_text[start : start+n] +} + +// ToHash returns the hash whose name is s. It returns zero if there is no +// such hash. It is case sensitive. +func ToHash(s []byte) Hash { + if len(s) == 0 || len(s) > _Hash_maxLen { + return 0 + } + h := uint32(_Hash_hash0) + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + if i := _Hash_table[h&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + goto NEXT + } + } + return i + } +NEXT: + if i := _Hash_table[(h>>16)&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + return 0 + } + } + return i + } + return 0 +} + +const _Hash_hash0 = 0x9acb0442 +const _Hash_maxLen = 9 +const _Hash_text = "iframemathscriptitlestylesvgxmplaintextarea" + +var _Hash_table = [1 << 4]Hash{ + 0x0: 0x2308, // textarea + 0x2: 0x6, // iframe + 0x4: 0xf05, // title + 0x5: 0x1e09, // plaintext + 0x7: 0x1405, // style + 0x8: 0x604, // math + 0x9: 0xa06, // script + 0xa: 0x1903, // svg + 0xb: 0x1c03, // xmp +} diff --git a/vendor/github.com/tdewolff/parse/v2/html/lex.go b/vendor/github.com/tdewolff/parse/v2/html/lex.go new file mode 100644 index 000000000..2c47d2527 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/lex.go @@ -0,0 +1,493 @@ +// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html. +package html + +import ( + "strconv" + + "github.com/tdewolff/parse/v2" +) + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + CommentToken + DoctypeToken + StartTagToken + StartTagCloseToken + StartTagVoidToken + EndTagToken + AttributeToken + TextToken + SvgToken + MathToken +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + case StartTagToken: + return "StartTag" + case StartTagCloseToken: + return "StartTagClose" + case StartTagVoidToken: + return "StartTagVoid" + case EndTagToken: + return "EndTag" + case AttributeToken: + return "Attribute" + case TextToken: + return "Text" + case SvgToken: + return "Svg" + case MathToken: + return "Math" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *parse.Input + err error + + rawTag Hash + inTag bool + + text []byte + attrVal []byte +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r *parse.Input) *Lexer { + return &Lexer{ + r: r, + } +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + if l.err != nil { + return l.err + } + return l.r.Err() +} + +// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. +func (l *Lexer) Text() []byte { + return l.text +} + +// AttrVal returns the attribute value when an AttributeToken was returned from Next. +func (l *Lexer) AttrVal() []byte { + return l.attrVal +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + l.text = nil + var c byte + if l.inTag { + l.attrVal = nil + for { // before attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == 0 && l.r.Err() != nil { + return ErrorToken, nil + } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') { + return AttributeToken, l.shiftAttribute() + } + l.r.Skip() + l.inTag = false + if c == '/' { + l.r.Move(2) + return StartTagVoidToken, l.r.Shift() + } + l.r.Move(1) + return StartTagCloseToken, l.r.Shift() + } + + if l.rawTag != 0 { + if rawText := l.shiftRawText(); len(rawText) > 0 { + l.rawTag = 0 + return TextToken, rawText + } + l.rawTag = 0 + } + + for { + c = l.r.Peek(0) + if c == '<' { + c = l.r.Peek(1) + isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil) + if l.r.Pos() > 0 { + if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { + // return currently buffered texttoken so that we can return tag next iteration + l.text = l.r.Shift() + return TextToken, l.text + } + } else if isEndTag { + l.r.Move(2) + // only endtags that are not followed by > or EOF arrive here + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + return CommentToken, l.shiftBogusComment() + } + return EndTagToken, l.shiftEndTag() + } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + l.r.Move(1) + l.inTag = true + return l.shiftStartTag() + } else if c == '!' { + l.r.Move(2) + return l.readMarkup() + } else if c == '?' { + l.r.Move(1) + return CommentToken, l.shiftBogusComment() + } + } else if c == 0 && l.r.Err() != nil { + if l.r.Pos() > 0 { + l.text = l.r.Shift() + return TextToken, l.text + } + return ErrorToken, nil + } + l.r.Move(1) + } +} + +//////////////////////////////////////////////////////////////// + +// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html + +func (l *Lexer) shiftRawText() []byte { + if l.rawTag == Plaintext { + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + return l.r.Shift() + } + l.r.Move(1) + } + } else { // RCDATA, RAWTEXT and SCRIPT + for { + c := l.r.Peek(0) + if c == '<' { + if l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice + l.r.Rewind(mark) + return l.r.Shift() + } + } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + l.r.Move(4) + inScript := false + for { + c := l.r.Peek(0) + if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + l.r.Move(3) + break + } else if c == '<' { + isEnd := l.r.Peek(1) == '/' + if isEnd { + l.r.Move(2) + } else { + l.r.Move(1) + } + mark := l.r.Pos() + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice + if !isEnd { + inScript = true + } else { + if !inScript { + l.r.Rewind(mark - 2) + return l.r.Shift() + } + inScript = false + } + } + } else if c == 0 && l.r.Err() != nil { + return l.r.Shift() + } else { + l.r.Move(1) + } + } + } else { + l.r.Move(1) + } + } else if c == 0 && l.r.Err() != nil { + return l.r.Shift() + } else { + l.r.Move(1) + } + } + } +} + +func (l *Lexer) readMarkup() (TokenType, []byte) { + if l.at('-', '-') { + l.r.Move(2) + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[4:] + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(3) + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '!', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(4) + return CommentToken, l.r.Shift() + } + l.r.Move(1) + } + } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { + l.r.Move(7) + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[9:] + return TextToken, l.r.Shift() + } else if l.at(']', ']', '>') { + l.text = l.r.Lexeme()[9:] + l.r.Move(3) + return TextToken, l.r.Shift() + } + l.r.Move(1) + } + } else { + if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') { + l.r.Move(7) + if l.r.Peek(0) == ' ' { + l.r.Move(1) + } + for { + if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[9:] + if c == '>' { + l.r.Move(1) + } + return DoctypeToken, l.r.Shift() + } + l.r.Move(1) + } + } + } + return CommentToken, l.shiftBogusComment() +} + +func (l *Lexer) shiftBogusComment() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + return l.r.Shift() + } else if c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[2:] + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftStartTag() (TokenType, []byte) { + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + l.text = parse.ToLower(l.r.Lexeme()[1:]) + if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math { + if h == Svg || h == Math { + data := l.shiftXML(h) + if l.err != nil { + return ErrorToken, nil + } + + l.inTag = false + if h == Svg { + return SvgToken, data + } + return MathToken, data + } + l.rawTag = h + } + return StartTagToken, l.r.Shift() +} + +func (l *Lexer) shiftAttribute() []byte { + nameStart := l.r.Pos() + var c byte + for { // attribute name state + if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + nameEnd := l.r.Pos() + for { // after attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == '=' { + l.r.Move(1) + for { // before attribute value state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + attrPos := l.r.Pos() + delim := c + if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + } else { // attribute value unquoted state + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + } + l.attrVal = l.r.Lexeme()[attrPos:] + } else { + l.r.Rewind(nameEnd) + l.attrVal = nil + } + l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd]) + return l.r.Shift() +} + +func (l *Lexer) shiftEndTag() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + break + } else if c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[2:] + break + } + l.r.Move(1) + } + + end := len(l.text) + for end > 0 { + if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { + end-- + continue + } + break + } + l.text = l.text[:end] + return parse.ToLower(l.r.Shift()) +} + +// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself. +// So far we have already parsed `<svg` or `<math`. +func (l *Lexer) shiftXML(rawTag Hash) []byte { + inQuote := false + for { + c := l.r.Peek(0) + if c == '"' { + inQuote = !inQuote + l.r.Move(1) + } else if c == '<' && !inQuote && l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice + break + } + } else if c == 0 { + if l.r.Err() == nil { + l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") + } + return l.r.Shift() + } else { + l.r.Move(1) + } + } + + for { + c := l.r.Peek(0) + if c == '>' { + l.r.Move(1) + break + } else if c == 0 { + if l.r.Err() == nil { + l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") + } + return l.r.Shift() + } + l.r.Move(1) + } + return l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) at(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c { + return false + } + } + return true +} + +func (l *Lexer) atCaseInsensitive(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c { + return false + } + } + return true +} diff --git a/vendor/github.com/tdewolff/parse/v2/html/util.go b/vendor/github.com/tdewolff/parse/v2/html/util.go new file mode 100644 index 000000000..fe12f17ff --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/util.go @@ -0,0 +1,103 @@ +package html + +var ( + singleQuoteEntityBytes = []byte("'") + doubleQuoteEntityBytes = []byte(""") +) + +// EscapeAttrVal returns the escaped attribute value bytes without quotes. +func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte { + singles := 0 + doubles := 0 + unquoted := true + entities := false + for _, c := range b { + if charTable[c] { + unquoted = false + if c == '"' { + doubles++ + } else if c == '\'' { + singles++ + } + } + } + if unquoted && !isXML { + return b + } else if !entities && len(orig) == len(b)+2 && (singles == 0 && orig[0] == '\'' || doubles == 0 && orig[0] == '"') { + return orig + } + + n := len(b) + 2 + var quote byte + var escapedQuote []byte + if singles >= doubles || isXML { + n += doubles * 4 + quote = '"' + escapedQuote = doubleQuoteEntityBytes + } else { + n += singles * 4 + quote = '\'' + escapedQuote = singleQuoteEntityBytes + } + if n > cap(*buf) { + *buf = make([]byte, 0, n) // maximum size, not actual size + } + t := (*buf)[:n] // maximum size, not actual size + t[0] = quote + j := 1 + start := 0 + for i, c := range b { + if c == quote { + j += copy(t[j:], b[start:i]) + j += copy(t[j:], escapedQuote) + start = i + 1 + } + } + j += copy(t[j:], b[start:]) + t[j] = quote + return t[:j+1] +} + +var charTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, true, true, false, true, true, false, false, // tab, line feed, form feed, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, true, false, false, false, false, true, // space, "), ' + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, true, true, true, false, // <, =, > + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} diff --git a/vendor/github.com/tdewolff/parse/v2/input.go b/vendor/github.com/tdewolff/parse/v2/input.go new file mode 100644 index 000000000..5b6d8f547 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/input.go @@ -0,0 +1,173 @@ +package parse + +import ( + "io" + "io/ioutil" +) + +var nullBuffer = []byte{0} + +// Input is a buffered reader that allows peeking forward and shifting, taking an io.Input. +// It keeps data in-memory until Free, taking a byte length, is called to move beyond the data. +type Input struct { + buf []byte + pos int // index in buf + start int // index in buf + err error + + restore func() +} + +// NewInput returns a new Input for a given io.Input and uses ioutil.ReadAll to read it into a byte slice. +// If the io.Input implements Bytes, that is used instead. It will append a NULL at the end of the buffer. +func NewInput(r io.Reader) *Input { + var b []byte + if r != nil { + if buffer, ok := r.(interface { + Bytes() []byte + }); ok { + b = buffer.Bytes() + } else { + var err error + b, err = ioutil.ReadAll(r) + if err != nil { + return &Input{ + buf: nullBuffer, + err: err, + } + } + } + } + return NewInputBytes(b) +} + +// NewInputString returns a new Input for a given string and appends NULL at the end. +func NewInputString(s string) *Input { + return NewInputBytes([]byte(s)) +} + +// NewInputBytes returns a new Input for a given byte slice and appends NULL at the end. +// To avoid reallocation, make sure the capacity has room for one more byte. +func NewInputBytes(b []byte) *Input { + z := &Input{ + buf: b, + } + + n := len(b) + if n == 0 { + z.buf = nullBuffer + } else { + // Append NULL to buffer, but try to avoid reallocation + if cap(b) > n { + // Overwrite next byte but restore when done + b = b[:n+1] + c := b[n] + b[n] = 0 + + z.buf = b + z.restore = func() { + b[n] = c + } + } else { + z.buf = append(b, 0) + } + } + return z +} + +// Restore restores the replaced byte past the end of the buffer by NULL. +func (z *Input) Restore() { + if z.restore != nil { + z.restore() + z.restore = nil + } +} + +// Err returns the error returned from io.Input or io.EOF when the end has been reached. +func (z *Input) Err() error { + return z.PeekErr(0) +} + +// PeekErr returns the error at position pos. When pos is zero, this is the same as calling Err(). +func (z *Input) PeekErr(pos int) error { + if z.err != nil { + return z.err + } else if z.pos+pos >= len(z.buf)-1 { + return io.EOF + } + return nil +} + +// Peek returns the ith byte relative to the end position. +// Peek returns 0 when an error has occurred, Err returns the erroz. +func (z *Input) Peek(pos int) byte { + pos += z.pos + return z.buf[pos] +} + +// PeekRune returns the rune and rune length of the ith byte relative to the end position. +func (z *Input) PeekRune(pos int) (rune, int) { + // from unicode/utf8 + c := z.Peek(pos) + if c < 0xC0 || z.Peek(pos+1) == 0 { + return rune(c), 1 + } else if c < 0xE0 || z.Peek(pos+2) == 0 { + return rune(c&0x1F)<<6 | rune(z.Peek(pos+1)&0x3F), 2 + } else if c < 0xF0 || z.Peek(pos+3) == 0 { + return rune(c&0x0F)<<12 | rune(z.Peek(pos+1)&0x3F)<<6 | rune(z.Peek(pos+2)&0x3F), 3 + } + return rune(c&0x07)<<18 | rune(z.Peek(pos+1)&0x3F)<<12 | rune(z.Peek(pos+2)&0x3F)<<6 | rune(z.Peek(pos+3)&0x3F), 4 +} + +// Move advances the position. +func (z *Input) Move(n int) { + z.pos += n +} + +// Pos returns a mark to which can be rewinded. +func (z *Input) Pos() int { + return z.pos - z.start +} + +// Rewind rewinds the position to the given position. +func (z *Input) Rewind(pos int) { + z.pos = z.start + pos +} + +// Lexeme returns the bytes of the current selection. +func (z *Input) Lexeme() []byte { + return z.buf[z.start:z.pos:z.pos] +} + +// Skip collapses the position to the end of the selection. +func (z *Input) Skip() { + z.start = z.pos +} + +// Shift returns the bytes of the current selection and collapses the position to the end of the selection. +func (z *Input) Shift() []byte { + b := z.buf[z.start:z.pos:z.pos] + z.start = z.pos + return b +} + +// Offset returns the character position in the buffez. +func (z *Input) Offset() int { + return z.pos +} + +// Bytes returns the underlying buffez. +func (z *Input) Bytes() []byte { + return z.buf[: len(z.buf)-1 : len(z.buf)-1] +} + +// Len returns the length of the underlying buffez. +func (z *Input) Len() int { + return len(z.buf) - 1 +} + +// Reset resets position to the underlying buffez. +func (z *Input) Reset() { + z.start = 0 + z.pos = 0 +} diff --git a/vendor/github.com/tdewolff/parse/v2/position.go b/vendor/github.com/tdewolff/parse/v2/position.go new file mode 100644 index 000000000..38e38cee4 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/position.go @@ -0,0 +1,95 @@ +package parse + +import ( + "fmt" + "io" + "strings" + "unicode" +) + +// Position returns the line and column number for a certain position in a file. It is useful for recovering the position in a file that caused an error. +// It only treates \n, \r, and \r\n as newlines, which might be different from some languages also recognizing \f, \u2028, and \u2029 to be newlines. +func Position(r io.Reader, offset int) (line, col int, context string) { + l := NewInput(r) + line = 1 + for l.Pos() < offset { + c := l.Peek(0) + n := 1 + newline := false + if c == '\n' { + newline = true + } else if c == '\r' { + if l.Peek(1) == '\n' { + newline = true + n = 2 + } else { + newline = true + } + } else if c >= 0xC0 { + var r rune + if r, n = l.PeekRune(0); r == '\u2028' || r == '\u2029' { + newline = true + } + } else if c == 0 && l.Err() != nil { + break + } + + if 1 < n && offset < l.Pos()+n { + break + } + l.Move(n) + + if newline { + line++ + offset -= l.Pos() + l.Skip() + } + } + + col = len([]rune(string(l.Lexeme()))) + 1 + context = positionContext(l, line, col) + return +} + +func positionContext(l *Input, line, col int) (context string) { + for { + c := l.Peek(0) + if c == 0 && l.Err() != nil || c == '\n' || c == '\r' { + break + } + l.Move(1) + } + rs := []rune(string(l.Lexeme())) + + // cut off front or rear of context to stay between 60 characters + limit := 60 + offset := 20 + ellipsisFront := "" + ellipsisRear := "" + if limit < len(rs) { + if col <= limit-offset { + ellipsisRear = "..." + rs = rs[:limit-3] + } else if col >= len(rs)-offset-3 { + ellipsisFront = "..." + col -= len(rs) - offset - offset - 7 + rs = rs[len(rs)-offset-offset-4:] + } else { + ellipsisFront = "..." + ellipsisRear = "..." + rs = rs[col-offset-1 : col+offset] + col = offset + 4 + } + } + + // replace unprintable characters by a space + for i, r := range rs { + if !unicode.IsGraphic(r) { + rs[i] = 'ยท' + } + } + + context += fmt.Sprintf("%5d: %s%s%s\n", line, ellipsisFront, string(rs), ellipsisRear) + context += fmt.Sprintf("%s^", strings.Repeat(" ", 6+col)) + return +} diff --git a/vendor/github.com/tdewolff/parse/v2/strconv/float.go b/vendor/github.com/tdewolff/parse/v2/strconv/float.go new file mode 100644 index 000000000..c89bdb29d --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/strconv/float.go @@ -0,0 +1,257 @@ +package strconv + +import ( + "math" +) + +var float64pow10 = []float64{ + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, + 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, + 1e20, 1e21, 1e22, +} + +// ParseFloat parses a byte-slice and returns the float it represents. +// If an invalid character is encountered, it will stop there. +func ParseFloat(b []byte) (float64, int) { + i := 0 + neg := false + if i < len(b) && (b[i] == '+' || b[i] == '-') { + neg = b[i] == '-' + i++ + } + start := i + dot := -1 + trunk := -1 + n := uint64(0) + for ; i < len(b); i++ { + c := b[i] + if c >= '0' && c <= '9' { + if trunk == -1 { + if n > math.MaxUint64/10 { + trunk = i + } else { + n *= 10 + n += uint64(c - '0') + } + } + } else if dot == -1 && c == '.' { + dot = i + } else { + break + } + } + if i == start || i == start+1 && dot == start { + return 0.0, 0 + } + + f := float64(n) + if neg { + f = -f + } + + mantExp := int64(0) + if dot != -1 { + if trunk == -1 { + trunk = i + } + mantExp = int64(trunk - dot - 1) + } else if trunk != -1 { + mantExp = int64(trunk - i) + } + expExp := int64(0) + if i < len(b) && (b[i] == 'e' || b[i] == 'E') { + startExp := i + i++ + if e, expLen := ParseInt(b[i:]); expLen > 0 { + expExp = e + i += expLen + } else { + i = startExp + } + } + exp := expExp - mantExp + + // copied from strconv/atof.go + if exp == 0 { + return f, i + } else if exp > 0 && exp <= 15+22 { // int * 10^k + // If exponent is big but number of digits is not, + // can move a few zeros into the integer part. + if exp > 22 { + f *= float64pow10[exp-22] + exp = 22 + } + if f <= 1e15 && f >= -1e15 { + return f * float64pow10[exp], i + } + } else if exp < 0 && exp >= -22 { // int / 10^k + return f / float64pow10[-exp], i + } + f *= math.Pow10(int(-mantExp)) + return f * math.Pow10(int(expExp)), i +} + +const log2 = 0.3010299956639812 + +func float64exp(f float64) int { + exp2 := 0 + if f != 0.0 { + x := math.Float64bits(f) + exp2 = int(x>>(64-11-1))&0x7FF - 1023 + 1 + } + + exp10 := float64(exp2) * log2 + if exp10 < 0 { + exp10 -= 1.0 + } + return int(exp10) +} + +// AppendFloat appends a float to `b` with precision `prec`. It returns the new slice and whether successful or not. Precision is the number of decimals to display, thus prec + 1 == number of significant digits. +func AppendFloat(b []byte, f float64, prec int) ([]byte, bool) { + if math.IsNaN(f) || math.IsInf(f, 0) { + return b, false + } + + neg := false + if f < 0.0 { + f = -f + neg = true + } + if prec < 0 || 17 < prec { + prec = 17 // maximum number of significant digits in double + } + prec -= float64exp(f) // number of digits in front of the dot + f *= math.Pow10(prec) + + // calculate mantissa and exponent + mant := int64(f) + mantLen := LenInt(mant) + mantExp := mantLen - prec - 1 + if mant == 0 { + return append(b, '0'), true + } + + // expLen is zero for positive exponents, because positive exponents are determined later on in the big conversion loop + exp := 0 + expLen := 0 + if mantExp > 0 { + // positive exponent is determined in the loop below + // but if we initially decreased the exponent to fit in an integer, we can't set the new exponent in the loop alone, + // since the number of zeros at the end determines the positive exponent in the loop, and we just artificially lost zeros + if prec < 0 { + exp = mantExp + } + expLen = 1 + LenInt(int64(exp)) // e + digits + } else if mantExp < -3 { + exp = mantExp + expLen = 2 + LenInt(int64(exp)) // e + minus + digits + } else if mantExp < -1 { + mantLen += -mantExp - 1 // extra zero between dot and first digit + } + + // reserve space in b + i := len(b) + maxLen := 1 + mantLen + expLen // dot + mantissa digits + exponent + if neg { + maxLen++ + } + if i+maxLen > cap(b) { + b = append(b, make([]byte, maxLen)...) + } else { + b = b[:i+maxLen] + } + + // write to string representation + if neg { + b[i] = '-' + i++ + } + + // big conversion loop, start at the end and move to the front + // initially print trailing zeros and remove them later on + // for example if the first non-zero digit is three positions in front of the dot, it will overwrite the zeros with a positive exponent + zero := true + last := i + mantLen // right-most position of digit that is non-zero + dot + dot := last - prec - exp // position of dot + j := last + for mant > 0 { + if j == dot { + b[j] = '.' + j-- + } + newMant := mant / 10 + digit := mant - 10*newMant + if zero && digit > 0 { + // first non-zero digit, if we are still behind the dot we can trim the end to this position + // otherwise trim to the dot (including the dot) + if j > dot { + i = j + 1 + // decrease negative exponent further to get rid of dot + if exp < 0 { + newExp := exp - (j - dot) + // getting rid of the dot shouldn't lower the exponent to more digits (e.g. -9 -> -10) + if LenInt(int64(newExp)) == LenInt(int64(exp)) { + exp = newExp + dot = j + j-- + i-- + } + } + } else { + i = dot + } + last = j + zero = false + } + b[j] = '0' + byte(digit) + j-- + mant = newMant + } + + if j > dot { + // extra zeros behind the dot + for j > dot { + b[j] = '0' + j-- + } + b[j] = '.' + } else if last+3 < dot { + // add positive exponent because we have 3 or more zeros in front of the dot + i = last + 1 + exp = dot - last - 1 + } else if j == dot { + // handle 0.1 + b[j] = '.' + } + + // exponent + if exp != 0 { + if exp == 1 { + b[i] = '0' + i++ + } else if exp == 2 { + b[i] = '0' + b[i+1] = '0' + i += 2 + } else { + b[i] = 'e' + i++ + if exp < 0 { + b[i] = '-' + i++ + exp = -exp + } + i += LenInt(int64(exp)) + j := i + for exp > 0 { + newExp := exp / 10 + digit := exp - 10*newExp + j-- + b[j] = '0' + byte(digit) + exp = newExp + } + } + } + return b[:i], true +} diff --git a/vendor/github.com/tdewolff/parse/v2/strconv/int.go b/vendor/github.com/tdewolff/parse/v2/strconv/int.go new file mode 100644 index 000000000..d8df0fd68 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/strconv/int.go @@ -0,0 +1,88 @@ +package strconv + +import ( + "math" +) + +// ParseInt parses a byte-slice and returns the integer it represents. +// If an invalid character is encountered, it will stop there. +func ParseInt(b []byte) (int64, int) { + i := 0 + neg := false + if len(b) > 0 && (b[0] == '+' || b[0] == '-') { + neg = b[0] == '-' + i++ + } + start := i + n := uint64(0) + for i < len(b) { + c := b[i] + if n > math.MaxUint64/10 { + return 0, 0 + } else if c >= '0' && c <= '9' { + n *= 10 + n += uint64(c - '0') + } else { + break + } + i++ + } + if i == start { + return 0, 0 + } + if !neg && n > uint64(math.MaxInt64) || n > uint64(math.MaxInt64)+1 { + return 0, 0 + } else if neg { + return -int64(n), i + } + return int64(n), i +} + +// LenInt returns the written length of an integer. +func LenInt(i int64) int { + if i < 0 { + if i == -9223372036854775808 { + return 19 + } + i = -i + } + switch { + case i < 10: + return 1 + case i < 100: + return 2 + case i < 1000: + return 3 + case i < 10000: + return 4 + case i < 100000: + return 5 + case i < 1000000: + return 6 + case i < 10000000: + return 7 + case i < 100000000: + return 8 + case i < 1000000000: + return 9 + case i < 10000000000: + return 10 + case i < 100000000000: + return 11 + case i < 1000000000000: + return 12 + case i < 10000000000000: + return 13 + case i < 100000000000000: + return 14 + case i < 1000000000000000: + return 15 + case i < 10000000000000000: + return 16 + case i < 100000000000000000: + return 17 + case i < 1000000000000000000: + return 18 + } + return 19 +} diff --git a/vendor/github.com/tdewolff/parse/v2/strconv/price.go b/vendor/github.com/tdewolff/parse/v2/strconv/price.go new file mode 100644 index 000000000..94b38343e --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/strconv/price.go @@ -0,0 +1,83 @@ +package strconv + +// AppendPrice will append an int64 formatted as a price, where the int64 is the price in cents. +// It does not display whether a price is negative or not. +func AppendPrice(b []byte, price int64, dec bool, milSeparator byte, decSeparator byte) []byte { + if price < 0 { + if price == -9223372036854775808 { + x := []byte("92 233 720 368 547 758 08") + x[2] = milSeparator + x[6] = milSeparator + x[10] = milSeparator + x[14] = milSeparator + x[18] = milSeparator + x[22] = decSeparator + return append(b, x...) + } + price = -price + } + + // rounding + if !dec { + firstDec := (price / 10) % 10 + if firstDec >= 5 { + price += 100 + } + } + + // calculate size + n := LenInt(price) - 2 + if n > 0 { + n += (n - 1) / 3 // mil separator + } else { + n = 1 + } + if dec { + n += 2 + 1 // decimals + dec separator + } + + // resize byte slice + i := len(b) + if i+n > cap(b) { + b = append(b, make([]byte, n)...) + } else { + b = b[:i+n] + } + + // print fractional-part + i += n - 1 + if dec { + for j := 0; j < 2; j++ { + c := byte(price%10) + '0' + price /= 10 + b[i] = c + i-- + } + b[i] = decSeparator + i-- + } else { + price /= 100 + } + + if price == 0 { + b[i] = '0' + return b + } + + // print integer-part + j := 0 + for price > 0 { + if j == 3 { + b[i] = milSeparator + i-- + j = 0 + } + + c := byte(price%10) + '0' + price /= 10 + b[i] = c + i-- + j++ + } + return b +} diff --git a/vendor/github.com/tdewolff/parse/v2/util.go b/vendor/github.com/tdewolff/parse/v2/util.go new file mode 100644 index 000000000..07101f467 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/util.go @@ -0,0 +1,489 @@ +package parse + +import ( + "bytes" + "fmt" + "strconv" + "unicode" +) + +// Copy returns a copy of the given byte slice. +func Copy(src []byte) (dst []byte) { + dst = make([]byte, len(src)) + copy(dst, src) + return +} + +// ToLower converts all characters in the byte slice from A-Z to a-z. +func ToLower(src []byte) []byte { + for i, c := range src { + if c >= 'A' && c <= 'Z' { + src[i] = c + ('a' - 'A') + } + } + return src +} + +// EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase). +func EqualFold(s, targetLower []byte) bool { + if len(s) != len(targetLower) { + return false + } + for i, c := range targetLower { + d := s[i] + if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) { + return false + } + } + return true +} + +// Printable returns a printable string for given rune +func Printable(r rune) string { + if unicode.IsGraphic(r) { + return fmt.Sprintf("%c", r) + } else if r < 128 { + return fmt.Sprintf("0x%02X", r) + } + return fmt.Sprintf("%U", r) +} + +var whitespaceTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, false, false, false, false, false, false, // space + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} + +// IsWhitespace returns true for space, \n, \r, \t, \f. +func IsWhitespace(c byte) bool { + return whitespaceTable[c] +} + +var newlineTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, false, true, false, false, true, false, false, // new line, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} + +// IsNewline returns true for \n, \r. +func IsNewline(c byte) bool { + return newlineTable[c] +} + +// IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f. +func IsAllWhitespace(b []byte) bool { + for _, c := range b { + if !IsWhitespace(c) { + return false + } + } + return true +} + +// TrimWhitespace removes any leading and trailing whitespace characters. +func TrimWhitespace(b []byte) []byte { + n := len(b) + start := n + for i := 0; i < n; i++ { + if !IsWhitespace(b[i]) { + start = i + break + } + } + end := n + for i := n - 1; i >= start; i-- { + if !IsWhitespace(b[i]) { + end = i + 1 + break + } + } + return b[start:end] +} + +// ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r). +func ReplaceMultipleWhitespace(b []byte) []byte { + j, k := 0, 0 // j is write position, k is start of next text section + for i := 0; i < len(b); i++ { + if IsWhitespace(b[i]) { + start := i + newline := IsNewline(b[i]) + i++ + for ; i < len(b) && IsWhitespace(b[i]); i++ { + if IsNewline(b[i]) { + newline = true + } + } + if newline { + b[start] = '\n' + } else { + b[start] = ' ' + } + if 1 < i-start { // more than one whitespace + if j == 0 { + j = start + 1 + } else { + j += copy(b[j:], b[k:start+1]) + } + k = i + } + } + } + if j == 0 { + return b + } else if j == 1 { // only if starts with whitespace + b[k-1] = b[0] + return b[k-1:] + } else if k < len(b) { + j += copy(b[j:], b[k:]) + } + return b[:j] +} + +// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites. +func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) { + const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral + var r []byte + j := i + 1 + if b[j] == '#' { + j++ + if b[j] == 'x' { + j++ + c := 0 + for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { + if b[j] <= '9' { + c = c<<4 + int(b[j]-'0') + } else if b[j] <= 'F' { + c = c<<4 + int(b[j]-'A') + 10 + } else if b[j] <= 'f' { + c = c<<4 + int(b[j]-'a') + 10 + } + } + if j <= i+3 || 10000 <= c { + return b, j - 1 + } + if c < 128 { + r = []byte{byte(c)} + } else { + r = append(r, '&', '#') + r = strconv.AppendInt(r, int64(c), 10) + r = append(r, ';') + } + } else { + c := 0 + for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ { + c = c*10 + int(b[j]-'0') + } + if j <= i+2 || 128 <= c { + return b, j - 1 + } + r = []byte{byte(c)} + } + } else { + for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ { + } + if j <= i+1 || len(b) <= j { + return b, j - 1 + } + + var ok bool + r, ok = entitiesMap[string(b[i+1:j])] + if !ok { + return b, j + } + } + + // j is at semicolon + n := j + 1 - i + if j < len(b) && b[j] == ';' && 2 < n { + if len(r) == 1 { + if q, ok := revEntitiesMap[r[0]]; ok { + if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) { + return b, j + } + r = q + } else if r[0] == '&' { + // check if for example & is followed by something that could potentially be an entity + k := j + 1 + if k < len(b) && b[k] == '#' { + k++ + } + for ; k < len(b) && k-j <= MaxEntityLength && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z'); k++ { + } + if k < len(b) && b[k] == ';' { + return b, k + } + } + } + + copy(b[i:], r) + copy(b[i+len(r):], b[j+1:]) + b = b[:len(b)-n+len(r)] + return b, i + len(r) - 1 + } + return b, i +} + +// ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes. +func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { + for i := 0; i < len(b); i++ { + if b[i] == '&' && i+3 < len(b) { + b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) + } + } + return b +} + +// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially. +func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { + j, k := 0, 0 // j is write position, k is start of next text section + for i := 0; i < len(b); i++ { + if IsWhitespace(b[i]) { + start := i + newline := IsNewline(b[i]) + i++ + for ; i < len(b) && IsWhitespace(b[i]); i++ { + if IsNewline(b[i]) { + newline = true + } + } + if newline { + b[start] = '\n' + } else { + b[start] = ' ' + } + if 1 < i-start { // more than one whitespace + if j == 0 { + j = start + 1 + } else { + j += copy(b[j:], b[k:start+1]) + } + k = i + } + } + if i+3 < len(b) && b[i] == '&' { + b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) + } + } + if j == 0 { + return b + } else if j == 1 { // only if starts with whitespace + b[k-1] = b[0] + return b[k-1:] + } else if k < len(b) { + j += copy(b[j:], b[k:]) + } + return b[:j] +} + +// URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme +var URLEncodingTable = [256]bool{ + // ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, false, true, true, true, true, true, false, // space, ", #, $, %, & + false, false, false, true, true, false, false, true, // +, comma, / + false, false, false, false, false, false, false, false, + false, false, true, true, true, true, true, true, // :, ;, <, =, >, ? + + true, false, false, false, false, false, false, false, // @ + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, true, true, true, false, // [, \, ], ^ + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, true, true, false, true, // {, |, }, DEL + + // non-ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, +} + +// DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme +// Escape only non-printable characters, unicode and %, #, &. IE11 additionally requires encoding of +// \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex +var DataURIEncodingTable = [256]bool{ + // ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + false, false, true, true, false, true, true, false, // ", #, %, & + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, true, false, true, false, // <, > + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, true, true, true, false, // [, \, ], ^ + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, true, true, true, false, true, // {, |, }, DEL + + // non-ASCII + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, + true, true, true, true, true, true, true, true, +} + +// EncodeURL encodes bytes using the URL encoding scheme +func EncodeURL(b []byte, table [256]bool) []byte { + for i := 0; i < len(b); i++ { + c := b[i] + if table[c] { + if c == ' ' { + b[i] = '+' + } else { + b = append(b, 0, 0) + copy(b[i+3:], b[i+1:]) + b[i+0] = '%' + b[i+1] = "0123456789ABCDEF"[c>>4] + b[i+2] = "0123456789ABCDEF"[c&15] + } + } + } + return b +} + +// DecodeURL decodes an URL encoded using the URL encoding scheme +func DecodeURL(b []byte) []byte { + for i := 0; i < len(b); i++ { + if b[i] == '%' && i+2 < len(b) { + j := i + 1 + c := 0 + for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { + if b[j] <= '9' { + c = c<<4 + int(b[j]-'0') + } else if b[j] <= 'F' { + c = c<<4 + int(b[j]-'A') + 10 + } else if b[j] <= 'f' { + c = c<<4 + int(b[j]-'a') + 10 + } + } + if j == i+3 && c < 128 { + b[i] = byte(c) + b = append(b[:i+1], b[i+3:]...) + } + } else if b[i] == '+' { + b[i] = ' ' + } + } + return b +} |