diff options
Diffstat (limited to 'vendor/github.com/tdewolff/parse/v2/html')
-rw-r--r-- | vendor/github.com/tdewolff/parse/v2/html/README.md | 98 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/v2/html/hash.go | 81 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/v2/html/lex.go | 493 | ||||
-rw-r--r-- | vendor/github.com/tdewolff/parse/v2/html/util.go | 103 |
4 files changed, 775 insertions, 0 deletions
diff --git a/vendor/github.com/tdewolff/parse/v2/html/README.md b/vendor/github.com/tdewolff/parse/v2/html/README.md new file mode 100644 index 000000000..53145dbd9 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/README.md @@ -0,0 +1,98 @@ +# HTML [](https://pkg.go.dev/github.com/tdewolff/parse/v2/html?tab=doc) + +This package is an HTML5 lexer written in [Go][1]. It follows the specification at [The HTML syntax](http://www.w3.org/TR/html5/syntax.html). The lexer takes an io.Reader and converts it into tokens until the EOF. + +## Installation +Run the following command + + go get -u github.com/tdewolff/parse/v2/html + +or add the following import and run project with `go get` + + import "github.com/tdewolff/parse/v2/html" + +## Lexer +### Usage +The following initializes a new Lexer with io.Reader `r`: +``` go +l := html.NewLexer(parse.NewInput(r)) +``` + +To tokenize until EOF an error, use: +``` go +for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + // error or EOF set in l.Err() + return + case html.StartTagToken: + // ... + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + // ... + } + // ... + } +} +``` + +All tokens: +``` go +ErrorToken TokenType = iota // extra token when errors occur +CommentToken +DoctypeToken +StartTagToken +StartTagCloseToken +StartTagVoidToken +EndTagToken +AttributeToken +TextToken +``` + +### Examples +``` go +package main + +import ( + "os" + + "github.com/tdewolff/parse/v2/html" +) + +// Tokenize HTML from stdin. +func main() { + l := html.NewLexer(parse.NewInput(os.Stdin)) + for { + tt, data := l.Next() + switch tt { + case html.ErrorToken: + if l.Err() != io.EOF { + fmt.Println("Error on line", l.Line(), ":", l.Err()) + } + return + case html.StartTagToken: + fmt.Println("Tag", string(data)) + for { + ttAttr, dataAttr := l.Next() + if ttAttr != html.AttributeToken { + break + } + + key := dataAttr + val := l.AttrVal() + fmt.Println("Attribute", string(key), "=", string(val)) + } + // ... + } + } +} +``` + +## License +Released under the [MIT license](https://github.com/tdewolff/parse/blob/master/LICENSE.md). + +[1]: http://golang.org/ "Go Language" diff --git a/vendor/github.com/tdewolff/parse/v2/html/hash.go b/vendor/github.com/tdewolff/parse/v2/html/hash.go new file mode 100644 index 000000000..16432ade1 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/hash.go @@ -0,0 +1,81 @@ +package html + +// generated by hasher -type=Hash -file=hash.go; DO NOT EDIT, except for adding more constants to the list and rerun go generate + +// uses github.com/tdewolff/hasher +//go:generate hasher -type=Hash -file=hash.go + +// Hash defines perfect hashes for a predefined list of strings +type Hash uint32 + +// Unique hash definitions to be used instead of strings +const ( + Iframe Hash = 0x6 // iframe + Math Hash = 0x604 // math + Plaintext Hash = 0x1e09 // plaintext + Script Hash = 0xa06 // script + Style Hash = 0x1405 // style + Svg Hash = 0x1903 // svg + Textarea Hash = 0x2308 // textarea + Title Hash = 0xf05 // title + Xmp Hash = 0x1c03 // xmp +) + +// String returns the hash' name. +func (i Hash) String() string { + start := uint32(i >> 8) + n := uint32(i & 0xff) + if start+n > uint32(len(_Hash_text)) { + return "" + } + return _Hash_text[start : start+n] +} + +// ToHash returns the hash whose name is s. It returns zero if there is no +// such hash. It is case sensitive. +func ToHash(s []byte) Hash { + if len(s) == 0 || len(s) > _Hash_maxLen { + return 0 + } + h := uint32(_Hash_hash0) + for i := 0; i < len(s); i++ { + h ^= uint32(s[i]) + h *= 16777619 + } + if i := _Hash_table[h&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + goto NEXT + } + } + return i + } +NEXT: + if i := _Hash_table[(h>>16)&uint32(len(_Hash_table)-1)]; int(i&0xff) == len(s) { + t := _Hash_text[i>>8 : i>>8+i&0xff] + for i := 0; i < len(s); i++ { + if t[i] != s[i] { + return 0 + } + } + return i + } + return 0 +} + +const _Hash_hash0 = 0x9acb0442 +const _Hash_maxLen = 9 +const _Hash_text = "iframemathscriptitlestylesvgxmplaintextarea" + +var _Hash_table = [1 << 4]Hash{ + 0x0: 0x2308, // textarea + 0x2: 0x6, // iframe + 0x4: 0xf05, // title + 0x5: 0x1e09, // plaintext + 0x7: 0x1405, // style + 0x8: 0x604, // math + 0x9: 0xa06, // script + 0xa: 0x1903, // svg + 0xb: 0x1c03, // xmp +} diff --git a/vendor/github.com/tdewolff/parse/v2/html/lex.go b/vendor/github.com/tdewolff/parse/v2/html/lex.go new file mode 100644 index 000000000..2c47d2527 --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/lex.go @@ -0,0 +1,493 @@ +// Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html. +package html + +import ( + "strconv" + + "github.com/tdewolff/parse/v2" +) + +// TokenType determines the type of token, eg. a number or a semicolon. +type TokenType uint32 + +// TokenType values. +const ( + ErrorToken TokenType = iota // extra token when errors occur + CommentToken + DoctypeToken + StartTagToken + StartTagCloseToken + StartTagVoidToken + EndTagToken + AttributeToken + TextToken + SvgToken + MathToken +) + +// String returns the string representation of a TokenType. +func (tt TokenType) String() string { + switch tt { + case ErrorToken: + return "Error" + case CommentToken: + return "Comment" + case DoctypeToken: + return "Doctype" + case StartTagToken: + return "StartTag" + case StartTagCloseToken: + return "StartTagClose" + case StartTagVoidToken: + return "StartTagVoid" + case EndTagToken: + return "EndTag" + case AttributeToken: + return "Attribute" + case TextToken: + return "Text" + case SvgToken: + return "Svg" + case MathToken: + return "Math" + } + return "Invalid(" + strconv.Itoa(int(tt)) + ")" +} + +//////////////////////////////////////////////////////////////// + +// Lexer is the state for the lexer. +type Lexer struct { + r *parse.Input + err error + + rawTag Hash + inTag bool + + text []byte + attrVal []byte +} + +// NewLexer returns a new Lexer for a given io.Reader. +func NewLexer(r *parse.Input) *Lexer { + return &Lexer{ + r: r, + } +} + +// Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned. +func (l *Lexer) Err() error { + if l.err != nil { + return l.err + } + return l.r.Err() +} + +// Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters. +func (l *Lexer) Text() []byte { + return l.text +} + +// AttrVal returns the attribute value when an AttributeToken was returned from Next. +func (l *Lexer) AttrVal() []byte { + return l.attrVal +} + +// Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message. +func (l *Lexer) Next() (TokenType, []byte) { + l.text = nil + var c byte + if l.inTag { + l.attrVal = nil + for { // before attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == 0 && l.r.Err() != nil { + return ErrorToken, nil + } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') { + return AttributeToken, l.shiftAttribute() + } + l.r.Skip() + l.inTag = false + if c == '/' { + l.r.Move(2) + return StartTagVoidToken, l.r.Shift() + } + l.r.Move(1) + return StartTagCloseToken, l.r.Shift() + } + + if l.rawTag != 0 { + if rawText := l.shiftRawText(); len(rawText) > 0 { + l.rawTag = 0 + return TextToken, rawText + } + l.rawTag = 0 + } + + for { + c = l.r.Peek(0) + if c == '<' { + c = l.r.Peek(1) + isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil) + if l.r.Pos() > 0 { + if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' { + // return currently buffered texttoken so that we can return tag next iteration + l.text = l.r.Shift() + return TextToken, l.text + } + } else if isEndTag { + l.r.Move(2) + // only endtags that are not followed by > or EOF arrive here + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + return CommentToken, l.shiftBogusComment() + } + return EndTagToken, l.shiftEndTag() + } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { + l.r.Move(1) + l.inTag = true + return l.shiftStartTag() + } else if c == '!' { + l.r.Move(2) + return l.readMarkup() + } else if c == '?' { + l.r.Move(1) + return CommentToken, l.shiftBogusComment() + } + } else if c == 0 && l.r.Err() != nil { + if l.r.Pos() > 0 { + l.text = l.r.Shift() + return TextToken, l.text + } + return ErrorToken, nil + } + l.r.Move(1) + } +} + +//////////////////////////////////////////////////////////////// + +// The following functions follow the specifications at https://html.spec.whatwg.org/multipage/parsing.html + +func (l *Lexer) shiftRawText() []byte { + if l.rawTag == Plaintext { + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + return l.r.Shift() + } + l.r.Move(1) + } + } else { // RCDATA, RAWTEXT and SCRIPT + for { + c := l.r.Peek(0) + if c == '<' { + if l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice + l.r.Rewind(mark) + return l.r.Shift() + } + } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' { + l.r.Move(4) + inScript := false + for { + c := l.r.Peek(0) + if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' { + l.r.Move(3) + break + } else if c == '<' { + isEnd := l.r.Peek(1) == '/' + if isEnd { + l.r.Move(2) + } else { + l.r.Move(1) + } + mark := l.r.Pos() + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice + if !isEnd { + inScript = true + } else { + if !inScript { + l.r.Rewind(mark - 2) + return l.r.Shift() + } + inScript = false + } + } + } else if c == 0 && l.r.Err() != nil { + return l.r.Shift() + } else { + l.r.Move(1) + } + } + } else { + l.r.Move(1) + } + } else if c == 0 && l.r.Err() != nil { + return l.r.Shift() + } else { + l.r.Move(1) + } + } + } +} + +func (l *Lexer) readMarkup() (TokenType, []byte) { + if l.at('-', '-') { + l.r.Move(2) + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[4:] + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(3) + return CommentToken, l.r.Shift() + } else if l.at('-', '-', '!', '>') { + l.text = l.r.Lexeme()[4:] + l.r.Move(4) + return CommentToken, l.r.Shift() + } + l.r.Move(1) + } + } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') { + l.r.Move(7) + for { + if l.r.Peek(0) == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[9:] + return TextToken, l.r.Shift() + } else if l.at(']', ']', '>') { + l.text = l.r.Lexeme()[9:] + l.r.Move(3) + return TextToken, l.r.Shift() + } + l.r.Move(1) + } + } else { + if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') { + l.r.Move(7) + if l.r.Peek(0) == ' ' { + l.r.Move(1) + } + for { + if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[9:] + if c == '>' { + l.r.Move(1) + } + return DoctypeToken, l.r.Shift() + } + l.r.Move(1) + } + } + } + return CommentToken, l.shiftBogusComment() +} + +func (l *Lexer) shiftBogusComment() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + return l.r.Shift() + } else if c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[2:] + return l.r.Shift() + } + l.r.Move(1) + } +} + +func (l *Lexer) shiftStartTag() (TokenType, []byte) { + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + l.text = parse.ToLower(l.r.Lexeme()[1:]) + if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math { + if h == Svg || h == Math { + data := l.shiftXML(h) + if l.err != nil { + return ErrorToken, nil + } + + l.inTag = false + if h == Svg { + return SvgToken, data + } + return MathToken, data + } + l.rawTag = h + } + return StartTagToken, l.r.Shift() +} + +func (l *Lexer) shiftAttribute() []byte { + nameStart := l.r.Pos() + var c byte + for { // attribute name state + if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + nameEnd := l.r.Pos() + for { // after attribute name state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + if c == '=' { + l.r.Move(1) + for { // before attribute value state + if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' { + l.r.Move(1) + continue + } + break + } + attrPos := l.r.Pos() + delim := c + if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state + l.r.Move(1) + for { + c := l.r.Peek(0) + if c == delim { + l.r.Move(1) + break + } else if c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + } else { // attribute value unquoted state + for { + if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil { + break + } + l.r.Move(1) + } + } + l.attrVal = l.r.Lexeme()[attrPos:] + } else { + l.r.Rewind(nameEnd) + l.attrVal = nil + } + l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd]) + return l.r.Shift() +} + +func (l *Lexer) shiftEndTag() []byte { + for { + c := l.r.Peek(0) + if c == '>' { + l.text = l.r.Lexeme()[2:] + l.r.Move(1) + break + } else if c == 0 && l.r.Err() != nil { + l.text = l.r.Lexeme()[2:] + break + } + l.r.Move(1) + } + + end := len(l.text) + for end > 0 { + if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' { + end-- + continue + } + break + } + l.text = l.text[:end] + return parse.ToLower(l.r.Shift()) +} + +// shiftXML parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself. +// So far we have already parsed `<svg` or `<math`. +func (l *Lexer) shiftXML(rawTag Hash) []byte { + inQuote := false + for { + c := l.r.Peek(0) + if c == '"' { + inQuote = !inQuote + l.r.Move(1) + } else if c == '<' && !inQuote && l.r.Peek(1) == '/' { + mark := l.r.Pos() + l.r.Move(2) + for { + if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { + break + } + l.r.Move(1) + } + if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice + break + } + } else if c == 0 { + if l.r.Err() == nil { + l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") + } + return l.r.Shift() + } else { + l.r.Move(1) + } + } + + for { + c := l.r.Peek(0) + if c == '>' { + l.r.Move(1) + break + } else if c == 0 { + if l.r.Err() == nil { + l.err = parse.NewErrorLexer(l.r, "HTML parse error: unexpected NULL character") + } + return l.r.Shift() + } + l.r.Move(1) + } + return l.r.Shift() +} + +//////////////////////////////////////////////////////////////// + +func (l *Lexer) at(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c { + return false + } + } + return true +} + +func (l *Lexer) atCaseInsensitive(b ...byte) bool { + for i, c := range b { + if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c { + return false + } + } + return true +} diff --git a/vendor/github.com/tdewolff/parse/v2/html/util.go b/vendor/github.com/tdewolff/parse/v2/html/util.go new file mode 100644 index 000000000..fe12f17ff --- /dev/null +++ b/vendor/github.com/tdewolff/parse/v2/html/util.go @@ -0,0 +1,103 @@ +package html + +var ( + singleQuoteEntityBytes = []byte("'") + doubleQuoteEntityBytes = []byte(""") +) + +// EscapeAttrVal returns the escaped attribute value bytes without quotes. +func EscapeAttrVal(buf *[]byte, orig, b []byte, isXML bool) []byte { + singles := 0 + doubles := 0 + unquoted := true + entities := false + for _, c := range b { + if charTable[c] { + unquoted = false + if c == '"' { + doubles++ + } else if c == '\'' { + singles++ + } + } + } + if unquoted && !isXML { + return b + } else if !entities && len(orig) == len(b)+2 && (singles == 0 && orig[0] == '\'' || doubles == 0 && orig[0] == '"') { + return orig + } + + n := len(b) + 2 + var quote byte + var escapedQuote []byte + if singles >= doubles || isXML { + n += doubles * 4 + quote = '"' + escapedQuote = doubleQuoteEntityBytes + } else { + n += singles * 4 + quote = '\'' + escapedQuote = singleQuoteEntityBytes + } + if n > cap(*buf) { + *buf = make([]byte, 0, n) // maximum size, not actual size + } + t := (*buf)[:n] // maximum size, not actual size + t[0] = quote + j := 1 + start := 0 + for i, c := range b { + if c == quote { + j += copy(t[j:], b[start:i]) + j += copy(t[j:], escapedQuote) + start = i + 1 + } + } + j += copy(t[j:], b[start:]) + t[j] = quote + return t[:j+1] +} + +var charTable = [256]bool{ + // ASCII + false, false, false, false, false, false, false, false, + false, true, true, false, true, true, false, false, // tab, line feed, form feed, carriage return + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, true, false, false, false, false, true, // space, "), ' + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, true, true, true, false, // <, =, > + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + true, false, false, false, false, false, false, false, // ` + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + // non-ASCII + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, + false, false, false, false, false, false, false, false, +} |