// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package html
import (
	"bytes"
	"errors"
	"io"
	"strconv"
	"strings"
	"golang.org/x/net/html/atom"
)
// A TokenType is the type of a Token.
type TokenType uint32
const (
	// ErrorToken means that an error occurred during tokenization.
	ErrorToken TokenType = iota
	// TextToken means a text node.
	TextToken
	// A StartTagToken looks like .
	StartTagToken
	// An EndTagToken looks like .
	EndTagToken
	// A SelfClosingTagToken tag looks like 
.
	SelfClosingTagToken
	// A CommentToken looks like .
	CommentToken
	// A DoctypeToken looks like 
	DoctypeToken
)
// ErrBufferExceeded means that the buffering limit was exceeded.
var ErrBufferExceeded = errors.New("max buffer exceeded")
// String returns a string representation of the TokenType.
func (t TokenType) String() string {
	switch t {
	case ErrorToken:
		return "Error"
	case TextToken:
		return "Text"
	case StartTagToken:
		return "StartTag"
	case EndTagToken:
		return "EndTag"
	case SelfClosingTagToken:
		return "SelfClosingTag"
	case CommentToken:
		return "Comment"
	case DoctypeToken:
		return "Doctype"
	}
	return "Invalid(" + strconv.Itoa(int(t)) + ")"
}
// An Attribute is an attribute namespace-key-value triple. Namespace is
// non-empty for foreign attributes like xlink, Key is alphabetic (and hence
// does not contain escapable characters like '&', '<' or '>'), and Val is
// unescaped (it looks like "a"
	case EndTagToken:
		return "" + t.tagString() + ">"
	case SelfClosingTagToken:
		return "<" + t.tagString() + "/>"
	case CommentToken:
		return ""
	case DoctypeToken:
		return ""
	}
	return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
}
// span is a range of bytes in a Tokenizer's buffer. The start is inclusive,
// the end is exclusive.
type span struct {
	start, end int
}
// A Tokenizer returns a stream of HTML Tokens.
type Tokenizer struct {
	// r is the source of the HTML text.
	r io.Reader
	// tt is the TokenType of the current token.
	tt TokenType
	// err is the first error encountered during tokenization. It is possible
	// for tt != Error && err != nil to hold: this means that Next returned a
	// valid token but the subsequent Next call will return an error token.
	// For example, if the HTML text input was just "plain", then the first
	// Next call would set z.err to io.EOF but return a TextToken, and all
	// subsequent Next calls would return an ErrorToken.
	// err is never reset. Once it becomes non-nil, it stays non-nil.
	err error
	// readErr is the error returned by the io.Reader r. It is separate from
	// err because it is valid for an io.Reader to return (n int, err1 error)
	// such that n > 0 && err1 != nil, and callers should always process the
	// n > 0 bytes before considering the error err1.
	readErr error
	// buf[raw.start:raw.end] holds the raw bytes of the current token.
	// buf[raw.end:] is buffered input that will yield future tokens.
	raw span
	buf []byte
	// maxBuf limits the data buffered in buf. A value of 0 means unlimited.
	maxBuf int
	// buf[data.start:data.end] holds the raw bytes of the current token's data:
	// a text token's text, a tag token's tag name, etc.
	data span
	// pendingAttr is the attribute key and value currently being tokenized.
	// When complete, pendingAttr is pushed onto attr. nAttrReturned is
	// incremented on each call to TagAttr.
	pendingAttr   [2]span
	attr          [][2]span
	nAttrReturned int
	// rawTag is the "script" in "" that closes the next token. If
	// non-empty, the subsequent call to Next will return a raw or RCDATA text
	// token: one that treats "
" as text instead of an element. // rawTag's contents are lower-cased. rawTag string // textIsRaw is whether the current text token's data is not escaped. textIsRaw bool // convertNUL is whether NUL bytes in the current token's data should // be converted into \ufffd replacement characters. convertNUL bool // allowCDATA is whether CDATA sections are allowed in the current context. allowCDATA bool } // AllowCDATA sets whether or not the tokenizer recognizes as // the text "foo". The default value is false, which means to recognize it as // a bogus comment "" instead. // // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and // only if tokenizing foreign content, such as MathML and SVG. However, // tracking foreign-contentness is difficult to do purely in the tokenizer, // as opposed to the parser, due to HTML integration points: an