diff options
Diffstat (limited to 'vendor/github.com/pelletier/go-toml/lexer.go')
-rw-r--r-- | vendor/github.com/pelletier/go-toml/lexer.go | 1031 |
1 files changed, 1031 insertions, 0 deletions
diff --git a/vendor/github.com/pelletier/go-toml/lexer.go b/vendor/github.com/pelletier/go-toml/lexer.go new file mode 100644 index 000000000..313908e3e --- /dev/null +++ b/vendor/github.com/pelletier/go-toml/lexer.go @@ -0,0 +1,1031 @@ +// TOML lexer. +// +// Written using the principles developed by Rob Pike in +// http://www.youtube.com/watch?v=HxaD_trXwRE + +package toml + +import ( + "bytes" + "errors" + "fmt" + "strconv" + "strings" +) + +// Define state functions +type tomlLexStateFn func() tomlLexStateFn + +// Define lexer +type tomlLexer struct { + inputIdx int + input []rune // Textual source + currentTokenStart int + currentTokenStop int + tokens []token + brackets []rune + line int + col int + endbufferLine int + endbufferCol int +} + +// Basic read operations on input + +func (l *tomlLexer) read() rune { + r := l.peek() + if r == '\n' { + l.endbufferLine++ + l.endbufferCol = 1 + } else { + l.endbufferCol++ + } + l.inputIdx++ + return r +} + +func (l *tomlLexer) next() rune { + r := l.read() + + if r != eof { + l.currentTokenStop++ + } + return r +} + +func (l *tomlLexer) ignore() { + l.currentTokenStart = l.currentTokenStop + l.line = l.endbufferLine + l.col = l.endbufferCol +} + +func (l *tomlLexer) skip() { + l.next() + l.ignore() +} + +func (l *tomlLexer) fastForward(n int) { + for i := 0; i < n; i++ { + l.next() + } +} + +func (l *tomlLexer) emitWithValue(t tokenType, value string) { + l.tokens = append(l.tokens, token{ + Position: Position{l.line, l.col}, + typ: t, + val: value, + }) + l.ignore() +} + +func (l *tomlLexer) emit(t tokenType) { + l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop])) +} + +func (l *tomlLexer) peek() rune { + if l.inputIdx >= len(l.input) { + return eof + } + return l.input[l.inputIdx] +} + +func (l *tomlLexer) peekString(size int) string { + maxIdx := len(l.input) + upperIdx := l.inputIdx + size // FIXME: potential overflow + if upperIdx > maxIdx { + upperIdx = maxIdx + } + return string(l.input[l.inputIdx:upperIdx]) +} + +func (l *tomlLexer) follow(next string) bool { + return next == l.peekString(len(next)) +} + +// Error management + +func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn { + l.tokens = append(l.tokens, token{ + Position: Position{l.line, l.col}, + typ: tokenError, + val: fmt.Sprintf(format, args...), + }) + return nil +} + +// State functions + +func (l *tomlLexer) lexVoid() tomlLexStateFn { + for { + next := l.peek() + switch next { + case '}': // after '{' + return l.lexRightCurlyBrace + case '[': + return l.lexTableKey + case '#': + return l.lexComment(l.lexVoid) + case '=': + return l.lexEqual + case '\r': + fallthrough + case '\n': + l.skip() + continue + } + + if isSpace(next) { + l.skip() + } + + if isKeyStartChar(next) { + return l.lexKey + } + + if next == eof { + l.next() + break + } + } + + l.emit(tokenEOF) + return nil +} + +func (l *tomlLexer) lexRvalue() tomlLexStateFn { + for { + next := l.peek() + switch next { + case '.': + return l.errorf("cannot start float with a dot") + case '=': + return l.lexEqual + case '[': + return l.lexLeftBracket + case ']': + return l.lexRightBracket + case '{': + return l.lexLeftCurlyBrace + case '}': + return l.lexRightCurlyBrace + case '#': + return l.lexComment(l.lexRvalue) + case '"': + return l.lexString + case '\'': + return l.lexLiteralString + case ',': + return l.lexComma + case '\r': + fallthrough + case '\n': + l.skip() + if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' { + return l.lexRvalue + } + return l.lexVoid + } + + if l.follow("true") { + return l.lexTrue + } + + if l.follow("false") { + return l.lexFalse + } + + if l.follow("inf") { + return l.lexInf + } + + if l.follow("nan") { + return l.lexNan + } + + if isSpace(next) { + l.skip() + continue + } + + if next == eof { + l.next() + break + } + + if next == '+' || next == '-' { + return l.lexNumber + } + + if isDigit(next) { + return l.lexDateTimeOrNumber + } + + return l.errorf("no value can start with %c", next) + } + + l.emit(tokenEOF) + return nil +} + +func (l *tomlLexer) lexDateTimeOrNumber() tomlLexStateFn { + // Could be either a date/time, or a digit. + // The options for date/times are: + // YYYY-... => date or date-time + // HH:... => time + // Anything else should be a number. + + lookAhead := l.peekString(5) + if len(lookAhead) < 3 { + return l.lexNumber() + } + + for idx, r := range lookAhead { + if !isDigit(r) { + if idx == 2 && r == ':' { + return l.lexDateTimeOrTime() + } + if idx == 4 && r == '-' { + return l.lexDateTimeOrTime() + } + return l.lexNumber() + } + } + return l.lexNumber() +} + +func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn { + l.next() + l.emit(tokenLeftCurlyBrace) + l.brackets = append(l.brackets, '{') + return l.lexVoid +} + +func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn { + l.next() + l.emit(tokenRightCurlyBrace) + if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' { + return l.errorf("cannot have '}' here") + } + l.brackets = l.brackets[:len(l.brackets)-1] + return l.lexRvalue +} + +func (l *tomlLexer) lexDateTimeOrTime() tomlLexStateFn { + // Example matches: + // 1979-05-27T07:32:00Z + // 1979-05-27T00:32:00-07:00 + // 1979-05-27T00:32:00.999999-07:00 + // 1979-05-27 07:32:00Z + // 1979-05-27 00:32:00-07:00 + // 1979-05-27 00:32:00.999999-07:00 + // 1979-05-27T07:32:00 + // 1979-05-27T00:32:00.999999 + // 1979-05-27 07:32:00 + // 1979-05-27 00:32:00.999999 + // 1979-05-27 + // 07:32:00 + // 00:32:00.999999 + + // we already know those two are digits + l.next() + l.next() + + // Got 2 digits. At that point it could be either a time or a date(-time). + + r := l.next() + if r == ':' { + return l.lexTime() + } + + return l.lexDateTime() +} + +func (l *tomlLexer) lexDateTime() tomlLexStateFn { + // This state accepts an offset date-time, a local date-time, or a local date. + // + // v--- cursor + // 1979-05-27T07:32:00Z + // 1979-05-27T00:32:00-07:00 + // 1979-05-27T00:32:00.999999-07:00 + // 1979-05-27 07:32:00Z + // 1979-05-27 00:32:00-07:00 + // 1979-05-27 00:32:00.999999-07:00 + // 1979-05-27T07:32:00 + // 1979-05-27T00:32:00.999999 + // 1979-05-27 07:32:00 + // 1979-05-27 00:32:00.999999 + // 1979-05-27 + + // date + + // already checked by lexRvalue + l.next() // digit + l.next() // - + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid month digit in date: %c", r) + } + } + + r := l.next() + if r != '-' { + return l.errorf("expected - to separate month of a date, not %c", r) + } + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid day digit in date: %c", r) + } + } + + l.emit(tokenLocalDate) + + r = l.peek() + + if r == eof { + + return l.lexRvalue + } + + if r != ' ' && r != 'T' { + return l.errorf("incorrect date/time separation character: %c", r) + } + + if r == ' ' { + lookAhead := l.peekString(3)[1:] + if len(lookAhead) < 2 { + return l.lexRvalue + } + for _, r := range lookAhead { + if !isDigit(r) { + return l.lexRvalue + } + } + } + + l.skip() // skip the T or ' ' + + // time + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid hour digit in time: %c", r) + } + } + + r = l.next() + if r != ':' { + return l.errorf("time hour/minute separator should be :, not %c", r) + } + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid minute digit in time: %c", r) + } + } + + r = l.next() + if r != ':' { + return l.errorf("time minute/second separator should be :, not %c", r) + } + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid second digit in time: %c", r) + } + } + + r = l.peek() + if r == '.' { + l.next() + r := l.next() + if !isDigit(r) { + return l.errorf("expected at least one digit in time's fraction, not %c", r) + } + + for { + r := l.peek() + if !isDigit(r) { + break + } + l.next() + } + } + + l.emit(tokenLocalTime) + + return l.lexTimeOffset + +} + +func (l *tomlLexer) lexTimeOffset() tomlLexStateFn { + // potential offset + + // Z + // -07:00 + // +07:00 + // nothing + + r := l.peek() + + if r == 'Z' { + l.next() + l.emit(tokenTimeOffset) + } else if r == '+' || r == '-' { + l.next() + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid hour digit in time offset: %c", r) + } + } + + r = l.next() + if r != ':' { + return l.errorf("time offset hour/minute separator should be :, not %c", r) + } + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid minute digit in time offset: %c", r) + } + } + + l.emit(tokenTimeOffset) + } + + return l.lexRvalue +} + +func (l *tomlLexer) lexTime() tomlLexStateFn { + // v--- cursor + // 07:32:00 + // 00:32:00.999999 + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid minute digit in time: %c", r) + } + } + + r := l.next() + if r != ':' { + return l.errorf("time minute/second separator should be :, not %c", r) + } + + for i := 0; i < 2; i++ { + r := l.next() + if !isDigit(r) { + return l.errorf("invalid second digit in time: %c", r) + } + } + + r = l.peek() + if r == '.' { + l.next() + r := l.next() + if !isDigit(r) { + return l.errorf("expected at least one digit in time's fraction, not %c", r) + } + + for { + r := l.peek() + if !isDigit(r) { + break + } + l.next() + } + } + + l.emit(tokenLocalTime) + return l.lexRvalue + +} + +func (l *tomlLexer) lexTrue() tomlLexStateFn { + l.fastForward(4) + l.emit(tokenTrue) + return l.lexRvalue +} + +func (l *tomlLexer) lexFalse() tomlLexStateFn { + l.fastForward(5) + l.emit(tokenFalse) + return l.lexRvalue +} + +func (l *tomlLexer) lexInf() tomlLexStateFn { + l.fastForward(3) + l.emit(tokenInf) + return l.lexRvalue +} + +func (l *tomlLexer) lexNan() tomlLexStateFn { + l.fastForward(3) + l.emit(tokenNan) + return l.lexRvalue +} + +func (l *tomlLexer) lexEqual() tomlLexStateFn { + l.next() + l.emit(tokenEqual) + return l.lexRvalue +} + +func (l *tomlLexer) lexComma() tomlLexStateFn { + l.next() + l.emit(tokenComma) + if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' { + return l.lexVoid + } + return l.lexRvalue +} + +// Parse the key and emits its value without escape sequences. +// bare keys, basic string keys and literal string keys are supported. +func (l *tomlLexer) lexKey() tomlLexStateFn { + var sb strings.Builder + + for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() { + if r == '"' { + l.next() + str, err := l.lexStringAsString(`"`, false, true) + if err != nil { + return l.errorf(err.Error()) + } + sb.WriteString("\"") + sb.WriteString(str) + sb.WriteString("\"") + l.next() + continue + } else if r == '\'' { + l.next() + str, err := l.lexLiteralStringAsString(`'`, false) + if err != nil { + return l.errorf(err.Error()) + } + sb.WriteString("'") + sb.WriteString(str) + sb.WriteString("'") + l.next() + continue + } else if r == '\n' { + return l.errorf("keys cannot contain new lines") + } else if isSpace(r) { + var str strings.Builder + str.WriteString(" ") + + // skip trailing whitespace + l.next() + for r = l.peek(); isSpace(r); r = l.peek() { + str.WriteRune(r) + l.next() + } + // break loop if not a dot + if r != '.' { + break + } + str.WriteString(".") + // skip trailing whitespace after dot + l.next() + for r = l.peek(); isSpace(r); r = l.peek() { + str.WriteRune(r) + l.next() + } + sb.WriteString(str.String()) + continue + } else if r == '.' { + // skip + } else if !isValidBareChar(r) { + return l.errorf("keys cannot contain %c character", r) + } + sb.WriteRune(r) + l.next() + } + l.emitWithValue(tokenKey, sb.String()) + return l.lexVoid +} + +func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn { + return func() tomlLexStateFn { + for next := l.peek(); next != '\n' && next != eof; next = l.peek() { + if next == '\r' && l.follow("\r\n") { + break + } + l.next() + } + l.ignore() + return previousState + } +} + +func (l *tomlLexer) lexLeftBracket() tomlLexStateFn { + l.next() + l.emit(tokenLeftBracket) + l.brackets = append(l.brackets, '[') + return l.lexRvalue +} + +func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) { + var sb strings.Builder + + if discardLeadingNewLine { + if l.follow("\r\n") { + l.skip() + l.skip() + } else if l.peek() == '\n' { + l.skip() + } + } + + // find end of string + for { + if l.follow(terminator) { + return sb.String(), nil + } + + next := l.peek() + if next == eof { + break + } + sb.WriteRune(l.next()) + } + + return "", errors.New("unclosed string") +} + +func (l *tomlLexer) lexLiteralString() tomlLexStateFn { + l.skip() + + // handle special case for triple-quote + terminator := "'" + discardLeadingNewLine := false + if l.follow("''") { + l.skip() + l.skip() + terminator = "'''" + discardLeadingNewLine = true + } + + str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine) + if err != nil { + return l.errorf(err.Error()) + } + + l.emitWithValue(tokenString, str) + l.fastForward(len(terminator)) + l.ignore() + return l.lexRvalue +} + +// Lex a string and return the results as a string. +// Terminator is the substring indicating the end of the token. +// The resulting string does not include the terminator. +func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) { + var sb strings.Builder + + if discardLeadingNewLine { + if l.follow("\r\n") { + l.skip() + l.skip() + } else if l.peek() == '\n' { + l.skip() + } + } + + for { + if l.follow(terminator) { + return sb.String(), nil + } + + if l.follow("\\") { + l.next() + switch l.peek() { + case '\r': + fallthrough + case '\n': + fallthrough + case '\t': + fallthrough + case ' ': + // skip all whitespace chars following backslash + for strings.ContainsRune("\r\n\t ", l.peek()) { + l.next() + } + case '"': + sb.WriteString("\"") + l.next() + case 'n': + sb.WriteString("\n") + l.next() + case 'b': + sb.WriteString("\b") + l.next() + case 'f': + sb.WriteString("\f") + l.next() + case '/': + sb.WriteString("/") + l.next() + case 't': + sb.WriteString("\t") + l.next() + case 'r': + sb.WriteString("\r") + l.next() + case '\\': + sb.WriteString("\\") + l.next() + case 'u': + l.next() + var code strings.Builder + for i := 0; i < 4; i++ { + c := l.peek() + if !isHexDigit(c) { + return "", errors.New("unfinished unicode escape") + } + l.next() + code.WriteRune(c) + } + intcode, err := strconv.ParseInt(code.String(), 16, 32) + if err != nil { + return "", errors.New("invalid unicode escape: \\u" + code.String()) + } + sb.WriteRune(rune(intcode)) + case 'U': + l.next() + var code strings.Builder + for i := 0; i < 8; i++ { + c := l.peek() + if !isHexDigit(c) { + return "", errors.New("unfinished unicode escape") + } + l.next() + code.WriteRune(c) + } + intcode, err := strconv.ParseInt(code.String(), 16, 64) + if err != nil { + return "", errors.New("invalid unicode escape: \\U" + code.String()) + } + sb.WriteRune(rune(intcode)) + default: + return "", errors.New("invalid escape sequence: \\" + string(l.peek())) + } + } else { + r := l.peek() + + if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) { + return "", fmt.Errorf("unescaped control character %U", r) + } + l.next() + sb.WriteRune(r) + } + + if l.peek() == eof { + break + } + } + + return "", errors.New("unclosed string") +} + +func (l *tomlLexer) lexString() tomlLexStateFn { + l.skip() + + // handle special case for triple-quote + terminator := `"` + discardLeadingNewLine := false + acceptNewLines := false + if l.follow(`""`) { + l.skip() + l.skip() + terminator = `"""` + discardLeadingNewLine = true + acceptNewLines = true + } + + str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines) + if err != nil { + return l.errorf(err.Error()) + } + + l.emitWithValue(tokenString, str) + l.fastForward(len(terminator)) + l.ignore() + return l.lexRvalue +} + +func (l *tomlLexer) lexTableKey() tomlLexStateFn { + l.next() + + if l.peek() == '[' { + // token '[[' signifies an array of tables + l.next() + l.emit(tokenDoubleLeftBracket) + return l.lexInsideTableArrayKey + } + // vanilla table key + l.emit(tokenLeftBracket) + return l.lexInsideTableKey +} + +// Parse the key till "]]", but only bare keys are supported +func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if l.currentTokenStop > l.currentTokenStart { + l.emit(tokenKeyGroupArray) + } + l.next() + if l.peek() != ']' { + break + } + l.next() + l.emit(tokenDoubleRightBracket) + return l.lexVoid + case '[': + return l.errorf("table array key cannot contain ']'") + default: + l.next() + } + } + return l.errorf("unclosed table array key") +} + +// Parse the key till "]" but only bare keys are supported +func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn { + for r := l.peek(); r != eof; r = l.peek() { + switch r { + case ']': + if l.currentTokenStop > l.currentTokenStart { + l.emit(tokenKeyGroup) + } + l.next() + l.emit(tokenRightBracket) + return l.lexVoid + case '[': + return l.errorf("table key cannot contain ']'") + default: + l.next() + } + } + return l.errorf("unclosed table key") +} + +func (l *tomlLexer) lexRightBracket() tomlLexStateFn { + l.next() + l.emit(tokenRightBracket) + if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' { + return l.errorf("cannot have ']' here") + } + l.brackets = l.brackets[:len(l.brackets)-1] + return l.lexRvalue +} + +type validRuneFn func(r rune) bool + +func isValidHexRune(r rune) bool { + return r >= 'a' && r <= 'f' || + r >= 'A' && r <= 'F' || + r >= '0' && r <= '9' || + r == '_' +} + +func isValidOctalRune(r rune) bool { + return r >= '0' && r <= '7' || r == '_' +} + +func isValidBinaryRune(r rune) bool { + return r == '0' || r == '1' || r == '_' +} + +func (l *tomlLexer) lexNumber() tomlLexStateFn { + r := l.peek() + + if r == '0' { + follow := l.peekString(2) + if len(follow) == 2 { + var isValidRune validRuneFn + switch follow[1] { + case 'x': + isValidRune = isValidHexRune + case 'o': + isValidRune = isValidOctalRune + case 'b': + isValidRune = isValidBinaryRune + default: + if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' { + return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1])) + } + } + + if isValidRune != nil { + l.next() + l.next() + digitSeen := false + for { + next := l.peek() + if !isValidRune(next) { + break + } + digitSeen = true + l.next() + } + + if !digitSeen { + return l.errorf("number needs at least one digit") + } + + l.emit(tokenInteger) + + return l.lexRvalue + } + } + } + + if r == '+' || r == '-' { + l.next() + if l.follow("inf") { + return l.lexInf + } + if l.follow("nan") { + return l.lexNan + } + } + + pointSeen := false + expSeen := false + digitSeen := false + for { + next := l.peek() + if next == '.' { + if pointSeen { + return l.errorf("cannot have two dots in one float") + } + l.next() + if !isDigit(l.peek()) { + return l.errorf("float cannot end with a dot") + } + pointSeen = true + } else if next == 'e' || next == 'E' { + expSeen = true + l.next() + r := l.peek() + if r == '+' || r == '-' { + l.next() + } + } else if isDigit(next) { + digitSeen = true + l.next() + } else if next == '_' { + l.next() + } else { + break + } + if pointSeen && !digitSeen { + return l.errorf("cannot start float with a dot") + } + } + + if !digitSeen { + return l.errorf("no digit in that number") + } + if pointSeen || expSeen { + l.emit(tokenFloat) + } else { + l.emit(tokenInteger) + } + return l.lexRvalue +} + +func (l *tomlLexer) run() { + for state := l.lexVoid; state != nil; { + state = state() + } +} + +// Entry point +func lexToml(inputBytes []byte) []token { + runes := bytes.Runes(inputBytes) + l := &tomlLexer{ + input: runes, + tokens: make([]token, 0, 256), + line: 1, + col: 1, + endbufferLine: 1, + endbufferCol: 1, + } + l.run() + return l.tokens +} |