summaryrefslogtreecommitdiff
path: root/vendor/github.com/gorilla/css/scanner/scanner.go
blob: 23fa7404ecf48ee8056fc318eeed662209fa5d3d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
// Copyright 2012 The Gorilla Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package scanner

import (
	"fmt"
	"regexp"
	"strings"
	"unicode"
	"unicode/utf8"
)

// tokenType identifies the type of lexical tokens.
type tokenType int

// String returns a string representation of the token type.
func (t tokenType) String() string {
	return tokenNames[t]
}

// Token represents a token and the corresponding string.
type Token struct {
	Type   tokenType
	Value  string
	Line   int
	Column int
}

// String returns a string representation of the token.
func (t *Token) String() string {
	if len(t.Value) > 10 {
		return fmt.Sprintf("%s (line: %d, column: %d): %.10q...",
			t.Type, t.Line, t.Column, t.Value)
	}
	return fmt.Sprintf("%s (line: %d, column: %d): %q",
		t.Type, t.Line, t.Column, t.Value)
}

// All tokens -----------------------------------------------------------------

// The complete list of tokens in CSS3.
const (
	// Scanner flags.
	TokenError tokenType = iota
	TokenEOF
	// From now on, only tokens from the CSS specification.
	TokenIdent
	TokenAtKeyword
	TokenString
	TokenHash
	TokenNumber
	TokenPercentage
	TokenDimension
	TokenURI
	TokenUnicodeRange
	TokenCDO
	TokenCDC
	TokenS
	TokenComment
	TokenFunction
	TokenIncludes
	TokenDashMatch
	TokenPrefixMatch
	TokenSuffixMatch
	TokenSubstringMatch
	TokenChar
	TokenBOM
)

// tokenNames maps tokenType's to their names. Used for conversion to string.
var tokenNames = map[tokenType]string{
	TokenError:          "error",
	TokenEOF:            "EOF",
	TokenIdent:          "IDENT",
	TokenAtKeyword:      "ATKEYWORD",
	TokenString:         "STRING",
	TokenHash:           "HASH",
	TokenNumber:         "NUMBER",
	TokenPercentage:     "PERCENTAGE",
	TokenDimension:      "DIMENSION",
	TokenURI:            "URI",
	TokenUnicodeRange:   "UNICODE-RANGE",
	TokenCDO:            "CDO",
	TokenCDC:            "CDC",
	TokenS:              "S",
	TokenComment:        "COMMENT",
	TokenFunction:       "FUNCTION",
	TokenIncludes:       "INCLUDES",
	TokenDashMatch:      "DASHMATCH",
	TokenPrefixMatch:    "PREFIXMATCH",
	TokenSuffixMatch:    "SUFFIXMATCH",
	TokenSubstringMatch: "SUBSTRINGMATCH",
	TokenChar:           "CHAR",
	TokenBOM:            "BOM",
}

// Macros and productions -----------------------------------------------------
// http://www.w3.org/TR/css3-syntax/#tokenization

var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`)

// macros maps macro names to patterns to be expanded.
var macros = map[string]string{
	// must be escaped: `\.+*?()|[]{}^$`
	"ident":      `-?{nmstart}{nmchar}*`,
	"name":       `{nmchar}+`,
	"nmstart":    `[a-zA-Z_]|{nonascii}|{escape}`,
	"nonascii":   "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
	"unicode":    `\\[0-9a-fA-F]{1,6}{wc}?`,
	"escape":     "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]",
	"nmchar":     `[a-zA-Z0-9_-]|{nonascii}|{escape}`,
	"num":        `[0-9]*\.[0-9]+|[0-9]+`,
	"string":     `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`,
	"stringchar": `{urlchar}|[ ]|\\{nl}`,
	"nl":         `[\n\r\f]|\r\n`,
	"w":          `{wc}*`,
	"wc":         `[\t\n\f\r ]`,

	// urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}]
	// ASCII characters range = `[\u0020-\u007e]`
	// Skip space \u0020 = `[\u0021-\u007e]`
	// Skip quotation mark \0022 = `[\u0021\u0023-\u007e]`
	// Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]`
	// Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]`
	// Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves
	"urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}",
}

// productions maps the list of tokens to patterns to be expanded.
var productions = map[tokenType]string{
	// Unused regexps (matched using other methods) are commented out.
	TokenIdent:        `{ident}`,
	TokenAtKeyword:    `@{ident}`,
	TokenString:       `{string}`,
	TokenHash:         `#{name}`,
	TokenNumber:       `{num}`,
	TokenPercentage:   `{num}%`,
	TokenDimension:    `{num}{ident}`,
	TokenURI:          `url\({w}(?:{string}|{urlchar}*?){w}\)`,
	TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`,
	//TokenCDO:            `<!--`,
	TokenCDC:      `-->`,
	TokenS:        `{wc}+`,
	TokenComment:  `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`,
	TokenFunction: `{ident}\(`,
	//TokenIncludes:       `~=`,
	//TokenDashMatch:      `\|=`,
	//TokenPrefixMatch:    `\^=`,
	//TokenSuffixMatch:    `\$=`,
	//TokenSubstringMatch: `\*=`,
	//TokenChar:           `[^"']`,
	//TokenBOM:            "\uFEFF",
}

// matchers maps the list of tokens to compiled regular expressions.
//
// The map is filled on init() using the macros and productions defined in
// the CSS specification.
var matchers = map[tokenType]*regexp.Regexp{}

// matchOrder is the order to test regexps when first-char shortcuts
// can't be used.
var matchOrder = []tokenType{
	TokenURI,
	TokenFunction,
	TokenUnicodeRange,
	TokenIdent,
	TokenDimension,
	TokenPercentage,
	TokenNumber,
	TokenCDC,
}

func init() {
	// replace macros and compile regexps for productions.
	replaceMacro := func(s string) string {
		return "(?:" + macros[s[1:len(s)-1]] + ")"
	}
	for t, s := range productions {
		for macroRegexp.MatchString(s) {
			s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro)
		}
		matchers[t] = regexp.MustCompile("^(?:" + s + ")")
	}
}

// Scanner --------------------------------------------------------------------

// New returns a new CSS scanner for the given input.
func New(input string) *Scanner {
	// Normalize newlines.
	input = strings.Replace(input, "\r\n", "\n", -1)
	return &Scanner{
		input: input,
		row:   1,
		col:   1,
	}
}

// Scanner scans an input and emits tokens following the CSS3 specification.
type Scanner struct {
	input string
	pos   int
	row   int
	col   int
	err   *Token
}

// Next returns the next token from the input.
//
// At the end of the input the token type is TokenEOF.
//
// If the input can't be tokenized the token type is TokenError. This occurs
// in case of unclosed quotation marks or comments.
func (s *Scanner) Next() *Token {
	if s.err != nil {
		return s.err
	}
	if s.pos >= len(s.input) {
		s.err = &Token{TokenEOF, "", s.row, s.col}
		return s.err
	}
	if s.pos == 0 {
		// Test BOM only once, at the beginning of the file.
		if strings.HasPrefix(s.input, "\uFEFF") {
			return s.emitSimple(TokenBOM, "\uFEFF")
		}
	}
	// There's a lot we can guess based on the first byte so we'll take a
	// shortcut before testing multiple regexps.
	input := s.input[s.pos:]
	switch input[0] {
	case '\t', '\n', '\f', '\r', ' ':
		// Whitespace.
		return s.emitToken(TokenS, matchers[TokenS].FindString(input))
	case '.':
		// Dot is too common to not have a quick check.
		// We'll test if this is a Char; if it is followed by a number it is a
		// dimension/percentage/number, and this will be matched later.
		if len(input) > 1 && !unicode.IsDigit(rune(input[1])) {
			return s.emitSimple(TokenChar, ".")
		}
	case '#':
		// Another common one: Hash or Char.
		if match := matchers[TokenHash].FindString(input); match != "" {
			return s.emitToken(TokenHash, match)
		}
		return s.emitSimple(TokenChar, "#")
	case '@':
		// Another common one: AtKeyword or Char.
		if match := matchers[TokenAtKeyword].FindString(input); match != "" {
			return s.emitSimple(TokenAtKeyword, match)
		}
		return s.emitSimple(TokenChar, "@")
	case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}':
		// More common chars.
		return s.emitSimple(TokenChar, string(input[0]))
	case '"', '\'':
		// String or error.
		match := matchers[TokenString].FindString(input)
		if match != "" {
			return s.emitToken(TokenString, match)
		}

		s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col}
		return s.err
	case '/':
		// Comment, error or Char.
		if len(input) > 1 && input[1] == '*' {
			match := matchers[TokenComment].FindString(input)
			if match != "" {
				return s.emitToken(TokenComment, match)
			} else {
				s.err = &Token{TokenError, "unclosed comment", s.row, s.col}
				return s.err
			}
		}
		return s.emitSimple(TokenChar, "/")
	case '~':
		// Includes or Char.
		return s.emitPrefixOrChar(TokenIncludes, "~=")
	case '|':
		// DashMatch or Char.
		return s.emitPrefixOrChar(TokenDashMatch, "|=")
	case '^':
		// PrefixMatch or Char.
		return s.emitPrefixOrChar(TokenPrefixMatch, "^=")
	case '$':
		// SuffixMatch or Char.
		return s.emitPrefixOrChar(TokenSuffixMatch, "$=")
	case '*':
		// SubstringMatch or Char.
		return s.emitPrefixOrChar(TokenSubstringMatch, "*=")
	case '<':
		// CDO or Char.
		return s.emitPrefixOrChar(TokenCDO, "<!--")
	}
	// Test all regexps, in order.
	for _, token := range matchOrder {
		if match := matchers[token].FindString(input); match != "" {
			return s.emitToken(token, match)
		}
	}
	// We already handled unclosed quotation marks and comments,
	// so this can only be a Char.
	r, width := utf8.DecodeRuneInString(input)
	token := &Token{TokenChar, string(r), s.row, s.col}
	s.col += width
	s.pos += width
	return token
}

// updatePosition updates input coordinates based on the consumed text.
func (s *Scanner) updatePosition(text string) {
	width := utf8.RuneCountInString(text)
	lines := strings.Count(text, "\n")
	s.row += lines
	if lines == 0 {
		s.col += width
	} else {
		s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):])
	}
	s.pos += len(text) // while col is a rune index, pos is a byte index
}

// emitToken returns a Token for the string v and updates the scanner position.
func (s *Scanner) emitToken(t tokenType, v string) *Token {
	token := &Token{t, v, s.row, s.col}
	s.updatePosition(v)
	return token
}

// emitSimple returns a Token for the string v and updates the scanner
// position in a simplified manner.
//
// The string is known to have only ASCII characters and to not have a newline.
func (s *Scanner) emitSimple(t tokenType, v string) *Token {
	token := &Token{t, v, s.row, s.col}
	s.col += len(v)
	s.pos += len(v)
	return token
}

// emitPrefixOrChar returns a Token for type t if the current position
// matches the given prefix. Otherwise it returns a Char token using the
// first character from the prefix.
//
// The prefix is known to have only ASCII characters and to not have a newline.
func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token {
	if strings.HasPrefix(s.input[s.pos:], prefix) {
		return s.emitSimple(t, prefix)
	}
	return s.emitSimple(TokenChar, string(prefix[0]))
}