diff options
Diffstat (limited to 'vendor/github.com/temoto/robotstxt')
-rw-r--r-- | vendor/github.com/temoto/robotstxt/.gitignore | 15 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/.golangci.yml | 20 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/.travis.yml | 30 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/LICENSE | 21 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/README.rst | 115 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/codecov.yml | 2 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/fuzz.go | 29 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/parser.go | 271 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/robotstxt.go | 227 | ||||
-rw-r--r-- | vendor/github.com/temoto/robotstxt/scanner.go | 185 |
10 files changed, 0 insertions, 915 deletions
diff --git a/vendor/github.com/temoto/robotstxt/.gitignore b/vendor/github.com/temoto/robotstxt/.gitignore deleted file mode 100644 index 6205f9eae..000000000 --- a/vendor/github.com/temoto/robotstxt/.gitignore +++ /dev/null @@ -1,15 +0,0 @@ -*.cgo?.* -*.o -*.so -*.sublime-* -*.zip -.DS_Store -.idea/ -.tags* -_cgo_* -_gofuzz/crashers/ -_gofuzz/suppressions/ -_obj -_test -coverage.txt -robots.txt-check/robots.txt-check diff --git a/vendor/github.com/temoto/robotstxt/.golangci.yml b/vendor/github.com/temoto/robotstxt/.golangci.yml deleted file mode 100644 index 24e5858fa..000000000 --- a/vendor/github.com/temoto/robotstxt/.golangci.yml +++ /dev/null @@ -1,20 +0,0 @@ -linters: - enable: - - goconst - - gofmt - - gosec - - maligned - - prealloc - - staticcheck - disable: - - deadcode - - structcheck - - varcheck - -linters-settings: - gofmt: - simplify: true - govet: - check-shadowing: true - maligned: - suggest-new: true diff --git a/vendor/github.com/temoto/robotstxt/.travis.yml b/vendor/github.com/temoto/robotstxt/.travis.yml deleted file mode 100644 index ad90dac37..000000000 --- a/vendor/github.com/temoto/robotstxt/.travis.yml +++ /dev/null @@ -1,30 +0,0 @@ -cache: - go: true - directories: - - $HOME/.cache - - $HOME/bin - - $HOME/gopath/pkg/mod -language: go -go: -- 1.11 -- 1.12 -- 1.13 -- 1.14 -- 1.x -- master -install: true -script: GO111MODULE=on go test -race - -matrix: - include: - - go: 1.x - env: task=coverage - script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt - after_success: bash <(curl -s https://codecov.io/bash) - - go: 1.x - env: task=bench - script: GO111MODULE=on ./script/bench - - go: 1.x - install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1 - env: task=clean - script: GO111MODULE=on ./script/clean diff --git a/vendor/github.com/temoto/robotstxt/LICENSE b/vendor/github.com/temoto/robotstxt/LICENSE deleted file mode 100644 index c125145b6..000000000 --- a/vendor/github.com/temoto/robotstxt/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -The MIT License - -Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com> - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. diff --git a/vendor/github.com/temoto/robotstxt/README.rst b/vendor/github.com/temoto/robotstxt/README.rst deleted file mode 100644 index 92f1ae161..000000000 --- a/vendor/github.com/temoto/robotstxt/README.rst +++ /dev/null @@ -1,115 +0,0 @@ -What -==== - -This is a robots.txt exclusion protocol implementation for Go language (golang). - - -Build -===== - -To build and run tests run `go test` in source directory. - - -Contribute -========== - -Warm welcome. - -* If desired, add your name in README.rst, section Who. -* Run `script/test && script/clean && echo ok` -* You can ignore linter warnings, but everything else must pass. -* Send your change as pull request or just a regular patch to current maintainer (see section Who). - -Thank you. - - -Usage -===== - -As usual, no special installation is required, just - - import "github.com/temoto/robotstxt" - -run `go get` and you're ready. - -1. Parse -^^^^^^^^ - -First of all, you need to parse robots.txt data. You can do it with -functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`:: - - robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:")) - robots, err := robotstxt.FromString("User-agent: *\nDisallow:") - -As of 2012-10-03, `FromBytes` is the most efficient method, everything else -is a wrapper for this core function. - -There are few convenient constructors for various purposes: - -* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data -from HTTP response. It *does not* call `response.Body.Close()`:: - - robots, err := robotstxt.FromResponse(resp) - resp.Body.Close() - if err != nil { - log.Println("Error parsing robots.txt:", err.Error()) - } - -* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or -`FromStatusAndString` if you prefer to read bytes (string) yourself. -Passing status code applies following logic in line with Google's interpretation -of robots.txt files: - - * status 2xx -> parse body with `FromBytes` and apply rules listed there. - * status 4xx -> allow all (even 401/403, as recommended by Google). - * other (5xx) -> disallow all, consider this a temporary unavailability. - -2. Query -^^^^^^^^ - -Parsing robots.txt content builds a kind of logic database, which you can -query with `(r *RobotsData) TestAgent(url, agent string) (bool)`. - -Explicit passing of agent is useful if you want to query for different agents. For -single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)` -returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`. - -Simple query with explicit user agent. Each call will scan all rules. - -:: - - allow := robots.TestAgent("/", "FooBot") - -Or query several paths against same user agent for performance. - -:: - - group := robots.FindGroup("BarBot") - group.Test("/") - group.Test("/download.mp3") - group.Test("/news/article-2012-1") - - -Who -=== - -Honorable contributors (in undefined order): - - * Ilya Grigorik (igrigorik) - * Martin Angers (PuerkitoBio) - * Micha Gorelick (mynameisfiber) - -Initial commit and other: Sergey Shepelev temotor@gmail.com - - -Flair -===== - -.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master - :target: https://travis-ci.org/temoto/robotstxt - -.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg - :target: https://codecov.io/gh/temoto/robotstxt - -.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt - :target: https://goreportcard.com/report/github.com/temoto/robotstxt diff --git a/vendor/github.com/temoto/robotstxt/codecov.yml b/vendor/github.com/temoto/robotstxt/codecov.yml deleted file mode 100644 index b80be28f6..000000000 --- a/vendor/github.com/temoto/robotstxt/codecov.yml +++ /dev/null @@ -1,2 +0,0 @@ -codecov: - token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04 diff --git a/vendor/github.com/temoto/robotstxt/fuzz.go b/vendor/github.com/temoto/robotstxt/fuzz.go deleted file mode 100644 index de4b0587a..000000000 --- a/vendor/github.com/temoto/robotstxt/fuzz.go +++ /dev/null @@ -1,29 +0,0 @@ -// +build gofuzz - -package robotstxt - -import "testing/quick" - -func Fuzz(data []byte) int { - r, err := FromBytes(data) - if err != nil { - if r != nil { - panic("r != nil on error") - } - return 0 - } - - // FindGroup must never return nil - f1 := func(agent string) bool { return r.FindGroup(agent) != nil } - if err := quick.Check(f1, nil); err != nil { - panic(err) - } - - // just check TestAgent doesn't panic - f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true } - if err := quick.Check(f2, nil); err != nil { - panic(err) - } - - return 1 -} diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go deleted file mode 100644 index 46eb6b184..000000000 --- a/vendor/github.com/temoto/robotstxt/parser.go +++ /dev/null @@ -1,271 +0,0 @@ -package robotstxt - -// Comments explaining the logic are taken from either the google's spec: -// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt -// -// or the Wikipedia's entry on robots.txt: -// http://en.wikipedia.org/wiki/Robots.txt - -import ( - "fmt" - "io" - "math" - "regexp" - "strconv" - "strings" - "time" -) - -type lineType uint - -const ( - lIgnore lineType = iota - lUnknown - lUserAgent - lAllow - lDisallow - lCrawlDelay - lSitemap - lHost -) - -type parser struct { - tokens []string - pos int -} - -type lineInfo struct { - t lineType // Type of line key - k string // String representation of the type of key - vs string // String value of the key - vf float64 // Float value of the key - vr *regexp.Regexp // Regexp value of the key -} - -func newParser(tokens []string) *parser { - return &parser{tokens: tokens} -} - -func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) { - var g *Group - for _, a := range agents { - if g = groups[a]; g == nil { - g = new(Group) - groups[a] = g - } - fun(g) - } -} - -func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) { - groups = make(map[string]*Group, 16) - agents := make([]string, 0, 4) - isEmptyGroup := true - - // Reset internal fields, tokens are assigned at creation time, never change - p.pos = 0 - - for { - if li, err := p.parseLine(); err != nil { - if err == io.EOF { - break - } - errs = append(errs, err) - } else { - switch li.t { - case lUserAgent: - // Two successive user-agent lines are part of the same group. - if !isEmptyGroup { - // End previous group - agents = make([]string, 0, 4) - } - if len(agents) == 0 { - isEmptyGroup = true - } - agents = append(agents, li.vs) - - case lDisallow: - // Error if no current group - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - var r *rule - if li.vr != nil { - r = &rule{"", false, li.vr} - } else { - r = &rule{li.vs, false, nil} - } - parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) - } - - case lAllow: - // Error if no current group - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - var r *rule - if li.vr != nil { - r = &rule{"", true, li.vr} - } else { - r = &rule{li.vs, true, nil} - } - parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) - } - - case lHost: - host = li.vs - - case lSitemap: - sitemaps = append(sitemaps, li.vs) - - case lCrawlDelay: - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - delay := time.Duration(li.vf * float64(time.Second)) - parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay }) - } - } - } - } - return -} - -func (p *parser) parseLine() (li *lineInfo, err error) { - t1, ok1 := p.popToken() - if !ok1 { - // proper EOF - return nil, io.EOF - } - - t2, ok2 := p.peekToken() - if !ok2 { - // EOF, no value associated with the token, so ignore token and return - return nil, io.EOF - } - - // Helper closure for all string-based tokens, common behaviour: - // - Consume t2 token - // - If empty, return unknown line info - // - Otherwise return the specified line info - returnStringVal := func(t lineType) (*lineInfo, error) { - p.popToken() - if t2 != "" { - return &lineInfo{t: t, k: t1, vs: t2}, nil - } - return &lineInfo{t: lIgnore}, nil - } - - // Helper closure for all path tokens (allow/disallow), common behaviour: - // - Consume t2 token - // - If empty, return unknown line info - // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*") - // - Detect if wildcards are present, if so, compile into a regexp - // - Return the specified line info - returnPathVal := func(t lineType) (*lineInfo, error) { - p.popToken() - if t2 != "" { - if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") { - t2 = "/" + t2 - } - t2 = strings.TrimRightFunc(t2, isAsterisk) - // From google's spec: - // Google, Bing, Yahoo, and Ask support a limited form of - // "wildcards" for path values. These are: - // * designates 0 or more instances of any valid character - // $ designates the end of the URL - if strings.ContainsAny(t2, "*$") { - // Must compile a regexp, this is a pattern. - // Escape string before compile. - t2 = regexp.QuoteMeta(t2) - t2 = strings.Replace(t2, `\*`, `.*`, -1) - t2 = strings.Replace(t2, `\$`, `$`, -1) - if r, e := regexp.Compile(t2); e != nil { - return nil, e - } else { - return &lineInfo{t: t, k: t1, vr: r}, nil - } - } else { - // Simple string path - return &lineInfo{t: t, k: t1, vs: t2}, nil - } - } - return &lineInfo{t: lIgnore}, nil - } - - switch strings.ToLower(t1) { - case tokEOL: - // Don't consume t2 and continue parsing - return &lineInfo{t: lIgnore}, nil - - case "user-agent", "useragent": - // From google's spec: - // Handling of <field> elements with simple errors / typos (eg "useragent" - // instead of "user-agent") is undefined and may be interpreted as correct - // directives by some user-agents. - // The user-agent is non-case-sensitive. - t2 = strings.ToLower(t2) - return returnStringVal(lUserAgent) - - case "disallow": - // From google's spec: - // When no path is specified, the directive is ignored (so an empty Disallow - // CAN be an allow, since allow is the default. The actual result depends - // on the other rules in the group). - return returnPathVal(lDisallow) - - case "allow": - // From google's spec: - // When no path is specified, the directive is ignored. - return returnPathVal(lAllow) - - case "host": - // Host directive to specify main site mirror - // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host - return returnStringVal(lHost) - - case "sitemap": - // Non-group field, applies to the host as a whole, not to a specific user-agent - return returnStringVal(lSitemap) - - case "crawl-delay", "crawldelay": - // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions - // Several major crawlers support a Crawl-delay parameter, set to the - // number of seconds to wait between successive requests to the same server. - p.popToken() - if cd, e := strconv.ParseFloat(t2, 64); e != nil { - return nil, e - } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) { - return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2) - } else { - return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil - } - } - - // Consume t2 token - p.popToken() - return &lineInfo{t: lUnknown, k: t1}, nil -} - -func (p *parser) popToken() (tok string, ok bool) { - tok, ok = p.peekToken() - if !ok { - return - } - p.pos++ - return tok, true -} - -func (p *parser) peekToken() (tok string, ok bool) { - if p.pos >= len(p.tokens) { - return "", false - } - return p.tokens[p.pos], true -} - -func isAsterisk(r rune) bool { - return r == '*' -} diff --git a/vendor/github.com/temoto/robotstxt/robotstxt.go b/vendor/github.com/temoto/robotstxt/robotstxt.go deleted file mode 100644 index 52d3637c6..000000000 --- a/vendor/github.com/temoto/robotstxt/robotstxt.go +++ /dev/null @@ -1,227 +0,0 @@ -// Package robotstxt implements the robots.txt Exclusion Protocol -// as specified in http://www.robotstxt.org/wc/robots.html -// with various extensions. -package robotstxt - -// Comments explaining the logic are taken from either the Google's spec: -// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt - -import ( - "bytes" - "errors" - "io/ioutil" - "net/http" - "regexp" - "strconv" - "strings" - "time" -) - -type RobotsData struct { - // private - groups map[string]*Group - allowAll bool - disallowAll bool - Host string - Sitemaps []string -} - -type Group struct { - rules []*rule - Agent string - CrawlDelay time.Duration -} - -type rule struct { - path string - allow bool - pattern *regexp.Regexp -} - -type ParseError struct { - Errs []error -} - -func newParseError(errs []error) *ParseError { - return &ParseError{errs} -} - -func (e ParseError) Error() string { - var b bytes.Buffer - - b.WriteString("Parse error(s): " + "\n") - for _, er := range e.Errs { - b.WriteString(er.Error() + "\n") - } - return b.String() -} - -var allowAll = &RobotsData{allowAll: true} -var disallowAll = &RobotsData{disallowAll: true} -var emptyGroup = &Group{} - -func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) { - switch { - case statusCode >= 200 && statusCode < 300: - return FromBytes(body) - - // From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt - // - // Google treats all 4xx errors in the same way and assumes that no valid - // robots.txt file exists. It is assumed that there are no restrictions. - // This is a "full allow" for crawling. Note: this includes 401 - // "Unauthorized" and 403 "Forbidden" HTTP result codes. - case statusCode >= 400 && statusCode < 500: - return allowAll, nil - - // From Google's spec: - // Server errors (5xx) are seen as temporary errors that result in a "full - // disallow" of crawling. - case statusCode >= 500 && statusCode < 600: - return disallowAll, nil - } - - return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode)) -} - -func FromStatusAndString(statusCode int, body string) (*RobotsData, error) { - return FromStatusAndBytes(statusCode, []byte(body)) -} - -func FromResponse(res *http.Response) (*RobotsData, error) { - if res == nil { - // Edge case, if res is nil, return nil data - return nil, nil - } - buf, e := ioutil.ReadAll(res.Body) - if e != nil { - return nil, e - } - return FromStatusAndBytes(res.StatusCode, buf) -} - -func FromBytes(body []byte) (r *RobotsData, err error) { - var errs []error - - // special case (probably not worth optimization?) - trimmed := bytes.TrimSpace(body) - if len(trimmed) == 0 { - return allowAll, nil - } - - sc := newByteScanner("bytes", true) - //sc.Quiet = !print_errors - sc.feed(body, true) - tokens := sc.scanAll() - - // special case worth optimization - if len(tokens) == 0 { - return allowAll, nil - } - - r = &RobotsData{} - parser := newParser(tokens) - r.groups, r.Host, r.Sitemaps, errs = parser.parseAll() - if len(errs) > 0 { - return nil, newParseError(errs) - } - - return r, nil -} - -func FromString(body string) (r *RobotsData, err error) { - return FromBytes([]byte(body)) -} - -func (r *RobotsData) TestAgent(path, agent string) bool { - if r.allowAll { - return true - } - if r.disallowAll { - return false - } - - // Find a group of rules that applies to this agent - // From Google's spec: - // The user-agent is non-case-sensitive. - g := r.FindGroup(agent) - return g.Test(path) -} - -// FindGroup searches block of declarations for specified user-agent. -// From Google's spec: -// Only one group of group-member records is valid for a particular crawler. -// The crawler must determine the correct group of records by finding the group -// with the most specific user-agent that still matches. All other groups of -// records are ignored by the crawler. The user-agent is non-case-sensitive. -// The order of the groups within the robots.txt file is irrelevant. -func (r *RobotsData) FindGroup(agent string) (ret *Group) { - var prefixLen int - - agent = strings.ToLower(agent) - if ret = r.groups["*"]; ret != nil { - // Weakest match possible - prefixLen = 1 - } - for a, g := range r.groups { - if a != "*" && strings.HasPrefix(agent, a) { - if l := len(a); l > prefixLen { - prefixLen = l - ret = g - } - } - } - - if ret == nil { - return emptyGroup - } - return -} - -func (g *Group) Test(path string) bool { - if r := g.findRule(path); r != nil { - return r.allow - } - - // From Google's spec: - // By default, there are no restrictions for crawling for the designated crawlers. - return true -} - -// From Google's spec: -// The path value is used as a basis to determine whether or not a rule applies -// to a specific URL on a site. With the exception of wildcards, the path is -// used to match the beginning of a URL (and any valid URLs that start with the -// same path). -// -// At a group-member level, in particular for allow and disallow directives, -// the most specific rule based on the length of the [path] entry will trump -// the less specific (shorter) rule. The order of precedence for rules with -// wildcards is undefined. -func (g *Group) findRule(path string) (ret *rule) { - var prefixLen int - - for _, r := range g.rules { - if r.pattern != nil { - if r.pattern.MatchString(path) { - // Consider this a match equal to the length of the pattern. - // From Google's spec: - // The order of precedence for rules with wildcards is undefined. - if l := len(r.pattern.String()); l > prefixLen { - prefixLen = l - ret = r - } - } - } else if r.path == "/" && prefixLen == 0 { - // Weakest match possible - prefixLen = 1 - ret = r - } else if strings.HasPrefix(path, r.path) { - if l := len(r.path); l > prefixLen { - prefixLen = l - ret = r - } - } - } - return -} diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go deleted file mode 100644 index 6bd98c2ec..000000000 --- a/vendor/github.com/temoto/robotstxt/scanner.go +++ /dev/null @@ -1,185 +0,0 @@ -package robotstxt - -import ( - "bytes" - "fmt" - "go/token" - "os" - "sync" - "unicode/utf8" -) - -type byteScanner struct { - pos token.Position - buf []byte - ErrorCount int - ch rune - Quiet bool - keyTokenFound bool - lastChunk bool -} - -const tokEOL = "\n" - -var WhitespaceChars = []rune{' ', '\t', '\v'} -var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }} - -func newByteScanner(srcname string, quiet bool) *byteScanner { - return &byteScanner{ - Quiet: quiet, - ch: -1, - pos: token.Position{Filename: srcname}, - } -} - -func (s *byteScanner) feed(input []byte, end bool) { - s.buf = input - s.pos.Offset = 0 - s.pos.Line = 1 - s.pos.Column = 1 - s.lastChunk = end - - // Read first char into look-ahead buffer `s.ch`. - if !s.nextChar() { - return - } - - // Skip UTF-8 byte order mark - if s.ch == 65279 { - s.nextChar() - s.pos.Column = 1 - } -} - -func (s *byteScanner) GetPosition() token.Position { - return s.pos -} - -func (s *byteScanner) scan() string { - // Note Offset > len, not >=, so we can scan last character. - if s.lastChunk && s.pos.Offset > len(s.buf) { - return "" - } - - s.skipSpace() - - if s.ch == -1 { - return "" - } - - // EOL - if s.isEol() { - s.keyTokenFound = false - // skip subsequent newline chars - for s.ch != -1 && s.isEol() { - s.nextChar() - } - // emit newline as separate token - return tokEOL - } - - // skip comments - if s.ch == '#' { - s.keyTokenFound = false - s.skipUntilEol() - if s.ch == -1 { - return "" - } - // emit newline as separate token - return tokEOL - } - - // else we found something - tok := tokBuffers.Get().(*bytes.Buffer) - defer tokBuffers.Put(tok) - tok.Reset() - tok.WriteRune(s.ch) - s.nextChar() - for s.ch != -1 && !s.isSpace() && !s.isEol() { - // Do not consider ":" to be a token separator if a first key token - // has already been found on this line (avoid cutting an absolute URL - // after the "http:") - if s.ch == ':' && !s.keyTokenFound { - s.nextChar() - s.keyTokenFound = true - break - } - - tok.WriteRune(s.ch) - s.nextChar() - } - return tok.String() -} - -func (s *byteScanner) scanAll() []string { - results := make([]string, 0, 64) // random guess of average tokens length - for { - token := s.scan() - if token != "" { - results = append(results, token) - } else { - break - } - } - return results -} - -func (s *byteScanner) error(pos token.Position, msg string) { - s.ErrorCount++ - if !s.Quiet { - fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg) - } -} - -func (s *byteScanner) isEol() bool { - return s.ch == '\n' || s.ch == '\r' -} - -func (s *byteScanner) isSpace() bool { - for _, r := range WhitespaceChars { - if s.ch == r { - return true - } - } - return false -} - -func (s *byteScanner) skipSpace() { - for s.ch != -1 && s.isSpace() { - s.nextChar() - } -} - -func (s *byteScanner) skipUntilEol() { - for s.ch != -1 && !s.isEol() { - s.nextChar() - } - // skip subsequent newline chars - for s.ch != -1 && s.isEol() { - s.nextChar() - } -} - -// Reads next Unicode char. -func (s *byteScanner) nextChar() bool { - if s.pos.Offset >= len(s.buf) { - s.ch = -1 - return false - } - s.pos.Column++ - if s.ch == '\n' { - s.pos.Line++ - s.pos.Column = 1 - } - r, w := rune(s.buf[s.pos.Offset]), 1 - if r >= 0x80 { - r, w = utf8.DecodeRune(s.buf[s.pos.Offset:]) - if r == utf8.RuneError && w == 1 { - s.error(s.pos, "illegal UTF-8 encoding") - } - } - s.pos.Column++ - s.pos.Offset += w - s.ch = r - return true -} |