diff options
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/parser.go')
-rw-r--r-- | vendor/github.com/temoto/robotstxt/parser.go | 271 |
1 files changed, 0 insertions, 271 deletions
diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go deleted file mode 100644 index 46eb6b184..000000000 --- a/vendor/github.com/temoto/robotstxt/parser.go +++ /dev/null @@ -1,271 +0,0 @@ -package robotstxt - -// Comments explaining the logic are taken from either the google's spec: -// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt -// -// or the Wikipedia's entry on robots.txt: -// http://en.wikipedia.org/wiki/Robots.txt - -import ( - "fmt" - "io" - "math" - "regexp" - "strconv" - "strings" - "time" -) - -type lineType uint - -const ( - lIgnore lineType = iota - lUnknown - lUserAgent - lAllow - lDisallow - lCrawlDelay - lSitemap - lHost -) - -type parser struct { - tokens []string - pos int -} - -type lineInfo struct { - t lineType // Type of line key - k string // String representation of the type of key - vs string // String value of the key - vf float64 // Float value of the key - vr *regexp.Regexp // Regexp value of the key -} - -func newParser(tokens []string) *parser { - return &parser{tokens: tokens} -} - -func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) { - var g *Group - for _, a := range agents { - if g = groups[a]; g == nil { - g = new(Group) - groups[a] = g - } - fun(g) - } -} - -func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) { - groups = make(map[string]*Group, 16) - agents := make([]string, 0, 4) - isEmptyGroup := true - - // Reset internal fields, tokens are assigned at creation time, never change - p.pos = 0 - - for { - if li, err := p.parseLine(); err != nil { - if err == io.EOF { - break - } - errs = append(errs, err) - } else { - switch li.t { - case lUserAgent: - // Two successive user-agent lines are part of the same group. - if !isEmptyGroup { - // End previous group - agents = make([]string, 0, 4) - } - if len(agents) == 0 { - isEmptyGroup = true - } - agents = append(agents, li.vs) - - case lDisallow: - // Error if no current group - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - var r *rule - if li.vr != nil { - r = &rule{"", false, li.vr} - } else { - r = &rule{li.vs, false, nil} - } - parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) - } - - case lAllow: - // Error if no current group - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - var r *rule - if li.vr != nil { - r = &rule{"", true, li.vr} - } else { - r = &rule{li.vs, true, nil} - } - parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) - } - - case lHost: - host = li.vs - - case lSitemap: - sitemaps = append(sitemaps, li.vs) - - case lCrawlDelay: - if len(agents) == 0 { - errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos)) - } else { - isEmptyGroup = false - delay := time.Duration(li.vf * float64(time.Second)) - parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay }) - } - } - } - } - return -} - -func (p *parser) parseLine() (li *lineInfo, err error) { - t1, ok1 := p.popToken() - if !ok1 { - // proper EOF - return nil, io.EOF - } - - t2, ok2 := p.peekToken() - if !ok2 { - // EOF, no value associated with the token, so ignore token and return - return nil, io.EOF - } - - // Helper closure for all string-based tokens, common behaviour: - // - Consume t2 token - // - If empty, return unknown line info - // - Otherwise return the specified line info - returnStringVal := func(t lineType) (*lineInfo, error) { - p.popToken() - if t2 != "" { - return &lineInfo{t: t, k: t1, vs: t2}, nil - } - return &lineInfo{t: lIgnore}, nil - } - - // Helper closure for all path tokens (allow/disallow), common behaviour: - // - Consume t2 token - // - If empty, return unknown line info - // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*") - // - Detect if wildcards are present, if so, compile into a regexp - // - Return the specified line info - returnPathVal := func(t lineType) (*lineInfo, error) { - p.popToken() - if t2 != "" { - if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") { - t2 = "/" + t2 - } - t2 = strings.TrimRightFunc(t2, isAsterisk) - // From google's spec: - // Google, Bing, Yahoo, and Ask support a limited form of - // "wildcards" for path values. These are: - // * designates 0 or more instances of any valid character - // $ designates the end of the URL - if strings.ContainsAny(t2, "*$") { - // Must compile a regexp, this is a pattern. - // Escape string before compile. - t2 = regexp.QuoteMeta(t2) - t2 = strings.Replace(t2, `\*`, `.*`, -1) - t2 = strings.Replace(t2, `\$`, `$`, -1) - if r, e := regexp.Compile(t2); e != nil { - return nil, e - } else { - return &lineInfo{t: t, k: t1, vr: r}, nil - } - } else { - // Simple string path - return &lineInfo{t: t, k: t1, vs: t2}, nil - } - } - return &lineInfo{t: lIgnore}, nil - } - - switch strings.ToLower(t1) { - case tokEOL: - // Don't consume t2 and continue parsing - return &lineInfo{t: lIgnore}, nil - - case "user-agent", "useragent": - // From google's spec: - // Handling of <field> elements with simple errors / typos (eg "useragent" - // instead of "user-agent") is undefined and may be interpreted as correct - // directives by some user-agents. - // The user-agent is non-case-sensitive. - t2 = strings.ToLower(t2) - return returnStringVal(lUserAgent) - - case "disallow": - // From google's spec: - // When no path is specified, the directive is ignored (so an empty Disallow - // CAN be an allow, since allow is the default. The actual result depends - // on the other rules in the group). - return returnPathVal(lDisallow) - - case "allow": - // From google's spec: - // When no path is specified, the directive is ignored. - return returnPathVal(lAllow) - - case "host": - // Host directive to specify main site mirror - // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host - return returnStringVal(lHost) - - case "sitemap": - // Non-group field, applies to the host as a whole, not to a specific user-agent - return returnStringVal(lSitemap) - - case "crawl-delay", "crawldelay": - // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions - // Several major crawlers support a Crawl-delay parameter, set to the - // number of seconds to wait between successive requests to the same server. - p.popToken() - if cd, e := strconv.ParseFloat(t2, 64); e != nil { - return nil, e - } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) { - return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2) - } else { - return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil - } - } - - // Consume t2 token - p.popToken() - return &lineInfo{t: lUnknown, k: t1}, nil -} - -func (p *parser) popToken() (tok string, ok bool) { - tok, ok = p.peekToken() - if !ok { - return - } - p.pos++ - return tok, true -} - -func (p *parser) peekToken() (tok string, ok bool) { - if p.pos >= len(p.tokens) { - return "", false - } - return p.tokens[p.pos], true -} - -func isAsterisk(r rune) bool { - return r == '*' -} |