summaryrefslogtreecommitdiff
path: root/vendor/github.com/temoto/robotstxt/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/parser.go')
-rw-r--r--vendor/github.com/temoto/robotstxt/parser.go271
1 files changed, 0 insertions, 271 deletions
diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go
deleted file mode 100644
index 46eb6b184..000000000
--- a/vendor/github.com/temoto/robotstxt/parser.go
+++ /dev/null
@@ -1,271 +0,0 @@
-package robotstxt
-
-// Comments explaining the logic are taken from either the google's spec:
-// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
-//
-// or the Wikipedia's entry on robots.txt:
-// http://en.wikipedia.org/wiki/Robots.txt
-
-import (
- "fmt"
- "io"
- "math"
- "regexp"
- "strconv"
- "strings"
- "time"
-)
-
-type lineType uint
-
-const (
- lIgnore lineType = iota
- lUnknown
- lUserAgent
- lAllow
- lDisallow
- lCrawlDelay
- lSitemap
- lHost
-)
-
-type parser struct {
- tokens []string
- pos int
-}
-
-type lineInfo struct {
- t lineType // Type of line key
- k string // String representation of the type of key
- vs string // String value of the key
- vf float64 // Float value of the key
- vr *regexp.Regexp // Regexp value of the key
-}
-
-func newParser(tokens []string) *parser {
- return &parser{tokens: tokens}
-}
-
-func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
- var g *Group
- for _, a := range agents {
- if g = groups[a]; g == nil {
- g = new(Group)
- groups[a] = g
- }
- fun(g)
- }
-}
-
-func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
- groups = make(map[string]*Group, 16)
- agents := make([]string, 0, 4)
- isEmptyGroup := true
-
- // Reset internal fields, tokens are assigned at creation time, never change
- p.pos = 0
-
- for {
- if li, err := p.parseLine(); err != nil {
- if err == io.EOF {
- break
- }
- errs = append(errs, err)
- } else {
- switch li.t {
- case lUserAgent:
- // Two successive user-agent lines are part of the same group.
- if !isEmptyGroup {
- // End previous group
- agents = make([]string, 0, 4)
- }
- if len(agents) == 0 {
- isEmptyGroup = true
- }
- agents = append(agents, li.vs)
-
- case lDisallow:
- // Error if no current group
- if len(agents) == 0 {
- errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
- } else {
- isEmptyGroup = false
- var r *rule
- if li.vr != nil {
- r = &rule{"", false, li.vr}
- } else {
- r = &rule{li.vs, false, nil}
- }
- parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
- }
-
- case lAllow:
- // Error if no current group
- if len(agents) == 0 {
- errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
- } else {
- isEmptyGroup = false
- var r *rule
- if li.vr != nil {
- r = &rule{"", true, li.vr}
- } else {
- r = &rule{li.vs, true, nil}
- }
- parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
- }
-
- case lHost:
- host = li.vs
-
- case lSitemap:
- sitemaps = append(sitemaps, li.vs)
-
- case lCrawlDelay:
- if len(agents) == 0 {
- errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
- } else {
- isEmptyGroup = false
- delay := time.Duration(li.vf * float64(time.Second))
- parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
- }
- }
- }
- }
- return
-}
-
-func (p *parser) parseLine() (li *lineInfo, err error) {
- t1, ok1 := p.popToken()
- if !ok1 {
- // proper EOF
- return nil, io.EOF
- }
-
- t2, ok2 := p.peekToken()
- if !ok2 {
- // EOF, no value associated with the token, so ignore token and return
- return nil, io.EOF
- }
-
- // Helper closure for all string-based tokens, common behaviour:
- // - Consume t2 token
- // - If empty, return unknown line info
- // - Otherwise return the specified line info
- returnStringVal := func(t lineType) (*lineInfo, error) {
- p.popToken()
- if t2 != "" {
- return &lineInfo{t: t, k: t1, vs: t2}, nil
- }
- return &lineInfo{t: lIgnore}, nil
- }
-
- // Helper closure for all path tokens (allow/disallow), common behaviour:
- // - Consume t2 token
- // - If empty, return unknown line info
- // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
- // - Detect if wildcards are present, if so, compile into a regexp
- // - Return the specified line info
- returnPathVal := func(t lineType) (*lineInfo, error) {
- p.popToken()
- if t2 != "" {
- if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
- t2 = "/" + t2
- }
- t2 = strings.TrimRightFunc(t2, isAsterisk)
- // From google's spec:
- // Google, Bing, Yahoo, and Ask support a limited form of
- // "wildcards" for path values. These are:
- // * designates 0 or more instances of any valid character
- // $ designates the end of the URL
- if strings.ContainsAny(t2, "*$") {
- // Must compile a regexp, this is a pattern.
- // Escape string before compile.
- t2 = regexp.QuoteMeta(t2)
- t2 = strings.Replace(t2, `\*`, `.*`, -1)
- t2 = strings.Replace(t2, `\$`, `$`, -1)
- if r, e := regexp.Compile(t2); e != nil {
- return nil, e
- } else {
- return &lineInfo{t: t, k: t1, vr: r}, nil
- }
- } else {
- // Simple string path
- return &lineInfo{t: t, k: t1, vs: t2}, nil
- }
- }
- return &lineInfo{t: lIgnore}, nil
- }
-
- switch strings.ToLower(t1) {
- case tokEOL:
- // Don't consume t2 and continue parsing
- return &lineInfo{t: lIgnore}, nil
-
- case "user-agent", "useragent":
- // From google's spec:
- // Handling of <field> elements with simple errors / typos (eg "useragent"
- // instead of "user-agent") is undefined and may be interpreted as correct
- // directives by some user-agents.
- // The user-agent is non-case-sensitive.
- t2 = strings.ToLower(t2)
- return returnStringVal(lUserAgent)
-
- case "disallow":
- // From google's spec:
- // When no path is specified, the directive is ignored (so an empty Disallow
- // CAN be an allow, since allow is the default. The actual result depends
- // on the other rules in the group).
- return returnPathVal(lDisallow)
-
- case "allow":
- // From google's spec:
- // When no path is specified, the directive is ignored.
- return returnPathVal(lAllow)
-
- case "host":
- // Host directive to specify main site mirror
- // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
- return returnStringVal(lHost)
-
- case "sitemap":
- // Non-group field, applies to the host as a whole, not to a specific user-agent
- return returnStringVal(lSitemap)
-
- case "crawl-delay", "crawldelay":
- // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
- // Several major crawlers support a Crawl-delay parameter, set to the
- // number of seconds to wait between successive requests to the same server.
- p.popToken()
- if cd, e := strconv.ParseFloat(t2, 64); e != nil {
- return nil, e
- } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
- return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
- } else {
- return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
- }
- }
-
- // Consume t2 token
- p.popToken()
- return &lineInfo{t: lUnknown, k: t1}, nil
-}
-
-func (p *parser) popToken() (tok string, ok bool) {
- tok, ok = p.peekToken()
- if !ok {
- return
- }
- p.pos++
- return tok, true
-}
-
-func (p *parser) peekToken() (tok string, ok bool) {
- if p.pos >= len(p.tokens) {
- return "", false
- }
- return p.tokens[p.pos], true
-}
-
-func isAsterisk(r rune) bool {
- return r == '*'
-}