summaryrefslogtreecommitdiff
path: root/vendor/github.com/temoto/robotstxt/parser.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/parser.go')
-rw-r--r--vendor/github.com/temoto/robotstxt/parser.go271
1 files changed, 271 insertions, 0 deletions
diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go
new file mode 100644
index 000000000..46eb6b184
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/parser.go
@@ -0,0 +1,271 @@
+package robotstxt
+
+// Comments explaining the logic are taken from either the google's spec:
+// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt
+//
+// or the Wikipedia's entry on robots.txt:
+// http://en.wikipedia.org/wiki/Robots.txt
+
+import (
+ "fmt"
+ "io"
+ "math"
+ "regexp"
+ "strconv"
+ "strings"
+ "time"
+)
+
+type lineType uint
+
+const (
+ lIgnore lineType = iota
+ lUnknown
+ lUserAgent
+ lAllow
+ lDisallow
+ lCrawlDelay
+ lSitemap
+ lHost
+)
+
+type parser struct {
+ tokens []string
+ pos int
+}
+
+type lineInfo struct {
+ t lineType // Type of line key
+ k string // String representation of the type of key
+ vs string // String value of the key
+ vf float64 // Float value of the key
+ vr *regexp.Regexp // Regexp value of the key
+}
+
+func newParser(tokens []string) *parser {
+ return &parser{tokens: tokens}
+}
+
+func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) {
+ var g *Group
+ for _, a := range agents {
+ if g = groups[a]; g == nil {
+ g = new(Group)
+ groups[a] = g
+ }
+ fun(g)
+ }
+}
+
+func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) {
+ groups = make(map[string]*Group, 16)
+ agents := make([]string, 0, 4)
+ isEmptyGroup := true
+
+ // Reset internal fields, tokens are assigned at creation time, never change
+ p.pos = 0
+
+ for {
+ if li, err := p.parseLine(); err != nil {
+ if err == io.EOF {
+ break
+ }
+ errs = append(errs, err)
+ } else {
+ switch li.t {
+ case lUserAgent:
+ // Two successive user-agent lines are part of the same group.
+ if !isEmptyGroup {
+ // End previous group
+ agents = make([]string, 0, 4)
+ }
+ if len(agents) == 0 {
+ isEmptyGroup = true
+ }
+ agents = append(agents, li.vs)
+
+ case lDisallow:
+ // Error if no current group
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ var r *rule
+ if li.vr != nil {
+ r = &rule{"", false, li.vr}
+ } else {
+ r = &rule{li.vs, false, nil}
+ }
+ parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+ }
+
+ case lAllow:
+ // Error if no current group
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ var r *rule
+ if li.vr != nil {
+ r = &rule{"", true, li.vr}
+ } else {
+ r = &rule{li.vs, true, nil}
+ }
+ parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) })
+ }
+
+ case lHost:
+ host = li.vs
+
+ case lSitemap:
+ sitemaps = append(sitemaps, li.vs)
+
+ case lCrawlDelay:
+ if len(agents) == 0 {
+ errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos))
+ } else {
+ isEmptyGroup = false
+ delay := time.Duration(li.vf * float64(time.Second))
+ parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay })
+ }
+ }
+ }
+ }
+ return
+}
+
+func (p *parser) parseLine() (li *lineInfo, err error) {
+ t1, ok1 := p.popToken()
+ if !ok1 {
+ // proper EOF
+ return nil, io.EOF
+ }
+
+ t2, ok2 := p.peekToken()
+ if !ok2 {
+ // EOF, no value associated with the token, so ignore token and return
+ return nil, io.EOF
+ }
+
+ // Helper closure for all string-based tokens, common behaviour:
+ // - Consume t2 token
+ // - If empty, return unknown line info
+ // - Otherwise return the specified line info
+ returnStringVal := func(t lineType) (*lineInfo, error) {
+ p.popToken()
+ if t2 != "" {
+ return &lineInfo{t: t, k: t1, vs: t2}, nil
+ }
+ return &lineInfo{t: lIgnore}, nil
+ }
+
+ // Helper closure for all path tokens (allow/disallow), common behaviour:
+ // - Consume t2 token
+ // - If empty, return unknown line info
+ // - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*")
+ // - Detect if wildcards are present, if so, compile into a regexp
+ // - Return the specified line info
+ returnPathVal := func(t lineType) (*lineInfo, error) {
+ p.popToken()
+ if t2 != "" {
+ if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") {
+ t2 = "/" + t2
+ }
+ t2 = strings.TrimRightFunc(t2, isAsterisk)
+ // From google's spec:
+ // Google, Bing, Yahoo, and Ask support a limited form of
+ // "wildcards" for path values. These are:
+ // * designates 0 or more instances of any valid character
+ // $ designates the end of the URL
+ if strings.ContainsAny(t2, "*$") {
+ // Must compile a regexp, this is a pattern.
+ // Escape string before compile.
+ t2 = regexp.QuoteMeta(t2)
+ t2 = strings.Replace(t2, `\*`, `.*`, -1)
+ t2 = strings.Replace(t2, `\$`, `$`, -1)
+ if r, e := regexp.Compile(t2); e != nil {
+ return nil, e
+ } else {
+ return &lineInfo{t: t, k: t1, vr: r}, nil
+ }
+ } else {
+ // Simple string path
+ return &lineInfo{t: t, k: t1, vs: t2}, nil
+ }
+ }
+ return &lineInfo{t: lIgnore}, nil
+ }
+
+ switch strings.ToLower(t1) {
+ case tokEOL:
+ // Don't consume t2 and continue parsing
+ return &lineInfo{t: lIgnore}, nil
+
+ case "user-agent", "useragent":
+ // From google's spec:
+ // Handling of <field> elements with simple errors / typos (eg "useragent"
+ // instead of "user-agent") is undefined and may be interpreted as correct
+ // directives by some user-agents.
+ // The user-agent is non-case-sensitive.
+ t2 = strings.ToLower(t2)
+ return returnStringVal(lUserAgent)
+
+ case "disallow":
+ // From google's spec:
+ // When no path is specified, the directive is ignored (so an empty Disallow
+ // CAN be an allow, since allow is the default. The actual result depends
+ // on the other rules in the group).
+ return returnPathVal(lDisallow)
+
+ case "allow":
+ // From google's spec:
+ // When no path is specified, the directive is ignored.
+ return returnPathVal(lAllow)
+
+ case "host":
+ // Host directive to specify main site mirror
+ // Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host
+ return returnStringVal(lHost)
+
+ case "sitemap":
+ // Non-group field, applies to the host as a whole, not to a specific user-agent
+ return returnStringVal(lSitemap)
+
+ case "crawl-delay", "crawldelay":
+ // From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions
+ // Several major crawlers support a Crawl-delay parameter, set to the
+ // number of seconds to wait between successive requests to the same server.
+ p.popToken()
+ if cd, e := strconv.ParseFloat(t2, 64); e != nil {
+ return nil, e
+ } else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) {
+ return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2)
+ } else {
+ return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil
+ }
+ }
+
+ // Consume t2 token
+ p.popToken()
+ return &lineInfo{t: lUnknown, k: t1}, nil
+}
+
+func (p *parser) popToken() (tok string, ok bool) {
+ tok, ok = p.peekToken()
+ if !ok {
+ return
+ }
+ p.pos++
+ return tok, true
+}
+
+func (p *parser) peekToken() (tok string, ok bool) {
+ if p.pos >= len(p.tokens) {
+ return "", false
+ }
+ return p.tokens[p.pos], true
+}
+
+func isAsterisk(r rune) bool {
+ return r == '*'
+}