diff options
author | 2025-02-11 13:16:14 +0100 | |
---|---|---|
committer | 2025-02-11 13:16:14 +0100 | |
commit | d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf (patch) | |
tree | a4eab190784a8d456226788404a71f263ecbdc49 /vendor/github.com/temoto/robotstxt/robotstxt.go | |
parent | [bugfix] Suggest lowercase username when creating via OIDC (#3780) (diff) | |
download | gotosocial-d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf.tar.xz |
[bug] respect `X-Robots-Tag` and `robots.txt` on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag
when accessing /api/v1/instance or /nodeinfo endpoints respect
X-Robots-Tag
* chore: go fmt ./...
* Check robots.txt as well, add tests
---------
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/robotstxt.go')
-rw-r--r-- | vendor/github.com/temoto/robotstxt/robotstxt.go | 227 |
1 files changed, 227 insertions, 0 deletions
diff --git a/vendor/github.com/temoto/robotstxt/robotstxt.go b/vendor/github.com/temoto/robotstxt/robotstxt.go new file mode 100644 index 000000000..52d3637c6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/robotstxt.go @@ -0,0 +1,227 @@ +// Package robotstxt implements the robots.txt Exclusion Protocol +// as specified in http://www.robotstxt.org/wc/robots.html +// with various extensions. +package robotstxt + +// Comments explaining the logic are taken from either the Google's spec: +// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt + +import ( + "bytes" + "errors" + "io/ioutil" + "net/http" + "regexp" + "strconv" + "strings" + "time" +) + +type RobotsData struct { + // private + groups map[string]*Group + allowAll bool + disallowAll bool + Host string + Sitemaps []string +} + +type Group struct { + rules []*rule + Agent string + CrawlDelay time.Duration +} + +type rule struct { + path string + allow bool + pattern *regexp.Regexp +} + +type ParseError struct { + Errs []error +} + +func newParseError(errs []error) *ParseError { + return &ParseError{errs} +} + +func (e ParseError) Error() string { + var b bytes.Buffer + + b.WriteString("Parse error(s): " + "\n") + for _, er := range e.Errs { + b.WriteString(er.Error() + "\n") + } + return b.String() +} + +var allowAll = &RobotsData{allowAll: true} +var disallowAll = &RobotsData{disallowAll: true} +var emptyGroup = &Group{} + +func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) { + switch { + case statusCode >= 200 && statusCode < 300: + return FromBytes(body) + + // From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt + // + // Google treats all 4xx errors in the same way and assumes that no valid + // robots.txt file exists. It is assumed that there are no restrictions. + // This is a "full allow" for crawling. Note: this includes 401 + // "Unauthorized" and 403 "Forbidden" HTTP result codes. + case statusCode >= 400 && statusCode < 500: + return allowAll, nil + + // From Google's spec: + // Server errors (5xx) are seen as temporary errors that result in a "full + // disallow" of crawling. + case statusCode >= 500 && statusCode < 600: + return disallowAll, nil + } + + return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode)) +} + +func FromStatusAndString(statusCode int, body string) (*RobotsData, error) { + return FromStatusAndBytes(statusCode, []byte(body)) +} + +func FromResponse(res *http.Response) (*RobotsData, error) { + if res == nil { + // Edge case, if res is nil, return nil data + return nil, nil + } + buf, e := ioutil.ReadAll(res.Body) + if e != nil { + return nil, e + } + return FromStatusAndBytes(res.StatusCode, buf) +} + +func FromBytes(body []byte) (r *RobotsData, err error) { + var errs []error + + // special case (probably not worth optimization?) + trimmed := bytes.TrimSpace(body) + if len(trimmed) == 0 { + return allowAll, nil + } + + sc := newByteScanner("bytes", true) + //sc.Quiet = !print_errors + sc.feed(body, true) + tokens := sc.scanAll() + + // special case worth optimization + if len(tokens) == 0 { + return allowAll, nil + } + + r = &RobotsData{} + parser := newParser(tokens) + r.groups, r.Host, r.Sitemaps, errs = parser.parseAll() + if len(errs) > 0 { + return nil, newParseError(errs) + } + + return r, nil +} + +func FromString(body string) (r *RobotsData, err error) { + return FromBytes([]byte(body)) +} + +func (r *RobotsData) TestAgent(path, agent string) bool { + if r.allowAll { + return true + } + if r.disallowAll { + return false + } + + // Find a group of rules that applies to this agent + // From Google's spec: + // The user-agent is non-case-sensitive. + g := r.FindGroup(agent) + return g.Test(path) +} + +// FindGroup searches block of declarations for specified user-agent. +// From Google's spec: +// Only one group of group-member records is valid for a particular crawler. +// The crawler must determine the correct group of records by finding the group +// with the most specific user-agent that still matches. All other groups of +// records are ignored by the crawler. The user-agent is non-case-sensitive. +// The order of the groups within the robots.txt file is irrelevant. +func (r *RobotsData) FindGroup(agent string) (ret *Group) { + var prefixLen int + + agent = strings.ToLower(agent) + if ret = r.groups["*"]; ret != nil { + // Weakest match possible + prefixLen = 1 + } + for a, g := range r.groups { + if a != "*" && strings.HasPrefix(agent, a) { + if l := len(a); l > prefixLen { + prefixLen = l + ret = g + } + } + } + + if ret == nil { + return emptyGroup + } + return +} + +func (g *Group) Test(path string) bool { + if r := g.findRule(path); r != nil { + return r.allow + } + + // From Google's spec: + // By default, there are no restrictions for crawling for the designated crawlers. + return true +} + +// From Google's spec: +// The path value is used as a basis to determine whether or not a rule applies +// to a specific URL on a site. With the exception of wildcards, the path is +// used to match the beginning of a URL (and any valid URLs that start with the +// same path). +// +// At a group-member level, in particular for allow and disallow directives, +// the most specific rule based on the length of the [path] entry will trump +// the less specific (shorter) rule. The order of precedence for rules with +// wildcards is undefined. +func (g *Group) findRule(path string) (ret *rule) { + var prefixLen int + + for _, r := range g.rules { + if r.pattern != nil { + if r.pattern.MatchString(path) { + // Consider this a match equal to the length of the pattern. + // From Google's spec: + // The order of precedence for rules with wildcards is undefined. + if l := len(r.pattern.String()); l > prefixLen { + prefixLen = l + ret = r + } + } + } else if r.path == "/" && prefixLen == 0 { + // Weakest match possible + prefixLen = 1 + ret = r + } else if strings.HasPrefix(path, r.path) { + if l := len(r.path); l > prefixLen { + prefixLen = l + ret = r + } + } + } + return +} |