diff options
author | 2025-02-11 13:16:14 +0100 | |
---|---|---|
committer | 2025-02-11 13:16:14 +0100 | |
commit | d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf (patch) | |
tree | a4eab190784a8d456226788404a71f263ecbdc49 /vendor/github.com/temoto/robotstxt/scanner.go | |
parent | [bugfix] Suggest lowercase username when creating via OIDC (#3780) (diff) | |
download | gotosocial-d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf.tar.xz |
[bug] respect `X-Robots-Tag` and `robots.txt` on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag
when accessing /api/v1/instance or /nodeinfo endpoints respect
X-Robots-Tag
* chore: go fmt ./...
* Check robots.txt as well, add tests
---------
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/scanner.go')
-rw-r--r-- | vendor/github.com/temoto/robotstxt/scanner.go | 185 |
1 files changed, 185 insertions, 0 deletions
diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go new file mode 100644 index 000000000..6bd98c2ec --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/scanner.go @@ -0,0 +1,185 @@ +package robotstxt + +import ( + "bytes" + "fmt" + "go/token" + "os" + "sync" + "unicode/utf8" +) + +type byteScanner struct { + pos token.Position + buf []byte + ErrorCount int + ch rune + Quiet bool + keyTokenFound bool + lastChunk bool +} + +const tokEOL = "\n" + +var WhitespaceChars = []rune{' ', '\t', '\v'} +var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }} + +func newByteScanner(srcname string, quiet bool) *byteScanner { + return &byteScanner{ + Quiet: quiet, + ch: -1, + pos: token.Position{Filename: srcname}, + } +} + +func (s *byteScanner) feed(input []byte, end bool) { + s.buf = input + s.pos.Offset = 0 + s.pos.Line = 1 + s.pos.Column = 1 + s.lastChunk = end + + // Read first char into look-ahead buffer `s.ch`. + if !s.nextChar() { + return + } + + // Skip UTF-8 byte order mark + if s.ch == 65279 { + s.nextChar() + s.pos.Column = 1 + } +} + +func (s *byteScanner) GetPosition() token.Position { + return s.pos +} + +func (s *byteScanner) scan() string { + // Note Offset > len, not >=, so we can scan last character. + if s.lastChunk && s.pos.Offset > len(s.buf) { + return "" + } + + s.skipSpace() + + if s.ch == -1 { + return "" + } + + // EOL + if s.isEol() { + s.keyTokenFound = false + // skip subsequent newline chars + for s.ch != -1 && s.isEol() { + s.nextChar() + } + // emit newline as separate token + return tokEOL + } + + // skip comments + if s.ch == '#' { + s.keyTokenFound = false + s.skipUntilEol() + if s.ch == -1 { + return "" + } + // emit newline as separate token + return tokEOL + } + + // else we found something + tok := tokBuffers.Get().(*bytes.Buffer) + defer tokBuffers.Put(tok) + tok.Reset() + tok.WriteRune(s.ch) + s.nextChar() + for s.ch != -1 && !s.isSpace() && !s.isEol() { + // Do not consider ":" to be a token separator if a first key token + // has already been found on this line (avoid cutting an absolute URL + // after the "http:") + if s.ch == ':' && !s.keyTokenFound { + s.nextChar() + s.keyTokenFound = true + break + } + + tok.WriteRune(s.ch) + s.nextChar() + } + return tok.String() +} + +func (s *byteScanner) scanAll() []string { + results := make([]string, 0, 64) // random guess of average tokens length + for { + token := s.scan() + if token != "" { + results = append(results, token) + } else { + break + } + } + return results +} + +func (s *byteScanner) error(pos token.Position, msg string) { + s.ErrorCount++ + if !s.Quiet { + fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg) + } +} + +func (s *byteScanner) isEol() bool { + return s.ch == '\n' || s.ch == '\r' +} + +func (s *byteScanner) isSpace() bool { + for _, r := range WhitespaceChars { + if s.ch == r { + return true + } + } + return false +} + +func (s *byteScanner) skipSpace() { + for s.ch != -1 && s.isSpace() { + s.nextChar() + } +} + +func (s *byteScanner) skipUntilEol() { + for s.ch != -1 && !s.isEol() { + s.nextChar() + } + // skip subsequent newline chars + for s.ch != -1 && s.isEol() { + s.nextChar() + } +} + +// Reads next Unicode char. +func (s *byteScanner) nextChar() bool { + if s.pos.Offset >= len(s.buf) { + s.ch = -1 + return false + } + s.pos.Column++ + if s.ch == '\n' { + s.pos.Line++ + s.pos.Column = 1 + } + r, w := rune(s.buf[s.pos.Offset]), 1 + if r >= 0x80 { + r, w = utf8.DecodeRune(s.buf[s.pos.Offset:]) + if r == utf8.RuneError && w == 1 { + s.error(s.pos, "illegal UTF-8 encoding") + } + } + s.pos.Column++ + s.pos.Offset += w + s.ch = r + return true +} |