summaryrefslogtreecommitdiff
path: root/vendor/github.com/temoto/robotstxt/scanner.go
diff options
context:
space:
mode:
authorLibravatar alemi.dev <notify@alemi.dev>2025-02-11 13:16:14 +0100
committerLibravatar GitHub <noreply@github.com>2025-02-11 13:16:14 +0100
commitd0de3ad49260ad2f87d02ce1307b1f20e88a1fdf (patch)
treea4eab190784a8d456226788404a71f263ecbdc49 /vendor/github.com/temoto/robotstxt/scanner.go
parent[bugfix] Suggest lowercase username when creating via OIDC (#3780) (diff)
downloadgotosocial-d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf.tar.xz
[bug] respect `X-Robots-Tag` and `robots.txt` on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag when accessing /api/v1/instance or /nodeinfo endpoints respect X-Robots-Tag * chore: go fmt ./... * Check robots.txt as well, add tests --------- Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Diffstat (limited to 'vendor/github.com/temoto/robotstxt/scanner.go')
-rw-r--r--vendor/github.com/temoto/robotstxt/scanner.go185
1 files changed, 185 insertions, 0 deletions
diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go
new file mode 100644
index 000000000..6bd98c2ec
--- /dev/null
+++ b/vendor/github.com/temoto/robotstxt/scanner.go
@@ -0,0 +1,185 @@
+package robotstxt
+
+import (
+ "bytes"
+ "fmt"
+ "go/token"
+ "os"
+ "sync"
+ "unicode/utf8"
+)
+
+type byteScanner struct {
+ pos token.Position
+ buf []byte
+ ErrorCount int
+ ch rune
+ Quiet bool
+ keyTokenFound bool
+ lastChunk bool
+}
+
+const tokEOL = "\n"
+
+var WhitespaceChars = []rune{' ', '\t', '\v'}
+var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }}
+
+func newByteScanner(srcname string, quiet bool) *byteScanner {
+ return &byteScanner{
+ Quiet: quiet,
+ ch: -1,
+ pos: token.Position{Filename: srcname},
+ }
+}
+
+func (s *byteScanner) feed(input []byte, end bool) {
+ s.buf = input
+ s.pos.Offset = 0
+ s.pos.Line = 1
+ s.pos.Column = 1
+ s.lastChunk = end
+
+ // Read first char into look-ahead buffer `s.ch`.
+ if !s.nextChar() {
+ return
+ }
+
+ // Skip UTF-8 byte order mark
+ if s.ch == 65279 {
+ s.nextChar()
+ s.pos.Column = 1
+ }
+}
+
+func (s *byteScanner) GetPosition() token.Position {
+ return s.pos
+}
+
+func (s *byteScanner) scan() string {
+ // Note Offset > len, not >=, so we can scan last character.
+ if s.lastChunk && s.pos.Offset > len(s.buf) {
+ return ""
+ }
+
+ s.skipSpace()
+
+ if s.ch == -1 {
+ return ""
+ }
+
+ // EOL
+ if s.isEol() {
+ s.keyTokenFound = false
+ // skip subsequent newline chars
+ for s.ch != -1 && s.isEol() {
+ s.nextChar()
+ }
+ // emit newline as separate token
+ return tokEOL
+ }
+
+ // skip comments
+ if s.ch == '#' {
+ s.keyTokenFound = false
+ s.skipUntilEol()
+ if s.ch == -1 {
+ return ""
+ }
+ // emit newline as separate token
+ return tokEOL
+ }
+
+ // else we found something
+ tok := tokBuffers.Get().(*bytes.Buffer)
+ defer tokBuffers.Put(tok)
+ tok.Reset()
+ tok.WriteRune(s.ch)
+ s.nextChar()
+ for s.ch != -1 && !s.isSpace() && !s.isEol() {
+ // Do not consider ":" to be a token separator if a first key token
+ // has already been found on this line (avoid cutting an absolute URL
+ // after the "http:")
+ if s.ch == ':' && !s.keyTokenFound {
+ s.nextChar()
+ s.keyTokenFound = true
+ break
+ }
+
+ tok.WriteRune(s.ch)
+ s.nextChar()
+ }
+ return tok.String()
+}
+
+func (s *byteScanner) scanAll() []string {
+ results := make([]string, 0, 64) // random guess of average tokens length
+ for {
+ token := s.scan()
+ if token != "" {
+ results = append(results, token)
+ } else {
+ break
+ }
+ }
+ return results
+}
+
+func (s *byteScanner) error(pos token.Position, msg string) {
+ s.ErrorCount++
+ if !s.Quiet {
+ fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
+ }
+}
+
+func (s *byteScanner) isEol() bool {
+ return s.ch == '\n' || s.ch == '\r'
+}
+
+func (s *byteScanner) isSpace() bool {
+ for _, r := range WhitespaceChars {
+ if s.ch == r {
+ return true
+ }
+ }
+ return false
+}
+
+func (s *byteScanner) skipSpace() {
+ for s.ch != -1 && s.isSpace() {
+ s.nextChar()
+ }
+}
+
+func (s *byteScanner) skipUntilEol() {
+ for s.ch != -1 && !s.isEol() {
+ s.nextChar()
+ }
+ // skip subsequent newline chars
+ for s.ch != -1 && s.isEol() {
+ s.nextChar()
+ }
+}
+
+// Reads next Unicode char.
+func (s *byteScanner) nextChar() bool {
+ if s.pos.Offset >= len(s.buf) {
+ s.ch = -1
+ return false
+ }
+ s.pos.Column++
+ if s.ch == '\n' {
+ s.pos.Line++
+ s.pos.Column = 1
+ }
+ r, w := rune(s.buf[s.pos.Offset]), 1
+ if r >= 0x80 {
+ r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
+ if r == utf8.RuneError && w == 1 {
+ s.error(s.pos, "illegal UTF-8 encoding")
+ }
+ }
+ s.pos.Column++
+ s.pos.Offset += w
+ s.ch = r
+ return true
+}