diff options
| author | 2025-02-11 13:16:14 +0100 | |
|---|---|---|
| committer | 2025-02-11 13:16:14 +0100 | |
| commit | d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf (patch) | |
| tree | a4eab190784a8d456226788404a71f263ecbdc49 | |
| parent | [bugfix] Suggest lowercase username when creating via OIDC (#3780) (diff) | |
| download | gotosocial-d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf.tar.xz | |
[bug] respect `X-Robots-Tag` and `robots.txt` on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag
when accessing /api/v1/instance or /nodeinfo endpoints respect
X-Robots-Tag
* chore: go fmt ./...
* Check robots.txt as well, add tests
---------
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | go.mod | 1 | ||||
| -rw-r--r-- | go.sum | 2 | ||||
| -rw-r--r-- | internal/api/util/mime.go | 12 | ||||
| -rw-r--r-- | internal/federation/dereferencing/instance.go | 6 | ||||
| -rw-r--r-- | internal/federation/dereferencing/instance_test.go | 94 | ||||
| -rw-r--r-- | internal/transport/derefinstance.go | 136 | ||||
| -rw-r--r-- | internal/transport/derefrobots.go | 91 | ||||
| -rw-r--r-- | testrig/transportcontroller.go | 162 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/.gitignore | 15 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/.golangci.yml | 20 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/.travis.yml | 30 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/LICENSE | 21 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/README.rst | 115 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/codecov.yml | 2 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/fuzz.go | 29 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/parser.go | 271 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/robotstxt.go | 227 | ||||
| -rw-r--r-- | vendor/github.com/temoto/robotstxt/scanner.go | 185 | ||||
| -rw-r--r-- | vendor/modules.txt | 5 | 
20 files changed, 1404 insertions, 24 deletions
@@ -43,7 +43,8 @@ Here's a screenshot of the instance landing page! Check out the project's [offic  - [Known Issues](#known-issues)  - [Installing GoToSocial](#installing-gotosocial)    - [Supported Platforms](#supported-platforms) -    - [FreeBSD](#freebsd) +    - [64-bit](#64-bit) +    - [BSDs](#bsds)      - [32-bit](#32-bit)      - [OpenBSD](#openbsd)    - [Stable Releases](#stable-releases) @@ -434,6 +435,7 @@ The following open source libraries, frameworks, and tools are used by GoToSocia    - [superseriousbusiness/exif-terminator](https://codeberg.org/superseriousbusiness/exif-terminator); EXIF data removal. [GNU AGPL v3 LICENSE](https://spdx.org/licenses/AGPL-3.0-or-later.html).    - [superseriousbusiness/httpsig](https://github.com/superseriousbusiness/httpsig) forked from [go-fed/httpsig](https://github.com/go-fed/httpsig); secure HTTP signature library. [BSD-3-Clause License](https://spdx.org/licenses/BSD-3-Clause.html).    - [superseriousbusiness/oauth2](https://github.com/superseriousbusiness/oauth2) forked from [go-oauth2/oauth2](https://github.com/go-oauth2/oauth2); OAuth server framework and token handling. [MIT License](https://spdx.org/licenses/MIT.html). +- [temoto/robotstxt](https://github.com/temoto/robotstxt); robots.txt parsing. [MIT License](https://spdx.org/licenses/MIT.html).  - [tdewolff/minify](https://github.com/tdewolff/minify); HTML minification for Markdown-submitted posts. [MIT License](https://spdx.org/licenses/MIT.html).  - [uber-go/automaxprocs](https://github.com/uber-go/automaxprocs); GOMAXPROCS automation. [MIT License](https://spdx.org/licenses/MIT.html).  - [ulule/limiter](https://github.com/ulule/limiter); http rate limit middleware. [MIT License](https://spdx.org/licenses/MIT.html). @@ -60,6 +60,7 @@ require (  	github.com/superseriousbusiness/oauth2/v4 v4.3.2-SSB.0.20230227143000-f4900831d6c8  	github.com/tdewolff/minify/v2 v2.21.3  	github.com/technologize/otel-go-contrib v1.1.1 +	github.com/temoto/robotstxt v1.1.2  	github.com/tetratelabs/wazero v1.8.2  	github.com/tomnomnom/linkheader v0.0.0-20180905144013-02ca5825eb80  	github.com/ulule/limiter/v3 v3.11.2 @@ -540,6 +540,8 @@ github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739 h1:IkjBCtQOOjIn03  github.com/tdewolff/test v1.0.11-0.20240106005702-7de5f7df4739/go.mod h1:XPuWBzvdUzhCuxWO1ojpXsyzsA5bFoS3tO/Q3kFuTG8=  github.com/technologize/otel-go-contrib v1.1.1 h1:wZH9aSPNWZWIkEh3vfaKfMb15AJ80jJ1aVj/4GZdqIw=  github.com/technologize/otel-go-contrib v1.1.1/go.mod h1:dCN/wj2WyUO8aFZFdIN+6tfJHImjTML/8r2YVYAy3So= +github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg= +github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=  github.com/tetratelabs/wazero v1.8.2 h1:yIgLR/b2bN31bjxwXHD8a3d+BogigR952csSDdLYEv4=  github.com/tetratelabs/wazero v1.8.2/go.mod h1:yAI0XTsMBhREkM/YDAK/zNou3GoiAce1P6+rp/wQhjs=  github.com/tidwall/btree v0.0.0-20191029221954-400434d76274 h1:G6Z6HvJuPjG6XfNGi/feOATzeJrfgTNJY+rGrHbA04E= diff --git a/internal/api/util/mime.go b/internal/api/util/mime.go index 4d8946e5d..da96be786 100644 --- a/internal/api/util/mime.go +++ b/internal/api/util/mime.go @@ -36,6 +36,8 @@ const (  	TextHTML          = `text/html`  	TextCSS           = `text/css`  	TextCSV           = `text/csv` +	TextPlain         = `text/plain` +	UTF8              = `utf-8`  )  // JSONContentType returns whether is application/json(;charset=utf-8)? content-type. @@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool {  		p[0] == AppXMLXRD  } +// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type. +func TextPlainContentType(ct string) bool { +	p := splitContentType(ct) +	p, ok := isUTF8ContentType(p) +	return ok && len(p) == 1 && +		p[0] == TextPlain +} +  // ASContentType returns whether is valid ActivityStreams content-types:  // - application/activity+json  // - application/ld+json;profile=https://w3.org/ns/activitystreams @@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool {  // type parts list, removes it and returns whether is utf-8.  func isUTF8ContentType(p []string) ([]string, bool) {  	const charset = "charset=" -	const charsetUTF8 = charset + "utf-8" +	const charsetUTF8 = charset + UTF8  	for i, part := range p {  		// Only handle charset slice parts. diff --git a/internal/federation/dereferencing/instance.go b/internal/federation/dereferencing/instance.go index 90ce074cd..66d0a21be 100644 --- a/internal/federation/dereferencing/instance.go +++ b/internal/federation/dereferencing/instance.go @@ -19,20 +19,20 @@ package dereferencing  import (  	"context" -	"fmt"  	"net/url" +	"github.com/superseriousbusiness/gotosocial/internal/gtserror"  	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"  )  func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) {  	if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil { -		return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host) +		return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host)  	}  	transport, err := d.transportController.NewTransportForUsername(ctx, username)  	if err != nil { -		return nil, fmt.Errorf("transport err: %s", err) +		return nil, gtserror.Newf("transport err: %w", err)  	}  	return transport.DereferenceInstance(ctx, remoteInstanceURI) diff --git a/internal/federation/dereferencing/instance_test.go b/internal/federation/dereferencing/instance_test.go new file mode 100644 index 000000000..15f075479 --- /dev/null +++ b/internal/federation/dereferencing/instance_test.go @@ -0,0 +1,94 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package dereferencing_test + +import ( +	"context" +	"net/url" +	"testing" + +	"github.com/stretchr/testify/suite" +	"github.com/superseriousbusiness/gotosocial/internal/gtscontext" +	"github.com/superseriousbusiness/gotosocial/testrig" +) + +type InstanceTestSuite struct { +	DereferencerStandardTestSuite +} + +func (suite *InstanceTestSuite) TestDerefInstance() { +	type testCase struct { +		instanceIRI      *url.URL +		expectedSoftware string +	} + +	for _, tc := range []testCase{ +		{ +			// Fossbros anonymous doesn't shield their nodeinfo or +			// well-known or anything so we should be able to fetch. +			instanceIRI:      testrig.URLMustParse("https://fossbros-anonymous.io"), +			expectedSoftware: "Hellsoft 6.6.6", +		}, +		{ +			// Furtive nerds forbids /nodeinfo using +			// robots.txt so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't fetch /nodeinfo/2.1: robots.txt disallows it"  +			instanceIRI:      testrig.URLMustParse("https://furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +		{ +			// Robotic furtive nerds forbids *everything* using +			// robots.txt so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't fetch api/v1/instance: robots.txt disallows it"  +			//   - "can't fetch .well-known/nodeinfo: robots.txt disallows it" +			instanceIRI:      testrig.URLMustParse("https://robotic.furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +		{ +			// Really furtive nerds forbids .well-known/nodeinfo using +			// X-Robots-Tagheaders, so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't use fetched .well-known/nodeinfo: robots tags disallows it"  +			instanceIRI:      testrig.URLMustParse("https://really.furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +	} { +		instance, err := suite.dereferencer.GetRemoteInstance( +			gtscontext.SetFastFail(context.Background()), +			suite.testAccounts["admin_account"].Username, +			tc.instanceIRI, +		) +		if err != nil { +			suite.FailNow(err.Error()) +		} + +		suite.Equal(tc.expectedSoftware, instance.Version) +	} +} + +func TestInstanceTestSuite(t *testing.T) { +	suite.Run(t, new(InstanceTestSuite)) +} diff --git a/internal/transport/derefinstance.go b/internal/transport/derefinstance.go index bbeb51000..e7971093d 100644 --- a/internal/transport/derefinstance.go +++ b/internal/transport/derefinstance.go @@ -25,6 +25,7 @@ import (  	"io"  	"net/http"  	"net/url" +	"slices"  	"strings"  	apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" @@ -35,18 +36,29 @@ import (  	"github.com/superseriousbusiness/gotosocial/internal/log"  	"github.com/superseriousbusiness/gotosocial/internal/util"  	"github.com/superseriousbusiness/gotosocial/internal/validate" +	"github.com/temoto/robotstxt"  )  func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) { +	// Try to fetch robots.txt to check +	// if we're allowed to try endpoints: +	// +	//   - /api/v1/instance +	//   - /.well-known/nodeinfo +	//   - /nodeinfo/2.0|2.1 endpoints +	robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host) +	if err != nil { +		log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err) +	} +  	var i *gtsmodel.Instance -	var err error  	// First try to dereference using /api/v1/instance.  	// This will provide the most complete picture of an instance, and avoid unnecessary api calls.  	//  	// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.  	log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host) -	i, err = dereferenceByAPIV1Instance(ctx, t, iri) +	i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt)  	if err == nil {  		log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")  		return i, nil @@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts  	// If that doesn't work, try to dereference using /.well-known/nodeinfo.  	// This will involve two API calls and return less info overall, but should be more widely compatible.  	log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host) -	i, err = dereferenceByNodeInfo(ctx, t, iri) +	i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt)  	if err == nil {  		log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")  		return i, nil @@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts  	}, nil  } -func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { +func (t *transport) dereferenceByAPIV1Instance( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { +	const path = "api/v1/instance" + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	cleanIRI := &url.URL{  		Scheme: iri.Scheme,  		Host:   iri.Host, -		Path:   "api/v1/instance", +		Path:   path,  	}  	// Build IRI just once @@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  		return nil, gtserror.NewFromResponse(resp)  	} +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	// Ensure that the incoming request content-type is expected.  	if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {  		err := gtserror.Newf("non json response type: %s", ct) @@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  		return nil, errors.New("response bytes was len 0")  	} -	// try to parse the returned bytes directly into an Instance model +	// Try to parse the returned bytes +	// directly into an Instance model.  	apiResp := &apimodel.InstanceV1{}  	if err := json.Unmarshal(b, apiResp); err != nil {  		return nil, err @@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  	return i, nil  } -func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { -	niIRI, err := callNodeInfoWellKnown(c, t, iri) +func (t *transport) dereferenceByNodeInfo( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { +	// Retrieve the nodeinfo IRI from .well-known/nodeinfo. +	niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt)  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err) +		return nil, gtserror.Newf("error during initial call to .well-known: %w", err)  	} -	ni, err := callNodeInfo(c, t, niIRI) +	// Use the returned nodeinfo IRI to make a followup call. +	ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt)  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err) +		return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err)  	} -	// we got a response of some kind! take what we can from it... +	// We got a response of some kind! +	// +	// Start building out the bare minimum +	// instance model, we'll add to it if we can.  	id, err := id.NewRandomULID()  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err) +		return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err)  	} -	// this is the bare minimum instance we'll return, and we'll add more stuff to it if we can  	i := >smodel.Instance{  		ID:     id,  		Domain: iri.Host, @@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm  	return i, nil  } -func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) { +func (t *transport) callNodeInfoWellKnown( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*url.URL, error) { +	const path = ".well-known/nodeinfo" + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	cleanIRI := &url.URL{  		Scheme: iri.Scheme,  		Host:   iri.Host, -		Path:   ".well-known/nodeinfo", +		Path:   path,  	}  	// Build IRI just once @@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  		return nil, gtserror.NewFromResponse(resp)  	} -	// Ensure that the incoming request content-type is expected. +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} + +	// Ensure that the returned content-type is expected.  	if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {  		err := gtserror.Newf("non json response type: %s", ct)  		return nil, gtserror.SetMalformed(err) @@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  		return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)  	} -	// look through the links for the first one that matches the nodeinfo schema, this is what we need +	// Look through the links for the first one that +	// matches nodeinfo schema, this is what we need.  	var nodeinfoHref *url.URL  	for _, l := range wellKnownResp.Links {  		if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") { @@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  	return nodeinfoHref, nil  } -func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) { +func (t *transport) callNodeInfo( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*apimodel.Nodeinfo, error) { +	// Normalize robots.txt test path. +	testPath := iri.Path +	if !strings.HasPrefix(testPath, "/") { +		testPath = "/" + testPath +	} + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath) +		return nil, gtserror.SetNotPermitted(err) +	} +  	// Build IRI just once  	iriStr := iri.String() @@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No  		return nil, gtserror.SetMalformed(err)  	} +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	b, err := io.ReadAll(resp.Body)  	if err != nil {  		return nil, err diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go new file mode 100644 index 000000000..d6c4f3058 --- /dev/null +++ b/internal/transport/derefrobots.go @@ -0,0 +1,91 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package transport + +import ( +	"context" +	"net/http" +	"net/url" + +	"codeberg.org/gruf/go-bytesize" +	"codeberg.org/gruf/go-iotools" +	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +	"github.com/superseriousbusiness/gotosocial/internal/gtserror" +	"github.com/temoto/robotstxt" +) + +func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) { +	robotsIRI := &url.URL{ +		Scheme: protocol, +		Host:   host, +		Path:   "robots.txt", +	} + +	// Build IRI just once +	iriStr := robotsIRI.String() + +	// Prepare new HTTP request to endpoint +	req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil) +	if err != nil { +		return nil, err +	} + +	// We want text/plain utf-8 encoding. +	// +	// https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method +	req.Header.Add("Accept", apiutil.TextPlain) +	req.Header.Add("Accept-Charset", apiutil.UTF8) + +	// Perform the HTTP request +	rsp, err := t.GET(req) +	if err != nil { +		return nil, err +	} + +	// Ensure a non-error status response. +	if rsp.StatusCode != http.StatusOK { +		err := gtserror.NewFromResponse(rsp) +		_ = rsp.Body.Close() // close early. +		return nil, err +	} + +	// Ensure that the incoming request content-type is expected. +	if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) { +		err := gtserror.Newf("non text/plain response: %s", ct) +		_ = rsp.Body.Close() // close early. +		return nil, gtserror.SetMalformed(err) +	} + +	// Limit the robots.txt size to 500KiB +	// +	// https://www.rfc-editor.org/rfc/rfc9309.html#name-limits +	const maxsz = int64(500 * bytesize.KiB) + +	// Check body claims to be within size limit. +	if rsp.ContentLength > maxsz { +		_ = rsp.Body.Close()       // close early. +		sz := bytesize.Size(maxsz) //nolint:gosec +		return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz) +	} + +	// Update response body with maximum size. +	rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz) +	defer rsp.Body.Close() + +	return robotstxt.FromResponse(rsp) +} diff --git a/testrig/transportcontroller.go b/testrig/transportcontroller.go index b886e5c40..00f8ad2a6 100644 --- a/testrig/transportcontroller.go +++ b/testrig/transportcontroller.go @@ -133,6 +133,12 @@ func NewMockHTTPClient(do func(req *http.Request) (*http.Response, error), relat  			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WebfingerResponse(req)  		} else if strings.Contains(reqURLString, ".well-known/host-meta") {  			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = HostMetaResponse(req) +		} else if strings.Contains(reqURLString, ".well-known/nodeinfo") { +			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = WellKnownNodeInfoResponse(req) +		} else if strings.Contains(reqURLString, "/robots.txt") { +			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = RobotsTxtResponse(req) +		} else if strings.Contains(reqURLString, "/nodeinfo/2.1") { +			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = NodeInfoResponse(req)  		} else if strings.Contains(reqURLString, "lists.example.org") {  			responseCode, responseBytes, responseContentType, responseContentLength, extraHeaders = DomainPermissionSubscriptionResponse(req)  		} else if note, ok := mockHTTPClient.TestRemoteStatuses[reqURLString]; ok { @@ -318,6 +324,162 @@ func HostMetaResponse(req *http.Request) (  	return  } +func WellKnownNodeInfoResponse(req *http.Request) ( +	responseCode int, +	responseBytes []byte, +	responseContentType string, +	responseContentLength int, +	extraHeaders map[string]string, +) { +	var wkr *apimodel.WellKnownResponse + +	switch req.URL.String() { +	case "https://fossbros-anonymous.io/.well-known/nodeinfo": +		wkr = &apimodel.WellKnownResponse{ +			Links: []apimodel.Link{ +				{ +					Rel:  "http://nodeinfo.diaspora.software/ns/schema/2.1", +					Href: "https://fossbros-anonymous.io/nodeinfo/2.1", +				}, +			}, +		} +	case "https://furtive-nerds.example.org/.well-known/nodeinfo": +		wkr = &apimodel.WellKnownResponse{ +			Links: []apimodel.Link{ +				{ +					Rel:  "http://nodeinfo.diaspora.software/ns/schema/2.1", +					Href: "https://furtive-nerds.example.org/nodeinfo/2.1", +				}, +			}, +		} +	case "https://really.furtive-nerds.example.org/.well-known/nodeinfo": +		wkr = &apimodel.WellKnownResponse{ +			Links: []apimodel.Link{ +				{ +					Rel:  "http://nodeinfo.diaspora.software/ns/schema/2.1", +					Href: "https://really.furtive-nerds.example.org/nodeinfo/2.1", +				}, +			}, +		} +		extraHeaders = map[string]string{"X-Robots-Tag": "noindex,nofollow"} +	default: +		log.Debugf(nil, "nodeinfo response not available for %s", req.URL) +		responseCode = http.StatusNotFound +		responseBytes = []byte(``) +		responseContentType = "application/json" +		responseContentLength = len(responseBytes) +		return +	} + +	niJSON, err := json.Marshal(wkr) +	if err != nil { +		panic(err) +	} +	responseCode = http.StatusOK +	responseBytes = niJSON +	responseContentType = "application/json" +	responseContentLength = len(niJSON) + +	return +} + +func NodeInfoResponse(req *http.Request) ( +	responseCode int, +	responseBytes []byte, +	responseContentType string, +	responseContentLength int, +	extraHeaders map[string]string, +) { +	var ni *apimodel.Nodeinfo + +	switch req.URL.String() { +	case "https://fossbros-anonymous.io/nodeinfo/2.1": +		ni = &apimodel.Nodeinfo{ +			Version: "2.1", +			Software: apimodel.NodeInfoSoftware{ +				Name:       "Hellsoft", +				Version:    "6.6.6", +				Repository: "https://forge.hellsoft.fossbros-anonymous.io", +				Homepage:   "https://hellsoft.fossbros-anonymous.io", +			}, +			Protocols: []string{"activitypub"}, +		} +	case "https://furtive-nerds.example.org/nodeinfo/2.1": +		ni = &apimodel.Nodeinfo{ +			Version: "2.1", +			Software: apimodel.NodeInfoSoftware{ +				Name:       "GoToSocial", +				Version:    "1.3.1.2", +				Repository: "https://github.com/superseriousbusiness/gotosocial", +				Homepage:   "https://docs.gotosocial.org", +			}, +			Protocols: []string{"activitypub"}, +		} +	case "https://really.furtive-nerds.example.org/nodeinfo/2.1": +		ni = &apimodel.Nodeinfo{ +			Version: "2.1", +			Software: apimodel.NodeInfoSoftware{ +				Name:       "GoToSocial", +				Version:    "1.3.1.2", +				Repository: "https://github.com/superseriousbusiness/gotosocial", +				Homepage:   "https://docs.gotosocial.org", +			}, +			Protocols: []string{"activitypub"}, +		} +	default: +		log.Debugf(nil, "nodeinfo response not available for %s", req.URL) +		responseCode = http.StatusNotFound +		responseBytes = []byte(``) +		responseContentType = "application/json" +		responseContentLength = len(responseBytes) +		return +	} + +	niJSON, err := json.Marshal(ni) +	if err != nil { +		panic(err) +	} +	responseCode = http.StatusOK +	responseBytes = niJSON +	responseContentType = "application/json" +	responseContentLength = len(niJSON) + +	return +} + +func RobotsTxtResponse(req *http.Request) ( +	responseCode int, +	responseBytes []byte, +	responseContentType string, +	responseContentLength int, +	extraHeaders map[string]string, +) { +	var robots string + +	switch req.URL.String() { +	case "https://furtive-nerds.example.org/robots.txt": +		// Disallow nodeinfo. +		robots = "User-agent: *\nDisallow: /nodeinfo" +	case "https://robotic.furtive-nerds.example.org/robots.txt": +		// Disallow everything. +		robots = "User-agent: *\nDisallow: /" +	default: +		log.Debugf(nil, "robots response not available for %s", req.URL) +		responseCode = http.StatusNotFound +		responseBytes = []byte(``) +		responseContentType = "text/plain" +		responseContentLength = len(responseBytes) +		return +	} + +	responseCode = http.StatusOK +	responseBytes = []byte(robots) +	responseContentType = "text/plain" +	responseContentLength = len(responseBytes) + +	return +} +  func WebfingerResponse(req *http.Request) (  	responseCode int,  	responseBytes []byte, diff --git a/vendor/github.com/temoto/robotstxt/.gitignore b/vendor/github.com/temoto/robotstxt/.gitignore new file mode 100644 index 000000000..6205f9eae --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.gitignore @@ -0,0 +1,15 @@ +*.cgo?.* +*.o +*.so +*.sublime-* +*.zip +.DS_Store +.idea/ +.tags* +_cgo_* +_gofuzz/crashers/ +_gofuzz/suppressions/ +_obj +_test +coverage.txt +robots.txt-check/robots.txt-check diff --git a/vendor/github.com/temoto/robotstxt/.golangci.yml b/vendor/github.com/temoto/robotstxt/.golangci.yml new file mode 100644 index 000000000..24e5858fa --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.golangci.yml @@ -0,0 +1,20 @@ +linters: +  enable: +    - goconst +    - gofmt +    - gosec +    - maligned +    - prealloc +    - staticcheck +  disable: +    - deadcode +    - structcheck +    - varcheck + +linters-settings: +  gofmt: +    simplify: true +  govet: +    check-shadowing: true +  maligned: +    suggest-new: true diff --git a/vendor/github.com/temoto/robotstxt/.travis.yml b/vendor/github.com/temoto/robotstxt/.travis.yml new file mode 100644 index 000000000..ad90dac37 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/.travis.yml @@ -0,0 +1,30 @@ +cache: +  go: true +  directories: +  - $HOME/.cache +  - $HOME/bin +  - $HOME/gopath/pkg/mod +language: go +go: +- 1.11 +- 1.12 +- 1.13 +- 1.14 +- 1.x +- master +install: true +script: GO111MODULE=on go test -race + +matrix: +  include: +  - go: 1.x +    env: task=coverage +    script: GO111MODULE=on go test -race -covermode=atomic -coverprofile=coverage.txt +    after_success: bash <(curl -s https://codecov.io/bash) +  - go: 1.x +    env: task=bench +    script: GO111MODULE=on ./script/bench +  - go: 1.x +    install: curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | bash -s -- -b $HOME/bin v1.19.1 +    env: task=clean +    script: GO111MODULE=on ./script/clean diff --git a/vendor/github.com/temoto/robotstxt/LICENSE b/vendor/github.com/temoto/robotstxt/LICENSE new file mode 100644 index 000000000..c125145b6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/LICENSE @@ -0,0 +1,21 @@ +The MIT License + +Copyright (c) 2010 Sergey Shepelev <temotor@gmail.com> + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/temoto/robotstxt/README.rst b/vendor/github.com/temoto/robotstxt/README.rst new file mode 100644 index 000000000..92f1ae161 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/README.rst @@ -0,0 +1,115 @@ +What +==== + +This is a robots.txt exclusion protocol implementation for Go language (golang). + + +Build +===== + +To build and run tests run `go test` in source directory. + + +Contribute +========== + +Warm welcome. + +* If desired, add your name in README.rst, section Who. +* Run `script/test && script/clean && echo ok` +* You can ignore linter warnings, but everything else must pass. +* Send your change as pull request or just a regular patch to current maintainer (see section Who). + +Thank you. + + +Usage +===== + +As usual, no special installation is required, just + +    import "github.com/temoto/robotstxt" + +run `go get` and you're ready. + +1. Parse +^^^^^^^^ + +First of all, you need to parse robots.txt data. You can do it with +functions `FromBytes(body []byte) (*RobotsData, error)` or same for `string`:: + +    robots, err := robotstxt.FromBytes([]byte("User-agent: *\nDisallow:")) +    robots, err := robotstxt.FromString("User-agent: *\nDisallow:") + +As of 2012-10-03, `FromBytes` is the most efficient method, everything else +is a wrapper for this core function. + +There are few convenient constructors for various purposes: + +* `FromResponse(*http.Response) (*RobotsData, error)` to init robots data +from HTTP response. It *does not* call `response.Body.Close()`:: + +    robots, err := robotstxt.FromResponse(resp) +    resp.Body.Close() +    if err != nil { +        log.Println("Error parsing robots.txt:", err.Error()) +    } + +* `FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error)` or +`FromStatusAndString` if you prefer to read bytes (string) yourself. +Passing status code applies following logic in line with Google's interpretation +of robots.txt files: + +    * status 2xx  -> parse body with `FromBytes` and apply rules listed there. +    * status 4xx  -> allow all (even 401/403, as recommended by Google). +    * other (5xx) -> disallow all, consider this a temporary unavailability. + +2. Query +^^^^^^^^ + +Parsing robots.txt content builds a kind of logic database, which you can +query with `(r *RobotsData) TestAgent(url, agent string) (bool)`. + +Explicit passing of agent is useful if you want to query for different agents. For +single agent users there is an efficient option: `RobotsData.FindGroup(userAgent string)` +returns a structure with `.Test(path string)` method and `.CrawlDelay time.Duration`. + +Simple query with explicit user agent. Each call will scan all rules. + +:: + +    allow := robots.TestAgent("/", "FooBot") + +Or query several paths against same user agent for performance. + +:: + +    group := robots.FindGroup("BarBot") +    group.Test("/") +    group.Test("/download.mp3") +    group.Test("/news/article-2012-1") + + +Who +=== + +Honorable contributors (in undefined order): + +    * Ilya Grigorik (igrigorik) +    * Martin Angers (PuerkitoBio) +    * Micha Gorelick (mynameisfiber) + +Initial commit and other: Sergey Shepelev temotor@gmail.com + + +Flair +===== + +.. image:: https://travis-ci.org/temoto/robotstxt.svg?branch=master +    :target: https://travis-ci.org/temoto/robotstxt + +.. image:: https://codecov.io/gh/temoto/robotstxt/branch/master/graph/badge.svg +    :target: https://codecov.io/gh/temoto/robotstxt + +.. image:: https://goreportcard.com/badge/github.com/temoto/robotstxt +    :target: https://goreportcard.com/report/github.com/temoto/robotstxt diff --git a/vendor/github.com/temoto/robotstxt/codecov.yml b/vendor/github.com/temoto/robotstxt/codecov.yml new file mode 100644 index 000000000..b80be28f6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/codecov.yml @@ -0,0 +1,2 @@ +codecov: +  token: 6bf9c7eb-69ff-4b74-8464-e2fb452d0f04 diff --git a/vendor/github.com/temoto/robotstxt/fuzz.go b/vendor/github.com/temoto/robotstxt/fuzz.go new file mode 100644 index 000000000..de4b0587a --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/fuzz.go @@ -0,0 +1,29 @@ +// +build gofuzz + +package robotstxt + +import "testing/quick" + +func Fuzz(data []byte) int { +	r, err := FromBytes(data) +	if err != nil { +		if r != nil { +			panic("r != nil on error") +		} +		return 0 +	} + +	// FindGroup must never return nil +	f1 := func(agent string) bool { return r.FindGroup(agent) != nil } +	if err := quick.Check(f1, nil); err != nil { +		panic(err) +	} + +	// just check TestAgent doesn't panic +	f2 := func(path, agent string) bool { r.TestAgent(path, agent); return true } +	if err := quick.Check(f2, nil); err != nil { +		panic(err) +	} + +	return 1 +} diff --git a/vendor/github.com/temoto/robotstxt/parser.go b/vendor/github.com/temoto/robotstxt/parser.go new file mode 100644 index 000000000..46eb6b184 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/parser.go @@ -0,0 +1,271 @@ +package robotstxt + +// Comments explaining the logic are taken from either the google's spec: +// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt +// +// or the Wikipedia's entry on robots.txt: +// http://en.wikipedia.org/wiki/Robots.txt + +import ( +	"fmt" +	"io" +	"math" +	"regexp" +	"strconv" +	"strings" +	"time" +) + +type lineType uint + +const ( +	lIgnore lineType = iota +	lUnknown +	lUserAgent +	lAllow +	lDisallow +	lCrawlDelay +	lSitemap +	lHost +) + +type parser struct { +	tokens []string +	pos    int +} + +type lineInfo struct { +	t  lineType       // Type of line key +	k  string         // String representation of the type of key +	vs string         // String value of the key +	vf float64        // Float value of the key +	vr *regexp.Regexp // Regexp value of the key +} + +func newParser(tokens []string) *parser { +	return &parser{tokens: tokens} +} + +func parseGroupMap(groups map[string]*Group, agents []string, fun func(*Group)) { +	var g *Group +	for _, a := range agents { +		if g = groups[a]; g == nil { +			g = new(Group) +			groups[a] = g +		} +		fun(g) +	} +} + +func (p *parser) parseAll() (groups map[string]*Group, host string, sitemaps []string, errs []error) { +	groups = make(map[string]*Group, 16) +	agents := make([]string, 0, 4) +	isEmptyGroup := true + +	// Reset internal fields, tokens are assigned at creation time, never change +	p.pos = 0 + +	for { +		if li, err := p.parseLine(); err != nil { +			if err == io.EOF { +				break +			} +			errs = append(errs, err) +		} else { +			switch li.t { +			case lUserAgent: +				// Two successive user-agent lines are part of the same group. +				if !isEmptyGroup { +					// End previous group +					agents = make([]string, 0, 4) +				} +				if len(agents) == 0 { +					isEmptyGroup = true +				} +				agents = append(agents, li.vs) + +			case lDisallow: +				// Error if no current group +				if len(agents) == 0 { +					errs = append(errs, fmt.Errorf("Disallow before User-agent at token #%d.", p.pos)) +				} else { +					isEmptyGroup = false +					var r *rule +					if li.vr != nil { +						r = &rule{"", false, li.vr} +					} else { +						r = &rule{li.vs, false, nil} +					} +					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) +				} + +			case lAllow: +				// Error if no current group +				if len(agents) == 0 { +					errs = append(errs, fmt.Errorf("Allow before User-agent at token #%d.", p.pos)) +				} else { +					isEmptyGroup = false +					var r *rule +					if li.vr != nil { +						r = &rule{"", true, li.vr} +					} else { +						r = &rule{li.vs, true, nil} +					} +					parseGroupMap(groups, agents, func(g *Group) { g.rules = append(g.rules, r) }) +				} + +			case lHost: +				host = li.vs + +			case lSitemap: +				sitemaps = append(sitemaps, li.vs) + +			case lCrawlDelay: +				if len(agents) == 0 { +					errs = append(errs, fmt.Errorf("Crawl-delay before User-agent at token #%d.", p.pos)) +				} else { +					isEmptyGroup = false +					delay := time.Duration(li.vf * float64(time.Second)) +					parseGroupMap(groups, agents, func(g *Group) { g.CrawlDelay = delay }) +				} +			} +		} +	} +	return +} + +func (p *parser) parseLine() (li *lineInfo, err error) { +	t1, ok1 := p.popToken() +	if !ok1 { +		// proper EOF +		return nil, io.EOF +	} + +	t2, ok2 := p.peekToken() +	if !ok2 { +		// EOF, no value associated with the token, so ignore token and return +		return nil, io.EOF +	} + +	// Helper closure for all string-based tokens, common behaviour: +	// - Consume t2 token +	// - If empty, return unknown line info +	// - Otherwise return the specified line info +	returnStringVal := func(t lineType) (*lineInfo, error) { +		p.popToken() +		if t2 != "" { +			return &lineInfo{t: t, k: t1, vs: t2}, nil +		} +		return &lineInfo{t: lIgnore}, nil +	} + +	// Helper closure for all path tokens (allow/disallow), common behaviour: +	// - Consume t2 token +	// - If empty, return unknown line info +	// - Otherwise, normalize the path (add leading "/" if missing, remove trailing "*") +	// - Detect if wildcards are present, if so, compile into a regexp +	// - Return the specified line info +	returnPathVal := func(t lineType) (*lineInfo, error) { +		p.popToken() +		if t2 != "" { +			if !strings.HasPrefix(t2, "*") && !strings.HasPrefix(t2, "/") { +				t2 = "/" + t2 +			} +			t2 = strings.TrimRightFunc(t2, isAsterisk) +			// From google's spec: +			// Google, Bing, Yahoo, and Ask support a limited form of +			// "wildcards" for path values. These are: +			//   * designates 0 or more instances of any valid character +			//   $ designates the end of the URL +			if strings.ContainsAny(t2, "*$") { +				// Must compile a regexp, this is a pattern. +				// Escape string before compile. +				t2 = regexp.QuoteMeta(t2) +				t2 = strings.Replace(t2, `\*`, `.*`, -1) +				t2 = strings.Replace(t2, `\$`, `$`, -1) +				if r, e := regexp.Compile(t2); e != nil { +					return nil, e +				} else { +					return &lineInfo{t: t, k: t1, vr: r}, nil +				} +			} else { +				// Simple string path +				return &lineInfo{t: t, k: t1, vs: t2}, nil +			} +		} +		return &lineInfo{t: lIgnore}, nil +	} + +	switch strings.ToLower(t1) { +	case tokEOL: +		// Don't consume t2 and continue parsing +		return &lineInfo{t: lIgnore}, nil + +	case "user-agent", "useragent": +		// From google's spec: +		// Handling of <field> elements with simple errors / typos (eg "useragent" +		// instead of "user-agent") is undefined and may be interpreted as correct +		// directives by some user-agents. +		// The user-agent is non-case-sensitive. +		t2 = strings.ToLower(t2) +		return returnStringVal(lUserAgent) + +	case "disallow": +		// From google's spec: +		// When no path is specified, the directive is ignored (so an empty Disallow +		// CAN be an allow, since allow is the default. The actual result depends +		// on the other rules in the group). +		return returnPathVal(lDisallow) + +	case "allow": +		// From google's spec: +		// When no path is specified, the directive is ignored. +		return returnPathVal(lAllow) + +	case "host": +		// Host directive to specify main site mirror +		// Read more: https://help.yandex.com/webmaster/controlling-robot/robots-txt.xml#host +		return returnStringVal(lHost) + +	case "sitemap": +		// Non-group field, applies to the host as a whole, not to a specific user-agent +		return returnStringVal(lSitemap) + +	case "crawl-delay", "crawldelay": +		// From http://en.wikipedia.org/wiki/Robots_exclusion_standard#Nonstandard_extensions +		// Several major crawlers support a Crawl-delay parameter, set to the +		// number of seconds to wait between successive requests to the same server. +		p.popToken() +		if cd, e := strconv.ParseFloat(t2, 64); e != nil { +			return nil, e +		} else if cd < 0 || math.IsInf(cd, 0) || math.IsNaN(cd) { +			return nil, fmt.Errorf("Crawl-delay invalid value '%s'", t2) +		} else { +			return &lineInfo{t: lCrawlDelay, k: t1, vf: cd}, nil +		} +	} + +	// Consume t2 token +	p.popToken() +	return &lineInfo{t: lUnknown, k: t1}, nil +} + +func (p *parser) popToken() (tok string, ok bool) { +	tok, ok = p.peekToken() +	if !ok { +		return +	} +	p.pos++ +	return tok, true +} + +func (p *parser) peekToken() (tok string, ok bool) { +	if p.pos >= len(p.tokens) { +		return "", false +	} +	return p.tokens[p.pos], true +} + +func isAsterisk(r rune) bool { +	return r == '*' +} diff --git a/vendor/github.com/temoto/robotstxt/robotstxt.go b/vendor/github.com/temoto/robotstxt/robotstxt.go new file mode 100644 index 000000000..52d3637c6 --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/robotstxt.go @@ -0,0 +1,227 @@ +// Package robotstxt implements the robots.txt Exclusion Protocol +// as specified in http://www.robotstxt.org/wc/robots.html +// with various extensions. +package robotstxt + +// Comments explaining the logic are taken from either the Google's spec: +// https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt + +import ( +	"bytes" +	"errors" +	"io/ioutil" +	"net/http" +	"regexp" +	"strconv" +	"strings" +	"time" +) + +type RobotsData struct { +	// private +	groups      map[string]*Group +	allowAll    bool +	disallowAll bool +	Host        string +	Sitemaps    []string +} + +type Group struct { +	rules      []*rule +	Agent      string +	CrawlDelay time.Duration +} + +type rule struct { +	path    string +	allow   bool +	pattern *regexp.Regexp +} + +type ParseError struct { +	Errs []error +} + +func newParseError(errs []error) *ParseError { +	return &ParseError{errs} +} + +func (e ParseError) Error() string { +	var b bytes.Buffer + +	b.WriteString("Parse error(s): " + "\n") +	for _, er := range e.Errs { +		b.WriteString(er.Error() + "\n") +	} +	return b.String() +} + +var allowAll = &RobotsData{allowAll: true} +var disallowAll = &RobotsData{disallowAll: true} +var emptyGroup = &Group{} + +func FromStatusAndBytes(statusCode int, body []byte) (*RobotsData, error) { +	switch { +	case statusCode >= 200 && statusCode < 300: +		return FromBytes(body) + +	// From https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt +	// +	// Google treats all 4xx errors in the same way and assumes that no valid +	// robots.txt file exists. It is assumed that there are no restrictions. +	// This is a "full allow" for crawling. Note: this includes 401 +	// "Unauthorized" and 403 "Forbidden" HTTP result codes. +	case statusCode >= 400 && statusCode < 500: +		return allowAll, nil + +	// From Google's spec: +	// Server errors (5xx) are seen as temporary errors that result in a "full +	// disallow" of crawling. +	case statusCode >= 500 && statusCode < 600: +		return disallowAll, nil +	} + +	return nil, errors.New("Unexpected status: " + strconv.Itoa(statusCode)) +} + +func FromStatusAndString(statusCode int, body string) (*RobotsData, error) { +	return FromStatusAndBytes(statusCode, []byte(body)) +} + +func FromResponse(res *http.Response) (*RobotsData, error) { +	if res == nil { +		// Edge case, if res is nil, return nil data +		return nil, nil +	} +	buf, e := ioutil.ReadAll(res.Body) +	if e != nil { +		return nil, e +	} +	return FromStatusAndBytes(res.StatusCode, buf) +} + +func FromBytes(body []byte) (r *RobotsData, err error) { +	var errs []error + +	// special case (probably not worth optimization?) +	trimmed := bytes.TrimSpace(body) +	if len(trimmed) == 0 { +		return allowAll, nil +	} + +	sc := newByteScanner("bytes", true) +	//sc.Quiet = !print_errors +	sc.feed(body, true) +	tokens := sc.scanAll() + +	// special case worth optimization +	if len(tokens) == 0 { +		return allowAll, nil +	} + +	r = &RobotsData{} +	parser := newParser(tokens) +	r.groups, r.Host, r.Sitemaps, errs = parser.parseAll() +	if len(errs) > 0 { +		return nil, newParseError(errs) +	} + +	return r, nil +} + +func FromString(body string) (r *RobotsData, err error) { +	return FromBytes([]byte(body)) +} + +func (r *RobotsData) TestAgent(path, agent string) bool { +	if r.allowAll { +		return true +	} +	if r.disallowAll { +		return false +	} + +	// Find a group of rules that applies to this agent +	// From Google's spec: +	// The user-agent is non-case-sensitive. +	g := r.FindGroup(agent) +	return g.Test(path) +} + +// FindGroup searches block of declarations for specified user-agent. +// From Google's spec: +// Only one group of group-member records is valid for a particular crawler. +// The crawler must determine the correct group of records by finding the group +// with the most specific user-agent that still matches. All other groups of +// records are ignored by the crawler. The user-agent is non-case-sensitive. +// The order of the groups within the robots.txt file is irrelevant. +func (r *RobotsData) FindGroup(agent string) (ret *Group) { +	var prefixLen int + +	agent = strings.ToLower(agent) +	if ret = r.groups["*"]; ret != nil { +		// Weakest match possible +		prefixLen = 1 +	} +	for a, g := range r.groups { +		if a != "*" && strings.HasPrefix(agent, a) { +			if l := len(a); l > prefixLen { +				prefixLen = l +				ret = g +			} +		} +	} + +	if ret == nil { +		return emptyGroup +	} +	return +} + +func (g *Group) Test(path string) bool { +	if r := g.findRule(path); r != nil { +		return r.allow +	} + +	// From Google's spec: +	// By default, there are no restrictions for crawling for the designated crawlers. +	return true +} + +// From Google's spec: +// The path value is used as a basis to determine whether or not a rule applies +// to a specific URL on a site. With the exception of wildcards, the path is +// used to match the beginning of a URL (and any valid URLs that start with the +// same path). +// +// At a group-member level, in particular for allow and disallow directives, +// the most specific rule based on the length of the [path] entry will trump +// the less specific (shorter) rule. The order of precedence for rules with +// wildcards is undefined. +func (g *Group) findRule(path string) (ret *rule) { +	var prefixLen int + +	for _, r := range g.rules { +		if r.pattern != nil { +			if r.pattern.MatchString(path) { +				// Consider this a match equal to the length of the pattern. +				// From Google's spec: +				// The order of precedence for rules with wildcards is undefined. +				if l := len(r.pattern.String()); l > prefixLen { +					prefixLen = l +					ret = r +				} +			} +		} else if r.path == "/" && prefixLen == 0 { +			// Weakest match possible +			prefixLen = 1 +			ret = r +		} else if strings.HasPrefix(path, r.path) { +			if l := len(r.path); l > prefixLen { +				prefixLen = l +				ret = r +			} +		} +	} +	return +} diff --git a/vendor/github.com/temoto/robotstxt/scanner.go b/vendor/github.com/temoto/robotstxt/scanner.go new file mode 100644 index 000000000..6bd98c2ec --- /dev/null +++ b/vendor/github.com/temoto/robotstxt/scanner.go @@ -0,0 +1,185 @@ +package robotstxt + +import ( +	"bytes" +	"fmt" +	"go/token" +	"os" +	"sync" +	"unicode/utf8" +) + +type byteScanner struct { +	pos           token.Position +	buf           []byte +	ErrorCount    int +	ch            rune +	Quiet         bool +	keyTokenFound bool +	lastChunk     bool +} + +const tokEOL = "\n" + +var WhitespaceChars = []rune{' ', '\t', '\v'} +var tokBuffers = sync.Pool{New: func() interface{} { return bytes.NewBuffer(make([]byte, 32)) }} + +func newByteScanner(srcname string, quiet bool) *byteScanner { +	return &byteScanner{ +		Quiet: quiet, +		ch:    -1, +		pos:   token.Position{Filename: srcname}, +	} +} + +func (s *byteScanner) feed(input []byte, end bool) { +	s.buf = input +	s.pos.Offset = 0 +	s.pos.Line = 1 +	s.pos.Column = 1 +	s.lastChunk = end + +	// Read first char into look-ahead buffer `s.ch`. +	if !s.nextChar() { +		return +	} + +	// Skip UTF-8 byte order mark +	if s.ch == 65279 { +		s.nextChar() +		s.pos.Column = 1 +	} +} + +func (s *byteScanner) GetPosition() token.Position { +	return s.pos +} + +func (s *byteScanner) scan() string { +	// Note Offset > len, not >=, so we can scan last character. +	if s.lastChunk && s.pos.Offset > len(s.buf) { +		return "" +	} + +	s.skipSpace() + +	if s.ch == -1 { +		return "" +	} + +	// EOL +	if s.isEol() { +		s.keyTokenFound = false +		// skip subsequent newline chars +		for s.ch != -1 && s.isEol() { +			s.nextChar() +		} +		// emit newline as separate token +		return tokEOL +	} + +	// skip comments +	if s.ch == '#' { +		s.keyTokenFound = false +		s.skipUntilEol() +		if s.ch == -1 { +			return "" +		} +		// emit newline as separate token +		return tokEOL +	} + +	// else we found something +	tok := tokBuffers.Get().(*bytes.Buffer) +	defer tokBuffers.Put(tok) +	tok.Reset() +	tok.WriteRune(s.ch) +	s.nextChar() +	for s.ch != -1 && !s.isSpace() && !s.isEol() { +		// Do not consider ":" to be a token separator if a first key token +		// has already been found on this line (avoid cutting an absolute URL +		// after the "http:") +		if s.ch == ':' && !s.keyTokenFound { +			s.nextChar() +			s.keyTokenFound = true +			break +		} + +		tok.WriteRune(s.ch) +		s.nextChar() +	} +	return tok.String() +} + +func (s *byteScanner) scanAll() []string { +	results := make([]string, 0, 64) // random guess of average tokens length +	for { +		token := s.scan() +		if token != "" { +			results = append(results, token) +		} else { +			break +		} +	} +	return results +} + +func (s *byteScanner) error(pos token.Position, msg string) { +	s.ErrorCount++ +	if !s.Quiet { +		fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg) +	} +} + +func (s *byteScanner) isEol() bool { +	return s.ch == '\n' || s.ch == '\r' +} + +func (s *byteScanner) isSpace() bool { +	for _, r := range WhitespaceChars { +		if s.ch == r { +			return true +		} +	} +	return false +} + +func (s *byteScanner) skipSpace() { +	for s.ch != -1 && s.isSpace() { +		s.nextChar() +	} +} + +func (s *byteScanner) skipUntilEol() { +	for s.ch != -1 && !s.isEol() { +		s.nextChar() +	} +	// skip subsequent newline chars +	for s.ch != -1 && s.isEol() { +		s.nextChar() +	} +} + +// Reads next Unicode char. +func (s *byteScanner) nextChar() bool { +	if s.pos.Offset >= len(s.buf) { +		s.ch = -1 +		return false +	} +	s.pos.Column++ +	if s.ch == '\n' { +		s.pos.Line++ +		s.pos.Column = 1 +	} +	r, w := rune(s.buf[s.pos.Offset]), 1 +	if r >= 0x80 { +		r, w = utf8.DecodeRune(s.buf[s.pos.Offset:]) +		if r == utf8.RuneError && w == 1 { +			s.error(s.pos, "illegal UTF-8 encoding") +		} +	} +	s.pos.Column++ +	s.pos.Offset += w +	s.ch = r +	return true +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 49ca611b2..04314f34f 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -412,6 +412,8 @@ github.com/jackc/puddle/v2/internal/genstack  # github.com/jessevdk/go-flags v1.5.0  ## explicit; go 1.15  github.com/jessevdk/go-flags +# github.com/jimsmart/grobotstxt v1.0.3 +## explicit; go 1.14  # github.com/jinzhu/inflection v1.0.0  ## explicit  github.com/jinzhu/inflection @@ -831,6 +833,9 @@ github.com/tdewolff/parse/v2/strconv  # github.com/technologize/otel-go-contrib v1.1.1  ## explicit; go 1.17  github.com/technologize/otel-go-contrib/otelginmetrics +# github.com/temoto/robotstxt v1.1.2 +## explicit; go 1.11 +github.com/temoto/robotstxt  # github.com/tetratelabs/wazero v1.8.2  ## explicit; go 1.21  github.com/tetratelabs/wazero  | 
