diff options
author | 2025-02-11 13:16:14 +0100 | |
---|---|---|
committer | 2025-02-11 13:16:14 +0100 | |
commit | d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf (patch) | |
tree | a4eab190784a8d456226788404a71f263ecbdc49 /internal | |
parent | [bugfix] Suggest lowercase username when creating via OIDC (#3780) (diff) | |
download | gotosocial-d0de3ad49260ad2f87d02ce1307b1f20e88a1fdf.tar.xz |
[bug] respect `X-Robots-Tag` and `robots.txt` on api/v1/instance and nodeinfo (#3756)
* feat: check X-Robots-Tag
when accessing /api/v1/instance or /nodeinfo endpoints respect
X-Robots-Tag
* chore: go fmt ./...
* Check robots.txt as well, add tests
---------
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Diffstat (limited to 'internal')
-rw-r--r-- | internal/api/util/mime.go | 12 | ||||
-rw-r--r-- | internal/federation/dereferencing/instance.go | 6 | ||||
-rw-r--r-- | internal/federation/dereferencing/instance_test.go | 94 | ||||
-rw-r--r-- | internal/transport/derefinstance.go | 136 | ||||
-rw-r--r-- | internal/transport/derefrobots.go | 91 |
5 files changed, 316 insertions, 23 deletions
diff --git a/internal/api/util/mime.go b/internal/api/util/mime.go index 4d8946e5d..da96be786 100644 --- a/internal/api/util/mime.go +++ b/internal/api/util/mime.go @@ -36,6 +36,8 @@ const ( TextHTML = `text/html` TextCSS = `text/css` TextCSV = `text/csv` + TextPlain = `text/plain` + UTF8 = `utf-8` ) // JSONContentType returns whether is application/json(;charset=utf-8)? content-type. @@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool { p[0] == AppXMLXRD } +// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type. +func TextPlainContentType(ct string) bool { + p := splitContentType(ct) + p, ok := isUTF8ContentType(p) + return ok && len(p) == 1 && + p[0] == TextPlain +} + // ASContentType returns whether is valid ActivityStreams content-types: // - application/activity+json // - application/ld+json;profile=https://w3.org/ns/activitystreams @@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool { // type parts list, removes it and returns whether is utf-8. func isUTF8ContentType(p []string) ([]string, bool) { const charset = "charset=" - const charsetUTF8 = charset + "utf-8" + const charsetUTF8 = charset + UTF8 for i, part := range p { // Only handle charset slice parts. diff --git a/internal/federation/dereferencing/instance.go b/internal/federation/dereferencing/instance.go index 90ce074cd..66d0a21be 100644 --- a/internal/federation/dereferencing/instance.go +++ b/internal/federation/dereferencing/instance.go @@ -19,20 +19,20 @@ package dereferencing import ( "context" - "fmt" "net/url" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) { if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil { - return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host) + return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host) } transport, err := d.transportController.NewTransportForUsername(ctx, username) if err != nil { - return nil, fmt.Errorf("transport err: %s", err) + return nil, gtserror.Newf("transport err: %w", err) } return transport.DereferenceInstance(ctx, remoteInstanceURI) diff --git a/internal/federation/dereferencing/instance_test.go b/internal/federation/dereferencing/instance_test.go new file mode 100644 index 000000000..15f075479 --- /dev/null +++ b/internal/federation/dereferencing/instance_test.go @@ -0,0 +1,94 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package dereferencing_test + +import ( + "context" + "net/url" + "testing" + + "github.com/stretchr/testify/suite" + "github.com/superseriousbusiness/gotosocial/internal/gtscontext" + "github.com/superseriousbusiness/gotosocial/testrig" +) + +type InstanceTestSuite struct { + DereferencerStandardTestSuite +} + +func (suite *InstanceTestSuite) TestDerefInstance() { + type testCase struct { + instanceIRI *url.URL + expectedSoftware string + } + + for _, tc := range []testCase{ + { + // Fossbros anonymous doesn't shield their nodeinfo or + // well-known or anything so we should be able to fetch. + instanceIRI: testrig.URLMustParse("https://fossbros-anonymous.io"), + expectedSoftware: "Hellsoft 6.6.6", + }, + { + // Furtive nerds forbids /nodeinfo using + // robots.txt so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't fetch /nodeinfo/2.1: robots.txt disallows it" + instanceIRI: testrig.URLMustParse("https://furtive-nerds.example.org"), + expectedSoftware: "", + }, + { + // Robotic furtive nerds forbids *everything* using + // robots.txt so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't fetch api/v1/instance: robots.txt disallows it" + // - "can't fetch .well-known/nodeinfo: robots.txt disallows it" + instanceIRI: testrig.URLMustParse("https://robotic.furtive-nerds.example.org"), + expectedSoftware: "", + }, + { + // Really furtive nerds forbids .well-known/nodeinfo using + // X-Robots-Tagheaders, so we should get bare minimum only. + // + // Debug-level logs should show something like: + // + // - "can't use fetched .well-known/nodeinfo: robots tags disallows it" + instanceIRI: testrig.URLMustParse("https://really.furtive-nerds.example.org"), + expectedSoftware: "", + }, + } { + instance, err := suite.dereferencer.GetRemoteInstance( + gtscontext.SetFastFail(context.Background()), + suite.testAccounts["admin_account"].Username, + tc.instanceIRI, + ) + if err != nil { + suite.FailNow(err.Error()) + } + + suite.Equal(tc.expectedSoftware, instance.Version) + } +} + +func TestInstanceTestSuite(t *testing.T) { + suite.Run(t, new(InstanceTestSuite)) +} diff --git a/internal/transport/derefinstance.go b/internal/transport/derefinstance.go index bbeb51000..e7971093d 100644 --- a/internal/transport/derefinstance.go +++ b/internal/transport/derefinstance.go @@ -25,6 +25,7 @@ import ( "io" "net/http" "net/url" + "slices" "strings" apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" @@ -35,18 +36,29 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/util" "github.com/superseriousbusiness/gotosocial/internal/validate" + "github.com/temoto/robotstxt" ) func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) { + // Try to fetch robots.txt to check + // if we're allowed to try endpoints: + // + // - /api/v1/instance + // - /.well-known/nodeinfo + // - /nodeinfo/2.0|2.1 endpoints + robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host) + if err != nil { + log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err) + } + var i *gtsmodel.Instance - var err error // First try to dereference using /api/v1/instance. // This will provide the most complete picture of an instance, and avoid unnecessary api calls. // // This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial. log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host) - i, err = dereferenceByAPIV1Instance(ctx, t, iri) + i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt) if err == nil { log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance") return i, nil @@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts // If that doesn't work, try to dereference using /.well-known/nodeinfo. // This will involve two API calls and return less info overall, but should be more widely compatible. log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host) - i, err = dereferenceByNodeInfo(ctx, t, iri) + i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt) if err == nil { log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo") return i, nil @@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts }, nil } -func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { +func (t *transport) dereferenceByAPIV1Instance( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { + const path = "api/v1/instance" + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + cleanIRI := &url.URL{ Scheme: iri.Scheme, Host: iri.Host, - Path: "api/v1/instance", + Path: path, } // Build IRI just once @@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return nil, gtserror.NewFromResponse(resp) } + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + // Ensure that the incoming request content-type is expected. if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) { err := gtserror.Newf("non json response type: %s", ct) @@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return nil, errors.New("response bytes was len 0") } - // try to parse the returned bytes directly into an Instance model + // Try to parse the returned bytes + // directly into an Instance model. apiResp := &apimodel.InstanceV1{} if err := json.Unmarshal(b, apiResp); err != nil { return nil, err @@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) return i, nil } -func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { - niIRI, err := callNodeInfoWellKnown(c, t, iri) +func (t *transport) dereferenceByNodeInfo( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { + // Retrieve the nodeinfo IRI from .well-known/nodeinfo. + niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt) if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err) + return nil, gtserror.Newf("error during initial call to .well-known: %w", err) } - ni, err := callNodeInfo(c, t, niIRI) + // Use the returned nodeinfo IRI to make a followup call. + ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt) if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err) + return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err) } - // we got a response of some kind! take what we can from it... + // We got a response of some kind! + // + // Start building out the bare minimum + // instance model, we'll add to it if we can. id, err := id.NewRandomULID() if err != nil { - return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err) + return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err) } - // this is the bare minimum instance we'll return, and we'll add more stuff to it if we can i := >smodel.Instance{ ID: id, Domain: iri.Host, @@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm return i, nil } -func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) { +func (t *transport) callNodeInfoWellKnown( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*url.URL, error) { + const path = ".well-known/nodeinfo" + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + cleanIRI := &url.URL{ Scheme: iri.Scheme, Host: iri.Host, - Path: ".well-known/nodeinfo", + Path: path, } // Build IRI just once @@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nil, gtserror.NewFromResponse(resp) } - // Ensure that the incoming request content-type is expected. + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) + return nil, gtserror.SetNotPermitted(err) + } + + // Ensure that the returned content-type is expected. if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) { err := gtserror.Newf("non json response type: %s", ct) return nil, gtserror.SetMalformed(err) @@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err) } - // look through the links for the first one that matches the nodeinfo schema, this is what we need + // Look through the links for the first one that + // matches nodeinfo schema, this is what we need. var nodeinfoHref *url.URL for _, l := range wellKnownResp.Links { if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") { @@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur return nodeinfoHref, nil } -func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) { +func (t *transport) callNodeInfo( + ctx context.Context, + iri *url.URL, + robotsTxt *robotstxt.RobotsData, +) (*apimodel.Nodeinfo, error) { + // Normalize robots.txt test path. + testPath := iri.Path + if !strings.HasPrefix(testPath, "/") { + testPath = "/" + testPath + } + + // Bail if we're not allowed to fetch this endpoint. + if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) { + err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath) + return nil, gtserror.SetNotPermitted(err) + } + // Build IRI just once iriStr := iri.String() @@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No return nil, gtserror.SetMalformed(err) } + // Ensure that we can use data returned from this endpoint. + robots := resp.Header.Values("X-Robots-Tag") + if slices.ContainsFunc( + robots, + func(key string) bool { + return strings.Contains(key, "noindex") + }, + ) { + err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path) + return nil, gtserror.SetNotPermitted(err) + } + b, err := io.ReadAll(resp.Body) if err != nil { return nil, err diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go new file mode 100644 index 000000000..d6c4f3058 --- /dev/null +++ b/internal/transport/derefrobots.go @@ -0,0 +1,91 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package transport + +import ( + "context" + "net/http" + "net/url" + + "codeberg.org/gruf/go-bytesize" + "codeberg.org/gruf/go-iotools" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/temoto/robotstxt" +) + +func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) { + robotsIRI := &url.URL{ + Scheme: protocol, + Host: host, + Path: "robots.txt", + } + + // Build IRI just once + iriStr := robotsIRI.String() + + // Prepare new HTTP request to endpoint + req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil) + if err != nil { + return nil, err + } + + // We want text/plain utf-8 encoding. + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method + req.Header.Add("Accept", apiutil.TextPlain) + req.Header.Add("Accept-Charset", apiutil.UTF8) + + // Perform the HTTP request + rsp, err := t.GET(req) + if err != nil { + return nil, err + } + + // Ensure a non-error status response. + if rsp.StatusCode != http.StatusOK { + err := gtserror.NewFromResponse(rsp) + _ = rsp.Body.Close() // close early. + return nil, err + } + + // Ensure that the incoming request content-type is expected. + if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) { + err := gtserror.Newf("non text/plain response: %s", ct) + _ = rsp.Body.Close() // close early. + return nil, gtserror.SetMalformed(err) + } + + // Limit the robots.txt size to 500KiB + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-limits + const maxsz = int64(500 * bytesize.KiB) + + // Check body claims to be within size limit. + if rsp.ContentLength > maxsz { + _ = rsp.Body.Close() // close early. + sz := bytesize.Size(maxsz) //nolint:gosec + return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz) + } + + // Update response body with maximum size. + rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz) + defer rsp.Body.Close() + + return robotstxt.FromResponse(rsp) +} |