diff options
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/api/util/mime.go | 12 | ||||
| -rw-r--r-- | internal/federation/dereferencing/instance.go | 6 | ||||
| -rw-r--r-- | internal/federation/dereferencing/instance_test.go | 94 | ||||
| -rw-r--r-- | internal/transport/derefinstance.go | 136 | ||||
| -rw-r--r-- | internal/transport/derefrobots.go | 91 | 
5 files changed, 316 insertions, 23 deletions
diff --git a/internal/api/util/mime.go b/internal/api/util/mime.go index 4d8946e5d..da96be786 100644 --- a/internal/api/util/mime.go +++ b/internal/api/util/mime.go @@ -36,6 +36,8 @@ const (  	TextHTML          = `text/html`  	TextCSS           = `text/css`  	TextCSV           = `text/csv` +	TextPlain         = `text/plain` +	UTF8              = `utf-8`  )  // JSONContentType returns whether is application/json(;charset=utf-8)? content-type. @@ -74,6 +76,14 @@ func XMLXRDContentType(ct string) bool {  		p[0] == AppXMLXRD  } +// TextPlainContentType returns whether is text/plain(;charset=utf-8)? content-type. +func TextPlainContentType(ct string) bool { +	p := splitContentType(ct) +	p, ok := isUTF8ContentType(p) +	return ok && len(p) == 1 && +		p[0] == TextPlain +} +  // ASContentType returns whether is valid ActivityStreams content-types:  // - application/activity+json  // - application/ld+json;profile=https://w3.org/ns/activitystreams @@ -118,7 +128,7 @@ func NodeInfo2ContentType(ct string) bool {  // type parts list, removes it and returns whether is utf-8.  func isUTF8ContentType(p []string) ([]string, bool) {  	const charset = "charset=" -	const charsetUTF8 = charset + "utf-8" +	const charsetUTF8 = charset + UTF8  	for i, part := range p {  		// Only handle charset slice parts. diff --git a/internal/federation/dereferencing/instance.go b/internal/federation/dereferencing/instance.go index 90ce074cd..66d0a21be 100644 --- a/internal/federation/dereferencing/instance.go +++ b/internal/federation/dereferencing/instance.go @@ -19,20 +19,20 @@ package dereferencing  import (  	"context" -	"fmt"  	"net/url" +	"github.com/superseriousbusiness/gotosocial/internal/gtserror"  	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"  )  func (d *Dereferencer) GetRemoteInstance(ctx context.Context, username string, remoteInstanceURI *url.URL) (*gtsmodel.Instance, error) {  	if blocked, err := d.state.DB.IsDomainBlocked(ctx, remoteInstanceURI.Host); blocked || err != nil { -		return nil, fmt.Errorf("GetRemoteInstance: domain %s is blocked", remoteInstanceURI.Host) +		return nil, gtserror.Newf("domain %s is blocked", remoteInstanceURI.Host)  	}  	transport, err := d.transportController.NewTransportForUsername(ctx, username)  	if err != nil { -		return nil, fmt.Errorf("transport err: %s", err) +		return nil, gtserror.Newf("transport err: %w", err)  	}  	return transport.DereferenceInstance(ctx, remoteInstanceURI) diff --git a/internal/federation/dereferencing/instance_test.go b/internal/federation/dereferencing/instance_test.go new file mode 100644 index 000000000..15f075479 --- /dev/null +++ b/internal/federation/dereferencing/instance_test.go @@ -0,0 +1,94 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package dereferencing_test + +import ( +	"context" +	"net/url" +	"testing" + +	"github.com/stretchr/testify/suite" +	"github.com/superseriousbusiness/gotosocial/internal/gtscontext" +	"github.com/superseriousbusiness/gotosocial/testrig" +) + +type InstanceTestSuite struct { +	DereferencerStandardTestSuite +} + +func (suite *InstanceTestSuite) TestDerefInstance() { +	type testCase struct { +		instanceIRI      *url.URL +		expectedSoftware string +	} + +	for _, tc := range []testCase{ +		{ +			// Fossbros anonymous doesn't shield their nodeinfo or +			// well-known or anything so we should be able to fetch. +			instanceIRI:      testrig.URLMustParse("https://fossbros-anonymous.io"), +			expectedSoftware: "Hellsoft 6.6.6", +		}, +		{ +			// Furtive nerds forbids /nodeinfo using +			// robots.txt so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't fetch /nodeinfo/2.1: robots.txt disallows it"  +			instanceIRI:      testrig.URLMustParse("https://furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +		{ +			// Robotic furtive nerds forbids *everything* using +			// robots.txt so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't fetch api/v1/instance: robots.txt disallows it"  +			//   - "can't fetch .well-known/nodeinfo: robots.txt disallows it" +			instanceIRI:      testrig.URLMustParse("https://robotic.furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +		{ +			// Really furtive nerds forbids .well-known/nodeinfo using +			// X-Robots-Tagheaders, so we should get bare minimum only. +			// +			// Debug-level logs should show something like: +			// +			//   - "can't use fetched .well-known/nodeinfo: robots tags disallows it"  +			instanceIRI:      testrig.URLMustParse("https://really.furtive-nerds.example.org"), +			expectedSoftware: "", +		}, +	} { +		instance, err := suite.dereferencer.GetRemoteInstance( +			gtscontext.SetFastFail(context.Background()), +			suite.testAccounts["admin_account"].Username, +			tc.instanceIRI, +		) +		if err != nil { +			suite.FailNow(err.Error()) +		} + +		suite.Equal(tc.expectedSoftware, instance.Version) +	} +} + +func TestInstanceTestSuite(t *testing.T) { +	suite.Run(t, new(InstanceTestSuite)) +} diff --git a/internal/transport/derefinstance.go b/internal/transport/derefinstance.go index bbeb51000..e7971093d 100644 --- a/internal/transport/derefinstance.go +++ b/internal/transport/derefinstance.go @@ -25,6 +25,7 @@ import (  	"io"  	"net/http"  	"net/url" +	"slices"  	"strings"  	apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" @@ -35,18 +36,29 @@ import (  	"github.com/superseriousbusiness/gotosocial/internal/log"  	"github.com/superseriousbusiness/gotosocial/internal/util"  	"github.com/superseriousbusiness/gotosocial/internal/validate" +	"github.com/temoto/robotstxt"  )  func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gtsmodel.Instance, error) { +	// Try to fetch robots.txt to check +	// if we're allowed to try endpoints: +	// +	//   - /api/v1/instance +	//   - /.well-known/nodeinfo +	//   - /nodeinfo/2.0|2.1 endpoints +	robotsTxt, err := t.DereferenceRobots(ctx, iri.Scheme, iri.Host) +	if err != nil { +		log.Debugf(ctx, "couldn't fetch robots.txt from %s: %v", iri.Host, err) +	} +  	var i *gtsmodel.Instance -	var err error  	// First try to dereference using /api/v1/instance.  	// This will provide the most complete picture of an instance, and avoid unnecessary api calls.  	//  	// This will only work with Mastodon-api compatible instances: Mastodon, some Pleroma instances, GoToSocial.  	log.Debugf(ctx, "trying to dereference instance %s by /api/v1/instance", iri.Host) -	i, err = dereferenceByAPIV1Instance(ctx, t, iri) +	i, err = t.dereferenceByAPIV1Instance(ctx, iri, robotsTxt)  	if err == nil {  		log.Debugf(ctx, "successfully dereferenced instance using /api/v1/instance")  		return i, nil @@ -56,7 +68,7 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts  	// If that doesn't work, try to dereference using /.well-known/nodeinfo.  	// This will involve two API calls and return less info overall, but should be more widely compatible.  	log.Debugf(ctx, "trying to dereference instance %s by /.well-known/nodeinfo", iri.Host) -	i, err = dereferenceByNodeInfo(ctx, t, iri) +	i, err = t.dereferenceByNodeInfo(ctx, iri, robotsTxt)  	if err == nil {  		log.Debugf(ctx, "successfully dereferenced instance using /.well-known/nodeinfo")  		return i, nil @@ -77,11 +89,23 @@ func (t *transport) DereferenceInstance(ctx context.Context, iri *url.URL) (*gts  	}, nil  } -func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { +func (t *transport) dereferenceByAPIV1Instance( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { +	const path = "api/v1/instance" + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	cleanIRI := &url.URL{  		Scheme: iri.Scheme,  		Host:   iri.Host, -		Path:   "api/v1/instance", +		Path:   path,  	}  	// Build IRI just once @@ -105,6 +129,18 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  		return nil, gtserror.NewFromResponse(resp)  	} +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	// Ensure that the incoming request content-type is expected.  	if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {  		err := gtserror.Newf("non json response type: %s", ct) @@ -118,7 +154,8 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  		return nil, errors.New("response bytes was len 0")  	} -	// try to parse the returned bytes directly into an Instance model +	// Try to parse the returned bytes +	// directly into an Instance model.  	apiResp := &apimodel.InstanceV1{}  	if err := json.Unmarshal(b, apiResp); err != nil {  		return nil, err @@ -149,24 +186,32 @@ func dereferenceByAPIV1Instance(ctx context.Context, t *transport, iri *url.URL)  	return i, nil  } -func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsmodel.Instance, error) { -	niIRI, err := callNodeInfoWellKnown(c, t, iri) +func (t *transport) dereferenceByNodeInfo( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*gtsmodel.Instance, error) { +	// Retrieve the nodeinfo IRI from .well-known/nodeinfo. +	niIRI, err := t.callNodeInfoWellKnown(ctx, iri, robotsTxt)  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error during initial call to well-known nodeinfo: %s", err) +		return nil, gtserror.Newf("error during initial call to .well-known: %w", err)  	} -	ni, err := callNodeInfo(c, t, niIRI) +	// Use the returned nodeinfo IRI to make a followup call. +	ni, err := t.callNodeInfo(ctx, niIRI, robotsTxt)  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error doing second call to nodeinfo uri %s: %s", niIRI.String(), err) +		return nil, gtserror.Newf("error during call to %s: %w", niIRI.String(), err)  	} -	// we got a response of some kind! take what we can from it... +	// We got a response of some kind! +	// +	// Start building out the bare minimum +	// instance model, we'll add to it if we can.  	id, err := id.NewRandomULID()  	if err != nil { -		return nil, fmt.Errorf("dereferenceByNodeInfo: error creating new id for instance %s: %s", iri.Host, err) +		return nil, gtserror.Newf("error creating new id for instance %s: %w", iri.Host, err)  	} -	// this is the bare minimum instance we'll return, and we'll add more stuff to it if we can  	i := >smodel.Instance{  		ID:     id,  		Domain: iri.Host, @@ -234,11 +279,23 @@ func dereferenceByNodeInfo(c context.Context, t *transport, iri *url.URL) (*gtsm  	return i, nil  } -func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*url.URL, error) { +func (t *transport) callNodeInfoWellKnown( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*url.URL, error) { +	const path = ".well-known/nodeinfo" + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent("/"+path, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	cleanIRI := &url.URL{  		Scheme: iri.Scheme,  		Host:   iri.Host, -		Path:   ".well-known/nodeinfo", +		Path:   path,  	}  	// Build IRI just once @@ -261,7 +318,19 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  		return nil, gtserror.NewFromResponse(resp)  	} -	// Ensure that the incoming request content-type is expected. +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", path) +		return nil, gtserror.SetNotPermitted(err) +	} + +	// Ensure that the returned content-type is expected.  	if ct := resp.Header.Get("Content-Type"); !apiutil.JSONContentType(ct) {  		err := gtserror.Newf("non json response type: %s", ct)  		return nil, gtserror.SetMalformed(err) @@ -279,7 +348,8 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  		return nil, gtserror.Newf("could not unmarshal server response as WellKnownResponse: %w", err)  	} -	// look through the links for the first one that matches the nodeinfo schema, this is what we need +	// Look through the links for the first one that +	// matches nodeinfo schema, this is what we need.  	var nodeinfoHref *url.URL  	for _, l := range wellKnownResp.Links {  		if l.Href == "" || !strings.HasPrefix(l.Rel, "http://nodeinfo.diaspora.software/ns/schema/2") { @@ -297,7 +367,23 @@ func callNodeInfoWellKnown(ctx context.Context, t *transport, iri *url.URL) (*ur  	return nodeinfoHref, nil  } -func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.Nodeinfo, error) { +func (t *transport) callNodeInfo( +	ctx context.Context, +	iri *url.URL, +	robotsTxt *robotstxt.RobotsData, +) (*apimodel.Nodeinfo, error) { +	// Normalize robots.txt test path. +	testPath := iri.Path +	if !strings.HasPrefix(testPath, "/") { +		testPath = "/" + testPath +	} + +	// Bail if we're not allowed to fetch this endpoint. +	if robotsTxt != nil && !robotsTxt.TestAgent(testPath, t.controller.userAgent) { +		err := gtserror.Newf("can't fetch %s: robots.txt disallows it", testPath) +		return nil, gtserror.SetNotPermitted(err) +	} +  	// Build IRI just once  	iriStr := iri.String() @@ -324,6 +410,18 @@ func callNodeInfo(ctx context.Context, t *transport, iri *url.URL) (*apimodel.No  		return nil, gtserror.SetMalformed(err)  	} +	// Ensure that we can use data returned from this endpoint. +	robots := resp.Header.Values("X-Robots-Tag") +	if slices.ContainsFunc( +		robots, +		func(key string) bool { +			return strings.Contains(key, "noindex") +		}, +	) { +		err := gtserror.Newf("can't use fetched %s: robots tags disallows it", iri.Path) +		return nil, gtserror.SetNotPermitted(err) +	} +  	b, err := io.ReadAll(resp.Body)  	if err != nil {  		return nil, err diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go new file mode 100644 index 000000000..d6c4f3058 --- /dev/null +++ b/internal/transport/derefrobots.go @@ -0,0 +1,91 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package transport + +import ( +	"context" +	"net/http" +	"net/url" + +	"codeberg.org/gruf/go-bytesize" +	"codeberg.org/gruf/go-iotools" +	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +	"github.com/superseriousbusiness/gotosocial/internal/gtserror" +	"github.com/temoto/robotstxt" +) + +func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) { +	robotsIRI := &url.URL{ +		Scheme: protocol, +		Host:   host, +		Path:   "robots.txt", +	} + +	// Build IRI just once +	iriStr := robotsIRI.String() + +	// Prepare new HTTP request to endpoint +	req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil) +	if err != nil { +		return nil, err +	} + +	// We want text/plain utf-8 encoding. +	// +	// https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method +	req.Header.Add("Accept", apiutil.TextPlain) +	req.Header.Add("Accept-Charset", apiutil.UTF8) + +	// Perform the HTTP request +	rsp, err := t.GET(req) +	if err != nil { +		return nil, err +	} + +	// Ensure a non-error status response. +	if rsp.StatusCode != http.StatusOK { +		err := gtserror.NewFromResponse(rsp) +		_ = rsp.Body.Close() // close early. +		return nil, err +	} + +	// Ensure that the incoming request content-type is expected. +	if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) { +		err := gtserror.Newf("non text/plain response: %s", ct) +		_ = rsp.Body.Close() // close early. +		return nil, gtserror.SetMalformed(err) +	} + +	// Limit the robots.txt size to 500KiB +	// +	// https://www.rfc-editor.org/rfc/rfc9309.html#name-limits +	const maxsz = int64(500 * bytesize.KiB) + +	// Check body claims to be within size limit. +	if rsp.ContentLength > maxsz { +		_ = rsp.Body.Close()       // close early. +		sz := bytesize.Size(maxsz) //nolint:gosec +		return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz) +	} + +	// Update response body with maximum size. +	rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz) +	defer rsp.Body.Close() + +	return robotstxt.FromResponse(rsp) +}  | 
