diff options
Diffstat (limited to 'internal/transport/derefrobots.go')
-rw-r--r-- | internal/transport/derefrobots.go | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/internal/transport/derefrobots.go b/internal/transport/derefrobots.go new file mode 100644 index 000000000..d6c4f3058 --- /dev/null +++ b/internal/transport/derefrobots.go @@ -0,0 +1,91 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package transport + +import ( + "context" + "net/http" + "net/url" + + "codeberg.org/gruf/go-bytesize" + "codeberg.org/gruf/go-iotools" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/temoto/robotstxt" +) + +func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) { + robotsIRI := &url.URL{ + Scheme: protocol, + Host: host, + Path: "robots.txt", + } + + // Build IRI just once + iriStr := robotsIRI.String() + + // Prepare new HTTP request to endpoint + req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil) + if err != nil { + return nil, err + } + + // We want text/plain utf-8 encoding. + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method + req.Header.Add("Accept", apiutil.TextPlain) + req.Header.Add("Accept-Charset", apiutil.UTF8) + + // Perform the HTTP request + rsp, err := t.GET(req) + if err != nil { + return nil, err + } + + // Ensure a non-error status response. + if rsp.StatusCode != http.StatusOK { + err := gtserror.NewFromResponse(rsp) + _ = rsp.Body.Close() // close early. + return nil, err + } + + // Ensure that the incoming request content-type is expected. + if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) { + err := gtserror.Newf("non text/plain response: %s", ct) + _ = rsp.Body.Close() // close early. + return nil, gtserror.SetMalformed(err) + } + + // Limit the robots.txt size to 500KiB + // + // https://www.rfc-editor.org/rfc/rfc9309.html#name-limits + const maxsz = int64(500 * bytesize.KiB) + + // Check body claims to be within size limit. + if rsp.ContentLength > maxsz { + _ = rsp.Body.Close() // close early. + sz := bytesize.Size(maxsz) //nolint:gosec + return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz) + } + + // Update response body with maximum size. + rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz) + defer rsp.Body.Close() + + return robotstxt.FromResponse(rsp) +} |