summaryrefslogtreecommitdiff
path: root/internal/transport/derefrobots.go
blob: d6c4f30581321878e8a599931f46c9b6c1e9c62c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

package transport

import (
	"context"
	"net/http"
	"net/url"

	"codeberg.org/gruf/go-bytesize"
	"codeberg.org/gruf/go-iotools"
	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util"
	"github.com/superseriousbusiness/gotosocial/internal/gtserror"
	"github.com/temoto/robotstxt"
)

func (t *transport) DereferenceRobots(ctx context.Context, protocol string, host string) (*robotstxt.RobotsData, error) {
	robotsIRI := &url.URL{
		Scheme: protocol,
		Host:   host,
		Path:   "robots.txt",
	}

	// Build IRI just once
	iriStr := robotsIRI.String()

	// Prepare new HTTP request to endpoint
	req, err := http.NewRequestWithContext(ctx, "GET", iriStr, nil)
	if err != nil {
		return nil, err
	}

	// We want text/plain utf-8 encoding.
	//
	// https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method
	req.Header.Add("Accept", apiutil.TextPlain)
	req.Header.Add("Accept-Charset", apiutil.UTF8)

	// Perform the HTTP request
	rsp, err := t.GET(req)
	if err != nil {
		return nil, err
	}

	// Ensure a non-error status response.
	if rsp.StatusCode != http.StatusOK {
		err := gtserror.NewFromResponse(rsp)
		_ = rsp.Body.Close() // close early.
		return nil, err
	}

	// Ensure that the incoming request content-type is expected.
	if ct := rsp.Header.Get("Content-Type"); !apiutil.TextPlainContentType(ct) {
		err := gtserror.Newf("non text/plain response: %s", ct)
		_ = rsp.Body.Close() // close early.
		return nil, gtserror.SetMalformed(err)
	}

	// Limit the robots.txt size to 500KiB
	//
	// https://www.rfc-editor.org/rfc/rfc9309.html#name-limits
	const maxsz = int64(500 * bytesize.KiB)

	// Check body claims to be within size limit.
	if rsp.ContentLength > maxsz {
		_ = rsp.Body.Close()       // close early.
		sz := bytesize.Size(maxsz) //nolint:gosec
		return nil, gtserror.Newf("robots.txt body exceeds max size %s", sz)
	}

	// Update response body with maximum size.
	rsp.Body, _, _ = iotools.UpdateReadCloserLimit(rsp.Body, maxsz)
	defer rsp.Body.Close()

	return robotstxt.FromResponse(rsp)
}