From baed591a1d19942ec553baed41a8048ab9dd18ca Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Wed, 5 Feb 2025 12:47:13 +0100 Subject: [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers (#3737) * [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers * use switch for RobotsHeaders --- internal/api/nodeinfo.go | 11 ++ internal/api/robots.go | 52 ++++++++ internal/api/robots/robots.go | 57 +++++++++ internal/api/util/robots.go | 133 +++++++++++++++++++++ internal/api/wellknown/hostmeta/hostmeta.go | 4 +- internal/api/wellknown/nodeinfo/nodeinfo.go | 58 ++++++++- internal/api/wellknown/nodeinfo/nodeinfoget.go | 66 ----------- internal/api/wellknown/webfinger/webfinger.go | 4 +- internal/middleware/extraheaders.go | 7 -- internal/middleware/robots.go | 67 +++++++++++ internal/web/profile.go | 2 +- internal/web/robots.go | 157 ------------------------- internal/web/web.go | 37 +++--- 13 files changed, 402 insertions(+), 253 deletions(-) create mode 100644 internal/api/robots.go create mode 100644 internal/api/robots/robots.go create mode 100644 internal/api/util/robots.go delete mode 100644 internal/api/wellknown/nodeinfo/nodeinfoget.go create mode 100644 internal/middleware/robots.go delete mode 100644 internal/web/robots.go (limited to 'internal') diff --git a/internal/api/nodeinfo.go b/internal/api/nodeinfo.go index 29942aba4..2f0c234fd 100644 --- a/internal/api/nodeinfo.go +++ b/internal/api/nodeinfo.go @@ -20,6 +20,7 @@ package api import ( "github.com/gin-gonic/gin" "github.com/superseriousbusiness/gotosocial/internal/api/nodeinfo" + "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" "github.com/superseriousbusiness/gotosocial/internal/router" @@ -43,6 +44,16 @@ func (w *NodeInfo) Route(r *router.Router, m ...gin.HandlerFunc) { }), ) + // If instance is configured to serve instance stats + // faithfully at nodeinfo, we should allow robots to + // crawl nodeinfo endpoints in a limited capacity. + // In all other cases, disallow everything. + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + nodeInfoGroup.Use(middleware.RobotsHeaders("allowSome")) + } else { + nodeInfoGroup.Use(middleware.RobotsHeaders("")) + } + w.nodeInfo.Route(nodeInfoGroup.Handle) } diff --git a/internal/api/robots.go b/internal/api/robots.go new file mode 100644 index 000000000..3ed8282f5 --- /dev/null +++ b/internal/api/robots.go @@ -0,0 +1,52 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package api + +import ( + "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/api/robots" + "github.com/superseriousbusiness/gotosocial/internal/middleware" + "github.com/superseriousbusiness/gotosocial/internal/router" +) + +type Robots struct { + robots *robots.Module +} + +func (rb *Robots) Route(r *router.Router, m ...gin.HandlerFunc) { + // Create a group so we can attach middlewares. + robotsGroup := r.AttachGroup("robots.txt") + + // Use passed-in middlewares. + robotsGroup.Use(m...) + + // Allow caching for 24 hrs. + // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 + robotsGroup.Use( + middleware.CacheControl(middleware.CacheControlConfig{ + Directives: []string{"public", "max-age=86400"}, + Vary: []string{"Accept-Encoding"}, + }), + ) + + rb.robots.Route(robotsGroup.Handle) +} + +func NewRobots() *Robots { + return &Robots{} +} diff --git a/internal/api/robots/robots.go b/internal/api/robots/robots.go new file mode 100644 index 000000000..98db4682d --- /dev/null +++ b/internal/api/robots/robots.go @@ -0,0 +1,57 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package robots + +import ( + "net/http" + + "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/config" +) + +type Module struct{} + +func New() *Module { + return &Module{} +} + +func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { + // Serve different robots.txt file depending on instance + // stats mode: Don't disallow scraping nodeinfo if admin + // has opted in to serving accurate stats there. In all + // other cases, disallow scraping nodeinfo. + var handler gin.HandlerFunc + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + handler = m.robotsGETHandler + } else { + handler = m.robotsGETHandlerDisallowNodeInfo + } + + // Attach handler at empty path as this + // is already grouped under /robots.txt. + attachHandler(http.MethodGet, "", handler) +} + +func (m *Module) robotsGETHandler(c *gin.Context) { + c.String(http.StatusOK, apiutil.RobotsTxt) +} + +func (m *Module) robotsGETHandlerDisallowNodeInfo(c *gin.Context) { + c.String(http.StatusOK, apiutil.RobotsTxtDisallowNodeInfo) +} diff --git a/internal/api/util/robots.go b/internal/api/util/robots.go new file mode 100644 index 000000000..49fb04561 --- /dev/null +++ b/internal/api/util/robots.go @@ -0,0 +1,133 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package util + +// See: +// +// - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta +// - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +// - https://www.rfc-editor.org/rfc/rfc9309.html +const ( + RobotsDirectivesDisallow = "noindex, nofollow" + RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" + RobotsTxt = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go +# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro + +# AI scrapers and the like. +# https://github.com/ai-robots-txt/ai.robots.txt/ +User-agent: AI2Bot +User-agent: Ai2Bot-Dolma +User-agent: Amazonbot +User-agent: anthropic-ai +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: Bytespider +User-agent: CCBot +User-agent: ChatGPT-User +User-agent: ClaudeBot +User-agent: Claude-Web +User-agent: cohere-ai +User-agent: cohere-training-data-crawler +User-agent: Diffbot +User-agent: DuckAssistBot +User-agent: FacebookBot +User-agent: FriendlyCrawler +User-agent: Google-Extended +User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video +User-agent: GPTBot +User-agent: iaskspider/2.0 +User-agent: ICC-Crawler +User-agent: ImagesiftBot +User-agent: img2dataset +User-agent: ISSCyberRiskCrawler +User-agent: Kangaroo Bot +User-agent: Meta-ExternalAgent +User-agent: Meta-ExternalFetcher +User-agent: OAI-SearchBot +User-agent: omgili +User-agent: omgilibot +User-agent: PanguBot +User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Sidetrade indexer bot +User-agent: Timpibot +User-agent: VelenPublicWebCrawler +User-agent: Webzio-Extended +User-agent: YouBot +Disallow: / + +# Marketing/SEO "intelligence" data scrapers +User-agent: AwarioRssBot +User-agent: AwarioSmartBot +User-agent: DataForSeoBot +User-agent: magpie-crawler +User-agent: Meltwater +User-agent: peer39_crawler +User-agent: peer39_crawler/1.0 +User-agent: PiplBot +User-agent: scoop.it +User-agent: Seekr +Disallow: / + +# Well-known.dev crawler. Indexes stuff under /.well-known. +# https://well-known.dev/about/ +User-agent: WellKnownBot +Disallow: / + +# Rules for everything else. +User-agent: * +Crawl-delay: 500 + +# API endpoints. +Disallow: /api/ + +# Auth/Sign in endpoints. +Disallow: /auth/ +Disallow: /oauth/ +Disallow: /check_your_email +Disallow: /wait_for_approval +Disallow: /account_disabled +Disallow: /signup + +# Fileserver/media. +Disallow: /fileserver/ + +# Fedi S2S API endpoints. +Disallow: /users/ +Disallow: /emoji/ + +# Settings panels. +Disallow: /admin +Disallow: /user +Disallow: /settings/ + +# Domain blocklist. +Disallow: /about/suspended + +# Webfinger endpoint. +Disallow: /.well-known/webfinger +` + RobotsTxtDisallowNodeInfo = RobotsTxt + ` +# Disallow nodeinfo +Disallow: /.well-known/nodeinfo +Disallow: /nodeinfo/ +` +) diff --git a/internal/api/wellknown/hostmeta/hostmeta.go b/internal/api/wellknown/hostmeta/hostmeta.go index cb439fcd3..43c6b161e 100644 --- a/internal/api/wellknown/hostmeta/hostmeta.go +++ b/internal/api/wellknown/hostmeta/hostmeta.go @@ -21,6 +21,7 @@ import ( "net/http" "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -40,5 +41,6 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, HostMetaPath, m.HostMetaGETHandler) + // Attach handler, injecting robots http header middleware to disallow all. + attachHandler(http.MethodGet, HostMetaPath, middleware.RobotsHeaders(""), m.HostMetaGETHandler) } diff --git a/internal/api/wellknown/nodeinfo/nodeinfo.go b/internal/api/wellknown/nodeinfo/nodeinfo.go index 9012006f4..270dde2b1 100644 --- a/internal/api/wellknown/nodeinfo/nodeinfo.go +++ b/internal/api/wellknown/nodeinfo/nodeinfo.go @@ -21,6 +21,10 @@ import ( "net/http" "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" + "github.com/superseriousbusiness/gotosocial/internal/config" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -42,5 +46,57 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, NodeInfoWellKnownPath, m.NodeInfoWellKnownGETHandler) + // If instance is configured to serve instance stats + // faithfully at nodeinfo, we should allow robots to + // crawl nodeinfo endpoints in a limited capacity. + // In all other cases, disallow everything. + var robots gin.HandlerFunc + if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { + robots = middleware.RobotsHeaders("allowSome") + } else { + robots = middleware.RobotsHeaders("") + } + + // Attach handler, injecting robots http header middleware. + attachHandler(http.MethodGet, NodeInfoWellKnownPath, robots, m.NodeInfoWellKnownGETHandler) +} + +// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet +// +// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. +// +// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` +// See: https://nodeinfo.diaspora.software/protocol.html +// +// --- +// tags: +// - .well-known +// +// produces: +// - application/json +// +// responses: +// '200': +// schema: +// "$ref": "#/definitions/wellKnownResponse" +func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { + if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { + apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) + return + } + + resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) + if errWithCode != nil { + apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) + return + } + + // Encode JSON HTTP response. + apiutil.EncodeJSONResponse( + c.Writer, + c.Request, + http.StatusOK, + apiutil.AppJSON, + resp, + ) } diff --git a/internal/api/wellknown/nodeinfo/nodeinfoget.go b/internal/api/wellknown/nodeinfo/nodeinfoget.go deleted file mode 100644 index c458f131e..000000000 --- a/internal/api/wellknown/nodeinfo/nodeinfoget.go +++ /dev/null @@ -1,66 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package nodeinfo - -import ( - "net/http" - - "github.com/gin-gonic/gin" - apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" - "github.com/superseriousbusiness/gotosocial/internal/gtserror" -) - -// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet -// -// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. -// -// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` -// See: https://nodeinfo.diaspora.software/protocol.html -// -// --- -// tags: -// - .well-known -// -// produces: -// - application/json -// -// responses: -// '200': -// schema: -// "$ref": "#/definitions/wellKnownResponse" -func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { - if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { - apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) - return - } - - resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) - if errWithCode != nil { - apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) - return - } - - // Encode JSON HTTP response. - apiutil.EncodeJSONResponse( - c.Writer, - c.Request, - http.StatusOK, - apiutil.AppJSON, - resp, - ) -} diff --git a/internal/api/wellknown/webfinger/webfinger.go b/internal/api/wellknown/webfinger/webfinger.go index a50013b32..c70afab9d 100644 --- a/internal/api/wellknown/webfinger/webfinger.go +++ b/internal/api/wellknown/webfinger/webfinger.go @@ -21,6 +21,7 @@ import ( "net/http" "github.com/gin-gonic/gin" + "github.com/superseriousbusiness/gotosocial/internal/middleware" "github.com/superseriousbusiness/gotosocial/internal/processing" ) @@ -41,5 +42,6 @@ func New(processor *processing.Processor) *Module { } func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { - attachHandler(http.MethodGet, WebfingerBasePath, m.WebfingerGETRequest) + // Attach handler, injecting robots http header middleware to disallow all. + attachHandler(http.MethodGet, WebfingerBasePath, middleware.RobotsHeaders(""), m.WebfingerGETRequest) } diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go index fb91bcc93..c75b65551 100644 --- a/internal/middleware/extraheaders.go +++ b/internal/middleware/extraheaders.go @@ -44,12 +44,5 @@ func ExtraHeaders() gin.HandlerFunc { // // See: https://github.com/patcg-individual-drafts/topics c.Header("Permissions-Policy", "browsing-topics=()") - - // Some AI scrapers respect the following tags to opt-out - // of their crawling and datasets. - c.Header("X-Robots-Tag", "noimageai") - // c.Header calls .Set(), but we want to emit the header - // twice, not override it. - c.Writer.Header().Add("X-Robots-Tag", "noai") } } diff --git a/internal/middleware/robots.go b/internal/middleware/robots.go new file mode 100644 index 000000000..fefd93be0 --- /dev/null +++ b/internal/middleware/robots.go @@ -0,0 +1,67 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +package middleware + +import ( + "github.com/gin-gonic/gin" + apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +) + +// RobotsHeaders adds robots directives to the X-Robots-Tag HTTP header. +// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +// +// If mode == "aiOnly" then only the noai and noimageai values will be set, +// and other headers will be left alone (for route groups / handlers to set). +// +// If mode == "allowSome" then noai, noimageai, and some indexing will be set. +// +// If mode == "" then noai, noimageai, noindex, and nofollow will be set +// (ie., as restrictive as possible). +func RobotsHeaders(mode string) gin.HandlerFunc { + const ( + key = "X-Robots-Tag" + // Some AI scrapers respect the following tags + // to opt-out of their crawling and datasets. + // We add them regardless of allowSome. + noai = "noai, noimageai" + ) + + switch mode { + + // Just set ai headers and + // leave the other headers be. + case "aiOnly": + return func(c *gin.Context) { + c.Writer.Header().Set(key, noai) + } + + // Allow some limited indexing. + case "allowSome": + return func(c *gin.Context) { + c.Writer.Header().Set(key, apiutil.RobotsDirectivesAllowSome) + c.Writer.Header().Add(key, noai) + } + + // Disallow indexing via noindex, nofollow. + default: + return func(c *gin.Context) { + c.Writer.Header().Set(key, apiutil.RobotsDirectivesDisallow) + c.Writer.Header().Add(key, noai) + } + } +} diff --git a/internal/web/profile.go b/internal/web/profile.go index a6d96a9ea..cf12ca33a 100644 --- a/internal/web/profile.go +++ b/internal/web/profile.go @@ -103,7 +103,7 @@ func (m *Module) profileGETHandler(c *gin.Context) { // index if account is discoverable. var robotsMeta string if targetAccount.Discoverable { - robotsMeta = robotsMetaAllowSome + robotsMeta = apiutil.RobotsDirectivesAllowSome } // We need to change our response slightly if the diff --git a/internal/web/robots.go b/internal/web/robots.go deleted file mode 100644 index 524550642..000000000 --- a/internal/web/robots.go +++ /dev/null @@ -1,157 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program. If not, see . - -package web - -import ( - "net/http" - - "github.com/gin-gonic/gin" - "github.com/superseriousbusiness/gotosocial/internal/config" -) - -const ( - robotsPath = "/robots.txt" - robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta - robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go -# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro - -# AI scrapers and the like. -# https://github.com/ai-robots-txt/ai.robots.txt/ -User-agent: AI2Bot -User-agent: Ai2Bot-Dolma -User-agent: Amazonbot -User-agent: anthropic-ai -User-agent: Applebot -User-agent: Applebot-Extended -User-agent: Bytespider -User-agent: CCBot -User-agent: ChatGPT-User -User-agent: ClaudeBot -User-agent: Claude-Web -User-agent: cohere-ai -User-agent: cohere-training-data-crawler -User-agent: Diffbot -User-agent: DuckAssistBot -User-agent: FacebookBot -User-agent: FriendlyCrawler -User-agent: Google-Extended -User-agent: GoogleOther -User-agent: GoogleOther-Image -User-agent: GoogleOther-Video -User-agent: GPTBot -User-agent: iaskspider/2.0 -User-agent: ICC-Crawler -User-agent: ImagesiftBot -User-agent: img2dataset -User-agent: ISSCyberRiskCrawler -User-agent: Kangaroo Bot -User-agent: Meta-ExternalAgent -User-agent: Meta-ExternalFetcher -User-agent: OAI-SearchBot -User-agent: omgili -User-agent: omgilibot -User-agent: PanguBot -User-agent: PerplexityBot -User-agent: PetalBot -User-agent: Scrapy -User-agent: Sidetrade indexer bot -User-agent: Timpibot -User-agent: VelenPublicWebCrawler -User-agent: Webzio-Extended -User-agent: YouBot -Disallow: / - -# Marketing/SEO "intelligence" data scrapers -User-agent: AwarioRssBot -User-agent: AwarioSmartBot -User-agent: DataForSeoBot -User-agent: magpie-crawler -User-agent: Meltwater -User-agent: peer39_crawler -User-agent: peer39_crawler/1.0 -User-agent: PiplBot -User-agent: scoop.it -User-agent: Seekr -Disallow: / - -# Well-known.dev crawler. Indexes stuff under /.well-known. -# https://well-known.dev/about/ -User-agent: WellKnownBot -Disallow: / - -# Rules for everything else. -User-agent: * -Crawl-delay: 500 - -# API endpoints. -Disallow: /api/ - -# Auth/Sign in endpoints. -Disallow: /auth/ -Disallow: /oauth/ -Disallow: /check_your_email -Disallow: /wait_for_approval -Disallow: /account_disabled -Disallow: /signup - -# Fileserver/media. -Disallow: /fileserver/ - -# Fedi S2S API endpoints. -Disallow: /users/ -Disallow: /emoji/ - -# Settings panels. -Disallow: /admin -Disallow: /user -Disallow: /settings/ - -# Domain blocklist. -Disallow: /about/suspended - -# Webfinger endpoint. -Disallow: /.well-known/webfinger -` - - robotsTxtNoNodeInfo = robotsTxt + ` -# Disallow nodeinfo -Disallow: /.well-known/nodeinfo -Disallow: /nodeinfo/ -` -) - -// robotsGETHandler returns a decent robots.txt that prevents crawling -// the api, auth pages, settings pages, etc. -// -// More granular robots meta tags are then applied for web pages -// depending on user preferences (see internal/web). -func (m *Module) robotsGETHandler(c *gin.Context) { - // Allow caching for 24 hrs. - // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 - c.Header("Cache-Control", "public, max-age=86400") - - if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { - // Serve robots.txt as-is - // without forbidding nodeinfo. - c.String(http.StatusOK, robotsTxt) - return - } - - // Disallow scraping nodeinfo. - c.String(http.StatusOK, robotsTxtNoNodeInfo) -} diff --git a/internal/web/web.go b/internal/web/web.go index cfadc9283..e5d4db4c4 100644 --- a/internal/web/web.go +++ b/internal/web/web.go @@ -95,8 +95,6 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) { // Route static assets. routeAssets(m, r, mi...) - // Route all other endpoints + handlers. - // // Handlers that serve profiles and statuses should use // the SignatureCheck middleware, so that requests with // content-type application/activity+json can be served @@ -108,24 +106,25 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) { profileGroup.Handle(http.MethodGet, "", m.profileGETHandler) // use empty path here since it's the base of the group profileGroup.Handle(http.MethodGet, statusPath, m.threadGETHandler) - // Individual web handlers requiring no specific middlewares. - r.AttachHandler(http.MethodGet, "/", m.indexHandler) // front-page - r.AttachHandler(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) - r.AttachHandler(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) - r.AttachHandler(http.MethodGet, customCSSPath, m.customCSSGETHandler) - r.AttachHandler(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) - r.AttachHandler(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) - r.AttachHandler(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) - r.AttachHandler(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) - r.AttachHandler(http.MethodGet, robotsPath, m.robotsGETHandler) - r.AttachHandler(http.MethodGet, aboutPath, m.aboutGETHandler) - r.AttachHandler(http.MethodGet, loginPath, m.loginGETHandler) - r.AttachHandler(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) - r.AttachHandler(http.MethodGet, tagsPath, m.tagGETHandler) - r.AttachHandler(http.MethodGet, signupPath, m.signupGETHandler) - r.AttachHandler(http.MethodPost, signupPath, m.signupPOSTHandler) + // Group for all other web handlers. + everythingElseGroup := r.AttachGroup("") + everythingElseGroup.Use(mi...) + everythingElseGroup.Handle(http.MethodGet, "/", m.indexHandler) // front-page + everythingElseGroup.Handle(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) + everythingElseGroup.Handle(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) + everythingElseGroup.Handle(http.MethodGet, customCSSPath, m.customCSSGETHandler) + everythingElseGroup.Handle(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) + everythingElseGroup.Handle(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) + everythingElseGroup.Handle(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) + everythingElseGroup.Handle(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) + everythingElseGroup.Handle(http.MethodGet, aboutPath, m.aboutGETHandler) + everythingElseGroup.Handle(http.MethodGet, loginPath, m.loginGETHandler) + everythingElseGroup.Handle(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) + everythingElseGroup.Handle(http.MethodGet, tagsPath, m.tagGETHandler) + everythingElseGroup.Handle(http.MethodGet, signupPath, m.signupGETHandler) + everythingElseGroup.Handle(http.MethodPost, signupPath, m.signupPOSTHandler) - // Redirects from old endpoints to for back compat. + // Redirects from old endpoints for back compat. r.AttachHandler(http.MethodGet, "/auth/edit", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) }) r.AttachHandler(http.MethodGet, "/user", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) }) r.AttachHandler(http.MethodGet, "/admin", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, adminPanelPath) }) -- cgit v1.2.3