diff options
| -rw-r--r-- | cmd/gotosocial/action/server/server.go | 40 | ||||
| -rw-r--r-- | cmd/gotosocial/action/testrig/testrig.go | 2 | ||||
| -rw-r--r-- | internal/api/nodeinfo.go | 11 | ||||
| -rw-r--r-- | internal/api/robots.go | 52 | ||||
| -rw-r--r-- | internal/api/robots/robots.go | 57 | ||||
| -rw-r--r-- | internal/api/util/robots.go (renamed from internal/web/robots.go) | 44 | ||||
| -rw-r--r-- | internal/api/wellknown/hostmeta/hostmeta.go | 4 | ||||
| -rw-r--r-- | internal/api/wellknown/nodeinfo/nodeinfo.go | 58 | ||||
| -rw-r--r-- | internal/api/wellknown/nodeinfo/nodeinfoget.go | 66 | ||||
| -rw-r--r-- | internal/api/wellknown/webfinger/webfinger.go | 4 | ||||
| -rw-r--r-- | internal/middleware/extraheaders.go | 7 | ||||
| -rw-r--r-- | internal/middleware/robots.go | 67 | ||||
| -rw-r--r-- | internal/web/profile.go | 2 | ||||
| -rw-r--r-- | internal/web/web.go | 37 | ||||
| -rw-r--r-- | web/template/page.tmpl | 2 | 
15 files changed, 311 insertions, 142 deletions
diff --git a/cmd/gotosocial/action/server/server.go b/cmd/gotosocial/action/server/server.go index 6f76fb804..4060eeb7f 100644 --- a/cmd/gotosocial/action/server/server.go +++ b/cmd/gotosocial/action/server/server.go @@ -417,7 +417,8 @@ var Start action.GTSAction = func(ctx context.Context) error {  		return fmt.Errorf("error creating main router: %s", err)  	} -	// Start preparing middleware stack. +	// Start preparing global middleware +	// stack (used for every request).  	middlewares := make([]gin.HandlerFunc, 1)  	// RequestID middleware must run before tracing! @@ -499,13 +500,14 @@ var Start action.GTSAction = func(ctx context.Context) error {  		metricsModule     = api.NewMetrics()                                                 // Metrics endpoints  		healthModule      = api.NewHealth(dbService.Ready)                                   // Health check endpoints  		fileserverModule  = api.NewFileserver(process)                                       // fileserver endpoints +		robotsModule      = api.NewRobots()                                                  // robots.txt endpoint  		wellKnownModule   = api.NewWellKnown(process)                                        // .well-known endpoints  		nodeInfoModule    = api.NewNodeInfo(process)                                         // nodeinfo endpoint  		activityPubModule = api.NewActivityPub(dbService, process)                           // ActivityPub endpoints  		webModule         = web.New(dbService, process)                                      // web pages + user profiles + settings panels etc  	) -	// create required middleware +	// Create per-route / per-grouping middlewares.  	// rate limiting  	rlLimit := config.GetAdvancedRateLimitRequests()  	clLimit := middleware.RateLimit(rlLimit, config.GetAdvancedRateLimitExceptionsParsed())        // client api @@ -518,10 +520,25 @@ var Start action.GTSAction = func(ctx context.Context) error {  	retryAfter := config.GetAdvancedThrottlingRetryAfter()  	clThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // client api  	s2sThrottle := middleware.Throttle(cpuMultiplier, retryAfter) +  	// server-to-server (AP)  	fsThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // fileserver / web templates / emojis  	pkThrottle := middleware.Throttle(cpuMultiplier, retryAfter) // throttle public key endpoint separately +	// Robots http headers (x-robots-tag). +	// +	// robotsDisallowAll is used for client API + S2S endpoints +	// that definitely should never be indexed by crawlers. +	// +	// robotsDisallowAIOnly is used for utility endpoints, +	// fileserver, and for web endpoints that set their own +	// additional robots directives in HTML meta tags. +	// +	// Other endpoints like .well-known and nodeinfo handle +	// robots headers themselves based on configuration. +	robotsDisallowAll := middleware.RobotsHeaders("") +	robotsDisallowAIOnly := middleware.RobotsHeaders("aiOnly") +  	// Gzip middleware is applied to all endpoints except  	// fileserver (compression too expensive for those),  	// health (which really doesn't need compression), and @@ -531,17 +548,18 @@ var Start action.GTSAction = func(ctx context.Context) error {  	// these should be routed in order;  	// apply throttling *after* rate limiting -	authModule.Route(route, clLimit, clThrottle, gzip) -	clientModule.Route(route, clLimit, clThrottle, gzip) -	metricsModule.Route(route, clLimit, clThrottle) -	healthModule.Route(route, clLimit, clThrottle) -	fileserverModule.Route(route, fsMainLimit, fsThrottle) -	fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle) +	authModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip) +	clientModule.Route(route, clLimit, clThrottle, robotsDisallowAll, gzip) +	metricsModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly) +	healthModule.Route(route, clLimit, clThrottle, robotsDisallowAIOnly) +	fileserverModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly) +	fileserverModule.RouteEmojis(route, instanceAccount.ID, fsEmojiLimit, fsThrottle, robotsDisallowAIOnly) +	robotsModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)  	wellKnownModule.Route(route, gzip, s2sLimit, s2sThrottle)  	nodeInfoModule.Route(route, s2sLimit, s2sThrottle, gzip) -	activityPubModule.Route(route, s2sLimit, s2sThrottle, gzip) -	activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, gzip) -	webModule.Route(route, fsMainLimit, fsThrottle, gzip) +	activityPubModule.Route(route, s2sLimit, s2sThrottle, robotsDisallowAll, gzip) +	activityPubModule.RoutePublicKey(route, s2sLimit, pkThrottle, robotsDisallowAll, gzip) +	webModule.Route(route, fsMainLimit, fsThrottle, robotsDisallowAIOnly, gzip)  	// Finally start the main http server!  	if err := route.Start(); err != nil { diff --git a/cmd/gotosocial/action/testrig/testrig.go b/cmd/gotosocial/action/testrig/testrig.go index d91758767..7de3f78a1 100644 --- a/cmd/gotosocial/action/testrig/testrig.go +++ b/cmd/gotosocial/action/testrig/testrig.go @@ -284,6 +284,7 @@ var Start action.GTSAction = func(ctx context.Context) error {  		metricsModule     = api.NewMetrics()                                                  // Metrics endpoints  		healthModule      = api.NewHealth(state.DB.Ready)                                     // Health check endpoints  		fileserverModule  = api.NewFileserver(processor)                                      // fileserver endpoints +		robotsModule      = api.NewRobots()                                                   // robots.txt endpoint  		wellKnownModule   = api.NewWellKnown(processor)                                       // .well-known endpoints  		nodeInfoModule    = api.NewNodeInfo(processor)                                        // nodeinfo endpoint  		activityPubModule = api.NewActivityPub(state.DB, processor)                           // ActivityPub endpoints @@ -297,6 +298,7 @@ var Start action.GTSAction = func(ctx context.Context) error {  	healthModule.Route(route)  	fileserverModule.Route(route)  	fileserverModule.RouteEmojis(route, instanceAccount.ID) +	robotsModule.Route(route)  	wellKnownModule.Route(route)  	nodeInfoModule.Route(route)  	activityPubModule.Route(route) diff --git a/internal/api/nodeinfo.go b/internal/api/nodeinfo.go index 29942aba4..2f0c234fd 100644 --- a/internal/api/nodeinfo.go +++ b/internal/api/nodeinfo.go @@ -20,6 +20,7 @@ package api  import (  	"github.com/gin-gonic/gin"  	"github.com/superseriousbusiness/gotosocial/internal/api/nodeinfo" +	"github.com/superseriousbusiness/gotosocial/internal/config"  	"github.com/superseriousbusiness/gotosocial/internal/middleware"  	"github.com/superseriousbusiness/gotosocial/internal/processing"  	"github.com/superseriousbusiness/gotosocial/internal/router" @@ -43,6 +44,16 @@ func (w *NodeInfo) Route(r *router.Router, m ...gin.HandlerFunc) {  		}),  	) +	// If instance is configured to serve instance stats +	// faithfully at nodeinfo, we should allow robots to +	// crawl nodeinfo endpoints in a limited capacity. +	// In all other cases, disallow everything. +	if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { +		nodeInfoGroup.Use(middleware.RobotsHeaders("allowSome")) +	} else { +		nodeInfoGroup.Use(middleware.RobotsHeaders("")) +	} +  	w.nodeInfo.Route(nodeInfoGroup.Handle)  } diff --git a/internal/api/robots.go b/internal/api/robots.go new file mode 100644 index 000000000..3ed8282f5 --- /dev/null +++ b/internal/api/robots.go @@ -0,0 +1,52 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package api + +import ( +	"github.com/gin-gonic/gin" +	"github.com/superseriousbusiness/gotosocial/internal/api/robots" +	"github.com/superseriousbusiness/gotosocial/internal/middleware" +	"github.com/superseriousbusiness/gotosocial/internal/router" +) + +type Robots struct { +	robots *robots.Module +} + +func (rb *Robots) Route(r *router.Router, m ...gin.HandlerFunc) { +	// Create a group so we can attach middlewares. +	robotsGroup := r.AttachGroup("robots.txt") + +	// Use passed-in middlewares. +	robotsGroup.Use(m...) + +	// Allow caching for 24 hrs. +	// https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 +	robotsGroup.Use( +		middleware.CacheControl(middleware.CacheControlConfig{ +			Directives: []string{"public", "max-age=86400"}, +			Vary:       []string{"Accept-Encoding"}, +		}), +	) + +	rb.robots.Route(robotsGroup.Handle) +} + +func NewRobots() *Robots { +	return &Robots{} +} diff --git a/internal/api/robots/robots.go b/internal/api/robots/robots.go new file mode 100644 index 000000000..98db4682d --- /dev/null +++ b/internal/api/robots/robots.go @@ -0,0 +1,57 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package robots + +import ( +	"net/http" + +	"github.com/gin-gonic/gin" +	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +	"github.com/superseriousbusiness/gotosocial/internal/config" +) + +type Module struct{} + +func New() *Module { +	return &Module{} +} + +func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { +	// Serve different robots.txt file depending on instance +	// stats mode: Don't disallow scraping nodeinfo if admin +	// has opted in to serving accurate stats there. In all +	// other cases, disallow scraping nodeinfo. +	var handler gin.HandlerFunc +	if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { +		handler = m.robotsGETHandler +	} else { +		handler = m.robotsGETHandlerDisallowNodeInfo +	} + +	// Attach handler at empty path as this +	// is already grouped under /robots.txt. +	attachHandler(http.MethodGet, "", handler) +} + +func (m *Module) robotsGETHandler(c *gin.Context) { +	c.String(http.StatusOK, apiutil.RobotsTxt) +} + +func (m *Module) robotsGETHandlerDisallowNodeInfo(c *gin.Context) { +	c.String(http.StatusOK, apiutil.RobotsTxtDisallowNodeInfo) +} diff --git a/internal/web/robots.go b/internal/api/util/robots.go index 524550642..49fb04561 100644 --- a/internal/web/robots.go +++ b/internal/api/util/robots.go @@ -15,19 +15,17 @@  // You should have received a copy of the GNU Affero General Public License  // along with this program.  If not, see <http://www.gnu.org/licenses/>. -package web - -import ( -	"net/http" - -	"github.com/gin-gonic/gin" -	"github.com/superseriousbusiness/gotosocial/internal/config" -) +package util +// See: +// +//   - https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta +//   - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +//   - https://www.rfc-editor.org/rfc/rfc9309.html  const ( -	robotsPath          = "/robots.txt" -	robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta -	robotsTxt           = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go +	RobotsDirectivesDisallow  = "noindex, nofollow" +	RobotsDirectivesAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" +	RobotsTxt                 = `# GoToSocial robots.txt -- to edit, see internal/api/util/robots.go  # More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro  # AI scrapers and the like. @@ -127,31 +125,9 @@ Disallow: /about/suspended  # Webfinger endpoint.  Disallow: /.well-known/webfinger  ` - -	robotsTxtNoNodeInfo = robotsTxt + ` +	RobotsTxtDisallowNodeInfo = RobotsTxt + `  # Disallow nodeinfo  Disallow: /.well-known/nodeinfo  Disallow: /nodeinfo/  `  ) - -// robotsGETHandler returns a decent robots.txt that prevents crawling -// the api, auth pages, settings pages, etc. -// -// More granular robots meta tags are then applied for web pages -// depending on user preferences (see internal/web). -func (m *Module) robotsGETHandler(c *gin.Context) { -	// Allow caching for 24 hrs. -	// https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4 -	c.Header("Cache-Control", "public, max-age=86400") - -	if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { -		// Serve robots.txt as-is -		// without forbidding nodeinfo. -		c.String(http.StatusOK, robotsTxt) -		return -	} - -	// Disallow scraping nodeinfo. -	c.String(http.StatusOK, robotsTxtNoNodeInfo) -} diff --git a/internal/api/wellknown/hostmeta/hostmeta.go b/internal/api/wellknown/hostmeta/hostmeta.go index cb439fcd3..43c6b161e 100644 --- a/internal/api/wellknown/hostmeta/hostmeta.go +++ b/internal/api/wellknown/hostmeta/hostmeta.go @@ -21,6 +21,7 @@ import (  	"net/http"  	"github.com/gin-gonic/gin" +	"github.com/superseriousbusiness/gotosocial/internal/middleware"  	"github.com/superseriousbusiness/gotosocial/internal/processing"  ) @@ -40,5 +41,6 @@ func New(processor *processing.Processor) *Module {  }  func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { -	attachHandler(http.MethodGet, HostMetaPath, m.HostMetaGETHandler) +	// Attach handler, injecting robots http header middleware to disallow all. +	attachHandler(http.MethodGet, HostMetaPath, middleware.RobotsHeaders(""), m.HostMetaGETHandler)  } diff --git a/internal/api/wellknown/nodeinfo/nodeinfo.go b/internal/api/wellknown/nodeinfo/nodeinfo.go index 9012006f4..270dde2b1 100644 --- a/internal/api/wellknown/nodeinfo/nodeinfo.go +++ b/internal/api/wellknown/nodeinfo/nodeinfo.go @@ -21,6 +21,10 @@ import (  	"net/http"  	"github.com/gin-gonic/gin" +	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +	"github.com/superseriousbusiness/gotosocial/internal/config" +	"github.com/superseriousbusiness/gotosocial/internal/gtserror" +	"github.com/superseriousbusiness/gotosocial/internal/middleware"  	"github.com/superseriousbusiness/gotosocial/internal/processing"  ) @@ -42,5 +46,57 @@ func New(processor *processing.Processor) *Module {  }  func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { -	attachHandler(http.MethodGet, NodeInfoWellKnownPath, m.NodeInfoWellKnownGETHandler) +	// If instance is configured to serve instance stats +	// faithfully at nodeinfo, we should allow robots to +	// crawl nodeinfo endpoints in a limited capacity. +	// In all other cases, disallow everything. +	var robots gin.HandlerFunc +	if config.GetInstanceStatsMode() == config.InstanceStatsModeServe { +		robots = middleware.RobotsHeaders("allowSome") +	} else { +		robots = middleware.RobotsHeaders("") +	} + +	// Attach handler, injecting robots http header middleware. +	attachHandler(http.MethodGet, NodeInfoWellKnownPath, robots, m.NodeInfoWellKnownGETHandler) +} + +// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet +// +// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. +// +// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` +// See: https://nodeinfo.diaspora.software/protocol.html +// +//	--- +//	tags: +//	- .well-known +// +//	produces: +//	- application/json +// +//	responses: +//		'200': +//			schema: +//				"$ref": "#/definitions/wellKnownResponse" +func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { +	if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { +		apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) +		return +	} + +	resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) +	if errWithCode != nil { +		apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) +		return +	} + +	// Encode JSON HTTP response. +	apiutil.EncodeJSONResponse( +		c.Writer, +		c.Request, +		http.StatusOK, +		apiutil.AppJSON, +		resp, +	)  } diff --git a/internal/api/wellknown/nodeinfo/nodeinfoget.go b/internal/api/wellknown/nodeinfo/nodeinfoget.go deleted file mode 100644 index c458f131e..000000000 --- a/internal/api/wellknown/nodeinfo/nodeinfoget.go +++ /dev/null @@ -1,66 +0,0 @@ -// GoToSocial -// Copyright (C) GoToSocial Authors admin@gotosocial.org -// SPDX-License-Identifier: AGPL-3.0-or-later -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU Affero General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the -// GNU Affero General Public License for more details. -// -// You should have received a copy of the GNU Affero General Public License -// along with this program.  If not, see <http://www.gnu.org/licenses/>. - -package nodeinfo - -import ( -	"net/http" - -	"github.com/gin-gonic/gin" -	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" -	"github.com/superseriousbusiness/gotosocial/internal/gtserror" -) - -// NodeInfoWellKnownGETHandler swagger:operation GET /.well-known/nodeinfo nodeInfoWellKnownGet -// -// Returns a well-known response which redirects callers to `/nodeinfo/2.0`. -// -// eg. `{"links":[{"rel":"http://nodeinfo.diaspora.software/ns/schema/2.0","href":"http://example.org/nodeinfo/2.0"}]}` -// See: https://nodeinfo.diaspora.software/protocol.html -// -//	--- -//	tags: -//	- .well-known -// -//	produces: -//	- application/json -// -//	responses: -//		'200': -//			schema: -//				"$ref": "#/definitions/wellKnownResponse" -func (m *Module) NodeInfoWellKnownGETHandler(c *gin.Context) { -	if _, err := apiutil.NegotiateAccept(c, apiutil.JSONAcceptHeaders...); err != nil { -		apiutil.ErrorHandler(c, gtserror.NewErrorNotAcceptable(err, err.Error()), m.processor.InstanceGetV1) -		return -	} - -	resp, errWithCode := m.processor.Fedi().NodeInfoRelGet(c.Request.Context()) -	if errWithCode != nil { -		apiutil.ErrorHandler(c, errWithCode, m.processor.InstanceGetV1) -		return -	} - -	// Encode JSON HTTP response. -	apiutil.EncodeJSONResponse( -		c.Writer, -		c.Request, -		http.StatusOK, -		apiutil.AppJSON, -		resp, -	) -} diff --git a/internal/api/wellknown/webfinger/webfinger.go b/internal/api/wellknown/webfinger/webfinger.go index a50013b32..c70afab9d 100644 --- a/internal/api/wellknown/webfinger/webfinger.go +++ b/internal/api/wellknown/webfinger/webfinger.go @@ -21,6 +21,7 @@ import (  	"net/http"  	"github.com/gin-gonic/gin" +	"github.com/superseriousbusiness/gotosocial/internal/middleware"  	"github.com/superseriousbusiness/gotosocial/internal/processing"  ) @@ -41,5 +42,6 @@ func New(processor *processing.Processor) *Module {  }  func (m *Module) Route(attachHandler func(method string, path string, f ...gin.HandlerFunc) gin.IRoutes) { -	attachHandler(http.MethodGet, WebfingerBasePath, m.WebfingerGETRequest) +	// Attach handler, injecting robots http header middleware to disallow all. +	attachHandler(http.MethodGet, WebfingerBasePath, middleware.RobotsHeaders(""), m.WebfingerGETRequest)  } diff --git a/internal/middleware/extraheaders.go b/internal/middleware/extraheaders.go index fb91bcc93..c75b65551 100644 --- a/internal/middleware/extraheaders.go +++ b/internal/middleware/extraheaders.go @@ -44,12 +44,5 @@ func ExtraHeaders() gin.HandlerFunc {  		//  		// See: https://github.com/patcg-individual-drafts/topics  		c.Header("Permissions-Policy", "browsing-topics=()") - -		// Some AI scrapers respect the following tags to opt-out -		// of their crawling and datasets. -		c.Header("X-Robots-Tag", "noimageai") -		// c.Header calls .Set(), but we want to emit the header -		// twice, not override it. -		c.Writer.Header().Add("X-Robots-Tag", "noai")  	}  } diff --git a/internal/middleware/robots.go b/internal/middleware/robots.go new file mode 100644 index 000000000..fefd93be0 --- /dev/null +++ b/internal/middleware/robots.go @@ -0,0 +1,67 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program.  If not, see <http://www.gnu.org/licenses/>. + +package middleware + +import ( +	"github.com/gin-gonic/gin" +	apiutil "github.com/superseriousbusiness/gotosocial/internal/api/util" +) + +// RobotsHeaders adds robots directives to the X-Robots-Tag HTTP header. +// https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/X-Robots-Tag +// +// If mode == "aiOnly" then only the noai and noimageai values will be set, +// and other headers will be left alone (for route groups / handlers to set). +// +// If mode == "allowSome" then noai, noimageai, and some indexing will be set. +// +// If mode == "" then noai, noimageai, noindex, and nofollow will be set +// (ie., as restrictive as possible). +func RobotsHeaders(mode string) gin.HandlerFunc { +	const ( +		key = "X-Robots-Tag" +		// Some AI scrapers respect the following tags +		// to opt-out of their crawling and datasets. +		// We add them regardless of allowSome. +		noai = "noai, noimageai" +	) + +	switch mode { + +	// Just set ai headers and +	// leave the other headers be. +	case "aiOnly": +		return func(c *gin.Context) { +			c.Writer.Header().Set(key, noai) +		} + +	// Allow some limited indexing. +	case "allowSome": +		return func(c *gin.Context) { +			c.Writer.Header().Set(key, apiutil.RobotsDirectivesAllowSome) +			c.Writer.Header().Add(key, noai) +		} + +	// Disallow indexing via noindex, nofollow. +	default: +		return func(c *gin.Context) { +			c.Writer.Header().Set(key, apiutil.RobotsDirectivesDisallow) +			c.Writer.Header().Add(key, noai) +		} +	} +} diff --git a/internal/web/profile.go b/internal/web/profile.go index a6d96a9ea..cf12ca33a 100644 --- a/internal/web/profile.go +++ b/internal/web/profile.go @@ -103,7 +103,7 @@ func (m *Module) profileGETHandler(c *gin.Context) {  	// index if account is discoverable.  	var robotsMeta string  	if targetAccount.Discoverable { -		robotsMeta = robotsMetaAllowSome +		robotsMeta = apiutil.RobotsDirectivesAllowSome  	}  	// We need to change our response slightly if the diff --git a/internal/web/web.go b/internal/web/web.go index cfadc9283..e5d4db4c4 100644 --- a/internal/web/web.go +++ b/internal/web/web.go @@ -95,8 +95,6 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) {  	// Route static assets.  	routeAssets(m, r, mi...) -	// Route all other endpoints + handlers. -	//  	// Handlers that serve profiles and statuses should use  	// the SignatureCheck middleware, so that requests with  	// content-type application/activity+json can be served @@ -108,24 +106,25 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) {  	profileGroup.Handle(http.MethodGet, "", m.profileGETHandler) // use empty path here since it's the base of the group  	profileGroup.Handle(http.MethodGet, statusPath, m.threadGETHandler) -	// Individual web handlers requiring no specific middlewares. -	r.AttachHandler(http.MethodGet, "/", m.indexHandler) // front-page -	r.AttachHandler(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) -	r.AttachHandler(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) -	r.AttachHandler(http.MethodGet, customCSSPath, m.customCSSGETHandler) -	r.AttachHandler(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) -	r.AttachHandler(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) -	r.AttachHandler(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) -	r.AttachHandler(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) -	r.AttachHandler(http.MethodGet, robotsPath, m.robotsGETHandler) -	r.AttachHandler(http.MethodGet, aboutPath, m.aboutGETHandler) -	r.AttachHandler(http.MethodGet, loginPath, m.loginGETHandler) -	r.AttachHandler(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) -	r.AttachHandler(http.MethodGet, tagsPath, m.tagGETHandler) -	r.AttachHandler(http.MethodGet, signupPath, m.signupGETHandler) -	r.AttachHandler(http.MethodPost, signupPath, m.signupPOSTHandler) +	// Group for all other web handlers. +	everythingElseGroup := r.AttachGroup("") +	everythingElseGroup.Use(mi...) +	everythingElseGroup.Handle(http.MethodGet, "/", m.indexHandler) // front-page +	everythingElseGroup.Handle(http.MethodGet, settingsPathPrefix, m.SettingsPanelHandler) +	everythingElseGroup.Handle(http.MethodGet, settingsPanelGlob, m.SettingsPanelHandler) +	everythingElseGroup.Handle(http.MethodGet, customCSSPath, m.customCSSGETHandler) +	everythingElseGroup.Handle(http.MethodGet, instanceCustomCSSPath, m.instanceCustomCSSGETHandler) +	everythingElseGroup.Handle(http.MethodGet, rssFeedPath, m.rssFeedGETHandler) +	everythingElseGroup.Handle(http.MethodGet, confirmEmailPath, m.confirmEmailGETHandler) +	everythingElseGroup.Handle(http.MethodPost, confirmEmailPath, m.confirmEmailPOSTHandler) +	everythingElseGroup.Handle(http.MethodGet, aboutPath, m.aboutGETHandler) +	everythingElseGroup.Handle(http.MethodGet, loginPath, m.loginGETHandler) +	everythingElseGroup.Handle(http.MethodGet, domainBlockListPath, m.domainBlockListGETHandler) +	everythingElseGroup.Handle(http.MethodGet, tagsPath, m.tagGETHandler) +	everythingElseGroup.Handle(http.MethodGet, signupPath, m.signupGETHandler) +	everythingElseGroup.Handle(http.MethodPost, signupPath, m.signupPOSTHandler) -	// Redirects from old endpoints to for back compat. +	// Redirects from old endpoints for back compat.  	r.AttachHandler(http.MethodGet, "/auth/edit", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) })  	r.AttachHandler(http.MethodGet, "/user", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, userPanelPath) })  	r.AttachHandler(http.MethodGet, "/admin", func(c *gin.Context) { c.Redirect(http.StatusMovedPermanently, adminPanelPath) }) diff --git a/web/template/page.tmpl b/web/template/page.tmpl index 52599a531..fad0fc3b9 100644 --- a/web/template/page.tmpl +++ b/web/template/page.tmpl @@ -47,7 +47,7 @@ image/webp          <meta charset="UTF-8">          <meta http-equiv="X-UA-Compatible" content="IE=edge">          <meta name="viewport" content="width=device-width, initial-scale=1.0"> -        <meta name="robots" content="{{- if .robotsMeta -}}{{- .robotsMeta -}}{{- else -}}noindex, nofollow{{- end -}}"> +        <meta name="robots" content="{{- if .robotsMeta -}}{{- .robotsMeta -}}{{- else -}}noindex, nofollow, noai, noimageai{{- end -}}">          {{- if .ogMeta }}          {{- include "page_ogmeta.tmpl" . | indent 2 }}          {{- else }}  | 
