summaryrefslogtreecommitdiff
path: root/internal/web/robots.go
diff options
context:
space:
mode:
authorLibravatar tobi <31960611+tsmethurst@users.noreply.github.com>2025-02-05 12:47:13 +0100
committerLibravatar GitHub <noreply@github.com>2025-02-05 12:47:13 +0100
commitbaed591a1d19942ec553baed41a8048ab9dd18ca (patch)
treeb8c91d4f193ab2a80e71f222fb1bda4bb775805b /internal/web/robots.go
parent[bugfix] wrong nodeinfo version (tobi is a boob) (#3735) (diff)
downloadgotosocial-baed591a1d19942ec553baed41a8048ab9dd18ca.tar.xz
[feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers (#3737)
* [feature] Use `X-Robots-Tag` headers to instruct scrapers/crawlers * use switch for RobotsHeaders
Diffstat (limited to 'internal/web/robots.go')
-rw-r--r--internal/web/robots.go157
1 files changed, 0 insertions, 157 deletions
diff --git a/internal/web/robots.go b/internal/web/robots.go
deleted file mode 100644
index 524550642..000000000
--- a/internal/web/robots.go
+++ /dev/null
@@ -1,157 +0,0 @@
-// GoToSocial
-// Copyright (C) GoToSocial Authors admin@gotosocial.org
-// SPDX-License-Identifier: AGPL-3.0-or-later
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-package web
-
-import (
- "net/http"
-
- "github.com/gin-gonic/gin"
- "github.com/superseriousbusiness/gotosocial/internal/config"
-)
-
-const (
- robotsPath = "/robots.txt"
- robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
- robotsTxt = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
-# More info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
-
-# AI scrapers and the like.
-# https://github.com/ai-robots-txt/ai.robots.txt/
-User-agent: AI2Bot
-User-agent: Ai2Bot-Dolma
-User-agent: Amazonbot
-User-agent: anthropic-ai
-User-agent: Applebot
-User-agent: Applebot-Extended
-User-agent: Bytespider
-User-agent: CCBot
-User-agent: ChatGPT-User
-User-agent: ClaudeBot
-User-agent: Claude-Web
-User-agent: cohere-ai
-User-agent: cohere-training-data-crawler
-User-agent: Diffbot
-User-agent: DuckAssistBot
-User-agent: FacebookBot
-User-agent: FriendlyCrawler
-User-agent: Google-Extended
-User-agent: GoogleOther
-User-agent: GoogleOther-Image
-User-agent: GoogleOther-Video
-User-agent: GPTBot
-User-agent: iaskspider/2.0
-User-agent: ICC-Crawler
-User-agent: ImagesiftBot
-User-agent: img2dataset
-User-agent: ISSCyberRiskCrawler
-User-agent: Kangaroo Bot
-User-agent: Meta-ExternalAgent
-User-agent: Meta-ExternalFetcher
-User-agent: OAI-SearchBot
-User-agent: omgili
-User-agent: omgilibot
-User-agent: PanguBot
-User-agent: PerplexityBot
-User-agent: PetalBot
-User-agent: Scrapy
-User-agent: Sidetrade indexer bot
-User-agent: Timpibot
-User-agent: VelenPublicWebCrawler
-User-agent: Webzio-Extended
-User-agent: YouBot
-Disallow: /
-
-# Marketing/SEO "intelligence" data scrapers
-User-agent: AwarioRssBot
-User-agent: AwarioSmartBot
-User-agent: DataForSeoBot
-User-agent: magpie-crawler
-User-agent: Meltwater
-User-agent: peer39_crawler
-User-agent: peer39_crawler/1.0
-User-agent: PiplBot
-User-agent: scoop.it
-User-agent: Seekr
-Disallow: /
-
-# Well-known.dev crawler. Indexes stuff under /.well-known.
-# https://well-known.dev/about/
-User-agent: WellKnownBot
-Disallow: /
-
-# Rules for everything else.
-User-agent: *
-Crawl-delay: 500
-
-# API endpoints.
-Disallow: /api/
-
-# Auth/Sign in endpoints.
-Disallow: /auth/
-Disallow: /oauth/
-Disallow: /check_your_email
-Disallow: /wait_for_approval
-Disallow: /account_disabled
-Disallow: /signup
-
-# Fileserver/media.
-Disallow: /fileserver/
-
-# Fedi S2S API endpoints.
-Disallow: /users/
-Disallow: /emoji/
-
-# Settings panels.
-Disallow: /admin
-Disallow: /user
-Disallow: /settings/
-
-# Domain blocklist.
-Disallow: /about/suspended
-
-# Webfinger endpoint.
-Disallow: /.well-known/webfinger
-`
-
- robotsTxtNoNodeInfo = robotsTxt + `
-# Disallow nodeinfo
-Disallow: /.well-known/nodeinfo
-Disallow: /nodeinfo/
-`
-)
-
-// robotsGETHandler returns a decent robots.txt that prevents crawling
-// the api, auth pages, settings pages, etc.
-//
-// More granular robots meta tags are then applied for web pages
-// depending on user preferences (see internal/web).
-func (m *Module) robotsGETHandler(c *gin.Context) {
- // Allow caching for 24 hrs.
- // https://www.rfc-editor.org/rfc/rfc9309.html#section-2.4
- c.Header("Cache-Control", "public, max-age=86400")
-
- if config.GetInstanceStatsMode() == config.InstanceStatsModeServe {
- // Serve robots.txt as-is
- // without forbidding nodeinfo.
- c.String(http.StatusOK, robotsTxt)
- return
- }
-
- // Disallow scraping nodeinfo.
- c.String(http.StatusOK, robotsTxtNoNodeInfo)
-}