summaryrefslogtreecommitdiff
path: root/internal/web/robots.go
diff options
context:
space:
mode:
Diffstat (limited to 'internal/web/robots.go')
-rw-r--r--internal/web/robots.go30
1 files changed, 30 insertions, 0 deletions
diff --git a/internal/web/robots.go b/internal/web/robots.go
index a79c31034..aee4d1a55 100644
--- a/internal/web/robots.go
+++ b/internal/web/robots.go
@@ -34,6 +34,36 @@ const (
User-agent: GPTBot
Disallow: /
+# As of September 2023, GPTBot and ChatGPT-User are equivalent. But there's no telling
+# when OpenAI might decide to change that, so block this one too.
+User-agent: ChatGPT-User
+Disallow: /
+
+# And a giant fuck you to Google Bard and their other generative AI ventures too.
+# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers
+User-agent: Google-Extended
+Disallow: /
+
+# Block CommonCrawl. Used in training LLMs and specifically GPT-3.
+# https://commoncrawl.org/faq
+User-agent: CCBot
+Disallow: /
+
+# Block Omgilike/Webz.io, a "Big Web Data" engine.
+# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/
+User-agent: Omgilibot
+Disallow: /
+
+# Block Faceboobot, because Meta.
+# https://developers.facebook.com/docs/sharing/bot
+User-agent: FacebookBot
+Disallow: /
+
+# Well-known.dev crawler. Indexes stuff under /.well-known.
+# https://well-known.dev/about/
+User-agent: WellKnownBot
+Disallow: /
+
# Rules for everything else.
User-agent: *
Crawl-delay: 500