From 9b50151f17b5921b68b3c413a26edf8ec6cdc6f8 Mon Sep 17 00:00:00 2001 From: Daenney Date: Fri, 2 Aug 2024 18:22:39 +0200 Subject: [feature] Beef up our AI opt-outs (#3165) * [chore] Synchronise our robots.txt with upstream * [feature] Add headers to escape AI crawlers This adds 2 headers that a number of AI crawlers respect to signal that content should not be included in their datasets. --- internal/web/robots.go | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'internal/web/robots.go') diff --git a/internal/web/robots.go b/internal/web/robots.go index 39708eb55..3309de97c 100644 --- a/internal/web/robots.go +++ b/internal/web/robots.go @@ -43,15 +43,24 @@ User-agent: Claude-Web User-agent: cohere-ai User-agent: Diffbot User-agent: FacebookBot +User-agent: facebookexternalhit User-agent: FriendlyCrawler User-agent: Google-Extended User-agent: GoogleOther +User-agent: GoogleOther-Image +User-agent: GoogleOther-Video User-agent: GPTBot User-agent: ImagesiftBot User-agent: img2dataset +User-agent: Meta-ExternalAgent +User-agent: OAI-SearchBot User-agent: omgili User-agent: omgilibot User-agent: PerplexityBot +User-agent: PetalBot +User-agent: Scrapy +User-agent: Timpibot +User-agent: VelenPublicWebCrawler User-agent: YouBot Disallow: / -- cgit v1.2.3