From 941893a774c83802afdc4cc76e1d30c59b6c5585 Mon Sep 17 00:00:00 2001
From: tobi <31960611+tsmethurst@users.noreply.github.com>
Date: Mon, 2 Jan 2023 13:10:50 +0100
Subject: [chore] The Big Middleware and API Refactor (tm) (#1250)

* interim commit: start refactoring middlewares into package under router

* another interim commit, this is becoming a big job

* another fucking massive interim commit

* refactor bookmarks to new style

* ambassador, wiz zeze commits you are spoiling uz

* she compiles, we're getting there

* we're just normal men; we're just innocent men

* apiutil

* whoopsie

* i'm glad noone reads commit msgs haha :blob_sweat:

* use that weirdo go-bytesize library for maxMultipartMemory

* fix media module paths
---
 internal/web/robots.go | 42 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

(limited to 'internal/web/robots.go')

diff --git a/internal/web/robots.go b/internal/web/robots.go
index c3307d068..0babb31b7 100644
--- a/internal/web/robots.go
+++ b/internal/web/robots.go
@@ -18,7 +18,45 @@
 
 package web
 
-// https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
+import (
+	"net/http"
+
+	"github.com/gin-gonic/gin"
+)
+
 const (
-	robotsAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard"
+	robotsPath          = "/robots.txt"
+	robotsMetaAllowSome = "nofollow, noarchive, nositelinkssearchbox, max-image-preview:standard" // https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag#robotsmeta
+	robotsTxt           = `# GoToSocial robots.txt -- to edit, see internal/web/robots.go
+# more info @ https://developers.google.com/search/docs/crawling-indexing/robots/intro
+User-agent: *
+Crawl-delay: 500
+# api stuff
+Disallow: /api/
+# auth/login stuff
+Disallow: /auth/
+Disallow: /oauth/
+Disallow: /check_your_email
+Disallow: /wait_for_approval
+Disallow: /account_disabled
+# well known stuff
+Disallow: /.well-known/
+# files
+Disallow: /fileserver/
+# s2s AP stuff
+Disallow: /users/
+Disallow: /emoji/
+# panels
+Disallow: /admin
+Disallow: /user
+Disallow: /settings/`
 )
+
+// robotsGETHandler returns a decent robots.txt that prevents crawling
+// the api, auth pages, settings pages, etc.
+//
+// More granular robots meta tags are then applied for web pages
+// depending on user preferences (see internal/web).
+func (m *Module) robotsGETHandler(c *gin.Context) {
+	c.String(http.StatusOK, robotsTxt)
+}
-- 
cgit v1.2.3