diff options
| author | 2025-04-28 20:12:27 +0000 | |
|---|---|---|
| committer | 2025-04-28 20:12:27 +0000 | |
| commit | d8c4d9fc5a62741f0c4c2b692a3a94874714bbcc (patch) | |
| tree | b64e5f1a635149db4b549fecd09437e9874572ad /internal | |
| parent | [chore/docs] add symmetry to the politics (#4081) (diff) | |
| download | gotosocial-d8c4d9fc5a62741f0c4c2b692a3a94874714bbcc.tar.xz | |
[feature] proof of work scraper deterrence (#4043)
This adds a proof-of-work based scraper deterrence to GoToSocial's middleware stack on profile and status web pages. Heavily inspired by https://github.com/TecharoHQ/anubis, but massively stripped back for our own usecase.
Todo:
- ~~add configuration option so this is disabled by default~~
- ~~fix whatever weirdness is preventing this working with CSP (even in debug)~~
- ~~use our standard templating mechanism going through apiutil helper func~~
- ~~probably some absurdly small performance improvements to be made in pooling re-used hex encode / hash encode buffers~~ the web endpoints aren't as hot a path as API / ActivityPub, will leave as-is for now as it is already very minimal and well optimized
- ~~verify the cryptographic assumptions re: using a portion of token as challenge data~~ this isn't a serious application of cryptography, if it turns out to be a problem we'll fix it, but it definitely should not be easily possible to guess a SHA256 hash from the first 1/4 of it even if mathematically it might make it a bit easier
- ~~theme / make look nice??~~
- ~~add a spinner~~
- ~~add entry in example configuration~~
- ~~add documentation~~
Verification page originally based on https://github.com/LucienV1/powtect
Co-authored-by: tobi <tobi.smethurst@protonmail.com>
Reviewed-on: https://codeberg.org/superseriousbusiness/gotosocial/pulls/4043
Reviewed-by: tobi <tsmethurst@noreply.codeberg.org>
Co-authored-by: kim <grufwub@gmail.com>
Co-committed-by: kim <grufwub@gmail.com>
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/config/config.go | 1 | ||||
| -rw-r--r-- | internal/config/defaults.go | 1 | ||||
| -rw-r--r-- | internal/config/helpers.gen.go | 25 | ||||
| -rw-r--r-- | internal/middleware/headerfilter.go | 15 | ||||
| -rw-r--r-- | internal/middleware/nollamas.go | 309 | ||||
| -rw-r--r-- | internal/middleware/nollamas_test.go | 178 | ||||
| -rw-r--r-- | internal/web/web.go | 6 |
7 files changed, 519 insertions, 16 deletions
diff --git a/internal/config/config.go b/internal/config/config.go index 07676627d..88083d491 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -175,6 +175,7 @@ type Configuration struct { AdvancedSenderMultiplier int `name:"advanced-sender-multiplier" usage:"Multiplier to use per cpu for batching outgoing fedi messages. 0 or less turns batching off (not recommended)."` AdvancedCSPExtraURIs []string `name:"advanced-csp-extra-uris" usage:"Additional URIs to allow when building content-security-policy for media + images."` AdvancedHeaderFilterMode string `name:"advanced-header-filter-mode" usage:"Set incoming request header filtering mode."` + AdvancedScraperDeterrence bool `name:"advanced-scraper-deterrence" usage:"Enable proof-of-work based scraper deterrence on profile / status pages"` // HTTPClient configuration vars. HTTPClient HTTPClientConfiguration `name:"http-client"` diff --git a/internal/config/defaults.go b/internal/config/defaults.go index f07aeb41b..4ef183aa6 100644 --- a/internal/config/defaults.go +++ b/internal/config/defaults.go @@ -142,6 +142,7 @@ var Defaults = Configuration{ AdvancedSenderMultiplier: 2, // 2 senders per CPU AdvancedCSPExtraURIs: []string{}, AdvancedHeaderFilterMode: RequestHeaderFilterModeDisabled, + AdvancedScraperDeterrence: false, Cache: CacheConfiguration{ // Rough memory target that the total diff --git a/internal/config/helpers.gen.go b/internal/config/helpers.gen.go index 56eb0e3e8..8fc4475b7 100644 --- a/internal/config/helpers.gen.go +++ b/internal/config/helpers.gen.go @@ -2906,6 +2906,31 @@ func GetAdvancedHeaderFilterMode() string { return global.GetAdvancedHeaderFilte // SetAdvancedHeaderFilterMode safely sets the value for global configuration 'AdvancedHeaderFilterMode' field func SetAdvancedHeaderFilterMode(v string) { global.SetAdvancedHeaderFilterMode(v) } +// GetAdvancedScraperDeterrence safely fetches the Configuration value for state's 'AdvancedScraperDeterrence' field +func (st *ConfigState) GetAdvancedScraperDeterrence() (v bool) { + st.mutex.RLock() + v = st.config.AdvancedScraperDeterrence + st.mutex.RUnlock() + return +} + +// SetAdvancedScraperDeterrence safely sets the Configuration value for state's 'AdvancedScraperDeterrence' field +func (st *ConfigState) SetAdvancedScraperDeterrence(v bool) { + st.mutex.Lock() + defer st.mutex.Unlock() + st.config.AdvancedScraperDeterrence = v + st.reloadToViper() +} + +// AdvancedScraperDeterrenceFlag returns the flag name for the 'AdvancedScraperDeterrence' field +func AdvancedScraperDeterrenceFlag() string { return "advanced-scraper-deterrence" } + +// GetAdvancedScraperDeterrence safely fetches the value for global configuration 'AdvancedScraperDeterrence' field +func GetAdvancedScraperDeterrence() bool { return global.GetAdvancedScraperDeterrence() } + +// SetAdvancedScraperDeterrence safely sets the value for global configuration 'AdvancedScraperDeterrence' field +func SetAdvancedScraperDeterrence(v bool) { global.SetAdvancedScraperDeterrence(v) } + // GetHTTPClientAllowIPs safely fetches the Configuration value for state's 'HTTPClient.AllowIPs' field func (st *ConfigState) GetHTTPClientAllowIPs() (v []string) { st.mutex.RLock() diff --git a/internal/middleware/headerfilter.go b/internal/middleware/headerfilter.go index 6fb9fc996..0fd2a8877 100644 --- a/internal/middleware/headerfilter.go +++ b/internal/middleware/headerfilter.go @@ -150,11 +150,6 @@ func isHeaderBlocked(state *state.State, c *gin.Context) (bool, error) { } if key != "" { - // if expr != "" { - // // TODO: replace expvar with build - // // taggable metrics types in State{}. - // } - // A header was matched against! // i.e. this request is blocked. return true, nil @@ -185,11 +180,6 @@ func isHeaderAllowed(state *state.State, c *gin.Context) (bool, error) { } if key != "" { - // if expr != "" { - // // TODO: replace expvar with build - // // taggable metrics types in State{}. - // } - // A header was matched against! // i.e. this request is allowed. return true, nil @@ -220,11 +210,6 @@ func isHeaderNotAllowed(state *state.State, c *gin.Context) (bool, error) { } if key != "" { - // if expr != "" { - // // TODO: replace expvar with build - // // taggable metrics types in State{}. - // } - // A header was matched against! // i.e. request is NOT allowed. return true, nil diff --git a/internal/middleware/nollamas.go b/internal/middleware/nollamas.go new file mode 100644 index 000000000..7f01c5afc --- /dev/null +++ b/internal/middleware/nollamas.go @@ -0,0 +1,309 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package middleware + +import ( + "context" + "crypto/rand" + "crypto/sha256" + "crypto/subtle" + "encoding/hex" + "hash" + "io" + "net/http" + "time" + + apimodel "code.superseriousbusiness.org/gotosocial/internal/api/model" + apiutil "code.superseriousbusiness.org/gotosocial/internal/api/util" + "code.superseriousbusiness.org/gotosocial/internal/config" + "code.superseriousbusiness.org/gotosocial/internal/gtscontext" + "code.superseriousbusiness.org/gotosocial/internal/gtserror" + "code.superseriousbusiness.org/gotosocial/internal/log" + "code.superseriousbusiness.org/gotosocial/internal/oauth" + "codeberg.org/gruf/go-byteutil" + "github.com/gin-gonic/gin" +) + +// NoLLaMas returns a piece of HTTP middleware that provides a deterrence +// on routes it is applied to, against bots and scrapers. It generates a +// unique but deterministic challenge for each HTTP client within an hour +// TTL that requires a proof-of-work solution to pass onto the next handler. +// On successful solution, the client is provided a cookie that allows them +// to bypass this check within that hour TTL. The outcome of this is that it +// should make scraping of these endpoints economically unfeasible, when enabled, +// and with an absurdly minimal performance impact. The downside is that it +// requires javascript to be enabled on the client to pass the middleware check. +// +// Heavily inspired by: https://github.com/TecharoHQ/anubis +func NoLLaMas(getInstanceV1 func(context.Context) (*apimodel.InstanceV1, gtserror.WithCode)) gin.HandlerFunc { + + if !config.GetAdvancedScraperDeterrence() { + // NoLLaMas middleware disabled. + return func(*gin.Context) {} + } + + seed := make([]byte, 32) + + // Read random data for the token seed. + _, err := io.ReadFull(rand.Reader, seed) + if err != nil { + panic(err) + } + + // Configure nollamas. + var nollamas nollamas + nollamas.seed = seed + nollamas.ttl = time.Hour + nollamas.diff = 4 + nollamas.getInstanceV1 = getInstanceV1 + return nollamas.Serve +} + +// hashWithBufs encompasses a hash along +// with the necessary buffers to generate +// a hashsum and then encode that sum. +type hashWithBufs struct { + hash hash.Hash + hbuf []byte + ebuf []byte +} + +type nollamas struct { + seed []byte // unique token seed + ttl time.Duration + diff uint8 + + // extra fields required for + // our template rendering. + getInstanceV1 func(ctx context.Context) (*apimodel.InstanceV1, gtserror.WithCode) +} + +func (m *nollamas) Serve(c *gin.Context) { + if c.Request.Method != http.MethodGet { + // Only interested in protecting + // crawlable 'GET' endpoints. + c.Next() + return + } + + // Extract request context. + ctx := c.Request.Context() + + if ctx.Value(oauth.SessionAuthorizedToken) != nil { + // Don't guard against requests + // providing valid OAuth tokens. + c.Next() + return + } + + if gtscontext.HTTPSignature(ctx) != "" { + // Don't guard against requests + // providing HTTP signatures. + c.Next() + return + } + + // i.e. outputted hash slice length. + const hashLen = sha256.Size + + // i.e. hex.EncodedLen(hashLen). + const encodedHashLen = 2 * hashLen + + // Prepare hash + buffers. + hash := hashWithBufs{ + hash: sha256.New(), + hbuf: make([]byte, 0, hashLen), + ebuf: make([]byte, encodedHashLen), + } + + // Extract client fingerprint data. + userAgent := c.GetHeader("User-Agent") + clientIP := c.ClientIP() + + // Generate a unique token for this request, + // only valid for a period of now +- m.ttl. + token := m.token(&hash, userAgent, clientIP) + + // For unique challenge string just use a + // single portion of their 'success' token. + // SHA256 is not yet cracked, this is not an + // application of a hash requiring serious + // cryptographic security and it rotates on + // a TTL basis, so it should be fine. + challenge := token[:len(token)/4] + + // Check for a provided success token. + cookie, _ := c.Cookie("gts-nollamas") + + // Check whether passed cookie + // is the expected success token. + if subtle.ConstantTimeCompare( + byteutil.S2B(token), + byteutil.S2B(cookie), + ) == 1 { + + // They passed us a valid, expected + // token. They already passed checks. + c.Next() + return + } + + // Prepare new log entry. + l := log.WithContext(ctx). + WithField("userAgent", userAgent). + WithField("challenge", challenge) + + // Extract and parse query. + query := c.Request.URL.Query() + + // Check query to see if an in-progress + // challenge solution has been provided. + nonce := query.Get("nollamas_solution") + if nonce == "" || len(nonce) > 20 { + + // noting that here, 20 is + // max integer string len. + // + // An invalid solution string, just + // present them with new challenge. + l.Info("posing new challenge") + m.renderChallenge(c, challenge) + return + } + + // Reset the hash. + hash.hash.Reset() + + // Check challenge+nonce as possible solution. + if !m.checkChallenge(&hash, challenge, nonce) { + + // They failed challenge, + // re-present challenge page. + l.Info("invalid solution provided") + m.renderChallenge(c, challenge) + return + } + + l.Infof("challenge passed: %s", nonce) + + // Don't pass to further + // handlers, we'll redirect. + c.Abort() + + // Drop solution query and encode. + query.Del("nollamas_solution") + c.Request.URL.RawQuery = query.Encode() + + // They passed the challenge! Set success token + // cookie and allow them to continue to next handlers. + c.SetCookie("gts-nollamas", token, int(m.ttl/time.Second), "", "", false, false) + c.Redirect(http.StatusTemporaryRedirect, c.Request.URL.RequestURI()) +} + +func (m *nollamas) renderChallenge(c *gin.Context, challenge string) { + // Don't pass to further + // handlers, they only get + // our challenge page. + c.Abort() + + // Fetch current instance information for templating vars. + instance, errWithCode := m.getInstanceV1(c.Request.Context()) + if errWithCode != nil { + apiutil.ErrorHandler(c, errWithCode, m.getInstanceV1) + return + } + + // Write templated challenge response to client. + apiutil.TemplateWebPage(c, apiutil.WebPage{ + Template: "nollamas.tmpl", + Instance: instance, + Stylesheets: []string{ + "/assets/dist/nollamas.css", + // Include fork-awesome stylesheet + // to get nice loading spinner. + "/assets/Fork-Awesome/css/fork-awesome.min.css", + }, + Extra: map[string]any{ + "challenge": challenge, + "difficulty": m.diff, + }, + Javascript: []apiutil.JavascriptEntry{ + { + Src: "/assets/dist/nollamas.js", + Defer: true, + }, + }, + }) +} + +func (m *nollamas) token(hash *hashWithBufs, userAgent, clientIP string) string { + // Use our unique seed to seed hash, + // to ensure we have cryptographically + // unique, yet deterministic, tokens + // generated for a given http client. + hash.hash.Write(m.seed) + + // Include difficulty level in + // hash input data so if config + // changes then token invalidates. + hash.hash.Write([]byte{m.diff}) + + // Also seed the generated input with + // current time rounded to TTL, so our + // single comparison handles expiries. + now := time.Now().Round(m.ttl).Unix() + hash.hash.Write([]byte{ + byte(now >> 56), + byte(now >> 48), + byte(now >> 40), + byte(now >> 32), + byte(now >> 24), + byte(now >> 16), + byte(now >> 8), + byte(now), + }) + + // Finally, append unique client request data. + hash.hash.Write(byteutil.S2B(userAgent)) + hash.hash.Write(byteutil.S2B(clientIP)) + + // Return hex encoded hash output. + hash.hbuf = hash.hash.Sum(hash.hbuf[:0]) + hex.Encode(hash.ebuf, hash.hbuf) + return string(hash.ebuf) +} + +func (m *nollamas) checkChallenge(hash *hashWithBufs, challenge, nonce string) bool { + // Hash and encode input challenge with + // proposed nonce as a possible solution. + hash.hash.Write(byteutil.S2B(challenge)) + hash.hash.Write(byteutil.S2B(nonce)) + hash.hbuf = hash.hash.Sum(hash.hbuf[:0]) + hex.Encode(hash.ebuf, hash.hbuf) + solution := hash.ebuf + + // Check that the first 'diff' + // many chars are indeed zeroes. + for i := range m.diff { + if solution[i] != '0' { + return false + } + } + + return true +} diff --git a/internal/middleware/nollamas_test.go b/internal/middleware/nollamas_test.go new file mode 100644 index 000000000..92a044d32 --- /dev/null +++ b/internal/middleware/nollamas_test.go @@ -0,0 +1,178 @@ +// GoToSocial +// Copyright (C) GoToSocial Authors admin@gotosocial.org +// SPDX-License-Identifier: AGPL-3.0-or-later +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. +// +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see <http://www.gnu.org/licenses/>. + +package middleware_test + +import ( + "context" + "crypto/sha256" + "encoding/hex" + "io" + "net/http" + "net/http/httptest" + "slices" + "strconv" + "strings" + "testing" + + "code.superseriousbusiness.org/gotosocial/internal/api/model" + "code.superseriousbusiness.org/gotosocial/internal/config" + "code.superseriousbusiness.org/gotosocial/internal/gtserror" + "code.superseriousbusiness.org/gotosocial/internal/middleware" + "code.superseriousbusiness.org/gotosocial/internal/router" + "codeberg.org/gruf/go-byteutil" + "github.com/gin-gonic/gin" + "github.com/stretchr/testify/assert" +) + +func TestNoLLaMasMiddleware(t *testing.T) { + // Gin test engine. + e := gin.New() + + // Setup necessary configuration variables. + config.SetAdvancedScraperDeterrence(true) + config.SetWebTemplateBaseDir("../../web/template") + + // Load templates into engine. + err := router.LoadTemplates(e) + assert.NoError(t, err) + + // Add middleware to the gin engine handler stack. + middleware := middleware.NoLLaMas(getInstanceV1) + e.Use(middleware) + + // Set test handler we can + // easily check if was used. + e.Handle("GET", "/", testHandler) + + // Test with differing user-agents. + for _, userAgent := range []string{ + "CURL", + "Mozilla FireSox", + "Google Gnome", + } { + testNoLLaMasMiddleware(t, e, userAgent) + } +} + +func testNoLLaMasMiddleware(t *testing.T, e *gin.Engine, userAgent string) { + // Prepare a test request for gin engine. + r := httptest.NewRequest("GET", "/", nil) + r.Header.Set("User-Agent", userAgent) + rw := httptest.NewRecorder() + + // Pass req through + // engine handler. + e.ServeHTTP(rw, r) + + // Get http result. + res := rw.Result() + + // It should have been stopped + // by middleware and NOT used + // the expected test handler. + ok := usedTestHandler(res) + assert.False(t, ok) + + // Read entire response body. + b, err := io.ReadAll(res.Body) + if err != nil { + panic(err) + } + + var difficulty uint64 + var challenge string + + // Parse output body and find the challenge / difficulty. + for _, line := range strings.Split(string(b), "\n") { + line = strings.TrimSpace(line) + switch { + case strings.HasPrefix(line, "data-nollamas-challenge=\""): + line = line[25:] + line = line[:len(line)-1] + challenge = line + case strings.HasPrefix(line, "data-nollamas-difficulty=\""): + line = line[26:] + line = line[:len(line)-1] + var err error + difficulty, err = strconv.ParseUint(line, 10, 8) + assert.NoError(t, err) + } + } + + // Ensure valid posed challenge. + assert.NotZero(t, difficulty) + assert.NotEmpty(t, challenge) + + // Prepare a test request for gin engine. + r = httptest.NewRequest("GET", "/", nil) + r.Header.Set("User-Agent", userAgent) + rw = httptest.NewRecorder() + + // Now compute and set solution query paramater. + solution := computeSolution(challenge, difficulty) + r.URL.RawQuery = "nollamas_solution=" + solution + + // Pass req through + // engine handler. + e.ServeHTTP(rw, r) + + // Get http result. + res = rw.Result() + + // Should have received redirect. + uri, err := res.Location() + assert.NoError(t, err) + assert.Equal(t, uri.String(), "/") + + // Ensure our expected solution cookie (to bypass challenge) was set. + ok = slices.ContainsFunc(res.Cookies(), func(c *http.Cookie) bool { + return c.Name == "gts-nollamas" + }) + assert.True(t, ok) +} + +// computeSolution does the functional equivalent of our nollamas workerTask.js. +func computeSolution(challenge string, difficulty uint64) string { +outer: + for i := 0; ; i++ { + solution := strconv.Itoa(i) + combined := challenge + solution + hash := sha256.Sum256(byteutil.S2B(combined)) + encoded := hex.EncodeToString(hash[:]) + for i := range difficulty { + if encoded[i] != '0' { + continue outer + } + } + return solution + } +} + +// usedTestHandler returns whether testHandler() was used. +func usedTestHandler(res *http.Response) bool { + return res.Header.Get("test-handler") == "ok" +} + +func testHandler(c *gin.Context) { + c.Writer.Header().Set("test-handler", "ok") + c.Writer.WriteHeader(http.StatusOK) +} + +func getInstanceV1(context.Context) (*model.InstanceV1, gtserror.WithCode) { + return &model.InstanceV1{}, nil +} diff --git a/internal/web/web.go b/internal/web/web.go index ab440ab2f..e42dc16c3 100644 --- a/internal/web/web.go +++ b/internal/web/web.go @@ -99,12 +99,16 @@ func (m *Module) Route(r *router.Router, mi ...gin.HandlerFunc) { // Handlers that serve profiles and statuses should use // the SignatureCheck middleware, so that requests with - // content-type application/activity+json can be served + // content-type application/activity+json can be served, + // and (if enabled) the nollamas middleware, to protect + // against scraping by shitty LLM bullshit. profileGroup := r.AttachGroup(profileGroupPath) profileGroup.Use(mi...) profileGroup.Use(middleware.SignatureCheck(m.isURIBlocked), middleware.CacheControl(middleware.CacheControlConfig{ Directives: []string{"no-store"}, })) + nollamas := middleware.NoLLaMas(m.processor.InstanceGetV1) + profileGroup.Use(nollamas) profileGroup.Handle(http.MethodGet, "", m.profileGETHandler) // use empty path here since it's the base of the group profileGroup.Handle(http.MethodGet, statusPath, m.threadGETHandler) |
