diff options
| author | 2024-09-16 14:00:23 +0200 | |
|---|---|---|
| committer | 2024-09-16 14:00:23 +0200 | |
| commit | efd1a4f717afa83d3d3609f0d70e4da151a8dc9b (patch) | |
| tree | 246ae4c12f86f8866e5299ae39ba5c1feba0bce4 /internal | |
| parent | [bugfix/chore] Always set the status sensitive if media + content-warning pre... (diff) | |
| download | gotosocial-efd1a4f717afa83d3d3609f0d70e4da151a8dc9b.tar.xz | |
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering
* add new deps to readme
* lint
* update tests
* update regexes
* address review comments
* remove now unused xxhash
* whoops, wrong logger
* Merge branch 'main' into status_filtering_bugfix
* put cache in caches struct
* pain
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/cache/cache.go | 20 | ||||
| -rw-r--r-- | internal/gtsmodel/filter.go | 19 | ||||
| -rw-r--r-- | internal/typeutils/internaltofrontend.go | 81 | ||||
| -rw-r--r-- | internal/typeutils/internaltofrontend_test.go | 24 | ||||
| -rw-r--r-- | internal/typeutils/util.go | 62 | ||||
| -rw-r--r-- | internal/typeutils/util_test.go | 60 | 
6 files changed, 205 insertions, 61 deletions
diff --git a/internal/cache/cache.go b/internal/cache/cache.go index 5554445b2..8291dec5a 100644 --- a/internal/cache/cache.go +++ b/internal/cache/cache.go @@ -47,6 +47,11 @@ type Caches struct {  	// Webfinger provides access to the webfinger URL cache.  	Webfinger *ttl.Cache[string, string] // TTL=24hr, sweep=5min +	// TTL cache of statuses -> filterable text fields. +	// To ensure up-to-date fields, cache is keyed as: +	// `[status.ID][status.UpdatedAt.Unix()]` +	StatusesFilterableFields *ttl.Cache[string, []string] +  	// prevent pass-by-value.  	_ nocopy  } @@ -109,6 +114,7 @@ func (c *Caches) Init() {  	c.initUserMuteIDs()  	c.initWebfinger()  	c.initVisibility() +	c.initStatusesFilterableFields()  }  // Start will start any caches that require a background @@ -119,6 +125,10 @@ func (c *Caches) Start() {  	tryUntil("starting webfinger cache", 5, func() bool {  		return c.Webfinger.Start(5 * time.Minute)  	}) + +	tryUntil("starting statusesFilterableFields cache", 5, func() bool { +		return c.StatusesFilterableFields.Start(5 * time.Minute) +	})  }  // Stop will stop any caches that require a background @@ -127,6 +137,7 @@ func (c *Caches) Stop() {  	log.Infof(nil, "stop: %p", c)  	tryUntil("stopping webfinger cache", 5, c.Webfinger.Stop) +	tryUntil("stopping statusesFilterableFields cache", 5, c.StatusesFilterableFields.Stop)  }  // Sweep will sweep all the available caches to ensure none @@ -204,3 +215,12 @@ func (c *Caches) initWebfinger() {  		24*time.Hour,  	)  } + +func (c *Caches) initStatusesFilterableFields() { +	c.StatusesFilterableFields = new(ttl.Cache[string, []string]) +	c.StatusesFilterableFields.Init( +		0, +		512, +		1*time.Hour, +	) +} diff --git a/internal/gtsmodel/filter.go b/internal/gtsmodel/filter.go index e670e6fc0..95047a44f 100644 --- a/internal/gtsmodel/filter.go +++ b/internal/gtsmodel/filter.go @@ -20,6 +20,8 @@ package gtsmodel  import (  	"regexp"  	"time" + +	"github.com/superseriousbusiness/gotosocial/internal/util"  )  // Filter stores a filter created by a local account. @@ -61,14 +63,23 @@ type FilterKeyword struct {  // Compile will compile this FilterKeyword as a prepared regular expression.  func (k *FilterKeyword) Compile() (err error) { -	var wordBreak string -	if k.WholeWord != nil && *k.WholeWord { -		wordBreak = `\b` +	var ( +		wordBreakStart string +		wordBreakEnd   string +	) + +	if util.PtrOrZero(k.WholeWord) { +		// Either word boundary or +		// whitespace or start of line. +		wordBreakStart = `(?:\b|\s|^)` +		// Either word boundary or +		// whitespace or end of line. +		wordBreakEnd = `(?:\b|\s|$)`  	}  	// Compile keyword filter regexp.  	quoted := regexp.QuoteMeta(k.Keyword) -	k.Regexp, err = regexp.Compile(`(?i)` + wordBreak + quoted + wordBreak) +	k.Regexp, err = regexp.Compile(`(?i)` + wordBreakStart + quoted + wordBreakEnd)  	return // caller is expected to wrap this error  } diff --git a/internal/typeutils/internaltofrontend.go b/internal/typeutils/internaltofrontend.go index 55af2c1f1..fe49766fa 100644 --- a/internal/typeutils/internaltofrontend.go +++ b/internal/typeutils/internaltofrontend.go @@ -21,6 +21,8 @@ import (  	"context"  	"errors"  	"fmt" +	"slices" +	"strconv"  	"strings"  	"time" @@ -35,7 +37,6 @@ import (  	"github.com/superseriousbusiness/gotosocial/internal/language"  	"github.com/superseriousbusiness/gotosocial/internal/log"  	"github.com/superseriousbusiness/gotosocial/internal/media" -	"github.com/superseriousbusiness/gotosocial/internal/text"  	"github.com/superseriousbusiness/gotosocial/internal/uris"  	"github.com/superseriousbusiness/gotosocial/internal/util"  ) @@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults(  		return nil, nil  	} -	// Extract text fields from the status that we will match filters against. -	fields := filterableTextFields(s) +	// Key this status based on ID + last updated time, +	// to ensure we always filter on latest version. +	statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10) + +	// Check if we have filterable fields cached for this status. +	cache := c.state.Caches.StatusesFilterableFields +	fields, stored := cache.Get(statusKey) +	if !stored { +		// We don't have filterable fields +		// cached, calculate + cache now. +		fields = filterableFields(s) +		cache.Set(statusKey, fields) +	}  	// Record all matching warn filters and the reasons they matched.  	filterResults := make([]apimodel.FilterResult, 0, len(filters))  	for _, filter := range filters {  		if !filterAppliesInContext(filter, filterContext) { -			// Filter doesn't apply to this context. +			// Filter doesn't apply +			// to this context.  			continue  		} +  		if filter.Expired(now) { +			// Filter doesn't +			// apply anymore.  			continue  		} -		// List all matching keywords. +		// Assemble matching keywords (if any) from this filter.  		keywordMatches := make([]string, 0, len(filter.Keywords)) -		for _, filterKeyword := range filter.Keywords { -			var isMatch bool -			for _, field := range fields { -				if filterKeyword.Regexp.MatchString(field) { -					isMatch = true -					break -				} -			} -			if isMatch { -				keywordMatches = append(keywordMatches, filterKeyword.Keyword) +		for _, keyword := range filter.Keywords { +			// Check if at least one filterable field +			// in the status matches on this filter. +			if slices.ContainsFunc( +				fields, +				func(field string) bool { +					return keyword.Regexp.MatchString(field) +				}, +			) { +				// At least one field matched on this filter. +				keywordMatches = append(keywordMatches, keyword.Keyword)  			}  		} @@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults(  	return filterResults, nil  } -// filterableTextFields returns all text from a status that we might want to filter on: -// - content -// - content warning -// - media descriptions -// - poll options -func filterableTextFields(s *gtsmodel.Status) []string { -	fieldCount := 2 + len(s.Attachments) -	if s.Poll != nil { -		fieldCount += len(s.Poll.Options) -	} -	fields := make([]string, 0, fieldCount) - -	if s.Content != "" { -		fields = append(fields, text.SanitizeToPlaintext(s.Content)) -	} -	if s.ContentWarning != "" { -		fields = append(fields, s.ContentWarning) -	} -	for _, attachment := range s.Attachments { -		if attachment.Description != "" { -			fields = append(fields, attachment.Description) -		} -	} -	if s.Poll != nil { -		for _, option := range s.Poll.Options { -			if option != "" { -				fields = append(fields, option) -			} -		} -	} - -	return fields -} -  // filterAppliesInContext returns whether a given filter applies in a given context.  func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool {  	switch filterContext { diff --git a/internal/typeutils/internaltofrontend_test.go b/internal/typeutils/internaltofrontend_test.go index 651ff867d..a44afe67e 100644 --- a/internal/typeutils/internaltofrontend_test.go +++ b/internal/typeutils/internaltofrontend_test.go @@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() {  // Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect.  func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) { -	testStatus := suite.testStatuses["admin_account_status_1"] +	testStatus := new(gtsmodel.Status) +	*testStatus = *suite.testStatuses["admin_account_status_1"]  	testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>`  	if boost { -		// Modify a fixture boost into a boost of the above status. -		boostStatus := suite.testStatuses["admin_account_status_4"] -		boostStatus.BoostOf = testStatus -		boostStatus.BoostOfID = testStatus.ID -		testStatus = boostStatus +		boost, err := suite.typeconverter.StatusToBoost( +			context.Background(), +			testStatus, +			suite.testAccounts["admin_account"], +			"", +		) +		if err != nil { +			suite.FailNow(err.Error()) +		} +		testStatus = boost  	}  	requestingAccount := suite.testAccounts["local_account_1"] @@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh  		[]*gtsmodel.Filter{filter},  		nil,  	) -	if suite.NoError(err) { -		suite.NotEmpty(apiStatus.Filtered) +	if err != nil { +		suite.FailNow(err.Error())  	} + +	suite.NotEmpty(apiStatus.Filtered)  }  func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() { diff --git a/internal/typeutils/util.go b/internal/typeutils/util.go index 3441e89a9..3a867ba35 100644 --- a/internal/typeutils/util.go +++ b/internal/typeutils/util.go @@ -27,6 +27,7 @@ import (  	"strconv"  	"strings" +	"github.com/k3a/html2text"  	apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model"  	"github.com/superseriousbusiness/gotosocial/internal/config"  	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel" @@ -284,3 +285,64 @@ func ContentToContentLanguage(  	return contentStr, langTagStr  } + +// filterableFields returns text fields from +// a status that we might want to filter on: +// +//   - content warning +//   - content (converted to plaintext from HTML) +//   - media descriptions +//   - poll options +// +// Each field should be filtered separately. +// This avoids scenarios where false-positive +// multiple-word matches can be made by matching +// the last word of one field + the first word +// of the next field together. +func filterableFields(s *gtsmodel.Status) []string { +	// Estimate length of fields. +	fieldCount := 2 + len(s.Attachments) +	if s.Poll != nil { +		fieldCount += len(s.Poll.Options) +	} +	fields := make([]string, 0, fieldCount) + +	// Content warning / title. +	if s.ContentWarning != "" { +		fields = append(fields, s.ContentWarning) +	} + +	// Status content. Though we have raw text +	// available for statuses created on our +	// instance, use the html2text version to +	// remove markdown-formatting characters +	// and ensure more consistent filtering. +	if s.Content != "" { +		text := html2text.HTML2TextWithOptions( +			s.Content, +			html2text.WithLinksInnerText(), +			html2text.WithUnixLineBreaks(), +		) +		if text != "" { +			fields = append(fields, text) +		} +	} + +	// Media descriptions. +	for _, attachment := range s.Attachments { +		if attachment.Description != "" { +			fields = append(fields, attachment.Description) +		} +	} + +	// Poll options. +	if s.Poll != nil { +		for _, opt := range s.Poll.Options { +			if opt != "" { +				fields = append(fields, opt) +			} +		} +	} + +	return fields +} diff --git a/internal/typeutils/util_test.go b/internal/typeutils/util_test.go index 0f852d399..ea6667519 100644 --- a/internal/typeutils/util_test.go +++ b/internal/typeutils/util_test.go @@ -21,6 +21,7 @@ import (  	"context"  	"testing" +	"github.com/stretchr/testify/assert"  	"github.com/superseriousbusiness/gotosocial/internal/config"  	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"  	"github.com/superseriousbusiness/gotosocial/internal/language" @@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) {  		}  	}  } + +func TestFilterableText(t *testing.T) { +	type testcase struct { +		status         *gtsmodel.Status +		expectedFields []string +	} + +	for _, testcase := range []testcase{ +		{ +			status: >smodel.Status{ +				ContentWarning: "This is a test status", +				Content:        `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`, +			}, +			expectedFields: []string{ +				"This is a test status", +				"Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.", +			}, +		}, +		{ +			status: >smodel.Status{ +				Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`, +			}, +			expectedFields: []string{ +				"@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>", +			}, +		}, +		{ +			status: >smodel.Status{ +				ContentWarning: "Nerd stuff", +				Content:        `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`, +				Attachments: []*gtsmodel.MediaAttachment{ +					{ +						Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`, +					}, +					{ +						Description: `Another media attachment`, +					}, +				}, +				Poll: >smodel.Poll{ +					Options: []string{ +						"Poll option 1", +						"Poll option 2", +					}, +				}, +			}, +			expectedFields: []string{ +				"Nerd stuff", +				"Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.", +				"Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.", +				"Another media attachment", +				"Poll option 1", +				"Poll option 2", +			}, +		}, +	} { +		fields := filterableFields(testcase.status) +		assert.Equal(t, testcase.expectedFields, fields) +	} +}  | 
