diff options
author | 2024-09-16 14:00:23 +0200 | |
---|---|---|
committer | 2024-09-16 14:00:23 +0200 | |
commit | efd1a4f717afa83d3d3609f0d70e4da151a8dc9b (patch) | |
tree | 246ae4c12f86f8866e5299ae39ba5c1feba0bce4 /internal/typeutils | |
parent | [bugfix/chore] Always set the status sensitive if media + content-warning pre... (diff) | |
download | gotosocial-efd1a4f717afa83d3d3609f0d70e4da151a8dc9b.tar.xz |
[bugfix] Use better plaintext representation of status for filtering (#3301)
* [bugfix] Use better plaintext representation of status for filtering
* add new deps to readme
* lint
* update tests
* update regexes
* address review comments
* remove now unused xxhash
* whoops, wrong logger
* Merge branch 'main' into status_filtering_bugfix
* put cache in caches struct
* pain
Diffstat (limited to 'internal/typeutils')
-rw-r--r-- | internal/typeutils/internaltofrontend.go | 81 | ||||
-rw-r--r-- | internal/typeutils/internaltofrontend_test.go | 24 | ||||
-rw-r--r-- | internal/typeutils/util.go | 62 | ||||
-rw-r--r-- | internal/typeutils/util_test.go | 60 |
4 files changed, 170 insertions, 57 deletions
diff --git a/internal/typeutils/internaltofrontend.go b/internal/typeutils/internaltofrontend.go index 55af2c1f1..fe49766fa 100644 --- a/internal/typeutils/internaltofrontend.go +++ b/internal/typeutils/internaltofrontend.go @@ -21,6 +21,8 @@ import ( "context" "errors" "fmt" + "slices" + "strconv" "strings" "time" @@ -35,7 +37,6 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/language" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/media" - "github.com/superseriousbusiness/gotosocial/internal/text" "github.com/superseriousbusiness/gotosocial/internal/uris" "github.com/superseriousbusiness/gotosocial/internal/util" ) @@ -939,32 +940,48 @@ func (c *Converter) statusToAPIFilterResults( return nil, nil } - // Extract text fields from the status that we will match filters against. - fields := filterableTextFields(s) + // Key this status based on ID + last updated time, + // to ensure we always filter on latest version. + statusKey := s.ID + strconv.FormatInt(s.UpdatedAt.Unix(), 10) + + // Check if we have filterable fields cached for this status. + cache := c.state.Caches.StatusesFilterableFields + fields, stored := cache.Get(statusKey) + if !stored { + // We don't have filterable fields + // cached, calculate + cache now. + fields = filterableFields(s) + cache.Set(statusKey, fields) + } // Record all matching warn filters and the reasons they matched. filterResults := make([]apimodel.FilterResult, 0, len(filters)) for _, filter := range filters { if !filterAppliesInContext(filter, filterContext) { - // Filter doesn't apply to this context. + // Filter doesn't apply + // to this context. continue } + if filter.Expired(now) { + // Filter doesn't + // apply anymore. continue } - // List all matching keywords. + // Assemble matching keywords (if any) from this filter. keywordMatches := make([]string, 0, len(filter.Keywords)) - for _, filterKeyword := range filter.Keywords { - var isMatch bool - for _, field := range fields { - if filterKeyword.Regexp.MatchString(field) { - isMatch = true - break - } - } - if isMatch { - keywordMatches = append(keywordMatches, filterKeyword.Keyword) + for _, keyword := range filter.Keywords { + // Check if at least one filterable field + // in the status matches on this filter. + if slices.ContainsFunc( + fields, + func(field string) bool { + return keyword.Regexp.MatchString(field) + }, + ) { + // At least one field matched on this filter. + keywordMatches = append(keywordMatches, keyword.Keyword) } } @@ -1001,40 +1018,6 @@ func (c *Converter) statusToAPIFilterResults( return filterResults, nil } -// filterableTextFields returns all text from a status that we might want to filter on: -// - content -// - content warning -// - media descriptions -// - poll options -func filterableTextFields(s *gtsmodel.Status) []string { - fieldCount := 2 + len(s.Attachments) - if s.Poll != nil { - fieldCount += len(s.Poll.Options) - } - fields := make([]string, 0, fieldCount) - - if s.Content != "" { - fields = append(fields, text.SanitizeToPlaintext(s.Content)) - } - if s.ContentWarning != "" { - fields = append(fields, s.ContentWarning) - } - for _, attachment := range s.Attachments { - if attachment.Description != "" { - fields = append(fields, attachment.Description) - } - } - if s.Poll != nil { - for _, option := range s.Poll.Options { - if option != "" { - fields = append(fields, option) - } - } - } - - return fields -} - // filterAppliesInContext returns whether a given filter applies in a given context. func filterAppliesInContext(filter *gtsmodel.Filter, filterContext statusfilter.FilterContext) bool { switch filterContext { diff --git a/internal/typeutils/internaltofrontend_test.go b/internal/typeutils/internaltofrontend_test.go index 651ff867d..a44afe67e 100644 --- a/internal/typeutils/internaltofrontend_test.go +++ b/internal/typeutils/internaltofrontend_test.go @@ -1063,15 +1063,21 @@ func (suite *InternalToFrontendTestSuite) TestHideFilteredBoostToFrontend() { // Test that a hashtag filter for a hashtag in Mastodon HTML content works the way most users would expect. func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wholeWord bool, boost bool) { - testStatus := suite.testStatuses["admin_account_status_1"] + testStatus := new(gtsmodel.Status) + *testStatus = *suite.testStatuses["admin_account_status_1"] testStatus.Content = `<p>doggo doggin' it</p><p><a href="https://example.test/tags/dogsofmastodon" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>dogsofmastodon</span></a></p>` if boost { - // Modify a fixture boost into a boost of the above status. - boostStatus := suite.testStatuses["admin_account_status_4"] - boostStatus.BoostOf = testStatus - boostStatus.BoostOfID = testStatus.ID - testStatus = boostStatus + boost, err := suite.typeconverter.StatusToBoost( + context.Background(), + testStatus, + suite.testAccounts["admin_account"], + "", + ) + if err != nil { + suite.FailNow(err.Error()) + } + testStatus = boost } requestingAccount := suite.testAccounts["local_account_1"] @@ -1103,9 +1109,11 @@ func (suite *InternalToFrontendTestSuite) testHashtagFilteredStatusToFrontend(wh []*gtsmodel.Filter{filter}, nil, ) - if suite.NoError(err) { - suite.NotEmpty(apiStatus.Filtered) + if err != nil { + suite.FailNow(err.Error()) } + + suite.NotEmpty(apiStatus.Filtered) } func (suite *InternalToFrontendTestSuite) TestHashtagWholeWordFilteredStatusToFrontend() { diff --git a/internal/typeutils/util.go b/internal/typeutils/util.go index 3441e89a9..3a867ba35 100644 --- a/internal/typeutils/util.go +++ b/internal/typeutils/util.go @@ -27,6 +27,7 @@ import ( "strconv" "strings" + "github.com/k3a/html2text" apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" @@ -284,3 +285,64 @@ func ContentToContentLanguage( return contentStr, langTagStr } + +// filterableFields returns text fields from +// a status that we might want to filter on: +// +// - content warning +// - content (converted to plaintext from HTML) +// - media descriptions +// - poll options +// +// Each field should be filtered separately. +// This avoids scenarios where false-positive +// multiple-word matches can be made by matching +// the last word of one field + the first word +// of the next field together. +func filterableFields(s *gtsmodel.Status) []string { + // Estimate length of fields. + fieldCount := 2 + len(s.Attachments) + if s.Poll != nil { + fieldCount += len(s.Poll.Options) + } + fields := make([]string, 0, fieldCount) + + // Content warning / title. + if s.ContentWarning != "" { + fields = append(fields, s.ContentWarning) + } + + // Status content. Though we have raw text + // available for statuses created on our + // instance, use the html2text version to + // remove markdown-formatting characters + // and ensure more consistent filtering. + if s.Content != "" { + text := html2text.HTML2TextWithOptions( + s.Content, + html2text.WithLinksInnerText(), + html2text.WithUnixLineBreaks(), + ) + if text != "" { + fields = append(fields, text) + } + } + + // Media descriptions. + for _, attachment := range s.Attachments { + if attachment.Description != "" { + fields = append(fields, attachment.Description) + } + } + + // Poll options. + if s.Poll != nil { + for _, opt := range s.Poll.Options { + if opt != "" { + fields = append(fields, opt) + } + } + } + + return fields +} diff --git a/internal/typeutils/util_test.go b/internal/typeutils/util_test.go index 0f852d399..ea6667519 100644 --- a/internal/typeutils/util_test.go +++ b/internal/typeutils/util_test.go @@ -21,6 +21,7 @@ import ( "context" "testing" + "github.com/stretchr/testify/assert" "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/language" @@ -158,3 +159,62 @@ func TestContentToContentLanguage(t *testing.T) { } } } + +func TestFilterableText(t *testing.T) { + type testcase struct { + status *gtsmodel.Status + expectedFields []string + } + + for _, testcase := range []testcase{ + { + status: >smodel.Status{ + ContentWarning: "This is a test status", + Content: `<p>Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> instance.</p>`, + }, + expectedFields: []string{ + "This is a test status", + "Import / export of account data via CSV files will be coming in 0.17.0 :) No more having to run scripts + CLI tools to import a list of accounts you follow, after doing a migration to a #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> instance.", + }, + }, + { + status: >smodel.Status{ + Content: `<p><span class="h-card"><a href="https://example.org/@zlatko" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>zlatko</span></a></span> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)</p><p><a href="https://github.com/superseriousbusiness/gotosocial/pull/2863" rel="nofollow noreferrer noopener" target="_blank">https://github.com/superseriousbusiness/gotosocial/pull/2863</a></p>`, + }, + expectedFields: []string{ + "@zlatko <https://example.org/@zlatko> currently we used modernc/sqlite3 for our sqlite driver, but we've been experimenting with wasm sqlite, and will likely move to that permanently in future; in the meantime, both options are available (the latter with a build tag)\n\nhttps://github.com/superseriousbusiness/gotosocial/pull/2863 <https://github.com/superseriousbusiness/gotosocial/pull/2863>", + }, + }, + { + status: >smodel.Status{ + ContentWarning: "Nerd stuff", + Content: `<p>Latest graphs for <a href="https://gts.superseriousbusiness.org/tags/gotosocial" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>GoToSocial</span></a> on <a href="https://github.com/ncruces/go-sqlite3" rel="nofollow noreferrer noopener" target="_blank">Wasm sqlite3</a> with <a href="https://codeberg.org/gruf/go-ffmpreg" rel="nofollow noreferrer noopener" target="_blank">embedded Wasm ffmpeg</a>, both running on <a href="https://wazero.io/" rel="nofollow noreferrer noopener" target="_blank">Wazero</a>, and configured with a <a href="https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266" rel="nofollow noreferrer noopener" target="_blank">50MiB db cache target</a>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.</p>`, + Attachments: []*gtsmodel.MediaAttachment{ + { + Description: `Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.`, + }, + { + Description: `Another media attachment`, + }, + }, + Poll: >smodel.Poll{ + Options: []string{ + "Poll option 1", + "Poll option 2", + }, + }, + }, + expectedFields: []string{ + "Nerd stuff", + "Latest graphs for #GoToSocial <https://gts.superseriousbusiness.org/tags/gotosocial> on Wasm sqlite3 <https://github.com/ncruces/go-sqlite3> with embedded Wasm ffmpeg <https://codeberg.org/gruf/go-ffmpreg>, both running on Wazero <https://wazero.io/>, and configured with a 50MiB db cache target <https://github.com/superseriousbusiness/gotosocial/blob/20fe430ef9ff3012a7a4dc2d01b68020c20e13bb/example/config.yaml#L259-L266>. This is the version we'll be releasing soonish, now we're happy with how we've tamed everything.", + "Graph showing GtS using between 150-300 MiB of memory, steadily, over a few days.", + "Another media attachment", + "Poll option 1", + "Poll option 2", + }, + }, + } { + fields := filterableFields(testcase.status) + assert.Equal(t, testcase.expectedFields, fields) + } +} |