From efd1a4f717afa83d3d3609f0d70e4da151a8dc9b Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Mon, 16 Sep 2024 14:00:23 +0200 Subject: [bugfix] Use better plaintext representation of status for filtering (#3301) * [bugfix] Use better plaintext representation of status for filtering * add new deps to readme * lint * update tests * update regexes * address review comments * remove now unused xxhash * whoops, wrong logger * Merge branch 'main' into status_filtering_bugfix * put cache in caches struct * pain --- internal/typeutils/util.go | 62 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'internal/typeutils/util.go') diff --git a/internal/typeutils/util.go b/internal/typeutils/util.go index 3441e89a9..3a867ba35 100644 --- a/internal/typeutils/util.go +++ b/internal/typeutils/util.go @@ -27,6 +27,7 @@ import ( "strconv" "strings" + "github.com/k3a/html2text" apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" @@ -284,3 +285,64 @@ func ContentToContentLanguage( return contentStr, langTagStr } + +// filterableFields returns text fields from +// a status that we might want to filter on: +// +// - content warning +// - content (converted to plaintext from HTML) +// - media descriptions +// - poll options +// +// Each field should be filtered separately. +// This avoids scenarios where false-positive +// multiple-word matches can be made by matching +// the last word of one field + the first word +// of the next field together. +func filterableFields(s *gtsmodel.Status) []string { + // Estimate length of fields. + fieldCount := 2 + len(s.Attachments) + if s.Poll != nil { + fieldCount += len(s.Poll.Options) + } + fields := make([]string, 0, fieldCount) + + // Content warning / title. + if s.ContentWarning != "" { + fields = append(fields, s.ContentWarning) + } + + // Status content. Though we have raw text + // available for statuses created on our + // instance, use the html2text version to + // remove markdown-formatting characters + // and ensure more consistent filtering. + if s.Content != "" { + text := html2text.HTML2TextWithOptions( + s.Content, + html2text.WithLinksInnerText(), + html2text.WithUnixLineBreaks(), + ) + if text != "" { + fields = append(fields, text) + } + } + + // Media descriptions. + for _, attachment := range s.Attachments { + if attachment.Description != "" { + fields = append(fields, attachment.Description) + } + } + + // Poll options. + if s.Poll != nil { + for _, opt := range s.Poll.Options { + if opt != "" { + fields = append(fields, opt) + } + } + } + + return fields +} -- cgit v1.2.3