diff options
Diffstat (limited to 'internal/util')
-rw-r--r-- | internal/util/statustools.go | 90 | ||||
-rw-r--r-- | internal/util/statustools_test.go | 44 |
2 files changed, 116 insertions, 18 deletions
diff --git a/internal/util/statustools.go b/internal/util/statustools.go index b2b7fffa1..b1fd7968b 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -19,11 +19,16 @@ package util import ( - "strings" + "unicode" + "unicode/utf8" "github.com/superseriousbusiness/gotosocial/internal/regexes" ) +const ( + maximumHashtagLength = 30 +) + // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, // and applies a regex to it to return a deduplicated list of account names // mentioned in that text, in the format "@user@example.org" or "@username" for @@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string { return UniqueStrings(mentionedAccounts) } -// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of hashtags -// used in that text, without the leading #. The case of the returned -// tags will be lowered, for consistency. +type Pair[A, B any] struct { + First A + Second B +} + +// Byte index in original string +// `First` includes `#`. +type Span = Pair[int, int] + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a slice of unique hashtags. func DeriveHashtagsFromText(text string) []string { + tagsMap := make(map[string]bool) tags := []string{} - for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) { - tags = append(tags, strings.TrimPrefix(m[1], "#")) + + for _, v := range FindHashtagSpansInText(text) { + t := text[v.First+1 : v.Second] + if _, value := tagsMap[t]; !value { + tagsMap[t] = true + tags = append(tags, t) + } + } + + return tags +} + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a list of pairs of indices into the original string, where +// hashtags are located. +func FindHashtagSpansInText(text string) []Span { + tags := []Span{} + start := 0 + // Keep one rune of lookbehind. + prev := ' ' + inTag := false + + for i, r := range text { + if r == '#' && isHashtagBoundary(prev) { + // Start of hashtag. + inTag = true + start = i + } else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) { + // Inside the hashtag, but it was a phoney, gottem. + inTag = false + } else if inTag && isHashtagBoundary(r) { + // End of hashtag. + inTag = false + appendTag(&tags, text, start, i) + } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { + // End of text. + appendTag(&tags, text, start, irl) + } + + prev = r + } + + return tags +} + +func appendTag(tags *[]Span, text string, start int, end int) { + l := end - start - 1 + // This check could be moved out into the parsing loop if necessary! + if 0 < l && l <= maximumHashtagLength { + *tags = append(*tags, Span{First: start, Second: end}) } - return UniqueStrings(tags) } // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, @@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string { } return UniqueStrings(emojis) } + +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +// Decides where to break before or after a hashtag. +func isHashtagBoundary(r rune) bool { + return r == '#' || // `###lol` should work + unicode.IsSpace(r) || // All kinds of Unicode whitespace. + unicode.IsControl(r) || // All kinds of control characters, like tab. + // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). + // But `someurl/#fragment` should not match, neither should HTML entities like `#`. + ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) +} diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go index d9f344e4b..214fab553 100644 --- a/internal/util/statustools_test.go +++ b/internal/util/statustools_test.go @@ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() { # testing this one shouldn't work - #thisshouldwork + #thisshouldwork #dupe #dupe!! #dupe here's a link with a fragment: https://example.org/whatever#ahhh + here's another link with a fragment: https://example.org/whatever/#ahhh -#ThisShouldAlsoWork #not_this_though +(#ThisShouldAlsoWork) #not_this_though #111111 thisalsoshouldn'twork#### ## -#alimentación, #saúde +#alimentación, #saúde, #lävistää, #ö, #네 +#ThisOneIsThirtyOneCharactersLon... ...ng +#ThisOneIsThirteyCharactersLong ` tags := util.DeriveHashtagsFromText(statusText) - assert.Len(suite.T(), tags, 7) + assert.Len(suite.T(), tags, 12) assert.Equal(suite.T(), "testing123", tags[0]) assert.Equal(suite.T(), "also", tags[1]) assert.Equal(suite.T(), "thisshouldwork", tags[2]) - assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3]) - assert.Equal(suite.T(), "111111", tags[4]) - assert.Equal(suite.T(), "alimentación", tags[5]) - assert.Equal(suite.T(), "saúde", tags[6]) + assert.Equal(suite.T(), "dupe", tags[3]) + assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4]) + assert.Equal(suite.T(), "111111", tags[5]) + assert.Equal(suite.T(), "alimentación", tags[6]) + assert.Equal(suite.T(), "saúde", tags[7]) + assert.Equal(suite.T(), "lävistää", tags[8]) + assert.Equal(suite.T(), "ö", tags[9]) + assert.Equal(suite.T(), "네", tags[10]) + assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11]) + + statusText = `#올빼미 hej` + tags = util.DeriveHashtagsFromText(statusText) + assert.Equal(suite.T(), "올빼미", tags[0]) +} + +func (suite *StatusTestSuite) TestHashtagSpansOK() { + statusText := `#0 #3 #8aa` + + spans := util.FindHashtagSpansInText(statusText) + assert.Equal(suite.T(), 0, spans[0].First) + assert.Equal(suite.T(), 2, spans[0].Second) + assert.Equal(suite.T(), 3, spans[1].First) + assert.Equal(suite.T(), 5, spans[1].Second) + assert.Equal(suite.T(), 8, spans[2].First) + assert.Equal(suite.T(), 12, spans[2].Second) } func (suite *StatusTestSuite) TestDeriveEmojiOK() { @@ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end func (suite *StatusTestSuite) TestDeriveMultiple() { statusText := `Another test @foss_satan@fossbros-anonymous.io - #Hashtag + #HashTag Text` @@ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() { assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) assert.Len(suite.T(), hs, 1) - assert.Equal(suite.T(), "Hashtag", hs[0]) + assert.Contains(suite.T(), hs, "HashTag") assert.Len(suite.T(), es, 0) } |