diff options
Diffstat (limited to 'internal/util/statustools.go')
-rw-r--r-- | internal/util/statustools.go | 90 |
1 files changed, 82 insertions, 8 deletions
diff --git a/internal/util/statustools.go b/internal/util/statustools.go index b2b7fffa1..b1fd7968b 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -19,11 +19,16 @@ package util import ( - "strings" + "unicode" + "unicode/utf8" "github.com/superseriousbusiness/gotosocial/internal/regexes" ) +const ( + maximumHashtagLength = 30 +) + // DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, // and applies a regex to it to return a deduplicated list of account names // mentioned in that text, in the format "@user@example.org" or "@username" for @@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string { return UniqueStrings(mentionedAccounts) } -// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of hashtags -// used in that text, without the leading #. The case of the returned -// tags will be lowered, for consistency. +type Pair[A, B any] struct { + First A + Second B +} + +// Byte index in original string +// `First` includes `#`. +type Span = Pair[int, int] + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a slice of unique hashtags. func DeriveHashtagsFromText(text string) []string { + tagsMap := make(map[string]bool) tags := []string{} - for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) { - tags = append(tags, strings.TrimPrefix(m[1], "#")) + + for _, v := range FindHashtagSpansInText(text) { + t := text[v.First+1 : v.Second] + if _, value := tagsMap[t]; !value { + tagsMap[t] = true + tags = append(tags, t) + } + } + + return tags +} + +// Takes a plaintext (ie., not HTML-formatted) text, +// and returns a list of pairs of indices into the original string, where +// hashtags are located. +func FindHashtagSpansInText(text string) []Span { + tags := []Span{} + start := 0 + // Keep one rune of lookbehind. + prev := ' ' + inTag := false + + for i, r := range text { + if r == '#' && isHashtagBoundary(prev) { + // Start of hashtag. + inTag = true + start = i + } else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) { + // Inside the hashtag, but it was a phoney, gottem. + inTag = false + } else if inTag && isHashtagBoundary(r) { + // End of hashtag. + inTag = false + appendTag(&tags, text, start, i) + } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { + // End of text. + appendTag(&tags, text, start, irl) + } + + prev = r + } + + return tags +} + +func appendTag(tags *[]Span, text string, start int, end int) { + l := end - start - 1 + // This check could be moved out into the parsing loop if necessary! + if 0 < l && l <= maximumHashtagLength { + *tags = append(*tags, Span{First: start, Second: end}) } - return UniqueStrings(tags) } // DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, @@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string { } return UniqueStrings(emojis) } + +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +// Decides where to break before or after a hashtag. +func isHashtagBoundary(r rune) bool { + return r == '#' || // `###lol` should work + unicode.IsSpace(r) || // All kinds of Unicode whitespace. + unicode.IsControl(r) || // All kinds of control characters, like tab. + // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). + // But `someurl/#fragment` should not match, neither should HTML entities like `#`. + ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) +} |