diff options
author | 2023-02-03 10:58:58 +0000 | |
---|---|---|
committer | 2023-02-03 11:58:58 +0100 | |
commit | 49beb17a8fbdbf3517c103a477a5459a3bba404d (patch) | |
tree | 364c82d4089c75d3b95a5d78fd31b33d91b30b59 /internal/util/statustools.go | |
parent | [bugfix] Read Bookwyrm Articles more thoroughly (#1410) (diff) | |
download | gotosocial-49beb17a8fbdbf3517c103a477a5459a3bba404d.tar.xz |
[chore] Text formatting overhaul (#1406)
* Implement goldmark debug print for hashtags and mentions
* Minify HTML in FromPlain
* Convert plaintext status parser to goldmark
* Move mention/tag/emoji finding logic into formatter
* Combine mention and hashtag boundary characters
* Normalize unicode when rendering hashtags
Diffstat (limited to 'internal/util/statustools.go')
-rw-r--r-- | internal/util/statustools.go | 110 |
1 files changed, 7 insertions, 103 deletions
diff --git a/internal/util/statustools.go b/internal/util/statustools.go index 80a091623..a4bb15007 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -20,115 +20,19 @@ package util import ( "unicode" - "unicode/utf8" - - "github.com/superseriousbusiness/gotosocial/internal/regexes" -) - -const ( - maximumHashtagLength = 30 ) -// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of account names -// mentioned in that text, in the format "@user@example.org" or "@username" for -// local users. -func DeriveMentionNamesFromText(text string) []string { - mentionedAccounts := []string{} - for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) { - mentionedAccounts = append(mentionedAccounts, m[1]) - } - return UniqueStrings(mentionedAccounts) -} - -type Pair[A, B any] struct { - First A - Second B -} - -// Byte index in original string -// `First` includes `#`. -type Span = Pair[int, int] - -// Takes a plaintext (ie., not HTML-formatted) text, -// and returns a slice of unique hashtags. -func DeriveHashtagsFromText(text string) []string { - tagsMap := make(map[string]bool) - tags := []string{} - - for _, v := range FindHashtagSpansInText(text) { - t := text[v.First+1 : v.Second] - if _, value := tagsMap[t]; !value { - tagsMap[t] = true - tags = append(tags, t) - } - } - - return tags -} - -// Takes a plaintext (ie., not HTML-formatted) text, -// and returns a list of pairs of indices into the original string, where -// hashtags are located. -func FindHashtagSpansInText(text string) []Span { - tags := []Span{} - start := 0 - // Keep one rune of lookbehind. - prev := ' ' - inTag := false - - for i, r := range text { - if r == '#' && IsHashtagBoundary(prev) { - // Start of hashtag. - inTag = true - start = i - } else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) { - // Inside the hashtag, but it was a phoney, gottem. - inTag = false - } else if inTag && IsHashtagBoundary(r) { - // End of hashtag. - inTag = false - appendTag(&tags, text, start, i) - } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { - // End of text. - appendTag(&tags, text, start, irl) - } - - prev = r - } - - return tags -} - -func appendTag(tags *[]Span, text string, start int, end int) { - l := end - start - 1 - // This check could be moved out into the parsing loop if necessary! - if 0 < l && l <= maximumHashtagLength { - *tags = append(*tags, Span{First: start, Second: end}) - } -} - -// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of emojis -// used in that text, without the surrounding `::` -func DeriveEmojisFromText(text string) []string { - emojis := []string{} - for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) { - emojis = append(emojis, m[1]) - } - return UniqueStrings(emojis) +func IsPlausiblyInHashtag(r rune) bool { + // Marks are allowed during parsing, prior to normalization, but not after, + // since they may be combined into letters during normalization. + return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) } func IsPermittedInHashtag(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } -// Decides where to break before or after a hashtag. -func IsHashtagBoundary(r rune) bool { - return r == '#' || // `###lol` should work - unicode.IsSpace(r) || // All kinds of Unicode whitespace. - unicode.IsControl(r) || // All kinds of control characters, like tab. - // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). - // But `someurl/#fragment` should not match, neither should HTML entities like `#`. - ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) +// Decides where to break before or after a #hashtag or @mention +func IsMentionOrHashtagBoundary(r rune) bool { + return unicode.IsSpace(r) || unicode.IsPunct(r) } |