diff options
author | 2023-02-03 10:58:58 +0000 | |
---|---|---|
committer | 2023-02-03 11:58:58 +0100 | |
commit | 49beb17a8fbdbf3517c103a477a5459a3bba404d (patch) | |
tree | 364c82d4089c75d3b95a5d78fd31b33d91b30b59 /internal/util | |
parent | [bugfix] Read Bookwyrm Articles more thoroughly (#1410) (diff) | |
download | gotosocial-49beb17a8fbdbf3517c103a477a5459a3bba404d.tar.xz |
[chore] Text formatting overhaul (#1406)
* Implement goldmark debug print for hashtags and mentions
* Minify HTML in FromPlain
* Convert plaintext status parser to goldmark
* Move mention/tag/emoji finding logic into formatter
* Combine mention and hashtag boundary characters
* Normalize unicode when rendering hashtags
Diffstat (limited to 'internal/util')
-rw-r--r-- | internal/util/statustools.go | 110 | ||||
-rw-r--r-- | internal/util/statustools_test.go | 173 |
2 files changed, 7 insertions, 276 deletions
diff --git a/internal/util/statustools.go b/internal/util/statustools.go index 80a091623..a4bb15007 100644 --- a/internal/util/statustools.go +++ b/internal/util/statustools.go @@ -20,115 +20,19 @@ package util import ( "unicode" - "unicode/utf8" - - "github.com/superseriousbusiness/gotosocial/internal/regexes" -) - -const ( - maximumHashtagLength = 30 ) -// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of account names -// mentioned in that text, in the format "@user@example.org" or "@username" for -// local users. -func DeriveMentionNamesFromText(text string) []string { - mentionedAccounts := []string{} - for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) { - mentionedAccounts = append(mentionedAccounts, m[1]) - } - return UniqueStrings(mentionedAccounts) -} - -type Pair[A, B any] struct { - First A - Second B -} - -// Byte index in original string -// `First` includes `#`. -type Span = Pair[int, int] - -// Takes a plaintext (ie., not HTML-formatted) text, -// and returns a slice of unique hashtags. -func DeriveHashtagsFromText(text string) []string { - tagsMap := make(map[string]bool) - tags := []string{} - - for _, v := range FindHashtagSpansInText(text) { - t := text[v.First+1 : v.Second] - if _, value := tagsMap[t]; !value { - tagsMap[t] = true - tags = append(tags, t) - } - } - - return tags -} - -// Takes a plaintext (ie., not HTML-formatted) text, -// and returns a list of pairs of indices into the original string, where -// hashtags are located. -func FindHashtagSpansInText(text string) []Span { - tags := []Span{} - start := 0 - // Keep one rune of lookbehind. - prev := ' ' - inTag := false - - for i, r := range text { - if r == '#' && IsHashtagBoundary(prev) { - // Start of hashtag. - inTag = true - start = i - } else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) { - // Inside the hashtag, but it was a phoney, gottem. - inTag = false - } else if inTag && IsHashtagBoundary(r) { - // End of hashtag. - inTag = false - appendTag(&tags, text, start, i) - } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) { - // End of text. - appendTag(&tags, text, start, irl) - } - - prev = r - } - - return tags -} - -func appendTag(tags *[]Span, text string, start int, end int) { - l := end - start - 1 - // This check could be moved out into the parsing loop if necessary! - if 0 < l && l <= maximumHashtagLength { - *tags = append(*tags, Span{First: start, Second: end}) - } -} - -// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text, -// and applies a regex to it to return a deduplicated list of emojis -// used in that text, without the surrounding `::` -func DeriveEmojisFromText(text string) []string { - emojis := []string{} - for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) { - emojis = append(emojis, m[1]) - } - return UniqueStrings(emojis) +func IsPlausiblyInHashtag(r rune) bool { + // Marks are allowed during parsing, prior to normalization, but not after, + // since they may be combined into letters during normalization. + return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) } func IsPermittedInHashtag(r rune) bool { return unicode.IsLetter(r) || unicode.IsNumber(r) } -// Decides where to break before or after a hashtag. -func IsHashtagBoundary(r rune) bool { - return r == '#' || // `###lol` should work - unicode.IsSpace(r) || // All kinds of Unicode whitespace. - unicode.IsControl(r) || // All kinds of control characters, like tab. - // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`). - // But `someurl/#fragment` should not match, neither should HTML entities like `#`. - ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r)) +// Decides where to break before or after a #hashtag or @mention +func IsMentionOrHashtagBoundary(r rune) bool { + return unicode.IsSpace(r) || unicode.IsPunct(r) } diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go deleted file mode 100644 index bb01557c5..000000000 --- a/internal/util/statustools_test.go +++ /dev/null @@ -1,173 +0,0 @@ -/* - GoToSocial - Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -package util_test - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/suite" - "github.com/superseriousbusiness/gotosocial/internal/util" -) - -type StatusTestSuite struct { - suite.Suite -} - -func (suite *StatusTestSuite) TestLinkNoMention() { - statusText := `here's a link to a post by zork: - -https://localhost:8080/@the_mighty_zork/statuses/01FGVP55XMF2K6316MQRX6PFG1 - -that link shouldn't come out formatted as a mention!` - - menchies := util.DeriveMentionNamesFromText(statusText) - suite.Empty(menchies) -} - -func (suite *StatusTestSuite) TestDeriveMentionsOK() { - statusText := `@dumpsterqueer@example.org testing testing - - is this thing on? - - @someone_else@testing.best-horse.com can you confirm? @hello@test.lgbt - - @thisisalocaluser! - - here is a duplicate mention: @hello@test.lgbt @hello@test.lgbt - - @account1@whatever.com @account2@whatever.com - - ` - - menchies := util.DeriveMentionNamesFromText(statusText) - assert.Len(suite.T(), menchies, 6) - assert.Equal(suite.T(), "@dumpsterqueer@example.org", menchies[0]) - assert.Equal(suite.T(), "@someone_else@testing.best-horse.com", menchies[1]) - assert.Equal(suite.T(), "@hello@test.lgbt", menchies[2]) - assert.Equal(suite.T(), "@thisisalocaluser", menchies[3]) - assert.Equal(suite.T(), "@account1@whatever.com", menchies[4]) - assert.Equal(suite.T(), "@account2@whatever.com", menchies[5]) -} - -func (suite *StatusTestSuite) TestDeriveMentionsEmpty() { - statusText := `` - menchies := util.DeriveMentionNamesFromText(statusText) - assert.Len(suite.T(), menchies, 0) -} - -func (suite *StatusTestSuite) TestDeriveHashtagsOK() { - statusText := `weeeeeeee #testing123 #also testing - -# testing this one shouldn't work - - #thisshouldwork #dupe #dupe!! #dupe - - here's a link with a fragment: https://example.org/whatever#ahhh - here's another link with a fragment: https://example.org/whatever/#ahhh - -(#ThisShouldAlsoWork) #not_this_though - -#111111 thisalsoshouldn'twork#### ## - -#alimentación, #saúde, #lävistää, #ö, #네 -#ThisOneIsThirtyOneCharactersLon... ...ng -#ThisOneIsThirteyCharactersLong -` - - tags := util.DeriveHashtagsFromText(statusText) - assert.Len(suite.T(), tags, 12) - assert.Equal(suite.T(), "testing123", tags[0]) - assert.Equal(suite.T(), "also", tags[1]) - assert.Equal(suite.T(), "thisshouldwork", tags[2]) - assert.Equal(suite.T(), "dupe", tags[3]) - assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4]) - assert.Equal(suite.T(), "111111", tags[5]) - assert.Equal(suite.T(), "alimentación", tags[6]) - assert.Equal(suite.T(), "saúde", tags[7]) - assert.Equal(suite.T(), "lävistää", tags[8]) - assert.Equal(suite.T(), "ö", tags[9]) - assert.Equal(suite.T(), "네", tags[10]) - assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11]) - - statusText = `#올빼미 hej` - tags = util.DeriveHashtagsFromText(statusText) - assert.Equal(suite.T(), "올빼미", tags[0]) -} - -func (suite *StatusTestSuite) TestHashtagSpansOK() { - statusText := `#0 #3 #8aa` - - spans := util.FindHashtagSpansInText(statusText) - assert.Equal(suite.T(), 0, spans[0].First) - assert.Equal(suite.T(), 2, spans[0].Second) - assert.Equal(suite.T(), 3, spans[1].First) - assert.Equal(suite.T(), 5, spans[1].Second) - assert.Equal(suite.T(), 8, spans[2].First) - assert.Equal(suite.T(), 12, spans[2].Second) -} - -func (suite *StatusTestSuite) TestDeriveEmojiOK() { - statusText := `:test: :another: - -Here's some normal text with an :emoji: at the end - -:spaces shouldnt work: - -:emoji1::emoji2: - -:anotheremoji:emoji2: -:anotheremoji::anotheremoji::anotheremoji::anotheremoji: -:underscores_ok_too: -` - - tags := util.DeriveEmojisFromText(statusText) - assert.Len(suite.T(), tags, 7) - assert.Equal(suite.T(), "test", tags[0]) - assert.Equal(suite.T(), "another", tags[1]) - assert.Equal(suite.T(), "emoji", tags[2]) - assert.Equal(suite.T(), "emoji1", tags[3]) - assert.Equal(suite.T(), "emoji2", tags[4]) - assert.Equal(suite.T(), "anotheremoji", tags[5]) - assert.Equal(suite.T(), "underscores_ok_too", tags[6]) -} - -func (suite *StatusTestSuite) TestDeriveMultiple() { - statusText := `Another test @foss_satan@fossbros-anonymous.io - - #HashTag - - Text` - - ms := util.DeriveMentionNamesFromText(statusText) - hs := util.DeriveHashtagsFromText(statusText) - es := util.DeriveEmojisFromText(statusText) - - assert.Len(suite.T(), ms, 1) - assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0]) - - assert.Len(suite.T(), hs, 1) - assert.Contains(suite.T(), hs, "HashTag") - - assert.Len(suite.T(), es, 0) -} - -func TestStatusTestSuite(t *testing.T) { - suite.Run(t, new(StatusTestSuite)) -} |