summaryrefslogtreecommitdiff
path: root/internal/util
diff options
context:
space:
mode:
authorLibravatar Autumn! <86073772+autumnull@users.noreply.github.com>2023-02-03 10:58:58 +0000
committerLibravatar GitHub <noreply@github.com>2023-02-03 11:58:58 +0100
commit49beb17a8fbdbf3517c103a477a5459a3bba404d (patch)
tree364c82d4089c75d3b95a5d78fd31b33d91b30b59 /internal/util
parent[bugfix] Read Bookwyrm Articles more thoroughly (#1410) (diff)
downloadgotosocial-49beb17a8fbdbf3517c103a477a5459a3bba404d.tar.xz
[chore] Text formatting overhaul (#1406)
* Implement goldmark debug print for hashtags and mentions * Minify HTML in FromPlain * Convert plaintext status parser to goldmark * Move mention/tag/emoji finding logic into formatter * Combine mention and hashtag boundary characters * Normalize unicode when rendering hashtags
Diffstat (limited to 'internal/util')
-rw-r--r--internal/util/statustools.go110
-rw-r--r--internal/util/statustools_test.go173
2 files changed, 7 insertions, 276 deletions
diff --git a/internal/util/statustools.go b/internal/util/statustools.go
index 80a091623..a4bb15007 100644
--- a/internal/util/statustools.go
+++ b/internal/util/statustools.go
@@ -20,115 +20,19 @@ package util
import (
"unicode"
- "unicode/utf8"
-
- "github.com/superseriousbusiness/gotosocial/internal/regexes"
-)
-
-const (
- maximumHashtagLength = 30
)
-// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
-// and applies a regex to it to return a deduplicated list of account names
-// mentioned in that text, in the format "@user@example.org" or "@username" for
-// local users.
-func DeriveMentionNamesFromText(text string) []string {
- mentionedAccounts := []string{}
- for _, m := range regexes.MentionFinder.FindAllStringSubmatch(text, -1) {
- mentionedAccounts = append(mentionedAccounts, m[1])
- }
- return UniqueStrings(mentionedAccounts)
-}
-
-type Pair[A, B any] struct {
- First A
- Second B
-}
-
-// Byte index in original string
-// `First` includes `#`.
-type Span = Pair[int, int]
-
-// Takes a plaintext (ie., not HTML-formatted) text,
-// and returns a slice of unique hashtags.
-func DeriveHashtagsFromText(text string) []string {
- tagsMap := make(map[string]bool)
- tags := []string{}
-
- for _, v := range FindHashtagSpansInText(text) {
- t := text[v.First+1 : v.Second]
- if _, value := tagsMap[t]; !value {
- tagsMap[t] = true
- tags = append(tags, t)
- }
- }
-
- return tags
-}
-
-// Takes a plaintext (ie., not HTML-formatted) text,
-// and returns a list of pairs of indices into the original string, where
-// hashtags are located.
-func FindHashtagSpansInText(text string) []Span {
- tags := []Span{}
- start := 0
- // Keep one rune of lookbehind.
- prev := ' '
- inTag := false
-
- for i, r := range text {
- if r == '#' && IsHashtagBoundary(prev) {
- // Start of hashtag.
- inTag = true
- start = i
- } else if inTag && !IsPermittedInHashtag(r) && !IsHashtagBoundary(r) {
- // Inside the hashtag, but it was a phoney, gottem.
- inTag = false
- } else if inTag && IsHashtagBoundary(r) {
- // End of hashtag.
- inTag = false
- appendTag(&tags, text, start, i)
- } else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
- // End of text.
- appendTag(&tags, text, start, irl)
- }
-
- prev = r
- }
-
- return tags
-}
-
-func appendTag(tags *[]Span, text string, start int, end int) {
- l := end - start - 1
- // This check could be moved out into the parsing loop if necessary!
- if 0 < l && l <= maximumHashtagLength {
- *tags = append(*tags, Span{First: start, Second: end})
- }
-}
-
-// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
-// and applies a regex to it to return a deduplicated list of emojis
-// used in that text, without the surrounding `::`
-func DeriveEmojisFromText(text string) []string {
- emojis := []string{}
- for _, m := range regexes.EmojiFinder.FindAllStringSubmatch(text, -1) {
- emojis = append(emojis, m[1])
- }
- return UniqueStrings(emojis)
+func IsPlausiblyInHashtag(r rune) bool {
+ // Marks are allowed during parsing, prior to normalization, but not after,
+ // since they may be combined into letters during normalization.
+ return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r)
}
func IsPermittedInHashtag(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}
-// Decides where to break before or after a hashtag.
-func IsHashtagBoundary(r rune) bool {
- return r == '#' || // `###lol` should work
- unicode.IsSpace(r) || // All kinds of Unicode whitespace.
- unicode.IsControl(r) || // All kinds of control characters, like tab.
- // Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
- // But `someurl/#fragment` should not match, neither should HTML entities like `&#35;`.
- ('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
+// Decides where to break before or after a #hashtag or @mention
+func IsMentionOrHashtagBoundary(r rune) bool {
+ return unicode.IsSpace(r) || unicode.IsPunct(r)
}
diff --git a/internal/util/statustools_test.go b/internal/util/statustools_test.go
deleted file mode 100644
index bb01557c5..000000000
--- a/internal/util/statustools_test.go
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- GoToSocial
- Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU Affero General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU Affero General Public License for more details.
-
- You should have received a copy of the GNU Affero General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-package util_test
-
-import (
- "testing"
-
- "github.com/stretchr/testify/assert"
- "github.com/stretchr/testify/suite"
- "github.com/superseriousbusiness/gotosocial/internal/util"
-)
-
-type StatusTestSuite struct {
- suite.Suite
-}
-
-func (suite *StatusTestSuite) TestLinkNoMention() {
- statusText := `here's a link to a post by zork:
-
-https://localhost:8080/@the_mighty_zork/statuses/01FGVP55XMF2K6316MQRX6PFG1
-
-that link shouldn't come out formatted as a mention!`
-
- menchies := util.DeriveMentionNamesFromText(statusText)
- suite.Empty(menchies)
-}
-
-func (suite *StatusTestSuite) TestDeriveMentionsOK() {
- statusText := `@dumpsterqueer@example.org testing testing
-
- is this thing on?
-
- @someone_else@testing.best-horse.com can you confirm? @hello@test.lgbt
-
- @thisisalocaluser!
-
- here is a duplicate mention: @hello@test.lgbt @hello@test.lgbt
-
- @account1@whatever.com @account2@whatever.com
-
- `
-
- menchies := util.DeriveMentionNamesFromText(statusText)
- assert.Len(suite.T(), menchies, 6)
- assert.Equal(suite.T(), "@dumpsterqueer@example.org", menchies[0])
- assert.Equal(suite.T(), "@someone_else@testing.best-horse.com", menchies[1])
- assert.Equal(suite.T(), "@hello@test.lgbt", menchies[2])
- assert.Equal(suite.T(), "@thisisalocaluser", menchies[3])
- assert.Equal(suite.T(), "@account1@whatever.com", menchies[4])
- assert.Equal(suite.T(), "@account2@whatever.com", menchies[5])
-}
-
-func (suite *StatusTestSuite) TestDeriveMentionsEmpty() {
- statusText := ``
- menchies := util.DeriveMentionNamesFromText(statusText)
- assert.Len(suite.T(), menchies, 0)
-}
-
-func (suite *StatusTestSuite) TestDeriveHashtagsOK() {
- statusText := `weeeeeeee #testing123 #also testing
-
-# testing this one shouldn't work
-
- #thisshouldwork #dupe #dupe!! #dupe
-
- here's a link with a fragment: https://example.org/whatever#ahhh
- here's another link with a fragment: https://example.org/whatever/#ahhh
-
-(#ThisShouldAlsoWork) #not_this_though
-
-#111111 thisalsoshouldn'twork#### ##
-
-#alimentación, #saúde, #lävistää, #ö, #네
-#ThisOneIsThirtyOneCharactersLon... ...ng
-#ThisOneIsThirteyCharactersLong
-`
-
- tags := util.DeriveHashtagsFromText(statusText)
- assert.Len(suite.T(), tags, 12)
- assert.Equal(suite.T(), "testing123", tags[0])
- assert.Equal(suite.T(), "also", tags[1])
- assert.Equal(suite.T(), "thisshouldwork", tags[2])
- assert.Equal(suite.T(), "dupe", tags[3])
- assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4])
- assert.Equal(suite.T(), "111111", tags[5])
- assert.Equal(suite.T(), "alimentación", tags[6])
- assert.Equal(suite.T(), "saúde", tags[7])
- assert.Equal(suite.T(), "lävistää", tags[8])
- assert.Equal(suite.T(), "ö", tags[9])
- assert.Equal(suite.T(), "네", tags[10])
- assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11])
-
- statusText = `#올빼미 hej`
- tags = util.DeriveHashtagsFromText(statusText)
- assert.Equal(suite.T(), "올빼미", tags[0])
-}
-
-func (suite *StatusTestSuite) TestHashtagSpansOK() {
- statusText := `#0 #3 #8aa`
-
- spans := util.FindHashtagSpansInText(statusText)
- assert.Equal(suite.T(), 0, spans[0].First)
- assert.Equal(suite.T(), 2, spans[0].Second)
- assert.Equal(suite.T(), 3, spans[1].First)
- assert.Equal(suite.T(), 5, spans[1].Second)
- assert.Equal(suite.T(), 8, spans[2].First)
- assert.Equal(suite.T(), 12, spans[2].Second)
-}
-
-func (suite *StatusTestSuite) TestDeriveEmojiOK() {
- statusText := `:test: :another:
-
-Here's some normal text with an :emoji: at the end
-
-:spaces shouldnt work:
-
-:emoji1::emoji2:
-
-:anotheremoji:emoji2:
-:anotheremoji::anotheremoji::anotheremoji::anotheremoji:
-:underscores_ok_too:
-`
-
- tags := util.DeriveEmojisFromText(statusText)
- assert.Len(suite.T(), tags, 7)
- assert.Equal(suite.T(), "test", tags[0])
- assert.Equal(suite.T(), "another", tags[1])
- assert.Equal(suite.T(), "emoji", tags[2])
- assert.Equal(suite.T(), "emoji1", tags[3])
- assert.Equal(suite.T(), "emoji2", tags[4])
- assert.Equal(suite.T(), "anotheremoji", tags[5])
- assert.Equal(suite.T(), "underscores_ok_too", tags[6])
-}
-
-func (suite *StatusTestSuite) TestDeriveMultiple() {
- statusText := `Another test @foss_satan@fossbros-anonymous.io
-
- #HashTag
-
- Text`
-
- ms := util.DeriveMentionNamesFromText(statusText)
- hs := util.DeriveHashtagsFromText(statusText)
- es := util.DeriveEmojisFromText(statusText)
-
- assert.Len(suite.T(), ms, 1)
- assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0])
-
- assert.Len(suite.T(), hs, 1)
- assert.Contains(suite.T(), hs, "HashTag")
-
- assert.Len(suite.T(), es, 0)
-}
-
-func TestStatusTestSuite(t *testing.T) {
- suite.Run(t, new(StatusTestSuite))
-}