diff options
author | 2023-02-03 10:58:58 +0000 | |
---|---|---|
committer | 2023-02-03 11:58:58 +0100 | |
commit | 49beb17a8fbdbf3517c103a477a5459a3bba404d (patch) | |
tree | 364c82d4089c75d3b95a5d78fd31b33d91b30b59 /internal/text | |
parent | [bugfix] Read Bookwyrm Articles more thoroughly (#1410) (diff) | |
download | gotosocial-49beb17a8fbdbf3517c103a477a5459a3bba404d.tar.xz |
[chore] Text formatting overhaul (#1406)
* Implement goldmark debug print for hashtags and mentions
* Minify HTML in FromPlain
* Convert plaintext status parser to goldmark
* Move mention/tag/emoji finding logic into formatter
* Combine mention and hashtag boundary characters
* Normalize unicode when rendering hashtags
Diffstat (limited to 'internal/text')
-rw-r--r-- | internal/text/common.go | 112 | ||||
-rw-r--r-- | internal/text/common_test.go | 106 | ||||
-rw-r--r-- | internal/text/emojionly.go | 71 | ||||
-rw-r--r-- | internal/text/formatter.go | 24 | ||||
-rw-r--r-- | internal/text/formatter_test.go | 22 | ||||
-rw-r--r-- | internal/text/goldmark_extension.go (renamed from internal/text/markdownextension.go) | 159 | ||||
-rw-r--r-- | internal/text/goldmark_plaintext.go | 64 | ||||
-rw-r--r-- | internal/text/link.go | 86 | ||||
-rw-r--r-- | internal/text/link_test.go | 157 | ||||
-rw-r--r-- | internal/text/markdown.go | 54 | ||||
-rw-r--r-- | internal/text/markdown_test.go | 117 | ||||
-rw-r--r-- | internal/text/minify.go | 45 | ||||
-rw-r--r-- | internal/text/plain.go | 68 | ||||
-rw-r--r-- | internal/text/plain_test.go | 125 | ||||
-rw-r--r-- | internal/text/replace.go | 141 |
15 files changed, 705 insertions, 646 deletions
diff --git a/internal/text/common.go b/internal/text/common.go deleted file mode 100644 index 2293ca3fe..000000000 --- a/internal/text/common.go +++ /dev/null @@ -1,112 +0,0 @@ -/* - GoToSocial - Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -package text - -import ( - "bytes" - "context" - "strings" - "unicode" - - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" - "github.com/superseriousbusiness/gotosocial/internal/log" - "github.com/superseriousbusiness/gotosocial/internal/regexes" - "github.com/superseriousbusiness/gotosocial/internal/util" -) - -func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { - spans := util.FindHashtagSpansInText(in) - - if len(spans) == 0 { - return in - } - - var b strings.Builder - i := 0 - -spans: - for _, t := range spans { - b.WriteString(in[i:t.First]) - i = t.Second - tagAsEntered := in[t.First+1 : t.Second] - - for _, tag := range tags { - if strings.EqualFold(tagAsEntered, tag.Name) { - // replace the #tag with the formatted tag content - // `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a> - b.WriteString(`<a href="`) - b.WriteString(tag.URL) - b.WriteString(`" class="mention hashtag" rel="tag">#<span>`) - b.WriteString(tagAsEntered) - b.WriteString(`</span></a>`) - continue spans - } - } - - b.WriteString(in[t.First:t.Second]) - } - - // Get the last bits. - i = spans[len(spans)-1].Second - b.WriteString(in[i:]) - - return b.String() -} - -func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { - return regexes.ReplaceAllStringFunc(regexes.MentionFinder, in, func(match string, buf *bytes.Buffer) string { - // we have a match, trim any spaces - matchTrimmed := strings.TrimSpace(match) - - // check through mentions to find what we're matching - for _, menchie := range mentions { - if strings.EqualFold(matchTrimmed, menchie.NameString) { - // make sure we have an account attached to this mention - if menchie.TargetAccount == nil { - a, err := f.db.GetAccountByID(ctx, menchie.TargetAccountID) - if err != nil { - log.Errorf("error getting account with id %s from the db: %s", menchie.TargetAccountID, err) - return match - } - menchie.TargetAccount = a - } - - // The mention's target is our target - targetAccount := menchie.TargetAccount - - // Add any dropped space from match - if unicode.IsSpace(rune(match[0])) { - buf.WriteByte(match[0]) - } - - // replace the mention with the formatted mention content - // <span class="h-card"><a href="targetAccount.URL" class="u-url mention">@<span>targetAccount.Username</span></a></span> - buf.WriteString(`<span class="h-card"><a href="`) - buf.WriteString(targetAccount.URL) - buf.WriteString(`" class="u-url mention">@<span>`) - buf.WriteString(targetAccount.Username) - buf.WriteString(`</span></a></span>`) - return buf.String() - } - } - - // the match wasn't in the list of mentions for whatever reason, so just return the match as we found it so nothing changes - return match - }) -} diff --git a/internal/text/common_test.go b/internal/text/common_test.go deleted file mode 100644 index 3949226ca..000000000 --- a/internal/text/common_test.go +++ /dev/null @@ -1,106 +0,0 @@ -/* - GoToSocial - Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -package text_test - -import ( - "context" - "testing" - "time" - - "github.com/stretchr/testify/suite" - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" -) - -const ( - replaceMentionsString = "Another test @foss_satan@fossbros-anonymous.io\n\n#Hashtag\n\nText" - replaceMentionsExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\n#Hashtag\n\nText" - replaceHashtagsExpected = "Another test @foss_satan@fossbros-anonymous.io\n\n<a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>Hashtag</span></a>\n\nText" - replaceHashtagsAfterMentionsExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\n<a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag\">#<span>Hashtag</span></a>\n\nText" - replaceMentionsWithLinkString = "Another test @foss_satan@fossbros-anonymous.io\n\nhttp://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060" - replaceMentionsWithLinkStringExpected = "Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\">@<span>foss_satan</span></a></span>\n\nhttp://fossbros-anonymous.io/@foss_satan/statuses/6675ee73-fccc-4562-a46a-3e8cd9798060" - replaceMentionsWithLinkSelfString = "Mentioning myself: @the_mighty_zork\n\nand linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR" - replaceMemtionsWithLinkSelfExpected = "Mentioning myself: <span class=\"h-card\"><a href=\"http://localhost:8080/@the_mighty_zork\" class=\"u-url mention\">@<span>the_mighty_zork</span></a></span>\n\nand linking to my own status: https://localhost:8080/@the_mighty_zork/statuses/01FGXKJRX2PMERJQ9EQF8Y6HCR" -) - -type CommonTestSuite struct { - TextStandardTestSuite -} - -func (suite *CommonTestSuite) TestReplaceMentions() { - foundMentions := []*gtsmodel.Mention{ - suite.testMentions["zork_mention_foss_satan"], - } - - f := suite.formatter.ReplaceMentions(context.Background(), replaceMentionsString, foundMentions) - suite.Equal(replaceMentionsExpected, f) -} - -func (suite *CommonTestSuite) TestReplaceHashtags() { - foundTags := []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - } - - f := suite.formatter.ReplaceTags(context.Background(), replaceMentionsString, foundTags) - - suite.Equal(replaceHashtagsExpected, f) -} - -func (suite *CommonTestSuite) TestReplaceHashtagsAfterReplaceMentions() { - foundTags := []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - } - - f := suite.formatter.ReplaceTags(context.Background(), replaceMentionsExpected, foundTags) - - suite.Equal(replaceHashtagsAfterMentionsExpected, f) -} - -func (suite *CommonTestSuite) TestReplaceMentionsWithLink() { - foundMentions := []*gtsmodel.Mention{ - suite.testMentions["zork_mention_foss_satan"], - } - - f := suite.formatter.ReplaceMentions(context.Background(), replaceMentionsWithLinkString, foundMentions) - suite.Equal(replaceMentionsWithLinkStringExpected, f) -} - -func (suite *CommonTestSuite) TestReplaceMentionsWithLinkSelf() { - mentioningAccount := suite.testAccounts["local_account_1"] - - foundMentions := []*gtsmodel.Mention{ - { - ID: "01FGXKN5F815DVFVD53PN9NYM6", - CreatedAt: time.Now(), - UpdatedAt: time.Now(), - StatusID: "01FGXKP0S5THQXFC1D9R141DDR", - OriginAccountID: mentioningAccount.ID, - TargetAccountID: mentioningAccount.ID, - NameString: "@the_mighty_zork", - TargetAccountURI: mentioningAccount.URI, - TargetAccountURL: mentioningAccount.URL, - }, - } - - f := suite.formatter.ReplaceMentions(context.Background(), replaceMentionsWithLinkSelfString, foundMentions) - suite.Equal(replaceMemtionsWithLinkSelfExpected, f) -} - -func TestCommonTestSuite(t *testing.T) { - suite.Run(t, new(CommonTestSuite)) -} diff --git a/internal/text/emojionly.go b/internal/text/emojionly.go new file mode 100644 index 000000000..1a3c0e968 --- /dev/null +++ b/internal/text/emojionly.go @@ -0,0 +1,71 @@ +/* + GoToSocial + Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +package text + +import ( + "bytes" + "context" + + "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/log" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/util" +) + +func (f *formatter) FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult { + result := &FormatResult{ + Mentions: []*gtsmodel.Mention{}, + Tags: []*gtsmodel.Tag{}, + Emojis: []*gtsmodel.Emoji{}, + } + // parse markdown text into html, using custom renderer to add hashtag/mention links + md := goldmark.New( + goldmark.WithRendererOptions( + html.WithXHTML(), + html.WithHardWraps(), + ), + goldmark.WithParser( + parser.NewParser( + parser.WithBlockParsers( + util.Prioritized(newPlaintextParser(), 500), + ), + ), + ), + goldmark.WithExtensions( + &customRenderer{f, ctx, pmf, authorID, statusID, true, result}, + ), + ) + + var htmlContentBytes bytes.Buffer + err := md.Convert([]byte(plain), &htmlContentBytes) + if err != nil { + log.Errorf("error formatting plaintext to HTML: %s", err) + } + result.HTML = htmlContentBytes.String() + + // clean anything dangerous out of the HTML + result.HTML = SanitizeHTML(result.HTML) + + // shrink ray + result.HTML = minifyHTML(result.HTML) + + return result +} diff --git a/internal/text/formatter.go b/internal/text/formatter.go index cb4de402b..bdad6c0f8 100644 --- a/internal/text/formatter.go +++ b/internal/text/formatter.go @@ -26,20 +26,19 @@ import ( ) // Formatter wraps some logic and functions for parsing statuses and other text input into nice html. +// Each of the member functions returns a struct containing the formatted HTML and any tags, mentions, and +// emoji that were found in the text. type Formatter interface { // FromPlain parses an HTML text from a plaintext. - FromPlain(ctx context.Context, plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string + FromPlain(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult // FromMarkdown parses an HTML text from a markdown-formatted text. - FromMarkdown(ctx context.Context, md string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag, emojis []*gtsmodel.Emoji) string - - // ReplaceTags takes a piece of text and a slice of tags, and returns the same text with the tags nicely formatted as hrefs. - ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string - // ReplaceMentions takes a piece of text and a slice of mentions, and returns the same text with the mentions nicely formatted as hrefs. - ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string - // ReplaceLinks takes a piece of text, finds all recognizable links in that text, and replaces them with hrefs. - ReplaceLinks(ctx context.Context, in string) string + FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, md string) *FormatResult + // FromPlainEmojiOnly parses an HTML text from a plaintext, only parsing emojis and not mentions etc. + FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult } +type FormatFunc func(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, text string) *FormatResult + type formatter struct { db db.DB } @@ -50,3 +49,10 @@ func NewFormatter(db db.DB) Formatter { db: db, } } + +type FormatResult struct { + HTML string + Mentions []*gtsmodel.Mention + Tags []*gtsmodel.Tag + Emojis []*gtsmodel.Emoji +} diff --git a/internal/text/formatter_test.go b/internal/text/formatter_test.go index 438a69c78..32ae74488 100644 --- a/internal/text/formatter_test.go +++ b/internal/text/formatter_test.go @@ -19,9 +19,13 @@ package text_test import ( + "context" "github.com/stretchr/testify/suite" + "github.com/superseriousbusiness/gotosocial/internal/concurrency" "github.com/superseriousbusiness/gotosocial/internal/db" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/messages" + "github.com/superseriousbusiness/gotosocial/internal/processing" "github.com/superseriousbusiness/gotosocial/internal/text" "github.com/superseriousbusiness/gotosocial/testrig" ) @@ -29,7 +33,8 @@ import ( type TextStandardTestSuite struct { // standard suite interfaces suite.Suite - db db.DB + db db.DB + parseMention gtsmodel.ParseMentionFunc // standard suite models testTokens map[string]*gtsmodel.Token @@ -41,6 +46,7 @@ type TextStandardTestSuite struct { testStatuses map[string]*gtsmodel.Status testTags map[string]*gtsmodel.Tag testMentions map[string]*gtsmodel.Mention + testEmojis map[string]*gtsmodel.Emoji // module being tested formatter text.Formatter @@ -56,6 +62,7 @@ func (suite *TextStandardTestSuite) SetupSuite() { suite.testStatuses = testrig.NewTestStatuses() suite.testTags = testrig.NewTestTags() suite.testMentions = testrig.NewTestMentions() + suite.testEmojis = testrig.NewTestEmojis() } func (suite *TextStandardTestSuite) SetupTest() { @@ -63,6 +70,11 @@ func (suite *TextStandardTestSuite) SetupTest() { testrig.InitTestConfig() suite.db = testrig.NewTestDB() + + fedWorker := concurrency.NewWorkerPool[messages.FromFederator](-1, -1) + federator := testrig.NewTestFederator(suite.db, testrig.NewTestTransportController(testrig.NewMockHTTPClient(nil, "../../testrig/media"), suite.db, fedWorker), nil, nil, fedWorker) + suite.parseMention = processing.GetParseMentionFunc(suite.db, federator) + suite.formatter = text.NewFormatter(suite.db) testrig.StandardDBSetup(suite.db, nil) @@ -71,3 +83,11 @@ func (suite *TextStandardTestSuite) SetupTest() { func (suite *TextStandardTestSuite) TearDownTest() { testrig.StandardDBTeardown(suite.db) } + +func (suite *TextStandardTestSuite) FromMarkdown(text string) *text.FormatResult { + return suite.formatter.FromMarkdown(context.Background(), suite.parseMention, suite.testAccounts["local_account_1"].ID, "status_ID", text) +} + +func (suite *TextStandardTestSuite) FromPlain(text string) *text.FormatResult { + return suite.formatter.FromPlain(context.Background(), suite.parseMention, suite.testAccounts["local_account_1"].ID, "status_ID", text) +} diff --git a/internal/text/markdownextension.go b/internal/text/goldmark_extension.go index 2d8eae907..11e4fde28 100644 --- a/internal/text/markdownextension.go +++ b/internal/text/goldmark_extension.go @@ -17,8 +17,10 @@ package text import ( "context" - "unicode" + "fmt" + "strings" + "github.com/superseriousbusiness/gotosocial/internal/db" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" @@ -46,8 +48,14 @@ type hashtag struct { Segment text.Segment } +type emoji struct { + ast.BaseInline + Segment text.Segment +} + var kindMention = ast.NewNodeKind("Mention") var kindHashtag = ast.NewNodeKind("Hashtag") +var kindEmoji = ast.NewNodeKind("Emoji") func (n *mention) Kind() ast.NodeKind { return kindMention @@ -57,14 +65,21 @@ func (n *hashtag) Kind() ast.NodeKind { return kindHashtag } -// Dump is used by goldmark for debugging. It is implemented only minimally because -// it is not used in our code. +func (n *emoji) Kind() ast.NodeKind { + return kindEmoji +} + +// Dump can be used for debugging. func (n *mention) Dump(source []byte, level int) { - ast.DumpHelper(n, source, level, nil, nil) + fmt.Printf("%sMention: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) } func (n *hashtag) Dump(source []byte, level int) { - ast.DumpHelper(n, source, level, nil, nil) + fmt.Printf("%sHashtag: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) +} + +func (n *emoji) Dump(source []byte, level int) { + fmt.Printf("%sEmoji: %s\n", strings.Repeat(" ", level), string(n.Segment.Value(source))) } // newMention and newHashtag create a goldmark ast.Node from a goldmark text.Segment. @@ -83,6 +98,13 @@ func newHashtag(s text.Segment) *hashtag { } } +func newEmoji(s text.Segment) *emoji { + return &emoji{ + BaseInline: ast.BaseInline{}, + Segment: s, + } +} + // mentionParser and hashtagParser fulfil the goldmark parser.InlineParser interface. type mentionParser struct { } @@ -90,6 +112,9 @@ type mentionParser struct { type hashtagParser struct { } +type emojiParser struct { +} + func (p *mentionParser) Trigger() []byte { return []byte{'@'} } @@ -98,11 +123,15 @@ func (p *hashtagParser) Trigger() []byte { return []byte{'#'} } +func (p *emojiParser) Trigger() []byte { + return []byte{':'} +} + func (p *mentionParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { before := block.PrecendingCharacter() line, segment := block.PeekLine() - if !unicode.IsSpace(before) { + if !util.IsMentionOrHashtagBoundary(before) { return nil } @@ -124,59 +153,88 @@ func (p *hashtagParser) Parse(parent ast.Node, block text.Reader, pc parser.Cont line, segment := block.PeekLine() s := string(line) - if !util.IsHashtagBoundary(before) { + if !util.IsMentionOrHashtagBoundary(before) || len(s) == 1 { return nil } for i, r := range s { switch { case r == '#' && i == 0: + // ignore initial # continue - case !util.IsPermittedInHashtag(r) && !util.IsHashtagBoundary(r): + case !util.IsPlausiblyInHashtag(r) && !util.IsMentionOrHashtagBoundary(r): // Fake hashtag, don't trust it return nil - case util.IsHashtagBoundary(r): + case util.IsMentionOrHashtagBoundary(r): + if i <= 1 { + // empty + return nil + } // End of hashtag block.Advance(i) return newHashtag(segment.WithStop(segment.Start + i)) } } - // If we don't find invalid characters before the end of the line then it's good - block.Advance(len(s)) + // If we don't find invalid characters before the end of the line then it's all hashtag, babey + block.Advance(segment.Len()) return newHashtag(segment) } +func (p *emojiParser) Parse(parent ast.Node, block text.Reader, pc parser.Context) ast.Node { + line, segment := block.PeekLine() + + // unideal for performance but makes use of existing regex + loc := regexes.EmojiFinder.FindIndex(line) + switch { + case loc == nil: + fallthrough + case loc[0] != 0: // fail if not found at start + return nil + default: + block.Advance(loc[1]) + return newEmoji(segment.WithStop(segment.Start + loc[1])) + } +} + // customRenderer fulfils both the renderer.NodeRenderer and goldmark.Extender interfaces. -// It is created in FromMarkdown to be used a goldmark extension, and the fields are used -// when rendering mentions and tags. +// It is created in FromMarkdown and FromPlain to be used as a goldmark extension, and the +// fields are used to report tags and mentions to the caller for use as metadata. type customRenderer struct { - f *formatter - ctx context.Context - mentions []*gtsmodel.Mention - tags []*gtsmodel.Tag + f *formatter + ctx context.Context + parseMention gtsmodel.ParseMentionFunc + accountID string + statusID string + emojiOnly bool + result *FormatResult } func (r *customRenderer) RegisterFuncs(reg renderer.NodeRendererFuncRegisterer) { reg.Register(kindMention, r.renderMention) reg.Register(kindHashtag, r.renderHashtag) + reg.Register(kindEmoji, r.renderEmoji) } func (r *customRenderer) Extend(m goldmark.Markdown) { + // 1000 is set as the lowest priority, but it's arbitrary m.Parser().AddOptions(parser.WithInlineParsers( - // 500 is pretty arbitrary here, it was copied from example goldmark extension code. - // https://github.com/yuin/goldmark/blob/75d8cce5b78c7e1d5d9c4ca32c1164f0a1e57b53/extension/strikethrough.go#L111 - mdutil.Prioritized(&mentionParser{}, 500), - mdutil.Prioritized(&hashtagParser{}, 500), + mdutil.Prioritized(&emojiParser{}, 1000), )) + if !r.emojiOnly { + m.Parser().AddOptions(parser.WithInlineParsers( + mdutil.Prioritized(&mentionParser{}, 1000), + mdutil.Prioritized(&hashtagParser{}, 1000), + )) + } m.Renderer().AddOptions(renderer.WithNodeRenderers( - mdutil.Prioritized(r, 500), + mdutil.Prioritized(r, 1000), )) } // renderMention and renderHashtag take a mention or a hashtag ast.Node and render it as HTML. func (r *customRenderer) renderMention(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { if !entering { - return ast.WalkContinue, nil + return ast.WalkSkipChildren, nil } n, ok := node.(*mention) // this function is only registered for kindMention @@ -185,18 +243,18 @@ func (r *customRenderer) renderMention(w mdutil.BufWriter, source []byte, node a } text := string(n.Segment.Value(source)) - html := r.f.ReplaceMentions(r.ctx, text, r.mentions) + html := r.replaceMention(text) // we don't have much recourse if this fails if _, err := w.WriteString(html); err != nil { - log.Errorf("error outputting markdown text: %s", err) + log.Errorf("error writing HTML: %s", err) } - return ast.WalkContinue, nil + return ast.WalkSkipChildren, nil } func (r *customRenderer) renderHashtag(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { if !entering { - return ast.WalkContinue, nil + return ast.WalkSkipChildren, nil } n, ok := node.(*hashtag) // this function is only registered for kindHashtag @@ -205,11 +263,50 @@ func (r *customRenderer) renderHashtag(w mdutil.BufWriter, source []byte, node a } text := string(n.Segment.Value(source)) - html := r.f.ReplaceTags(r.ctx, text, r.tags) + html := r.replaceHashtag(text) + _, err := w.WriteString(html) // we don't have much recourse if this fails - if _, err := w.WriteString(html); err != nil { - log.Errorf("error outputting markdown text: %s", err) + if err != nil { + log.Errorf("error writing HTML: %s", err) + } + return ast.WalkSkipChildren, nil +} + +// renderEmoji doesn't turn an emoji into HTML, but adds it to the metadata. +func (r *customRenderer) renderEmoji(w mdutil.BufWriter, source []byte, node ast.Node, entering bool) (ast.WalkStatus, error) { + if !entering { + return ast.WalkSkipChildren, nil + } + + n, ok := node.(*emoji) // this function is only registered for kindEmoji + if !ok { + log.Errorf("type assertion failed") + } + text := string(n.Segment.Value(source)) + shortcode := text[1 : len(text)-1] + + emoji, err := r.f.db.GetEmojiByShortcodeDomain(r.ctx, shortcode, "") + if err != nil { + if err != db.ErrNoEntries { + log.Errorf("error getting local emoji with shortcode %s: %s", shortcode, err) + } + } else if *emoji.VisibleInPicker && !*emoji.Disabled { + listed := false + for _, e := range r.result.Emojis { + if e.Shortcode == emoji.Shortcode { + listed = true + break + } + } + if !listed { + r.result.Emojis = append(r.result.Emojis, emoji) + } + } + + // we don't have much recourse if this fails + if _, err := w.WriteString(text); err != nil { + log.Errorf("error writing HTML: %s", err) } - return ast.WalkContinue, nil + return ast.WalkSkipChildren, nil } diff --git a/internal/text/goldmark_plaintext.go b/internal/text/goldmark_plaintext.go new file mode 100644 index 000000000..84916b1d1 --- /dev/null +++ b/internal/text/goldmark_plaintext.go @@ -0,0 +1,64 @@ +/* + GoToSocial + Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +package text + +import ( + "github.com/yuin/goldmark/ast" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/text" +) + +// plaintextParser implements goldmark.parser.BlockParser +type plaintextParser struct { +} + +var defaultPlaintextParser = &plaintextParser{} + +func newPlaintextParser() parser.BlockParser { + return defaultPlaintextParser +} + +func (b *plaintextParser) Trigger() []byte { + return nil +} + +func (b *plaintextParser) Open(parent ast.Node, reader text.Reader, pc parser.Context) (ast.Node, parser.State) { + _, segment := reader.PeekLine() + node := ast.NewParagraph() + node.Lines().Append(segment) + reader.Advance(segment.Len() - 1) + return node, parser.NoChildren +} + +func (b *plaintextParser) Continue(node ast.Node, reader text.Reader, pc parser.Context) parser.State { + _, segment := reader.PeekLine() + node.Lines().Append(segment) + reader.Advance(segment.Len() - 1) + return parser.Continue | parser.NoChildren +} + +func (b *plaintextParser) Close(node ast.Node, reader text.Reader, pc parser.Context) {} + +func (b *plaintextParser) CanInterruptParagraph() bool { + return false +} + +func (b *plaintextParser) CanAcceptIndentedLine() bool { + return true +} diff --git a/internal/text/link.go b/internal/text/link.go deleted file mode 100644 index 2b2b45e73..000000000 --- a/internal/text/link.go +++ /dev/null @@ -1,86 +0,0 @@ -/* - GoToSocial - Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -package text - -import ( - "bytes" - "context" - "net/url" - "strings" - - "github.com/superseriousbusiness/gotosocial/internal/regexes" -) - -// FindLinks parses the given string looking for recognizable URLs (including scheme). -// It returns a list of those URLs, without changing the string, or an error if something goes wrong. -// If no URLs are found within the given string, an empty slice and nil will be returned. -func FindLinks(in string) []*url.URL { - var urls []*url.URL - - // bail already if we don't find anything - found := regexes.LinkScheme.FindAllString(in, -1) - if len(found) == 0 { - return nil - } - - urlmap := map[string]struct{}{} - - // for each string we find, we want to parse it into a URL if we can - // if we fail to parse it, just ignore this match and continue - for _, f := range found { - u, err := url.Parse(f) - if err != nil { - continue - } - - // Calculate string - ustr := u.String() - - if _, ok := urlmap[ustr]; !ok { - // Has not been encountered yet - urls = append(urls, u) - urlmap[ustr] = struct{}{} - } - } - - return urls -} - -// ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents. -// Note: because Go doesn't allow negative lookbehinds in regex, it's possible that an already-formatted -// href will end up double-formatted, if the text you pass here contains one or more hrefs already. -// To avoid this, you should sanitize any HTML out of text before you pass it into this function. -func (f *formatter) ReplaceLinks(ctx context.Context, in string) string { - return regexes.ReplaceAllStringFunc(regexes.LinkScheme, in, func(urlString string, buf *bytes.Buffer) string { - thisURL, err := url.Parse(urlString) - if err != nil { - return urlString // we can't parse it as a URL so don't replace it - } - // <a href="thisURL.String()" rel="noopener">urlString</a> - urlString = thisURL.String() - buf.WriteString(`<a href="`) - buf.WriteString(thisURL.String()) - buf.WriteString(`" rel="noopener">`) - urlString = strings.TrimPrefix(urlString, thisURL.Scheme) - urlString = strings.TrimPrefix(urlString, "://") - buf.WriteString(urlString) - buf.WriteString(`</a>`) - return buf.String() - }) -} diff --git a/internal/text/link_test.go b/internal/text/link_test.go deleted file mode 100644 index dfb4656b8..000000000 --- a/internal/text/link_test.go +++ /dev/null @@ -1,157 +0,0 @@ -/* - GoToSocial - Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with this program. If not, see <http://www.gnu.org/licenses/>. -*/ - -package text_test - -import ( - "context" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/suite" - "github.com/superseriousbusiness/gotosocial/internal/text" -) - -const text1 = ` -This is a text with some links in it. Here's link number one: https://example.org/link/to/something#fragment - -Here's link number two: http://test.example.org?q=bahhhhhhhhhhhh - -https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it - -really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme - -https://example.orghttps://google.com <-- this shouldn't work either, but it does?! OK -` - -const text2 = ` -this is one link: https://example.org - -this is the same link again: https://example.org - -these should be deduplicated -` - -const text3 = ` -here's a mailto link: mailto:whatever@test.org -` - -const text4 = ` -two similar links: - -https://example.org - -https://example.org/test -` - -const text5 = ` -what happens when we already have a link within an href? - -<a href="https://example.org">https://example.org</a> -` - -type LinkTestSuite struct { - TextStandardTestSuite -} - -func (suite *LinkTestSuite) TestParseSimple() { - f := suite.formatter.FromPlain(context.Background(), simple, nil, nil) - suite.Equal(simpleExpected, f) -} - -func (suite *LinkTestSuite) TestParseURLsFromText1() { - urls := text.FindLinks(text1) - - suite.Equal("https://example.org/link/to/something#fragment", urls[0].String()) - suite.Equal("http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String()) - suite.Equal("https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it", urls[2].String()) - suite.Equal("https://example.orghttps://google.com", urls[3].String()) -} - -func (suite *LinkTestSuite) TestParseURLsFromText2() { - urls := text.FindLinks(text2) - - // assert length 1 because the found links will be deduplicated - assert.Len(suite.T(), urls, 1) -} - -func (suite *LinkTestSuite) TestParseURLsFromText3() { - urls := text.FindLinks(text3) - - // assert length 0 because `mailto:` isn't accepted - assert.Len(suite.T(), urls, 0) -} - -func (suite *LinkTestSuite) TestReplaceLinksFromText1() { - replaced := suite.formatter.ReplaceLinks(context.Background(), text1) - suite.Equal(` -This is a text with some links in it. Here's link number one: <a href="https://example.org/link/to/something#fragment" rel="noopener">example.org/link/to/something#fragment</a> - -Here's link number two: <a href="http://test.example.org?q=bahhhhhhhhhhhh" rel="noopener">test.example.org?q=bahhhhhhhhhhhh</a> - -<a href="https://another.link.example.org/with/a/pretty/long/path/at/the/end/of/it" rel="noopener">another.link.example.org/with/a/pretty/long/path/at/the/end/of/it</a> - -really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme - -<a href="https://example.orghttps://google.com" rel="noopener">example.orghttps://google.com</a> <-- this shouldn't work either, but it does?! OK -`, replaced) -} - -func (suite *LinkTestSuite) TestReplaceLinksFromText2() { - replaced := suite.formatter.ReplaceLinks(context.Background(), text2) - suite.Equal(` -this is one link: <a href="https://example.org" rel="noopener">example.org</a> - -this is the same link again: <a href="https://example.org" rel="noopener">example.org</a> - -these should be deduplicated -`, replaced) -} - -func (suite *LinkTestSuite) TestReplaceLinksFromText3() { - // we know mailto links won't be replaced with hrefs -- we only accept https and http - replaced := suite.formatter.ReplaceLinks(context.Background(), text3) - suite.Equal(` -here's a mailto link: mailto:whatever@test.org -`, replaced) -} - -func (suite *LinkTestSuite) TestReplaceLinksFromText4() { - replaced := suite.formatter.ReplaceLinks(context.Background(), text4) - suite.Equal(` -two similar links: - -<a href="https://example.org" rel="noopener">example.org</a> - -<a href="https://example.org/test" rel="noopener">example.org/test</a> -`, replaced) -} - -func (suite *LinkTestSuite) TestReplaceLinksFromText5() { - // we know this one doesn't work properly, which is why html should always be sanitized before being passed into the ReplaceLinks function - replaced := suite.formatter.ReplaceLinks(context.Background(), text5) - suite.Equal(` -what happens when we already have a link within an href? - -<a href="<a href="https://example.org" rel="noopener">example.org</a>"><a href="https://example.org" rel="noopener">example.org</a></a> -`, replaced) -} - -func TestLinkTestSuite(t *testing.T) { - suite.Run(t, new(LinkTestSuite)) -} diff --git a/internal/text/markdown.go b/internal/text/markdown.go index dbe86d110..232f0f723 100644 --- a/internal/text/markdown.go +++ b/internal/text/markdown.go @@ -21,32 +21,19 @@ package text import ( "bytes" "context" - "strings" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" - "github.com/tdewolff/minify/v2" - minifyHtml "github.com/tdewolff/minify/v2/html" "github.com/yuin/goldmark" "github.com/yuin/goldmark/extension" "github.com/yuin/goldmark/renderer/html" ) -var ( - m *minify.M -) - -func (f *formatter) FromMarkdown(ctx context.Context, markdownText string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag, emojis []*gtsmodel.Emoji) string { - - // Temporarily replace all found emoji shortcodes in the markdown text with - // their ID so that they're not parsed as anything by the markdown parser - - // this fixes cases where emojis with some underscores in them are parsed as - // words with emphasis, eg `:_some_emoji:` becomes `:<em>some</em>emoji:` - // - // Since the IDs of the emojis are just uppercase letters + numbers they should - // be safe to pass through the markdown parser without unexpected effects. - for _, e := range emojis { - markdownText = strings.ReplaceAll(markdownText, ":"+e.Shortcode+":", ":"+e.ID+":") +func (f *formatter) FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, markdownText string) *FormatResult { + result := &FormatResult{ + Mentions: []*gtsmodel.Mention{}, + Tags: []*gtsmodel.Tag{}, + Emojis: []*gtsmodel.Emoji{}, } // parse markdown text into html, using custom renderer to add hashtag/mention links @@ -57,7 +44,7 @@ func (f *formatter) FromMarkdown(ctx context.Context, markdownText string, menti html.WithUnsafe(), // allows raw HTML ), goldmark.WithExtensions( - &customRenderer{f, ctx, mentions, tags}, + &customRenderer{f, ctx, pmf, authorID, statusID, false, result}, extension.Linkify, // turns URLs into links extension.Strikethrough, ), @@ -66,30 +53,15 @@ func (f *formatter) FromMarkdown(ctx context.Context, markdownText string, menti var htmlContentBytes bytes.Buffer err := md.Convert([]byte(markdownText), &htmlContentBytes) if err != nil { - log.Errorf("error rendering markdown to HTML: %s", err) - } - htmlContent := htmlContentBytes.String() - - // Replace emoji IDs in the parsed html content with their shortcodes again - for _, e := range emojis { - htmlContent = strings.ReplaceAll(htmlContent, ":"+e.ID+":", ":"+e.Shortcode+":") + log.Errorf("error formatting markdown to HTML: %s", err) } + result.HTML = htmlContentBytes.String() - // clean anything dangerous out of the html - htmlContent = SanitizeHTML(htmlContent) + // clean anything dangerous out of the HTML + result.HTML = SanitizeHTML(result.HTML) - if m == nil { - m = minify.New() - m.Add("text/html", &minifyHtml.Minifier{ - KeepEndTags: true, - KeepQuotes: true, - }) - } - - minified, err := m.String("text/html", htmlContent) - if err != nil { - log.Errorf("error minifying markdown text: %s", err) - } + // shrink ray + result.HTML = minifyHTML(result.HTML) - return minified + return result } diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index 384f4389c..80547f8b3 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -19,11 +19,9 @@ package text_test import ( - "context" "testing" "github.com/stretchr/testify/suite" - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) var withCodeBlock = `# Title @@ -77,6 +75,16 @@ const ( mdWithStrikethroughExpected = "<p>I have <del>mdae</del> made an error</p>" mdWithLink = "Check out this code, i heard it was written by a sloth https://github.com/superseriousbusiness/gotosocial" mdWithLinkExpected = "<p>Check out this code, i heard it was written by a sloth <a href=\"https://github.com/superseriousbusiness/gotosocial\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">https://github.com/superseriousbusiness/gotosocial</a></p>" + mdObjectInCodeBlock = "@foss_satan@fossbros-anonymous.io this is how to mention a user\n```\n@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n```\nhope that helps" + mdObjectInCodeBlockExpected = "<p><span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span> this is how to mention a user</p><pre><code>@the_mighty_zork hey bud! nice #ObjectOrientedProgramming software you've been writing lately! :rainbow:\n</code></pre><p>hope that helps</p>" + mdItalicHashtag = "_#hashtag_" + mdItalicHashtagExpected = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>" + mdItalicHashtags = "_#hashtag #hashtag #hashtag_" + mdItalicHashtagsExpected = "<p><em><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a> <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hashtag</span></a></em></p>" + // BEWARE: sneaky unicode business going on. + // the first ö is one rune, the second ö is an o with a combining diacritic. + mdUnnormalizedHashtag = "#hellöthere #hellöthere" + mdUnnormalizedHashtagExpected = "<p><a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a> <a href=\"http://localhost:8080/tags/hell%C3%B6there\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>hellöthere</span></a></p>" ) type MarkdownTestSuite struct { @@ -84,101 +92,112 @@ type MarkdownTestSuite struct { } func (suite *MarkdownTestSuite) TestParseSimple() { - s := suite.formatter.FromMarkdown(context.Background(), simpleMarkdown, nil, nil, nil) - suite.Equal(simpleMarkdownExpected, s) + formatted := suite.FromMarkdown(simpleMarkdown) + suite.Equal(simpleMarkdownExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithCodeBlock() { - s := suite.formatter.FromMarkdown(context.Background(), withCodeBlock, nil, nil, nil) - suite.Equal(withCodeBlockExpected, s) + formatted := suite.FromMarkdown(withCodeBlock) + suite.Equal(withCodeBlockExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithInlineCode() { - s := suite.formatter.FromMarkdown(context.Background(), withInlineCode, nil, nil, nil) - suite.Equal(withInlineCodeExpected, s) + formatted := suite.FromMarkdown(withInlineCode) + suite.Equal(withInlineCodeExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithInlineCode2() { - s := suite.formatter.FromMarkdown(context.Background(), withInlineCode2, nil, nil, nil) - suite.Equal(withInlineCode2Expected, s) + formatted := suite.FromMarkdown(withInlineCode2) + suite.Equal(withInlineCode2Expected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithHashtag() { - foundTags := []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - } - - s := suite.formatter.FromMarkdown(context.Background(), withHashtag, nil, foundTags, nil) - suite.Equal(withHashtagExpected, s) + formatted := suite.FromMarkdown(withHashtag) + suite.Equal(withHashtagExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithHTML() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithHTML, nil, nil, nil) - suite.Equal(mdWithHTMLExpected, s) + formatted := suite.FromMarkdown(mdWithHTML) + suite.Equal(mdWithHTMLExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithCheekyHTML() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithCheekyHTML, nil, nil, nil) - suite.Equal(mdWithCheekyHTMLExpected, s) + formatted := suite.FromMarkdown(mdWithCheekyHTML) + suite.Equal(mdWithCheekyHTMLExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithHashtagInitial() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithHashtagInitial, nil, []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - suite.testTags["welcome"], - }, nil) - suite.Equal(mdWithHashtagInitialExpected, s) + formatted := suite.FromMarkdown(mdWithHashtagInitial) + suite.Equal(mdWithHashtagInitialExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseCodeBlockWithNewlines() { - s := suite.formatter.FromMarkdown(context.Background(), mdCodeBlockWithNewlines, nil, nil, nil) - suite.Equal(mdCodeBlockWithNewlinesExpected, s) + formatted := suite.FromMarkdown(mdCodeBlockWithNewlines) + suite.Equal(mdCodeBlockWithNewlinesExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithFootnote() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithFootnote, nil, nil, nil) - suite.Equal(mdWithFootnoteExpected, s) + formatted := suite.FromMarkdown(mdWithFootnote) + suite.Equal(mdWithFootnoteExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseWithBlockquote() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithBlockQuote, nil, nil, nil) - suite.Equal(mdWithBlockQuoteExpected, s) + formatted := suite.FromMarkdown(mdWithBlockQuote) + suite.Equal(mdWithBlockQuoteExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseHashtagWithCodeBlock() { - s := suite.formatter.FromMarkdown(context.Background(), mdHashtagAndCodeBlock, nil, []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - }, nil) - suite.Equal(mdHashtagAndCodeBlockExpected, s) + formatted := suite.FromMarkdown(mdHashtagAndCodeBlock) + suite.Equal(mdHashtagAndCodeBlockExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseMentionWithCodeBlock() { - s := suite.formatter.FromMarkdown(context.Background(), mdMentionAndCodeBlock, []*gtsmodel.Mention{ - suite.testMentions["local_user_2_mention_zork"], - }, nil, nil) - suite.Equal(mdMentionAndCodeBlockExpected, s) + formatted := suite.FromMarkdown(mdMentionAndCodeBlock) + suite.Equal(mdMentionAndCodeBlockExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseSmartypants() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithSmartypants, []*gtsmodel.Mention{ - suite.testMentions["local_user_2_mention_zork"], - }, nil, nil) - suite.Equal(mdWithSmartypantsExpected, s) + formatted := suite.FromMarkdown(mdWithSmartypants) + suite.Equal(mdWithSmartypantsExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseAsciiHeart() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithAsciiHeart, nil, nil, nil) - suite.Equal(mdWithAsciiHeartExpected, s) + formatted := suite.FromMarkdown(mdWithAsciiHeart) + suite.Equal(mdWithAsciiHeartExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseStrikethrough() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithStrikethrough, nil, nil, nil) - suite.Equal(mdWithStrikethroughExpected, s) + formatted := suite.FromMarkdown(mdWithStrikethrough) + suite.Equal(mdWithStrikethroughExpected, formatted.HTML) } func (suite *MarkdownTestSuite) TestParseLink() { - s := suite.formatter.FromMarkdown(context.Background(), mdWithLink, nil, nil, nil) - suite.Equal(mdWithLinkExpected, s) + formatted := suite.FromMarkdown(mdWithLink) + suite.Equal(mdWithLinkExpected, formatted.HTML) +} + +func (suite *MarkdownTestSuite) TestParseObjectInCodeBlock() { + formatted := suite.FromMarkdown(mdObjectInCodeBlock) + suite.Equal(mdObjectInCodeBlockExpected, formatted.HTML) + suite.Len(formatted.Mentions, 1) + suite.Equal("@foss_satan@fossbros-anonymous.io", formatted.Mentions[0].NameString) + suite.Empty(formatted.Tags) + suite.Empty(formatted.Emojis) +} + +func (suite *MarkdownTestSuite) TestParseItalicHashtag() { + formatted := suite.FromMarkdown(mdItalicHashtag) + suite.Equal(mdItalicHashtagExpected, formatted.HTML) +} + +func (suite *MarkdownTestSuite) TestParseItalicHashtags() { + formatted := suite.FromMarkdown(mdItalicHashtags) + suite.Equal(mdItalicHashtagsExpected, formatted.HTML) +} + +func (suite *MarkdownTestSuite) TestParseUnnormalizedHashtag() { + formatted := suite.FromMarkdown(mdUnnormalizedHashtag) + suite.Equal(mdUnnormalizedHashtagExpected, formatted.HTML) } func TestMarkdownTestSuite(t *testing.T) { diff --git a/internal/text/minify.go b/internal/text/minify.go new file mode 100644 index 000000000..62562c7ca --- /dev/null +++ b/internal/text/minify.go @@ -0,0 +1,45 @@ +/* + GoToSocial + Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +package text + +import ( + "github.com/superseriousbusiness/gotosocial/internal/log" + "github.com/tdewolff/minify/v2" + "github.com/tdewolff/minify/v2/html" +) + +var ( + m *minify.M +) + +func minifyHTML(content string) string { + if m == nil { + m = minify.New() + m.Add("text/html", &html.Minifier{ + KeepEndTags: true, + KeepQuotes: true, + }) + } + + minified, err := m.String("text/html", content) + if err != nil { + log.Errorf("error minifying HTML: %s", err) + } + return minified +} diff --git a/internal/text/plain.go b/internal/text/plain.go index a64a14f06..3549200c6 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -19,40 +19,56 @@ package text import ( + "bytes" "context" - "html" - "strings" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/log" + "github.com/yuin/goldmark" + "github.com/yuin/goldmark/extension" + "github.com/yuin/goldmark/parser" + "github.com/yuin/goldmark/renderer/html" + "github.com/yuin/goldmark/util" ) -// breakReplacer replaces new-lines with HTML breaks. -var breakReplacer = strings.NewReplacer( - "\r\n", "<br/>", - "\n", "<br/>", -) - -func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string { - // trim any crap - content := strings.TrimSpace(plain) - - // clean 'er up - content = html.EscapeString(content) - - // format links nicely - content = f.ReplaceLinks(ctx, content) +func (f *formatter) FromPlain(ctx context.Context, pmf gtsmodel.ParseMentionFunc, authorID string, statusID string, plain string) *FormatResult { + result := &FormatResult{ + Mentions: []*gtsmodel.Mention{}, + Tags: []*gtsmodel.Tag{}, + Emojis: []*gtsmodel.Emoji{}, + } - // format tags nicely - content = f.ReplaceTags(ctx, content, tags) + // parse markdown text into html, using custom renderer to add hashtag/mention links + md := goldmark.New( + goldmark.WithRendererOptions( + html.WithXHTML(), + html.WithHardWraps(), + ), + goldmark.WithParser( + parser.NewParser( + parser.WithBlockParsers( + util.Prioritized(newPlaintextParser(), 500), + ), + ), + ), + goldmark.WithExtensions( + &customRenderer{f, ctx, pmf, authorID, statusID, false, result}, + extension.Linkify, // turns URLs into links + ), + ) - // format mentions nicely - content = f.ReplaceMentions(ctx, content, mentions) + var htmlContentBytes bytes.Buffer + err := md.Convert([]byte(plain), &htmlContentBytes) + if err != nil { + log.Errorf("error formatting plaintext to HTML: %s", err) + } + result.HTML = htmlContentBytes.String() - // replace newlines with breaks - content = breakReplacer.Replace(content) + // clean anything dangerous out of the HTML + result.HTML = SanitizeHTML(result.HTML) - // wrap the whole thing in a pee - content = `<p>` + content + `</p>` + // shrink ray + result.HTML = minifyHTML(result.HTML) - return SanitizeHTML(content) + return result } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index 6b850cb45..3693ada9a 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -19,22 +19,21 @@ package text_test import ( - "context" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/suite" - "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) const ( - simple = "this is a plain and simple status" - simpleExpected = "<p>this is a plain and simple status</p>" - withTag = "here's a simple status that uses hashtag #welcome!" - withTagExpected = "<p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>" - withHTML = "<div>blah this should just be html escaped blah</div>" - withHTMLExpected = "<p><div>blah this should just be html escaped blah</div></p>" - moreComplex = "Another test @foss_satan@fossbros-anonymous.io\n\n#Hashtag\n\nText" - moreComplexFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br/><br/><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br/><br/>Text</p>" + simple = "this is a plain and simple status" + simpleExpected = "<p>this is a plain and simple status</p>" + withTag = "here's a simple status that uses hashtag #welcome!" + withTagExpected = "<p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>" + withHTML = "<div>blah this should just be html escaped blah</div>" + withHTMLExpected = "<p><div>blah this should just be html escaped blah</div></p>" + moreComplex = "Another test @foss_satan@fossbros-anonymous.io\n\n#Hashtag\n\nText\n\n:rainbow:" + moreComplexExpected = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br>Text<br><br>:rainbow:</p>" ) type PlainTestSuite struct { @@ -42,35 +41,105 @@ type PlainTestSuite struct { } func (suite *PlainTestSuite) TestParseSimple() { - f := suite.formatter.FromPlain(context.Background(), simple, nil, nil) - suite.Equal(simpleExpected, f) + formatted := suite.FromPlain(simple) + suite.Equal(simpleExpected, formatted.HTML) } func (suite *PlainTestSuite) TestParseWithTag() { - foundTags := []*gtsmodel.Tag{ - suite.testTags["welcome"], - } - - f := suite.formatter.FromPlain(context.Background(), withTag, nil, foundTags) - suite.Equal(withTagExpected, f) + formatted := suite.FromPlain(withTag) + suite.Equal(withTagExpected, formatted.HTML) } func (suite *PlainTestSuite) TestParseWithHTML() { - f := suite.formatter.FromPlain(context.Background(), withHTML, nil, nil) - suite.Equal(withHTMLExpected, f) + formatted := suite.FromPlain(withHTML) + suite.Equal(withHTMLExpected, formatted.HTML) } func (suite *PlainTestSuite) TestParseMoreComplex() { - foundTags := []*gtsmodel.Tag{ - suite.testTags["Hashtag"], - } + formatted := suite.FromPlain(moreComplex) + suite.Equal(moreComplexExpected, formatted.HTML) +} + +func (suite *PlainTestSuite) TestLinkNoMention() { + statusText := `here's a link to a post by zork + +https://example.com/@the_mighty_zork/statuses/01FGVP55XMF2K6316MQRX6PFG1 + +that link shouldn't come out formatted as a mention!` + + menchies := suite.FromPlain(statusText).Mentions + suite.Empty(menchies) +} + +func (suite *PlainTestSuite) TestDeriveMentionsEmpty() { + statusText := `` + menchies := suite.FromPlain(statusText).Mentions + assert.Len(suite.T(), menchies, 0) +} + +func (suite *PlainTestSuite) TestDeriveHashtagsOK() { + statusText := `weeeeeeee #testing123 #also testing + +# testing this one shouldn't work + + #thisshouldwork #dupe #dupe!! #dupe + + here's a link with a fragment: https://example.org/whatever#ahhh + here's another link with a fragment: https://example.org/whatever/#ahhh - foundMentions := []*gtsmodel.Mention{ - suite.testMentions["zork_mention_foss_satan"], - } +(#ThisShouldAlsoWork) #this_should_be_split + +#111111 thisalsoshouldn'twork#### ## + +#alimentación, #saúde, #lävistää, #ö, #네 +#ThisOneIsThirtyOneCharactersLon... ...ng +#ThisOneIsThirteyCharactersLong +` + + tags := suite.FromPlain(statusText).Tags + assert.Len(suite.T(), tags, 13) + assert.Equal(suite.T(), "testing123", tags[0].Name) + assert.Equal(suite.T(), "also", tags[1].Name) + assert.Equal(suite.T(), "thisshouldwork", tags[2].Name) + assert.Equal(suite.T(), "dupe", tags[3].Name) + assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4].Name) + assert.Equal(suite.T(), "this", tags[5].Name) + assert.Equal(suite.T(), "111111", tags[6].Name) + assert.Equal(suite.T(), "alimentación", tags[7].Name) + assert.Equal(suite.T(), "saúde", tags[8].Name) + assert.Equal(suite.T(), "lävistää", tags[9].Name) + assert.Equal(suite.T(), "ö", tags[10].Name) + assert.Equal(suite.T(), "네", tags[11].Name) + assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[12].Name) + + statusText = `#올빼미 hej` + tags = suite.FromPlain(statusText).Tags + assert.Equal(suite.T(), "올빼미", tags[0].Name) +} + +func (suite *PlainTestSuite) TestDeriveMultiple() { + statusText := `Another test @foss_satan@fossbros-anonymous.io + + #Hashtag + + Text` + + f := suite.FromPlain(statusText) + + assert.Len(suite.T(), f.Mentions, 1) + assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", f.Mentions[0].NameString) + + assert.Len(suite.T(), f.Tags, 1) + assert.Equal(suite.T(), "Hashtag", f.Tags[0].Name) + + assert.Len(suite.T(), f.Emojis, 0) +} - f := suite.formatter.FromPlain(context.Background(), moreComplex, foundMentions, foundTags) - suite.Equal(moreComplexFull, f) +func (suite *PlainTestSuite) TestZalgoHashtag() { + statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?` + f := suite.FromPlain(statusText) + assert.Len(suite.T(), f.Tags, 1) + assert.Equal(suite.T(), "praying", f.Tags[0].Name) } func TestPlainTestSuite(t *testing.T) { diff --git a/internal/text/replace.go b/internal/text/replace.go new file mode 100644 index 000000000..5deab5d4d --- /dev/null +++ b/internal/text/replace.go @@ -0,0 +1,141 @@ +/* + GoToSocial + Copyright (C) 2021-2023 GoToSocial Authors admin@gotosocial.org + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +package text + +import ( + "errors" + "github.com/superseriousbusiness/gotosocial/internal/db" + "github.com/superseriousbusiness/gotosocial/internal/log" + "github.com/superseriousbusiness/gotosocial/internal/util" + "golang.org/x/text/unicode/norm" + "strings" +) + +const ( + maximumHashtagLength = 30 +) + +// given a mention or a hashtag string, the methods in this file will attempt to parse it, +// add it to the database, and render it as HTML. If any of these steps fails, the method +// will just return the original string and log an error. + +// replaceMention takes a string in the form @username@domain.com or @localusername +func (r *customRenderer) replaceMention(text string) string { + menchie, err := r.parseMention(r.ctx, text, r.accountID, r.statusID) + if err != nil { + log.Errorf("error parsing mention %s from status: %s", text, err) + return text + } + + if r.statusID != "" { + if err := r.f.db.Put(r.ctx, menchie); err != nil { + log.Errorf("error putting mention in db: %s", err) + return text + } + } + + // only append if it's not been listed yet + listed := false + for _, m := range r.result.Mentions { + if menchie.ID == m.ID { + listed = true + break + } + } + if !listed { + r.result.Mentions = append(r.result.Mentions, menchie) + } + + // make sure we have an account attached to this mention + if menchie.TargetAccount == nil { + a, err := r.f.db.GetAccountByID(r.ctx, menchie.TargetAccountID) + if err != nil { + log.Errorf("error getting account with id %s from the db: %s", menchie.TargetAccountID, err) + return text + } + menchie.TargetAccount = a + } + + // The mention's target is our target + targetAccount := menchie.TargetAccount + + var b strings.Builder + + // replace the mention with the formatted mention content + // <span class="h-card"><a href="targetAccount.URL" class="u-url mention">@<span>targetAccount.Username</span></a></span> + b.WriteString(`<span class="h-card"><a href="`) + b.WriteString(targetAccount.URL) + b.WriteString(`" class="u-url mention">@<span>`) + b.WriteString(targetAccount.Username) + b.WriteString(`</span></a></span>`) + return b.String() +} + +// replaceMention takes a string in the form #HashedTag, and will normalize it before +// adding it to the db and turning it into HTML. +func (r *customRenderer) replaceHashtag(text string) string { + // this normalization is specifically to avoid cases where visually-identical + // hashtags are stored with different unicode representations (e.g. with combining + // diacritics). It allows a tasteful number of combining diacritics to be used, + // as long as they can be combined with parent characters to form regular letter + // symbols. + normalized := norm.NFC.String(text[1:]) + + for i, r := range normalized { + if i >= maximumHashtagLength || !util.IsPermittedInHashtag(r) { + return text + } + } + + tag, err := r.f.db.TagStringToTag(r.ctx, normalized, r.accountID) + if err != nil { + log.Errorf("error generating hashtags from status: %s", err) + return text + } + + // only append if it's not been listed yet + listed := false + for _, t := range r.result.Tags { + if tag.ID == t.ID { + listed = true + break + } + } + if !listed { + err = r.f.db.Put(r.ctx, tag) + if err != nil { + if !errors.Is(err, db.ErrAlreadyExists) { + log.Errorf("error putting tags in db: %s", err) + return text + } + } + r.result.Tags = append(r.result.Tags, tag) + } + + var b strings.Builder + // replace the #tag with the formatted tag content + // `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a> + b.WriteString(`<a href="`) + b.WriteString(tag.URL) + b.WriteString(`" class="mention hashtag" rel="tag">#<span>`) + b.WriteString(normalized) + b.WriteString(`</span></a>`) + + return b.String() +} |