[bugfix] Fix existing bio text showing as HTML (#531)

* fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
author: kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com> 2022-05-07 16:55:27 +0100
committer: GitHub <noreply@github.com> 2022-05-07 17:55:27 +0200
commit: 26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 (patch)
tree: db316febba8e0ada7a9360b059011dcc7ea138a3 /internal/text
parent: [performance] improved logrus output switching performance (#544) (diff)
download: gotosocial-26b74aefaf5d2a3cd26bd57652fe96a6a20ed034.tar.xz
5 files changed, 73 insertions, 97 deletions
diff --git a/internal/text/common.go b/internal/text/common.go
index 4148ece15..12c0f1dfa 100644
--- a/internal/text/common.go
+++ b/internal/text/common.go
@@ -19,10 +19,11 @@
 package text
 
 import (
+	"bytes"
 	"context"
-	"fmt"
 	"html"
 	"strings"
+	"unicode"
 
 	"github.com/sirupsen/logrus"
 
@@ -63,38 +64,40 @@ func postformat(in string) string {
 }
 
 func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
-	return regexes.HashtagFinder.ReplaceAllStringFunc(in, func(match string) string {
+	return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string {
 		// we have a match
 		matchTrimmed := strings.TrimSpace(match)
-		tagAsEntered := strings.Split(matchTrimmed, "#")[1]
+		tagAsEntered := matchTrimmed[1:]
 
 		// check through the tags to find what we're matching
 		for _, tag := range tags {
-
-			if strings.EqualFold(matchTrimmed, fmt.Sprintf("#%s", tag.Name)) {
-				// replace the #tag with the formatted tag content
-				tagContent := fmt.Sprintf(`<a href="%s" class="mention hashtag" rel="tag">#<span>%s</span></a>`, tag.URL, tagAsEntered)
-
-				// in case the match picked up any previous space or newlines (thanks to the regex), include them as well
-				if strings.HasPrefix(match, " ") {
-					tagContent = " " + tagContent
-				} else if strings.HasPrefix(match, "\n") {
-					tagContent = "\n" + tagContent
+			if strings.EqualFold(tagAsEntered, tag.Name) {
+				// Add any dropped space from match
+				if unicode.IsSpace(rune(match[0])) {
+					buf.WriteByte(match[0])
 				}
 
-				// done
-				return tagContent
+				// replace the #tag with the formatted tag content
+				// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a>
+				buf.WriteString(`<a href="`)
+				buf.WriteString(tag.URL)
+				buf.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
+				buf.WriteString(tagAsEntered)
+				buf.WriteString(`</span></a>`)
+				return buf.String()
 			}
 		}
+
 		// the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes
 		return match
 	})
 }
 
 func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string {
-	return regexes.MentionFinder.ReplaceAllStringFunc(in, func(match string) string {
-		// we have a match
+	return regexes.ReplaceAllStringFunc(regexes.MentionFinder, in, func(match string, buf *bytes.Buffer) string {
+		// we have a match, trim any spaces
 		matchTrimmed := strings.TrimSpace(match)
+
 		// check through mentions to find what we're matching
 		for _, menchie := range mentions {
 			if strings.EqualFold(matchTrimmed, menchie.NameString) {
@@ -107,22 +110,26 @@ func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*
 					}
 					menchie.TargetAccount = a
 				}
-				targetAccount := menchie.TargetAccount
 
-				// replace the mention with the formatted mention content
-				mentionContent := fmt.Sprintf(`<span class="h-card"><a href="%s" class="u-url mention">@<span>%s</span></a></span>`, targetAccount.URL, targetAccount.Username)
+				// The mention's target is our target
+				targetAccount := menchie.TargetAccount
 
-				// in case the match picked up any previous space or newlines (thanks to the regex), include them as well
-				if strings.HasPrefix(match, " ") {
-					mentionContent = " " + mentionContent
-				} else if strings.HasPrefix(match, "\n") {
-					mentionContent = "\n" + mentionContent
+				// Add any dropped space from match
+				if unicode.IsSpace(rune(match[0])) {
+					buf.WriteByte(match[0])
 				}
 
-				// done
-				return mentionContent
+				// replace the mention with the formatted mention content
+				// <span class="h-card"><a href="targetAccount.URL" class="u-url mention">@<span>targetAccount.Username</span></a></span>
+				buf.WriteString(`<span class="h-card"><a href="`)
+				buf.WriteString(targetAccount.URL)
+				buf.WriteString(`" class="u-url mention">@<span>`)
+				buf.WriteString(targetAccount.Username)
+				buf.WriteString(`</span></a></span>`)
+				return buf.String()
 			}
 		}
+
 		// the match wasn't in the list of mentions for whatever reason, so just return the match as we found it so nothing changes
 		return match
 	})
diff --git a/internal/text/link.go b/internal/text/link.go
index d8d83df6d..f72c451f2 100644
--- a/internal/text/link.go
+++ b/internal/text/link.go
@@ -19,34 +19,28 @@
 package text
 
 import (
+	"bytes"
 	"context"
-	"fmt"
 	"net/url"
+	"strings"
 
-	"mvdan.cc/xurls/v2"
+	"github.com/superseriousbusiness/gotosocial/internal/regexes"
 )
 
-// schemes is the regex for schemes we accept when looking for links.
-// Basically, we accept https or http.
-var schemes = `(((http|https))://)`
-
 // FindLinks parses the given string looking for recognizable URLs (including scheme).
 // It returns a list of those URLs, without changing the string, or an error if something goes wrong.
 // If no URLs are found within the given string, an empty slice and nil will be returned.
-func FindLinks(in string) ([]*url.URL, error) {
-	rxStrict, err := xurls.StrictMatchingScheme(schemes)
-	if err != nil {
-		return nil, err
-	}
-
-	urls := []*url.URL{}
+func FindLinks(in string) []*url.URL {
+	var urls []*url.URL
 
 	// bail already if we don't find anything
-	found := rxStrict.FindAllString(in, -1)
+	found := regexes.LinkScheme.FindAllString(in, -1)
 	if len(found) == 0 {
-		return urls, nil
+		return nil
 	}
 
+	urlmap := map[string]struct{}{}
+
 	// for each string we find, we want to parse it into a URL if we can
 	// if we fail to parse it, just ignore this match and continue
 	for _, f := range found {
@@ -54,29 +48,18 @@ func FindLinks(in string) ([]*url.URL, error) {
 		if err != nil {
 			continue
 		}
-		urls = append(urls, u)
-	}
 
-	// deduplicate the URLs
-	urlsDeduped := []*url.URL{}
+		// Calculate string
+		ustr := u.String()
 
-	for _, u := range urls {
-		if !contains(urlsDeduped, u) {
-			urlsDeduped = append(urlsDeduped, u)
+		if _, ok := urlmap[ustr]; !ok {
+			// Has not been encountered yet
+			urls = append(urls, u)
+			urlmap[ustr] = struct{}{}
 		}
 	}
 
-	return urlsDeduped, nil
-}
-
-// contains checks if the given url is already within a slice of URLs
-func contains(urls []*url.URL, url *url.URL) bool {
-	for _, u := range urls {
-		if u.String() == url.String() {
-			return true
-		}
-	}
-	return false
+	return urls
 }
 
 // ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents.
@@ -84,33 +67,20 @@ func contains(urls []*url.URL, url *url.URL) bool {
 // href will end up double-formatted, if the text you pass here contains one or more hrefs already.
 // To avoid this, you should sanitize any HTML out of text before you pass it into this function.
 func (f *formatter) ReplaceLinks(ctx context.Context, in string) string {
-	rxStrict, err := xurls.StrictMatchingScheme(schemes)
-	if err != nil {
-		panic(err)
-	}
-
-	replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string {
+	return regexes.ReplaceAllStringFunc(regexes.LinkScheme, in, func(urlString string, buf *bytes.Buffer) string {
 		thisURL, err := url.Parse(urlString)
 		if err != nil {
 			return urlString // we can't parse it as a URL so don't replace it
 		}
-
-		shortString := thisURL.Hostname()
-
-		if thisURL.Path != "" {
-			shortString += thisURL.Path
-		}
-
-		if thisURL.Fragment != "" {
-			shortString = shortString + "#" + thisURL.Fragment
-		}
-
-		if thisURL.RawQuery != "" {
-			shortString = shortString + "?" + thisURL.RawQuery
-		}
-
-		replacement := fmt.Sprintf(`<a href="%s" rel="noopener">%s</a>`, urlString, shortString)
-		return replacement
+		// <a href="thisURL.String()" rel="noopener">urlString</a>
+		urlString = thisURL.String()
+		buf.WriteString(`<a href="`)
+		buf.WriteString(thisURL.String())
+		buf.WriteString(`" rel="noopener">`)
+		urlString = strings.TrimPrefix(urlString, thisURL.Scheme)
+		urlString = strings.TrimPrefix(urlString, "://")
+		buf.WriteString(urlString)
+		buf.WriteString(`</a>`)
+		return buf.String()
 	})
-	return replaced
 }
diff --git a/internal/text/link_test.go b/internal/text/link_test.go
index e524315e7..24484e02d 100644
--- a/internal/text/link_test.go
+++ b/internal/text/link_test.go
@@ -75,9 +75,7 @@ func (suite *LinkTestSuite) TestParseSimple() {
 }
 
 func (suite *LinkTestSuite) TestParseURLsFromText1() {
-	urls, err := text.FindLinks(text1)
-
-	assert.NoError(suite.T(), err)
+	urls := text.FindLinks(text1)
 
 	assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String())
 	assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String())
@@ -86,16 +84,14 @@ func (suite *LinkTestSuite) TestParseURLsFromText1() {
 }
 
 func (suite *LinkTestSuite) TestParseURLsFromText2() {
-	urls, err := text.FindLinks(text2)
-	assert.NoError(suite.T(), err)
+	urls := text.FindLinks(text2)
 
 	// assert length 1 because the found links will be deduplicated
 	assert.Len(suite.T(), urls, 1)
 }
 
 func (suite *LinkTestSuite) TestParseURLsFromText3() {
-	urls, err := text.FindLinks(text3)
-	assert.NoError(suite.T(), err)
+	urls := text.FindLinks(text3)
 
 	// assert length 0 because `mailto:` isn't accepted
 	assert.Len(suite.T(), urls, 0)
@@ -112,7 +108,7 @@ Here's link number two: <a href="http://test.example.org?q=bahhhhhhhhhhhh" rel="
 
 really.cool.website <-- this one shouldn't be parsed as a link because it doesn't contain the scheme
 
-<a href="https://example.orghttps://google.com" rel="noopener">example.orghttps//google.com</a> <-- this shouldn't work either, but it does?! OK
+<a href="https://example.orghttps://google.com" rel="noopener">example.orghttps://google.com</a> <-- this shouldn't work either, but it does?! OK
 `, replaced)
 }
 
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 453f4dd31..4ef3b3715 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -20,12 +20,17 @@ package text
 
 import (
 	"context"
-	"fmt"
 	"strings"
 
 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
 )
 
+// breakReplacer replaces new-lines with HTML breaks.
+var breakReplacer = strings.NewReplacer(
+	"\r\n", "<br/>",
+	"\n", "<br/>",
+)
+
 func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
 	content := preformat(plain)
 
@@ -42,10 +47,10 @@ func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gts
 	content = f.ReplaceMentions(ctx, content, mentions)
 
 	// replace newlines with breaks
-	content = strings.ReplaceAll(content, "\n", "<br />")
+	content = breakReplacer.Replace(content)
 
 	// wrap the whole thing in a pee
-	content = fmt.Sprintf(`<p>%s</p>`, content)
+	content = `<p>` + content + `</p>`
 
 	return postformat(content)
 }
diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go
index d8b7c17da..2b7b50d5e 100644
--- a/internal/text/plain_test.go
+++ b/internal/text/plain_test.go
@@ -53,7 +53,6 @@ func (suite *PlainTestSuite) TestParseSimple() {
 }
 
 func (suite *PlainTestSuite) TestParseWithTag() {
-
 	foundTags := []*gtsmodel.Tag{
 		suite.testTags["welcome"],
 	}
@@ -63,7 +62,6 @@ func (suite *PlainTestSuite) TestParseWithTag() {
 }
 
 func (suite *PlainTestSuite) TestParseMoreComplex() {
-
 	foundTags := []*gtsmodel.Tag{
 		suite.testTags["Hashtag"],
 	}
author	kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com>	2022-05-07 16:55:27 +0100
committer	GitHub <noreply@github.com>	2022-05-07 17:55:27 +0200
commit	26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 (patch)
tree	db316febba8e0ada7a9360b059011dcc7ea138a3 /internal/text
parent	[performance] improved logrus output switching performance (#544) (diff)
download	gotosocial-26b74aefaf5d2a3cd26bd57652fe96a6a20ed034.tar.xz