From 26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 Mon Sep 17 00:00:00 2001 From: kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com> Date: Sat, 7 May 2022 16:55:27 +0100 Subject: [bugfix] Fix existing bio text showing as HTML (#531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim * go away linter Signed-off-by: kim * change buf reset location, change html mention tags Signed-off-by: kim * reduce FindLinks code complexity Signed-off-by: kim * fix HTML to text conversion Signed-off-by: kim * Update internal/regexes/regexes.go Co-authored-by: Mina Galić * use improved html2text lib with more options Signed-off-by: kim * fix to produce actual plaintext from html Signed-off-by: kim * fix span tags instead written as space Signed-off-by: kim * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim * use matched input string for link replace href text Signed-off-by: kim * remove unused code (to appease linter :sobs:) Signed-off-by: kim * improve hashtagFinger regex to be more compliant Signed-off-by: kim * update breakReplacer to include both unix and windows line endings Signed-off-by: kim * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim * drop unnecessary code Signed-off-by: kim * update text package tests to fix logic changes Signed-off-by: kim * add raw note content testing to account update and account verify Signed-off-by: kim * remove unused modules Signed-off-by: kim * fix emoji regex Signed-off-by: kim * fix replacement of hashtags Signed-off-by: kim * update code comment Signed-off-by: kim Co-authored-by: Mina Galić --- internal/text/common.go | 61 +++++++++++++++++--------------- internal/text/link.go | 84 +++++++++++++++------------------------------ internal/text/link_test.go | 12 +++---- internal/text/plain.go | 11 ++++-- internal/text/plain_test.go | 2 -- 5 files changed, 73 insertions(+), 97 deletions(-) (limited to 'internal/text') diff --git a/internal/text/common.go b/internal/text/common.go index 4148ece15..12c0f1dfa 100644 --- a/internal/text/common.go +++ b/internal/text/common.go @@ -19,10 +19,11 @@ package text import ( + "bytes" "context" - "fmt" "html" "strings" + "unicode" "github.com/sirupsen/logrus" @@ -63,38 +64,40 @@ func postformat(in string) string { } func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { - return regexes.HashtagFinder.ReplaceAllStringFunc(in, func(match string) string { + return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string { // we have a match matchTrimmed := strings.TrimSpace(match) - tagAsEntered := strings.Split(matchTrimmed, "#")[1] + tagAsEntered := matchTrimmed[1:] // check through the tags to find what we're matching for _, tag := range tags { - - if strings.EqualFold(matchTrimmed, fmt.Sprintf("#%s", tag.Name)) { - // replace the #tag with the formatted tag content - tagContent := fmt.Sprintf(``, tag.URL, tagAsEntered) - - // in case the match picked up any previous space or newlines (thanks to the regex), include them as well - if strings.HasPrefix(match, " ") { - tagContent = " " + tagContent - } else if strings.HasPrefix(match, "\n") { - tagContent = "\n" + tagContent + if strings.EqualFold(tagAsEntered, tag.Name) { + // Add any dropped space from match + if unicode.IsSpace(rune(match[0])) { + buf.WriteByte(match[0]) } - // done - return tagContent + // replace the #tag with the formatted tag content + // ` + buf.WriteString(``) + return buf.String() } } + // the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes return match }) } func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { - return regexes.MentionFinder.ReplaceAllStringFunc(in, func(match string) string { - // we have a match + return regexes.ReplaceAllStringFunc(regexes.MentionFinder, in, func(match string, buf *bytes.Buffer) string { + // we have a match, trim any spaces matchTrimmed := strings.TrimSpace(match) + // check through mentions to find what we're matching for _, menchie := range mentions { if strings.EqualFold(matchTrimmed, menchie.NameString) { @@ -107,22 +110,26 @@ func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []* } menchie.TargetAccount = a } - targetAccount := menchie.TargetAccount - // replace the mention with the formatted mention content - mentionContent := fmt.Sprintf(`@%s`, targetAccount.URL, targetAccount.Username) + // The mention's target is our target + targetAccount := menchie.TargetAccount - // in case the match picked up any previous space or newlines (thanks to the regex), include them as well - if strings.HasPrefix(match, " ") { - mentionContent = " " + mentionContent - } else if strings.HasPrefix(match, "\n") { - mentionContent = "\n" + mentionContent + // Add any dropped space from match + if unicode.IsSpace(rune(match[0])) { + buf.WriteByte(match[0]) } - // done - return mentionContent + // replace the mention with the formatted mention content + // @targetAccount.Username + buf.WriteString(`@`) + buf.WriteString(targetAccount.Username) + buf.WriteString(``) + return buf.String() } } + // the match wasn't in the list of mentions for whatever reason, so just return the match as we found it so nothing changes return match }) diff --git a/internal/text/link.go b/internal/text/link.go index d8d83df6d..f72c451f2 100644 --- a/internal/text/link.go +++ b/internal/text/link.go @@ -19,34 +19,28 @@ package text import ( + "bytes" "context" - "fmt" "net/url" + "strings" - "mvdan.cc/xurls/v2" + "github.com/superseriousbusiness/gotosocial/internal/regexes" ) -// schemes is the regex for schemes we accept when looking for links. -// Basically, we accept https or http. -var schemes = `(((http|https))://)` - // FindLinks parses the given string looking for recognizable URLs (including scheme). // It returns a list of those URLs, without changing the string, or an error if something goes wrong. // If no URLs are found within the given string, an empty slice and nil will be returned. -func FindLinks(in string) ([]*url.URL, error) { - rxStrict, err := xurls.StrictMatchingScheme(schemes) - if err != nil { - return nil, err - } - - urls := []*url.URL{} +func FindLinks(in string) []*url.URL { + var urls []*url.URL // bail already if we don't find anything - found := rxStrict.FindAllString(in, -1) + found := regexes.LinkScheme.FindAllString(in, -1) if len(found) == 0 { - return urls, nil + return nil } + urlmap := map[string]struct{}{} + // for each string we find, we want to parse it into a URL if we can // if we fail to parse it, just ignore this match and continue for _, f := range found { @@ -54,29 +48,18 @@ func FindLinks(in string) ([]*url.URL, error) { if err != nil { continue } - urls = append(urls, u) - } - // deduplicate the URLs - urlsDeduped := []*url.URL{} + // Calculate string + ustr := u.String() - for _, u := range urls { - if !contains(urlsDeduped, u) { - urlsDeduped = append(urlsDeduped, u) + if _, ok := urlmap[ustr]; !ok { + // Has not been encountered yet + urls = append(urls, u) + urlmap[ustr] = struct{}{} } } - return urlsDeduped, nil -} - -// contains checks if the given url is already within a slice of URLs -func contains(urls []*url.URL, url *url.URL) bool { - for _, u := range urls { - if u.String() == url.String() { - return true - } - } - return false + return urls } // ReplaceLinks replaces all detected links in a piece of text with their HTML (href) equivalents. @@ -84,33 +67,20 @@ func contains(urls []*url.URL, url *url.URL) bool { // href will end up double-formatted, if the text you pass here contains one or more hrefs already. // To avoid this, you should sanitize any HTML out of text before you pass it into this function. func (f *formatter) ReplaceLinks(ctx context.Context, in string) string { - rxStrict, err := xurls.StrictMatchingScheme(schemes) - if err != nil { - panic(err) - } - - replaced := rxStrict.ReplaceAllStringFunc(in, func(urlString string) string { + return regexes.ReplaceAllStringFunc(regexes.LinkScheme, in, func(urlString string, buf *bytes.Buffer) string { thisURL, err := url.Parse(urlString) if err != nil { return urlString // we can't parse it as a URL so don't replace it } - - shortString := thisURL.Hostname() - - if thisURL.Path != "" { - shortString += thisURL.Path - } - - if thisURL.Fragment != "" { - shortString = shortString + "#" + thisURL.Fragment - } - - if thisURL.RawQuery != "" { - shortString = shortString + "?" + thisURL.RawQuery - } - - replacement := fmt.Sprintf(`%s`, urlString, shortString) - return replacement + // urlString + urlString = thisURL.String() + buf.WriteString(``) + urlString = strings.TrimPrefix(urlString, thisURL.Scheme) + urlString = strings.TrimPrefix(urlString, "://") + buf.WriteString(urlString) + buf.WriteString(``) + return buf.String() }) - return replaced } diff --git a/internal/text/link_test.go b/internal/text/link_test.go index e524315e7..24484e02d 100644 --- a/internal/text/link_test.go +++ b/internal/text/link_test.go @@ -75,9 +75,7 @@ func (suite *LinkTestSuite) TestParseSimple() { } func (suite *LinkTestSuite) TestParseURLsFromText1() { - urls, err := text.FindLinks(text1) - - assert.NoError(suite.T(), err) + urls := text.FindLinks(text1) assert.Equal(suite.T(), "https://example.org/link/to/something#fragment", urls[0].String()) assert.Equal(suite.T(), "http://test.example.org?q=bahhhhhhhhhhhh", urls[1].String()) @@ -86,16 +84,14 @@ func (suite *LinkTestSuite) TestParseURLsFromText1() { } func (suite *LinkTestSuite) TestParseURLsFromText2() { - urls, err := text.FindLinks(text2) - assert.NoError(suite.T(), err) + urls := text.FindLinks(text2) // assert length 1 because the found links will be deduplicated assert.Len(suite.T(), urls, 1) } func (suite *LinkTestSuite) TestParseURLsFromText3() { - urls, err := text.FindLinks(text3) - assert.NoError(suite.T(), err) + urls := text.FindLinks(text3) // assert length 0 because `mailto:` isn't accepted assert.Len(suite.T(), urls, 0) @@ -112,7 +108,7 @@ Here's link number two: example.orghttps//google.com <-- this shouldn't work either, but it does?! OK +example.orghttps://google.com <-- this shouldn't work either, but it does?! OK `, replaced) } diff --git a/internal/text/plain.go b/internal/text/plain.go index 453f4dd31..4ef3b3715 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -20,12 +20,17 @@ package text import ( "context" - "fmt" "strings" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" ) +// breakReplacer replaces new-lines with HTML breaks. +var breakReplacer = strings.NewReplacer( + "\r\n", "
", + "\n", "
", +) + func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string { content := preformat(plain) @@ -42,10 +47,10 @@ func (f *formatter) FromPlain(ctx context.Context, plain string, mentions []*gts content = f.ReplaceMentions(ctx, content, mentions) // replace newlines with breaks - content = strings.ReplaceAll(content, "\n", "
") + content = breakReplacer.Replace(content) // wrap the whole thing in a pee - content = fmt.Sprintf(`

%s

`, content) + content = `

` + content + `

` return postformat(content) } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index d8b7c17da..2b7b50d5e 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -53,7 +53,6 @@ func (suite *PlainTestSuite) TestParseSimple() { } func (suite *PlainTestSuite) TestParseWithTag() { - foundTags := []*gtsmodel.Tag{ suite.testTags["welcome"], } @@ -63,7 +62,6 @@ func (suite *PlainTestSuite) TestParseWithTag() { } func (suite *PlainTestSuite) TestParseMoreComplex() { - foundTags := []*gtsmodel.Tag{ suite.testTags["Hashtag"], } -- cgit v1.2.3