From 26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 Mon Sep 17 00:00:00 2001 From: kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com> Date: Sat, 7 May 2022 16:55:27 +0100 Subject: [bugfix] Fix existing bio text showing as HTML (#531) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim * go away linter Signed-off-by: kim * change buf reset location, change html mention tags Signed-off-by: kim * reduce FindLinks code complexity Signed-off-by: kim * fix HTML to text conversion Signed-off-by: kim * Update internal/regexes/regexes.go Co-authored-by: Mina Galić * use improved html2text lib with more options Signed-off-by: kim * fix to produce actual plaintext from html Signed-off-by: kim * fix span tags instead written as space Signed-off-by: kim * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim * use matched input string for link replace href text Signed-off-by: kim * remove unused code (to appease linter :sobs:) Signed-off-by: kim * improve hashtagFinger regex to be more compliant Signed-off-by: kim * update breakReplacer to include both unix and windows line endings Signed-off-by: kim * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim * drop unnecessary code Signed-off-by: kim * update text package tests to fix logic changes Signed-off-by: kim * add raw note content testing to account update and account verify Signed-off-by: kim * remove unused modules Signed-off-by: kim * fix emoji regex Signed-off-by: kim * fix replacement of hashtags Signed-off-by: kim * update code comment Signed-off-by: kim Co-authored-by: Mina Galić --- internal/regexes/regexes.go | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'internal/regexes') diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index 9302b544d..f05f9b391 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -19,8 +19,12 @@ package regexes import ( + "bytes" "fmt" "regexp" + "sync" + + "mvdan.cc/xurls/v2" ) const ( @@ -47,6 +51,16 @@ const ( ) var ( + schemes = `(http|https)://` + // LinkScheme captures http/https schemes in URLs. + LinkScheme = func() *regexp.Regexp { + rgx, err := xurls.StrictMatchingScheme(schemes) + if err != nil { + panic(err) + } + return rgx + }() + mentionName = `^@(\w+)(?:@([a-zA-Z0-9_\-\.:]+))?$` // MentionName captures the username and domain part from a mention string // such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) @@ -58,7 +72,7 @@ var ( MentionFinder = regexp.MustCompile(mentionFinder) // hashtag regex can be played with here: https://regex101.com/r/bPxeca/1 - hashtagFinder = fmt.Sprintf(`(?:^|\n|\s)(#[a-zA-Z0-9]{1,%d})(?:\b)`, maximumHashtagLength) + hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[a-zA-Z0-9]{1,%d})(?:#|\b)`, maximumHashtagLength) // HashtagFinder finds possible hashtags in a string. // It returns just the string part of the hashtag, not the # symbol. HashtagFinder = regexp.MustCompile(hashtagFinder) @@ -68,7 +82,7 @@ var ( EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) // emoji regex can be played with here: https://regex101.com/r/478XGM/1 - emojiFinderString = fmt.Sprintf(`(?:\B)?:(%s):(?:\B)?`, emojiShortcode) + emojiFinderString = fmt.Sprintf(`(?:\b)?:(%s):(?:\b)?`, emojiShortcode) // EmojiFinder extracts emoji strings from a piece of text. EmojiFinder = regexp.MustCompile(emojiFinderString) @@ -134,3 +148,21 @@ var ( // from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH BlockPath = regexp.MustCompile(blockPath) ) + +// bufpool is a memory pool of byte buffers for use in our regex utility functions. +var bufpool = sync.Pool{ + New: func() any { + buf := bytes.NewBuffer(make([]byte, 0, 512)) + return buf + }, +} + +// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes. +func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string { + buf := bufpool.Get().(*bytes.Buffer) //nolint + defer bufpool.Put(buf) + return rgx.ReplaceAllStringFunc(src, func(match string) string { + buf.Reset() // reset use + return repl(match, buf) + }) +} -- cgit v1.2.3