diff options
author | 2022-05-07 16:55:27 +0100 | |
---|---|---|
committer | 2022-05-07 17:55:27 +0200 | |
commit | 26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 (patch) | |
tree | db316febba8e0ada7a9360b059011dcc7ea138a3 /internal/regexes | |
parent | [performance] improved logrus output switching performance (#544) (diff) | |
download | gotosocial-26b74aefaf5d2a3cd26bd57652fe96a6a20ed034.tar.xz |
[bugfix] Fix existing bio text showing as HTML (#531)
* fix existing bio text showing as HTML
- updated replaced mentions to include instance
- strips HTML from account source note in Verify handler
- update text formatter to use buffers for string writes
Signed-off-by: kim <grufwub@gmail.com>
* go away linter
Signed-off-by: kim <grufwub@gmail.com>
* change buf reset location, change html mention tags
Signed-off-by: kim <grufwub@gmail.com>
* reduce FindLinks code complexity
Signed-off-by: kim <grufwub@gmail.com>
* fix HTML to text conversion
Signed-off-by: kim <grufwub@gmail.com>
* Update internal/regexes/regexes.go
Co-authored-by: Mina Galić <mina.galic@puppet.com>
* use improved html2text lib with more options
Signed-off-by: kim <grufwub@gmail.com>
* fix to produce actual plaintext from html
Signed-off-by: kim <grufwub@gmail.com>
* fix span tags instead written as space
Signed-off-by: kim <grufwub@gmail.com>
* performance improvements to regex replacements, fix link replace logic for un-html-ing in the future
Signed-off-by: kim <grufwub@gmail.com>
* fix tag/mention replacements to use input string, fix link replace to not include scheme
Signed-off-by: kim <grufwub@gmail.com>
* use matched input string for link replace href text
Signed-off-by: kim <grufwub@gmail.com>
* remove unused code (to appease linter :sobs:)
Signed-off-by: kim <grufwub@gmail.com>
* improve hashtagFinger regex to be more compliant
Signed-off-by: kim <grufwub@gmail.com>
* update breakReplacer to include both unix and windows line endings
Signed-off-by: kim <grufwub@gmail.com>
* add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts
Signed-off-by: kim <grufwub@gmail.com>
* drop unnecessary code
Signed-off-by: kim <grufwub@gmail.com>
* update text package tests to fix logic changes
Signed-off-by: kim <grufwub@gmail.com>
* add raw note content testing to account update and account verify
Signed-off-by: kim <grufwub@gmail.com>
* remove unused modules
Signed-off-by: kim <grufwub@gmail.com>
* fix emoji regex
Signed-off-by: kim <grufwub@gmail.com>
* fix replacement of hashtags
Signed-off-by: kim <grufwub@gmail.com>
* update code comment
Signed-off-by: kim <grufwub@gmail.com>
Co-authored-by: Mina Galić <mina.galic@puppet.com>
Diffstat (limited to 'internal/regexes')
-rw-r--r-- | internal/regexes/regexes.go | 36 |
1 files changed, 34 insertions, 2 deletions
diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go index 9302b544d..f05f9b391 100644 --- a/internal/regexes/regexes.go +++ b/internal/regexes/regexes.go @@ -19,8 +19,12 @@ package regexes import ( + "bytes" "fmt" "regexp" + "sync" + + "mvdan.cc/xurls/v2" ) const ( @@ -47,6 +51,16 @@ const ( ) var ( + schemes = `(http|https)://` + // LinkScheme captures http/https schemes in URLs. + LinkScheme = func() *regexp.Regexp { + rgx, err := xurls.StrictMatchingScheme(schemes) + if err != nil { + panic(err) + } + return rgx + }() + mentionName = `^@(\w+)(?:@([a-zA-Z0-9_\-\.:]+))?$` // MentionName captures the username and domain part from a mention string // such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols) @@ -58,7 +72,7 @@ var ( MentionFinder = regexp.MustCompile(mentionFinder) // hashtag regex can be played with here: https://regex101.com/r/bPxeca/1 - hashtagFinder = fmt.Sprintf(`(?:^|\n|\s)(#[a-zA-Z0-9]{1,%d})(?:\b)`, maximumHashtagLength) + hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[a-zA-Z0-9]{1,%d})(?:#|\b)`, maximumHashtagLength) // HashtagFinder finds possible hashtags in a string. // It returns just the string part of the hashtag, not the # symbol. HashtagFinder = regexp.MustCompile(hashtagFinder) @@ -68,7 +82,7 @@ var ( EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode)) // emoji regex can be played with here: https://regex101.com/r/478XGM/1 - emojiFinderString = fmt.Sprintf(`(?:\B)?:(%s):(?:\B)?`, emojiShortcode) + emojiFinderString = fmt.Sprintf(`(?:\b)?:(%s):(?:\b)?`, emojiShortcode) // EmojiFinder extracts emoji strings from a piece of text. EmojiFinder = regexp.MustCompile(emojiFinderString) @@ -134,3 +148,21 @@ var ( // from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH BlockPath = regexp.MustCompile(blockPath) ) + +// bufpool is a memory pool of byte buffers for use in our regex utility functions. +var bufpool = sync.Pool{ + New: func() any { + buf := bytes.NewBuffer(make([]byte, 0, 512)) + return buf + }, +} + +// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes. +func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string { + buf := bufpool.Get().(*bytes.Buffer) //nolint + defer bufpool.Put(buf) + return rgx.ReplaceAllStringFunc(src, func(match string) string { + buf.Reset() // reset use + return repl(match, buf) + }) +} |