summaryrefslogtreecommitdiff
path: root/internal/regexes
diff options
context:
space:
mode:
authorLibravatar kim <89579420+NyaaaWhatsUpDoc@users.noreply.github.com>2022-05-07 16:55:27 +0100
committerLibravatar GitHub <noreply@github.com>2022-05-07 17:55:27 +0200
commit26b74aefaf5d2a3cd26bd57652fe96a6a20ed034 (patch)
treedb316febba8e0ada7a9360b059011dcc7ea138a3 /internal/regexes
parent[performance] improved logrus output switching performance (#544) (diff)
downloadgotosocial-26b74aefaf5d2a3cd26bd57652fe96a6a20ed034.tar.xz
[bugfix] Fix existing bio text showing as HTML (#531)
* fix existing bio text showing as HTML - updated replaced mentions to include instance - strips HTML from account source note in Verify handler - update text formatter to use buffers for string writes Signed-off-by: kim <grufwub@gmail.com> * go away linter Signed-off-by: kim <grufwub@gmail.com> * change buf reset location, change html mention tags Signed-off-by: kim <grufwub@gmail.com> * reduce FindLinks code complexity Signed-off-by: kim <grufwub@gmail.com> * fix HTML to text conversion Signed-off-by: kim <grufwub@gmail.com> * Update internal/regexes/regexes.go Co-authored-by: Mina Galić <mina.galic@puppet.com> * use improved html2text lib with more options Signed-off-by: kim <grufwub@gmail.com> * fix to produce actual plaintext from html Signed-off-by: kim <grufwub@gmail.com> * fix span tags instead written as space Signed-off-by: kim <grufwub@gmail.com> * performance improvements to regex replacements, fix link replace logic for un-html-ing in the future Signed-off-by: kim <grufwub@gmail.com> * fix tag/mention replacements to use input string, fix link replace to not include scheme Signed-off-by: kim <grufwub@gmail.com> * use matched input string for link replace href text Signed-off-by: kim <grufwub@gmail.com> * remove unused code (to appease linter :sobs:) Signed-off-by: kim <grufwub@gmail.com> * improve hashtagFinger regex to be more compliant Signed-off-by: kim <grufwub@gmail.com> * update breakReplacer to include both unix and windows line endings Signed-off-by: kim <grufwub@gmail.com> * add NoteRaw field to Account to store plaintext account bio, add migration for this, set for sensitive accounts Signed-off-by: kim <grufwub@gmail.com> * drop unnecessary code Signed-off-by: kim <grufwub@gmail.com> * update text package tests to fix logic changes Signed-off-by: kim <grufwub@gmail.com> * add raw note content testing to account update and account verify Signed-off-by: kim <grufwub@gmail.com> * remove unused modules Signed-off-by: kim <grufwub@gmail.com> * fix emoji regex Signed-off-by: kim <grufwub@gmail.com> * fix replacement of hashtags Signed-off-by: kim <grufwub@gmail.com> * update code comment Signed-off-by: kim <grufwub@gmail.com> Co-authored-by: Mina Galić <mina.galic@puppet.com>
Diffstat (limited to 'internal/regexes')
-rw-r--r--internal/regexes/regexes.go36
1 files changed, 34 insertions, 2 deletions
diff --git a/internal/regexes/regexes.go b/internal/regexes/regexes.go
index 9302b544d..f05f9b391 100644
--- a/internal/regexes/regexes.go
+++ b/internal/regexes/regexes.go
@@ -19,8 +19,12 @@
package regexes
import (
+ "bytes"
"fmt"
"regexp"
+ "sync"
+
+ "mvdan.cc/xurls/v2"
)
const (
@@ -47,6 +51,16 @@ const (
)
var (
+ schemes = `(http|https)://`
+ // LinkScheme captures http/https schemes in URLs.
+ LinkScheme = func() *regexp.Regexp {
+ rgx, err := xurls.StrictMatchingScheme(schemes)
+ if err != nil {
+ panic(err)
+ }
+ return rgx
+ }()
+
mentionName = `^@(\w+)(?:@([a-zA-Z0-9_\-\.:]+))?$`
// MentionName captures the username and domain part from a mention string
// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
@@ -58,7 +72,7 @@ var (
MentionFinder = regexp.MustCompile(mentionFinder)
// hashtag regex can be played with here: https://regex101.com/r/bPxeca/1
- hashtagFinder = fmt.Sprintf(`(?:^|\n|\s)(#[a-zA-Z0-9]{1,%d})(?:\b)`, maximumHashtagLength)
+ hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[a-zA-Z0-9]{1,%d})(?:#|\b)`, maximumHashtagLength)
// HashtagFinder finds possible hashtags in a string.
// It returns just the string part of the hashtag, not the # symbol.
HashtagFinder = regexp.MustCompile(hashtagFinder)
@@ -68,7 +82,7 @@ var (
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
// emoji regex can be played with here: https://regex101.com/r/478XGM/1
- emojiFinderString = fmt.Sprintf(`(?:\B)?:(%s):(?:\B)?`, emojiShortcode)
+ emojiFinderString = fmt.Sprintf(`(?:\b)?:(%s):(?:\b)?`, emojiShortcode)
// EmojiFinder extracts emoji strings from a piece of text.
EmojiFinder = regexp.MustCompile(emojiFinderString)
@@ -134,3 +148,21 @@ var (
// from eg /users/example_username/blocks/01F7XT5JZW1WMVSW1KADS8PVDH
BlockPath = regexp.MustCompile(blockPath)
)
+
+// bufpool is a memory pool of byte buffers for use in our regex utility functions.
+var bufpool = sync.Pool{
+ New: func() any {
+ buf := bytes.NewBuffer(make([]byte, 0, 512))
+ return buf
+ },
+}
+
+// ReplaceAllStringFunc will call through to .ReplaceAllStringFunc in the provided regex, but provide you a clean byte buffer for optimized string writes.
+func ReplaceAllStringFunc(rgx *regexp.Regexp, src string, repl func(match string, buf *bytes.Buffer) string) string {
+ buf := bufpool.Get().(*bytes.Buffer) //nolint
+ defer bufpool.Put(buf)
+ return rgx.ReplaceAllStringFunc(src, func(match string) string {
+ buf.Reset() // reset use
+ return repl(match, buf)
+ })
+}