From 52109776f63ac59b2fef5cd7417becd9f0007acb Mon Sep 17 00:00:00 2001 From: ugla Date: Tue, 15 Nov 2022 16:05:34 +0100 Subject: [bugfix] Fix unicode-unaware word boundary check in hashtags (#1049) * [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it --- internal/text/common.go | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) (limited to 'internal/text/common.go') diff --git a/internal/text/common.go b/internal/text/common.go index 005f9dfe1..ca4b97465 100644 --- a/internal/text/common.go +++ b/internal/text/common.go @@ -27,36 +27,46 @@ import ( "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" + "github.com/superseriousbusiness/gotosocial/internal/util" ) func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string { - return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string { - // we have a match - matchTrimmed := strings.TrimSpace(match) - tagAsEntered := matchTrimmed[1:] + spans := util.FindHashtagSpansInText(in) + + if len(spans) == 0 { + return in + } + + var b strings.Builder + i := 0 + +spans: + for _, t := range spans { + b.WriteString(in[i:t.First]) + i = t.Second + tagAsEntered := in[t.First+1 : t.Second] - // check through the tags to find what we're matching for _, tag := range tags { if strings.EqualFold(tagAsEntered, tag.Name) { - // Add any dropped space from match - if unicode.IsSpace(rune(match[0])) { - buf.WriteByte(match[0]) - } - // replace the #tag with the formatted tag content // ` - buf.WriteString(``) - return buf.String() + b.WriteString(``) + continue spans } } - // the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes - return match - }) + b.WriteString(in[t.First:t.Second]) + } + + // Get the last bits. + i = spans[len(spans)-1].Second + b.WriteString(in[i:]) + + return b.String() } func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string { -- cgit v1.2.3