From b9e0689359f347edc47487a8043c9004ead0770a Mon Sep 17 00:00:00 2001 From: Vyr Cossont Date: Fri, 31 Jan 2025 02:42:55 -0800 Subject: [bugfix] Extend parser to handle more non-Latin hashtags (#3700) * Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag --- internal/text/normalize.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'internal/text/normalize.go') diff --git a/internal/text/normalize.go b/internal/text/normalize.go index d2e633d1e..ea266fb33 100644 --- a/internal/text/normalize.go +++ b/internal/text/normalize.go @@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) { // Validate normalized result. var ( - notJustUnderscores = false - onlyPermittedChars = true - lengthOK = true + atLeastOneRequiredChar = false + onlyPermittedChars = true + lengthOK = true ) for i, r := range normalized { - if r != '_' { - // This isn't an underscore, - // so the whole hashtag isn't - // just underscores. - notJustUnderscores = true + if !isPermittedIfNotEntireHashtag(r) { + // This isn't an underscore, mark, etc, + // so the hashtag contains at least one + atLeastOneRequiredChar = true } if i >= maximumHashtagLength { @@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) { } } - return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) + return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar } -- cgit v1.2.3