From b9e0689359f347edc47487a8043c9004ead0770a Mon Sep 17 00:00:00 2001
From: Vyr Cossont <VyrCossont@users.noreply.github.com>
Date: Fri, 31 Jan 2025 02:42:55 -0800
Subject: [bugfix] Extend parser to handle more non-Latin hashtags (#3700)

* Allow marks after NFC normalization

Includes regression test for the Tamil example from #3618

* Disallow just numbers + marks + underscore as hashtag
---
 internal/text/normalize.go | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'internal/text/normalize.go')

diff --git a/internal/text/normalize.go b/internal/text/normalize.go
index d2e633d1e..ea266fb33 100644
--- a/internal/text/normalize.go
+++ b/internal/text/normalize.go
@@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) {
 
 	// Validate normalized result.
 	var (
-		notJustUnderscores = false
-		onlyPermittedChars = true
-		lengthOK           = true
+		atLeastOneRequiredChar = false
+		onlyPermittedChars     = true
+		lengthOK               = true
 	)
 
 	for i, r := range normalized {
-		if r != '_' {
-			// This isn't an underscore,
-			// so the whole hashtag isn't
-			// just underscores.
-			notJustUnderscores = true
+		if !isPermittedIfNotEntireHashtag(r) {
+			// This isn't an underscore, mark, etc,
+			// so the hashtag contains at least one
+			atLeastOneRequiredChar = true
 		}
 
 		if i >= maximumHashtagLength {
@@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) {
 		}
 	}
 
-	return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores)
+	return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar
 }
-- 
cgit v1.2.3