From b9e0689359f347edc47487a8043c9004ead0770a Mon Sep 17 00:00:00 2001
From: Vyr Cossont <VyrCossont@users.noreply.github.com>
Date: Fri, 31 Jan 2025 02:42:55 -0800
Subject: [bugfix] Extend parser to handle more non-Latin hashtags (#3700)

* Allow marks after NFC normalization

Includes regression test for the Tamil example from #3618

* Disallow just numbers + marks + underscore as hashtag
---
 internal/text/util.go | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

(limited to 'internal/text/util.go')

diff --git a/internal/text/util.go b/internal/text/util.go
index af45cfaf0..47b2416dd 100644
--- a/internal/text/util.go
+++ b/internal/text/util.go
@@ -19,19 +19,14 @@ package text
 
 import "unicode"
 
-func isPlausiblyInHashtag(r rune) bool {
-	// Marks are allowed during parsing
-	// prior to normalization, but not after,
-	// since they may be combined into letters
-	// during normalization.
-	return unicode.IsMark(r) ||
-		isPermittedInHashtag(r)
+func isPermittedInHashtag(r rune) bool {
+	return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)
 }
 
-func isPermittedInHashtag(r rune) bool {
-	return unicode.IsLetter(r) ||
-		unicode.IsNumber(r) ||
-		r == '_'
+// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
+// but are not allowed to be the only characters making up the hashtag.
+func isPermittedIfNotEntireHashtag(r rune) bool {
+	return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'
 }
 
 // isHashtagBoundary returns true if rune r
-- 
cgit v1.2.3