From b9e0689359f347edc47487a8043c9004ead0770a Mon Sep 17 00:00:00 2001 From: Vyr Cossont Date: Fri, 31 Jan 2025 02:42:55 -0800 Subject: [bugfix] Extend parser to handle more non-Latin hashtags (#3700) * Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag --- internal/text/util.go | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'internal/text/util.go') diff --git a/internal/text/util.go b/internal/text/util.go index af45cfaf0..47b2416dd 100644 --- a/internal/text/util.go +++ b/internal/text/util.go @@ -19,19 +19,14 @@ package text import "unicode" -func isPlausiblyInHashtag(r rune) bool { - // Marks are allowed during parsing - // prior to normalization, but not after, - // since they may be combined into letters - // during normalization. - return unicode.IsMark(r) || - isPermittedInHashtag(r) +func isPermittedInHashtag(r rune) bool { + return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r) } -func isPermittedInHashtag(r rune) bool { - return unicode.IsLetter(r) || - unicode.IsNumber(r) || - r == '_' +// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag +// but are not allowed to be the only characters making up the hashtag. +func isPermittedIfNotEntireHashtag(r rune) bool { + return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_' } // isHashtagBoundary returns true if rune r -- cgit v1.2.3