summaryrefslogtreecommitdiff
path: root/internal
diff options
context:
space:
mode:
authorLibravatar Vyr Cossont <VyrCossont@users.noreply.github.com>2025-01-31 02:42:55 -0800
committerLibravatar GitHub <noreply@github.com>2025-01-31 11:42:55 +0100
commitb9e0689359f347edc47487a8043c9004ead0770a (patch)
tree514077f83214533ec359a79e0033dcd9015d4ff2 /internal
parent[feature] Add system message wrappers for pending replies and placeholder att... (diff)
downloadgotosocial-b9e0689359f347edc47487a8043c9004ead0770a.tar.xz
[bugfix] Extend parser to handle more non-Latin hashtags (#3700)
* Allow marks after NFC normalization Includes regression test for the Tamil example from #3618 * Disallow just numbers + marks + underscore as hashtag
Diffstat (limited to 'internal')
-rw-r--r--internal/text/goldmark_parsers.go2
-rw-r--r--internal/text/markdown_test.go8
-rw-r--r--internal/text/normalize.go17
-rw-r--r--internal/text/plain_test.go41
-rw-r--r--internal/text/util.go17
5 files changed, 48 insertions, 37 deletions
diff --git a/internal/text/goldmark_parsers.go b/internal/text/goldmark_parsers.go
index b7cf4f9e9..e2c87e057 100644
--- a/internal/text/goldmark_parsers.go
+++ b/internal/text/goldmark_parsers.go
@@ -177,7 +177,7 @@ func (p *hashtagParser) Parse(
// Ignore initial '#'.
continue
- case !isPlausiblyInHashtag(r) &&
+ case !isPermittedInHashtag(r) &&
!isHashtagBoundary(r):
// Weird non-boundary character
// in the hashtag. Don't trust it.
diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go
index 98ed3a96b..153673415 100644
--- a/internal/text/markdown_test.go
+++ b/internal/text/markdown_test.go
@@ -50,6 +50,8 @@ const (
withInlineCode2Expected = "<p><code>Nobody tells you about the &lt;/code>&lt;del>SECRET CODE&lt;/del>&lt;code>, do they?</code></p>"
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
+ withTamilHashtag = "here's a simple status that uses a hashtag in Tamil #தமிழ்"
+ withTamilHashtagExpected = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>"
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
@@ -121,6 +123,12 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() {
suite.Equal(withHashtagExpected, formatted.HTML)
}
+// Regressiom test for https://github.com/superseriousbusiness/gotosocial/issues/3618
+func (suite *MarkdownTestSuite) TestParseWithTamilHashtag() {
+ formatted := suite.FromMarkdown(withTamilHashtag)
+ suite.Equal(withTamilHashtagExpected, formatted.HTML)
+}
+
func (suite *MarkdownTestSuite) TestParseWithHTML() {
formatted := suite.FromMarkdown(mdWithHTML)
suite.Equal(mdWithHTMLExpected, formatted.HTML)
diff --git a/internal/text/normalize.go b/internal/text/normalize.go
index d2e633d1e..ea266fb33 100644
--- a/internal/text/normalize.go
+++ b/internal/text/normalize.go
@@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) {
// Validate normalized result.
var (
- notJustUnderscores = false
- onlyPermittedChars = true
- lengthOK = true
+ atLeastOneRequiredChar = false
+ onlyPermittedChars = true
+ lengthOK = true
)
for i, r := range normalized {
- if r != '_' {
- // This isn't an underscore,
- // so the whole hashtag isn't
- // just underscores.
- notJustUnderscores = true
+ if !isPermittedIfNotEntireHashtag(r) {
+ // This isn't an underscore, mark, etc,
+ // so the hashtag contains at least one
+ atLeastOneRequiredChar = true
}
if i >= maximumHashtagLength {
@@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) {
}
}
- return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores)
+ return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar
}
diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go
index fac54a38e..ffa64ce44 100644
--- a/internal/text/plain_test.go
+++ b/internal/text/plain_test.go
@@ -118,20 +118,20 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() {
`
tags := suite.FromPlain(statusText).Tags
- suite.Len(tags, 13)
- suite.Equal("testing123", tags[0].Name)
- suite.Equal("also", tags[1].Name)
- suite.Equal("thisshouldwork", tags[2].Name)
- suite.Equal("dupe", tags[3].Name)
- suite.Equal("ThisShouldAlsoWork", tags[4].Name)
- suite.Equal("this_should_not_be_split", tags[5].Name)
- suite.Equal("111111", tags[6].Name)
- suite.Equal("alimentación", tags[7].Name)
- suite.Equal("saúde", tags[8].Name)
- suite.Equal("lävistää", tags[9].Name)
- suite.Equal("ö", tags[10].Name)
- suite.Equal("네", tags[11].Name)
- suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name)
+ if suite.Len(tags, 12) {
+ suite.Equal("testing123", tags[0].Name)
+ suite.Equal("also", tags[1].Name)
+ suite.Equal("thisshouldwork", tags[2].Name)
+ suite.Equal("dupe", tags[3].Name)
+ suite.Equal("ThisShouldAlsoWork", tags[4].Name)
+ suite.Equal("this_should_not_be_split", tags[5].Name)
+ suite.Equal("alimentación", tags[6].Name)
+ suite.Equal("saúde", tags[7].Name)
+ suite.Equal("lävistää", tags[8].Name)
+ suite.Equal("ö", tags[9].Name)
+ suite.Equal("네", tags[10].Name)
+ suite.Equal("ThisOneIsThirteyCharactersLong", tags[11].Name)
+ }
statusText = `#올빼미 hej`
tags = suite.FromPlain(statusText).Tags
@@ -170,8 +170,17 @@ func (suite *PlainTestSuite) TestDeriveMultiple() {
func (suite *PlainTestSuite) TestZalgoHashtag() {
statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`
f := suite.FromPlain(statusText)
- suite.Len(f.Tags, 1)
- suite.Equal("praying", f.Tags[0].Name)
+ if suite.Len(f.Tags, 2) {
+ suite.Equal("praying", f.Tags[0].Name)
+ // NFC doesn't do much for Zalgo text, but it's difficult to strip marks without affecting non-Latin text.
+ suite.Equal("z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪", f.Tags[1].Name)
+ }
+}
+
+func (suite *PlainTestSuite) TestNumbersAreNotHashtags() {
+ statusText := `yo who else thinks #19_98 is #1?`
+ f := suite.FromPlain(statusText)
+ suite.Len(f.Tags, 0)
}
func TestPlainTestSuite(t *testing.T) {
diff --git a/internal/text/util.go b/internal/text/util.go
index af45cfaf0..47b2416dd 100644
--- a/internal/text/util.go
+++ b/internal/text/util.go
@@ -19,19 +19,14 @@ package text
import "unicode"
-func isPlausiblyInHashtag(r rune) bool {
- // Marks are allowed during parsing
- // prior to normalization, but not after,
- // since they may be combined into letters
- // during normalization.
- return unicode.IsMark(r) ||
- isPermittedInHashtag(r)
+func isPermittedInHashtag(r rune) bool {
+ return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)
}
-func isPermittedInHashtag(r rune) bool {
- return unicode.IsLetter(r) ||
- unicode.IsNumber(r) ||
- r == '_'
+// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag
+// but are not allowed to be the only characters making up the hashtag.
+func isPermittedIfNotEntireHashtag(r rune) bool {
+ return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'
}
// isHashtagBoundary returns true if rune r