diff options
| author | 2025-01-31 02:42:55 -0800 | |
|---|---|---|
| committer | 2025-01-31 11:42:55 +0100 | |
| commit | b9e0689359f347edc47487a8043c9004ead0770a (patch) | |
| tree | 514077f83214533ec359a79e0033dcd9015d4ff2 /internal/text | |
| parent | [feature] Add system message wrappers for pending replies and placeholder att... (diff) | |
| download | gotosocial-b9e0689359f347edc47487a8043c9004ead0770a.tar.xz | |
[bugfix] Extend parser to handle more non-Latin hashtags (#3700)
* Allow marks after NFC normalization
Includes regression test for the Tamil example from #3618
* Disallow just numbers + marks + underscore as hashtag
Diffstat (limited to 'internal/text')
| -rw-r--r-- | internal/text/goldmark_parsers.go | 2 | ||||
| -rw-r--r-- | internal/text/markdown_test.go | 8 | ||||
| -rw-r--r-- | internal/text/normalize.go | 17 | ||||
| -rw-r--r-- | internal/text/plain_test.go | 41 | ||||
| -rw-r--r-- | internal/text/util.go | 17 | 
5 files changed, 48 insertions, 37 deletions
| diff --git a/internal/text/goldmark_parsers.go b/internal/text/goldmark_parsers.go index b7cf4f9e9..e2c87e057 100644 --- a/internal/text/goldmark_parsers.go +++ b/internal/text/goldmark_parsers.go @@ -177,7 +177,7 @@ func (p *hashtagParser) Parse(  			// Ignore initial '#'.  			continue -		case !isPlausiblyInHashtag(r) && +		case !isPermittedInHashtag(r) &&  			!isHashtagBoundary(r):  			// Weird non-boundary character  			// in the hashtag. Don't trust it. diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go index 98ed3a96b..153673415 100644 --- a/internal/text/markdown_test.go +++ b/internal/text/markdown_test.go @@ -50,6 +50,8 @@ const (  	withInlineCode2Expected         = "<p><code>Nobody tells you about the </code><del>SECRET CODE</del><code>, do they?</code></p>"  	withHashtag                     = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"  	withHashtagExpected             = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>" +	withTamilHashtag                = "here's a simple status that uses a hashtag in Tamil #தமிழ்" +	withTamilHashtagExpected        = "<p>here's a simple status that uses a hashtag in Tamil <a href=\"http://localhost:8080/tags/%E0%AE%A4%E0%AE%AE%E0%AE%BF%E0%AE%B4%E0%AF%8D\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>தமிழ்</span></a></p>"  	mdWithHTML                      = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"  	mdWithHTMLExpected              = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"  	mdWithCheekyHTML                = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>" @@ -121,6 +123,12 @@ func (suite *MarkdownTestSuite) TestParseWithHashtag() {  	suite.Equal(withHashtagExpected, formatted.HTML)  } +// Regressiom test for https://github.com/superseriousbusiness/gotosocial/issues/3618 +func (suite *MarkdownTestSuite) TestParseWithTamilHashtag() { +	formatted := suite.FromMarkdown(withTamilHashtag) +	suite.Equal(withTamilHashtagExpected, formatted.HTML) +} +  func (suite *MarkdownTestSuite) TestParseWithHTML() {  	formatted := suite.FromMarkdown(mdWithHTML)  	suite.Equal(mdWithHTMLExpected, formatted.HTML) diff --git a/internal/text/normalize.go b/internal/text/normalize.go index d2e633d1e..ea266fb33 100644 --- a/internal/text/normalize.go +++ b/internal/text/normalize.go @@ -50,17 +50,16 @@ func NormalizeHashtag(text string) (string, bool) {  	// Validate normalized result.  	var ( -		notJustUnderscores = false -		onlyPermittedChars = true -		lengthOK           = true +		atLeastOneRequiredChar = false +		onlyPermittedChars     = true +		lengthOK               = true  	)  	for i, r := range normalized { -		if r != '_' { -			// This isn't an underscore, -			// so the whole hashtag isn't -			// just underscores. -			notJustUnderscores = true +		if !isPermittedIfNotEntireHashtag(r) { +			// This isn't an underscore, mark, etc, +			// so the hashtag contains at least one +			atLeastOneRequiredChar = true  		}  		if i >= maximumHashtagLength { @@ -74,5 +73,5 @@ func NormalizeHashtag(text string) (string, bool) {  		}  	} -	return normalized, (lengthOK && onlyPermittedChars && notJustUnderscores) +	return normalized, lengthOK && onlyPermittedChars && atLeastOneRequiredChar  } diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go index fac54a38e..ffa64ce44 100644 --- a/internal/text/plain_test.go +++ b/internal/text/plain_test.go @@ -118,20 +118,20 @@ func (suite *PlainTestSuite) TestDeriveHashtagsOK() {  `  	tags := suite.FromPlain(statusText).Tags -	suite.Len(tags, 13) -	suite.Equal("testing123", tags[0].Name) -	suite.Equal("also", tags[1].Name) -	suite.Equal("thisshouldwork", tags[2].Name) -	suite.Equal("dupe", tags[3].Name) -	suite.Equal("ThisShouldAlsoWork", tags[4].Name) -	suite.Equal("this_should_not_be_split", tags[5].Name) -	suite.Equal("111111", tags[6].Name) -	suite.Equal("alimentación", tags[7].Name) -	suite.Equal("saúde", tags[8].Name) -	suite.Equal("lävistää", tags[9].Name) -	suite.Equal("ö", tags[10].Name) -	suite.Equal("네", tags[11].Name) -	suite.Equal("ThisOneIsThirteyCharactersLong", tags[12].Name) +	if suite.Len(tags, 12) { +		suite.Equal("testing123", tags[0].Name) +		suite.Equal("also", tags[1].Name) +		suite.Equal("thisshouldwork", tags[2].Name) +		suite.Equal("dupe", tags[3].Name) +		suite.Equal("ThisShouldAlsoWork", tags[4].Name) +		suite.Equal("this_should_not_be_split", tags[5].Name) +		suite.Equal("alimentación", tags[6].Name) +		suite.Equal("saúde", tags[7].Name) +		suite.Equal("lävistää", tags[8].Name) +		suite.Equal("ö", tags[9].Name) +		suite.Equal("네", tags[10].Name) +		suite.Equal("ThisOneIsThirteyCharactersLong", tags[11].Name) +	}  	statusText = `#올빼미 hej`  	tags = suite.FromPlain(statusText).Tags @@ -170,8 +170,17 @@ func (suite *PlainTestSuite) TestDeriveMultiple() {  func (suite *PlainTestSuite) TestZalgoHashtag() {  	statusText := `yo who else loves #praying to #z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪?`  	f := suite.FromPlain(statusText) -	suite.Len(f.Tags, 1) -	suite.Equal("praying", f.Tags[0].Name) +	if suite.Len(f.Tags, 2) { +		suite.Equal("praying", f.Tags[0].Name) +		// NFC doesn't do much for Zalgo text, but it's difficult to strip marks without affecting non-Latin text. +		suite.Equal("z̸͉̅a̸͚͋l̵͈̊g̸̫͌ỏ̷̪", f.Tags[1].Name) +	} +} + +func (suite *PlainTestSuite) TestNumbersAreNotHashtags() { +	statusText := `yo who else thinks #19_98 is #1?` +	f := suite.FromPlain(statusText) +	suite.Len(f.Tags, 0)  }  func TestPlainTestSuite(t *testing.T) { diff --git a/internal/text/util.go b/internal/text/util.go index af45cfaf0..47b2416dd 100644 --- a/internal/text/util.go +++ b/internal/text/util.go @@ -19,19 +19,14 @@ package text  import "unicode" -func isPlausiblyInHashtag(r rune) bool { -	// Marks are allowed during parsing -	// prior to normalization, but not after, -	// since they may be combined into letters -	// during normalization. -	return unicode.IsMark(r) || -		isPermittedInHashtag(r) +func isPermittedInHashtag(r rune) bool { +	return unicode.IsLetter(r) || isPermittedIfNotEntireHashtag(r)  } -func isPermittedInHashtag(r rune) bool { -	return unicode.IsLetter(r) || -		unicode.IsNumber(r) || -		r == '_' +// isPermittedIfNotEntireHashtag is true for characters that may be in a hashtag +// but are not allowed to be the only characters making up the hashtag. +func isPermittedIfNotEntireHashtag(r rune) bool { +	return unicode.IsNumber(r) || unicode.IsMark(r) || r == '_'  }  // isHashtagBoundary returns true if rune r | 
