summaryrefslogtreecommitdiff
path: root/internal/text
diff options
context:
space:
mode:
authorLibravatar Tobi Smethurst <31960611+tsmethurst@users.noreply.github.com>2021-08-16 19:17:56 +0200
committerLibravatar GitHub <noreply@github.com>2021-08-16 19:17:56 +0200
commitce190d867ca126001a1c0417b00810fc03c0b3ba (patch)
tree364b00118a405239bc6bcac0bfb7891c83655c23 /internal/text
parentTimeline loop fix (#140) (diff)
downloadgotosocial-ce190d867ca126001a1c0417b00810fc03c0b3ba.tar.xz
Text/status parsing fixes (#141)
* aaaaaa * vendor minify * update + test markdown parsing
Diffstat (limited to 'internal/text')
-rw-r--r--internal/text/common.go33
-rw-r--r--internal/text/markdown.go11
-rw-r--r--internal/text/markdown_test.go116
-rw-r--r--internal/text/minify.go39
-rw-r--r--internal/text/plain.go7
-rw-r--r--internal/text/plain_test.go8
-rw-r--r--internal/text/sanitize.go17
-rw-r--r--internal/text/sanitize_test.go75
8 files changed, 271 insertions, 35 deletions
diff --git a/internal/text/common.go b/internal/text/common.go
index 4f0bad9dc..f6a5ca5f5 100644
--- a/internal/text/common.go
+++ b/internal/text/common.go
@@ -20,6 +20,7 @@ package text
import (
"fmt"
+ "html"
"strings"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@@ -29,23 +30,33 @@ import (
// preformat contains some common logic for making a string ready for formatting, which should be used for all user-input text.
func preformat(in string) string {
// do some preformatting of the text
- // 1. Trim all the whitespace
- s := strings.TrimSpace(in)
+
+ // 1. unescape everything that might be html escaped
+ s := html.UnescapeString(in)
+
+ // 2. trim leading or trailing whitespace
+ s = strings.TrimSpace(s)
return s
}
// postformat contains some common logic for html sanitization of text, wrapping elements, and trimming newlines and whitespace
func postformat(in string) string {
// do some postformatting of the text
- // 1. sanitize html to remove any dodgy scripts or other disallowed elements
- s := SanitizeOutgoing(in)
- // 2. wrap the whole thing in a paragraph
- s = fmt.Sprintf(`<p>%s</p>`, s)
- // 3. remove any cheeky newlines
- s = strings.ReplaceAll(s, "\n", "")
- // 4. remove any whitespace added as a result of the formatting
- s = strings.TrimSpace(s)
- return s
+
+ // 1. sanitize html to remove potentially dangerous elements
+ s := SanitizeHTML(in)
+
+ // 2. the sanitize step tends to escape characters inside codeblocks, which is behavior we don't want, so unescape everything again
+ s = html.UnescapeString(s)
+
+ // 3. minify html to remove any trailing newlines, spaces, unnecessary elements, etc etc
+ mini, err := minifyHTML(s)
+ if err != nil {
+ // if the minify failed, just return what we have
+ return s
+ }
+ // return minified version of the html
+ return mini
}
func (f *formatter) ReplaceTags(in string, tags []*gtsmodel.Tag) string {
diff --git a/internal/text/markdown.go b/internal/text/markdown.go
index f9d12209a..5a7603615 100644
--- a/internal/text/markdown.go
+++ b/internal/text/markdown.go
@@ -23,21 +23,14 @@ import (
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
)
-var bfExtensions = blackfriday.NoIntraEmphasis |
- blackfriday.FencedCode |
- blackfriday.Autolink |
- blackfriday.Strikethrough |
- blackfriday.SpaceHeadings |
- blackfriday.BackslashLineBreak
-
func (f *formatter) FromMarkdown(md string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(md)
// do the markdown parsing *first*
- content = string(blackfriday.Run([]byte(content), blackfriday.WithExtensions(bfExtensions)))
+ contentBytes := blackfriday.Run([]byte(content))
// format tags nicely
- content = f.ReplaceTags(content, tags)
+ content = f.ReplaceTags(string(contentBytes), tags)
// format mentions nicely
content = f.ReplaceMentions(content, mentions)
diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go
new file mode 100644
index 000000000..432e9a4ec
--- /dev/null
+++ b/internal/text/markdown_test.go
@@ -0,0 +1,116 @@
+/*
+ GoToSocial
+ Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package text_test
+
+import (
+ "fmt"
+ "testing"
+
+ "github.com/stretchr/testify/suite"
+ "github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
+ "github.com/superseriousbusiness/gotosocial/internal/text"
+ "github.com/superseriousbusiness/gotosocial/testrig"
+)
+
+const (
+ simpleMarkdown = `# Title
+
+Here's a simple text in markdown.
+
+Here's a [link](https://example.org).`
+
+ simpleMarkdownExpected = "<h1>Title</h1><p>Here’s a simple text in markdown.</p><p>Here’s a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p>"
+
+ withCodeBlockExpected = "<h1>Title</h1><p>Below is some JSON.</p><pre><code class=\"language-json\">{\n \"key\": \"value\",\n \"another_key\": [\n \"value1\",\n \"value2\"\n ]\n}\n</code></pre><p>that was some JSON :)</p>"
+
+ withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
+ withHashtagExpected = "<h1>Title</h1><p>here’s a simple status that uses hashtag <a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
+)
+
+var (
+ withCodeBlock = `# Title
+
+Below is some JSON.
+
+` + "```" + `json
+{
+ "key": "value",
+ "another_key": [
+ "value1",
+ "value2"
+ ]
+}
+` + "```" + `
+
+that was some JSON :)
+`
+)
+
+type MarkdownTestSuite struct {
+ TextStandardTestSuite
+}
+
+func (suite *MarkdownTestSuite) SetupSuite() {
+ suite.testTokens = testrig.NewTestTokens()
+ suite.testClients = testrig.NewTestClients()
+ suite.testApplications = testrig.NewTestApplications()
+ suite.testUsers = testrig.NewTestUsers()
+ suite.testAccounts = testrig.NewTestAccounts()
+ suite.testAttachments = testrig.NewTestAttachments()
+ suite.testStatuses = testrig.NewTestStatuses()
+ suite.testTags = testrig.NewTestTags()
+ suite.testMentions = testrig.NewTestMentions()
+}
+
+func (suite *MarkdownTestSuite) SetupTest() {
+ suite.config = testrig.NewTestConfig()
+ suite.db = testrig.NewTestDB()
+ suite.log = testrig.NewTestLog()
+ suite.formatter = text.NewFormatter(suite.config, suite.db, suite.log)
+
+ testrig.StandardDBSetup(suite.db, suite.testAccounts)
+}
+
+func (suite *MarkdownTestSuite) TearDownTest() {
+ testrig.StandardDBTeardown(suite.db)
+}
+
+func (suite *MarkdownTestSuite) TestParseSimple() {
+ s := suite.formatter.FromMarkdown(simpleMarkdown, nil, nil)
+ suite.Equal(simpleMarkdownExpected, s)
+}
+
+func (suite *MarkdownTestSuite) TestParseWithCodeBlock() {
+ fmt.Println(withCodeBlock)
+ s := suite.formatter.FromMarkdown(withCodeBlock, nil, nil)
+ suite.Equal(withCodeBlockExpected, s)
+}
+
+func (suite *MarkdownTestSuite) TestParseWithHashtag() {
+ foundTags := []*gtsmodel.Tag{
+ suite.testTags["Hashtag"],
+ }
+
+ s := suite.formatter.FromMarkdown(withHashtag, nil, foundTags)
+ suite.Equal(withHashtagExpected, s)
+}
+
+func TestMarkdownTestSuite(t *testing.T) {
+ suite.Run(t, new(MarkdownTestSuite))
+}
diff --git a/internal/text/minify.go b/internal/text/minify.go
new file mode 100644
index 000000000..c6d7b9bc1
--- /dev/null
+++ b/internal/text/minify.go
@@ -0,0 +1,39 @@
+/*
+ GoToSocial
+ Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package text
+
+import (
+ "github.com/tdewolff/minify/v2"
+ "github.com/tdewolff/minify/v2/html"
+)
+
+var m *minify.M
+
+// minifyHTML runs html through a minifier, reducing it in size.
+func minifyHTML(in string) (string, error) {
+ if m == nil {
+ m = minify.New()
+ m.Add("text/html", &html.Minifier{
+ KeepQuotes: true,
+ KeepEndTags: true,
+ KeepDocumentTags: true,
+ })
+ }
+ return m.String("text/html", in)
+}
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 40fb6412f..a44e02c80 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -19,6 +19,7 @@
package text
import (
+ "fmt"
"strings"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
@@ -27,6 +28,9 @@ import (
func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags []*gtsmodel.Tag) string {
content := preformat(plain)
+ // sanitize any html elements
+ content = RemoveHTML(content)
+
// format links nicely
content = f.ReplaceLinks(content)
@@ -39,5 +43,8 @@ func (f *formatter) FromPlain(plain string, mentions []*gtsmodel.Mention, tags [
// replace newlines with breaks
content = strings.ReplaceAll(content, "\n", "<br />")
+ // wrap the whole thing in a pee
+ content = fmt.Sprintf(`<p>%s</p>`, content)
+
return postformat(content)
}
diff --git a/internal/text/plain_test.go b/internal/text/plain_test.go
index 2f9eb3a29..33c95234c 100644
--- a/internal/text/plain_test.go
+++ b/internal/text/plain_test.go
@@ -33,15 +33,15 @@ const (
simple = "this is a plain and simple status"
simpleExpected = "<p>this is a plain and simple status</p>"
- withTag = "this is a simple status that uses hashtag #welcome!"
- withTagExpected = "<p>this is a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
+ withTag = "here's a simple status that uses hashtag #welcome!"
+ withTagExpected = "<p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/welcome\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>welcome</span></a>!</p>"
moreComplex = `Another test @foss_satan@fossbros-anonymous.io
#Hashtag
Text`
- moreComplexExpected = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
+ moreComplexFull = "<p>Another test <span class=\"h-card\"><a href=\"http://fossbros-anonymous.io/@foss_satan\" class=\"u-url mention\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">@<span>foss_satan</span></a></span><br><br><a href=\"http://localhost:8080/tags/Hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a><br><br>Text</p>"
)
type PlainTestSuite struct {
@@ -102,7 +102,7 @@ func (suite *PlainTestSuite) TestParseMoreComplex() {
fmt.Println(f)
- assert.Equal(suite.T(), moreComplexExpected, f)
+ assert.Equal(suite.T(), moreComplexFull, f)
}
func TestPlainTestSuite(t *testing.T) {
diff --git a/internal/text/sanitize.go b/internal/text/sanitize.go
index 365875d46..e1bc73559 100644
--- a/internal/text/sanitize.go
+++ b/internal/text/sanitize.go
@@ -19,6 +19,8 @@
package text
import (
+ "regexp"
+
"github.com/microcosm-cc/bluemonday"
)
@@ -31,12 +33,11 @@ var regular *bluemonday.Policy = bluemonday.UGCPolicy().
RequireNoReferrerOnLinks(true).
RequireNoFollowOnLinks(true).
RequireCrossOriginAnonymous(true).
- AddTargetBlankToFullyQualifiedLinks(true)
-
-// outgoing policy should be used on statuses we've already parsed and added our own elements etc to. It is less strict than regular.
-var outgoing *bluemonday.Policy = regular.
+ AddTargetBlankToFullyQualifiedLinks(true).
AllowAttrs("class", "href", "rel").OnElements("a").
- AllowAttrs("class").OnElements("span")
+ AllowAttrs("class").OnElements("span").
+ AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code").
+ SkipElementsContent("code", "pre")
// '[C]an be thought of as equivalent to stripping all HTML elements and their attributes as it has nothing on its allowlist.
// An example usage scenario would be blog post titles where HTML tags are not expected at all
@@ -54,9 +55,3 @@ func SanitizeHTML(in string) string {
func RemoveHTML(in string) string {
return strict.Sanitize(in)
}
-
-// SanitizeOutgoing cleans up HTML in the given string, allowing through only safe elements and elements that were added during the parsing process.
-// This should be used on text that we've already converted into HTML, just to catch any weirdness.
-func SanitizeOutgoing(in string) string {
- return outgoing.Sanitize(in)
-}
diff --git a/internal/text/sanitize_test.go b/internal/text/sanitize_test.go
new file mode 100644
index 000000000..19a5f6a06
--- /dev/null
+++ b/internal/text/sanitize_test.go
@@ -0,0 +1,75 @@
+/*
+ GoToSocial
+ Copyright (C) 2021 GoToSocial Authors admin@gotosocial.org
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+package text_test
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/suite"
+ "github.com/superseriousbusiness/gotosocial/internal/text"
+)
+
+const (
+ removeHTML = `<p>Another test <span class="h-card"><a href="http://fossbros-anonymous.io/@foss_satan" class="u-url mention" rel="nofollow noreferrer noopener" target="_blank">@<span>foss_satan</span></a></span><br/><br/><a href="http://localhost:8080/tags/Hashtag" class="mention hashtag" rel="tag nofollow noreferrer noopener" target="_blank">#<span>Hashtag</span></a><br/><br/>Text</p>`
+ removedHTML = `Another test @foss_satan#HashtagText`
+
+ sanitizeHTML = `here's some naughty html: <script>alert(ahhhh)</script> !!!`
+ sanitizedHTML = `here&#39;s some naughty html: !!!`
+
+ withEscapedLiteral = `it\u0026amp;#39;s its it is`
+ withEscapedLiteralExpected = `it\u0026amp;#39;s its it is`
+ withEscaped = "it\u0026amp;#39;s its it is"
+ withEscapedExpected = "it&amp;#39;s its it is"
+
+ sanitizeOutgoing = `<p>gotta test some fucking &#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39; marks</p>`
+ sanitizedOutgoing = `<p>gotta test some fucking &#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39; marks</p>`
+)
+
+type SanitizeTestSuite struct {
+ suite.Suite
+}
+
+func (suite *SanitizeTestSuite) TestRemoveHTML() {
+ s := text.RemoveHTML(removeHTML)
+ suite.Equal(removedHTML, s)
+}
+
+func (suite *SanitizeTestSuite) TestSanitizeOutgoing() {
+ s := text.SanitizeHTML(sanitizeOutgoing)
+ suite.Equal(sanitizedOutgoing, s)
+}
+
+func (suite *SanitizeTestSuite) TestSanitizeHTML() {
+ s := text.SanitizeHTML(sanitizeHTML)
+ suite.Equal(sanitizedHTML, s)
+}
+
+func (suite *SanitizeTestSuite) TestSanitizeWithEscapedLiteral() {
+ s := text.RemoveHTML(withEscapedLiteral)
+ suite.Equal(withEscapedLiteralExpected, s)
+}
+
+func (suite *SanitizeTestSuite) TestSanitizeWithEscaped() {
+ s := text.RemoveHTML(withEscaped)
+ suite.Equal(withEscapedExpected, s)
+}
+
+func TestSanitizeTestSuite(t *testing.T) {
+ suite.Run(t, new(SanitizeTestSuite))
+}