summaryrefslogtreecommitdiff
path: root/internal/text
diff options
context:
space:
mode:
authorLibravatar tobi <31960611+tsmethurst@users.noreply.github.com>2023-08-11 14:40:11 +0200
committerLibravatar GitHub <noreply@github.com>2023-08-11 14:40:11 +0200
commitdc96562b4084e058846aea9102ef0257461717d6 (patch)
treea0b4bdbaa266386c7fdbbc02ca3e62bae559bf17 /internal/text
parent[feature] Set Content-Security-Policy header (#2095) (diff)
downloadgotosocial-dc96562b4084e058846aea9102ef0257461717d6.tar.xz
[bugfix] Use custom bluemonday policy to disallow inline img tags (#2100)
Diffstat (limited to 'internal/text')
-rw-r--r--internal/text/emojionly.go7
-rw-r--r--internal/text/markdown.go7
-rw-r--r--internal/text/markdown_test.go2
-rw-r--r--internal/text/minify.go21
-rw-r--r--internal/text/plain.go7
-rw-r--r--internal/text/sanitize.go173
-rw-r--r--internal/text/sanitize_test.go28
7 files changed, 193 insertions, 52 deletions
diff --git a/internal/text/emojionly.go b/internal/text/emojionly.go
index ba7555716..f4f200b21 100644
--- a/internal/text/emojionly.go
+++ b/internal/text/emojionly.go
@@ -61,13 +61,10 @@ func (f *formatter) FromPlainEmojiOnly(ctx context.Context, pmf gtsmodel.ParseMe
result.HTML = htmlContentBytes.String()
// clean anything dangerous out of the HTML
- result.HTML = SanitizeHTML(result.HTML)
+ result.HTML = SanitizeToHTML(result.HTML)
// shrink ray
- result.HTML, err = m.String("text/html", result.HTML)
- if err != nil {
- log.Errorf(ctx, "error minifying HTML: %s", err)
- }
+ result.HTML = MinifyHTML(result.HTML)
return result
}
diff --git a/internal/text/markdown.go b/internal/text/markdown.go
index c7d4958f4..ecc49673b 100644
--- a/internal/text/markdown.go
+++ b/internal/text/markdown.go
@@ -57,13 +57,10 @@ func (f *formatter) FromMarkdown(ctx context.Context, pmf gtsmodel.ParseMentionF
result.HTML = htmlContentBytes.String()
// clean anything dangerous out of the HTML
- result.HTML = SanitizeHTML(result.HTML)
+ result.HTML = SanitizeToHTML(result.HTML)
// shrink ray
- result.HTML, err = m.String("text/html", result.HTML)
- if err != nil {
- log.Errorf(ctx, "error minifying HTML: %s", err)
- }
+ result.HTML = MinifyHTML(result.HTML)
return result
}
diff --git a/internal/text/markdown_test.go b/internal/text/markdown_test.go
index 2602506ca..cc466df6c 100644
--- a/internal/text/markdown_test.go
+++ b/internal/text/markdown_test.go
@@ -51,7 +51,7 @@ const (
withHashtag = "# Title\n\nhere's a simple status that uses hashtag #Hashtag!"
withHashtagExpected = "<h1>Title</h1><p>here's a simple status that uses hashtag <a href=\"http://localhost:8080/tags/hashtag\" class=\"mention hashtag\" rel=\"tag nofollow noreferrer noopener\" target=\"_blank\">#<span>Hashtag</span></a>!</p>"
mdWithHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a <a href=\"https://example.org\">link</a>.\n\nHere's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\">"
- mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image: <img src=\"https://gts.superseriousbusiness.org/assets/logo.png\" alt=\"The GoToSocial sloth logo.\" width=\"500\" height=\"600\" crossorigin=\"anonymous\"></p>"
+ mdWithHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a <a href=\"https://example.org\" rel=\"nofollow noreferrer noopener\" target=\"_blank\">link</a>.</p><p>Here's an image:</p>"
mdWithCheekyHTML = "# Title\n\nHere's a simple text in markdown.\n\nHere's a cheeky little script: <script>alert(ahhhh)</script>"
mdWithCheekyHTMLExpected = "<h1>Title</h1><p>Here's a simple text in markdown.</p><p>Here's a cheeky little script:</p>"
mdWithHashtagInitial = "#welcome #Hashtag"
diff --git a/internal/text/minify.go b/internal/text/minify.go
index 83780d5c1..da61bdcf9 100644
--- a/internal/text/minify.go
+++ b/internal/text/minify.go
@@ -18,6 +18,7 @@
package text
import (
+ "github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/tdewolff/minify/v2"
"github.com/tdewolff/minify/v2/html"
)
@@ -31,3 +32,23 @@ var m = func() *minify.M {
})
return m
}()
+
+// MinifyHTML minifies the given string
+// under the assumption that it's HTML.
+//
+// If input is not HTML encoded, this
+// function will try to do minimization
+// anyway, but this may produce unexpected
+// results.
+//
+// If an error occurs during minimization,
+// it will be logged and the original string
+// returned unmodified.
+func MinifyHTML(in string) string {
+ out, err := m.String("text/html", in)
+ if err != nil {
+ log.Error(nil, err)
+ }
+
+ return out
+}
diff --git a/internal/text/plain.go b/internal/text/plain.go
index b1c2a2c33..330ebfb15 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -65,13 +65,10 @@ func (f *formatter) fromPlain(
result.HTML = htmlContentBytes.String()
// Clean anything dangerous out of resulting HTML.
- result.HTML = SanitizeHTML(result.HTML)
+ result.HTML = SanitizeToHTML(result.HTML)
// Shrink ray!
- var err error
- if result.HTML, err = m.String("text/html", result.HTML); err != nil {
- log.Errorf(ctx, "error minifying HTML: %s", err)
- }
+ result.HTML = MinifyHTML(result.HTML)
return result
}
diff --git a/internal/text/sanitize.go b/internal/text/sanitize.go
index 7e857b533..81c436264 100644
--- a/internal/text/sanitize.go
+++ b/internal/text/sanitize.go
@@ -25,44 +25,167 @@ import (
"github.com/microcosm-cc/bluemonday"
)
-// '[A]llows a broad selection of HTML elements and attributes that are safe for user generated content.
-// Note that this policy does not allow iframes, object, embed, styles, script, etc.
-// An example usage scenario would be blog post bodies where a variety of formatting is expected along with the potential for TABLEs and IMGs.'
-//
-// Source: https://github.com/microcosm-cc/bluemonday#usage
-var regular *bluemonday.Policy = bluemonday.UGCPolicy().
- RequireNoReferrerOnLinks(true).
- RequireNoFollowOnLinks(false). // remove the global default which adds rel="nofollow" to all links including local relative
- RequireNoFollowOnFullyQualifiedLinks(true). // add rel="nofollow" on all external links
- RequireCrossOriginAnonymous(true).
- AddTargetBlankToFullyQualifiedLinks(true).
- AllowAttrs("class", "href", "rel").OnElements("a").
- AllowAttrs("class").OnElements("span").
- AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code").
- SkipElementsContent("code", "pre")
-
-// '[C]an be thought of as equivalent to stripping all HTML elements and their attributes as it has nothing on its allowlist.
-// An example usage scenario would be blog post titles where HTML tags are not expected at all
-// and if they are then the elements and the content of the elements should be stripped. This is a very strict policy.'
+// Regular HTML policy is an adapted version of the default
+// bluemonday UGC policy, with some tweaks of our own.
+// See: https://github.com/microcosm-cc/bluemonday#usage
+var regular *bluemonday.Policy = func() *bluemonday.Policy {
+ p := bluemonday.NewPolicy()
+
+ // AllowStandardAttributes will enable "id", "title" and
+ // the language specific attributes "dir" and "lang" on
+ // all elements that are allowed
+ p.AllowStandardAttributes()
+
+ /*
+ LAYOUT AND FORMATTING
+ */
+
+ // "aside" is permitted and takes no attributes.
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/aside
+ p.AllowElements("article", "aside")
+
+ // "details" is permitted, including the "open" attribute
+ // which can either be blank or the value "open".
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/details
+ p.AllowAttrs("open").Matching(regexp.MustCompile(`(?i)^(|open)$`)).OnElements("details")
+
+ // "section" is permitted and takes no attributes.
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/section
+ p.AllowElements("section")
+
+ // "summary" is permitted and takes no attributes.
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/summary
+ p.AllowElements("summary")
+
+ // "h1" through "h6" are permitted and take no attributes.
+ p.AllowElements("h1", "h2", "h3", "h4", "h5", "h6")
+
+ // "hgroup" is permitted and takes no attributes.
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/hgroup
+ p.AllowElements("hgroup")
+
+ // "blockquote" is permitted, including the "cite"
+ // attribute which must be a standard URL.
+ p.AllowAttrs("cite").OnElements("blockquote")
+
+ // "br" "div" "hr" "p" "span" "wbr" are permitted and take no attributes
+ p.AllowElements("br", "div", "hr", "p", "span", "wbr")
+
+ // The following are all inline phrasing elements:
+ p.AllowElements("abbr", "acronym", "cite", "code", "dfn", "em",
+ "figcaption", "mark", "s", "samp", "strong", "sub", "sup", "var")
+
+ // "q" is permitted and "cite" is a URL and handled by URL policies
+ p.AllowAttrs("cite").OnElements("q")
+
+ // "time" is permitted
+ p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("time")
+
+ // Block and inline elements that impart no
+ // semantic meaning but style the document.
+ // Underlines, italics, bold, strikethrough etc.
+ p.AllowElements("b", "i", "pre", "small", "strike", "tt", "u")
+
+ // "del" "ins" are permitted
+ p.AllowAttrs("cite").Matching(bluemonday.Paragraph).OnElements("del", "ins")
+ p.AllowAttrs("datetime").Matching(bluemonday.ISO8601).OnElements("del", "ins")
+
+ // Enable ordered, unordered, and definition lists.
+ p.AllowLists()
+
+ // Class needed on span for mentions, which look like this when assembled:
+ // `<span class="h-card"><a href="https://example.org/users/targetAccount" class="u-url mention">@<span>someusername</span></a></span>`
+ p.AllowAttrs("class").OnElements("span")
+
+ /*
+ LANGUAGE FORMATTING
+ */
+
+ // "bdi" "bdo" are permitted on "dir".
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Global_attributes/dir
+ p.AllowAttrs("dir").Matching(bluemonday.Direction).OnElements("bdi", "bdo")
+
+ // "rp" "rt" "ruby" are permitted. See:
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rp
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/rt
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/ruby
+ p.AllowElements("rp", "rt", "ruby")
+
+ /*
+ CODE BLOCKS
+ */
+
+ // Permit language tags for code elements.
+ p.AllowAttrs("class").Matching(regexp.MustCompile("^language-[a-zA-Z0-9]+$")).OnElements("code")
+
+ // Don't sanitize HTML inside code blocks.
+ p.SkipElementsContent("code", "pre")
+
+ /*
+ LINKS AND LINK SAFETY.
+ */
+
+ // Permit hyperlinks.
+ p.AllowAttrs("class", "href", "rel").OnElements("a")
+
+ // URLs must be parseable by net/url.Parse().
+ p.RequireParseableURLs(true)
+
+ // Most common URL schemes only.
+ p.AllowURLSchemes("mailto", "http", "https")
+
+ // Force rel="noreferrer".
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel/noreferrer
+ p.RequireNoReferrerOnLinks(true)
+
+ // Add rel="nofollow" on all fully qualified (not relative) links.
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/rel#nofollow
+ p.RequireNoFollowOnFullyQualifiedLinks(true)
+
+ // Force crossorigin="anonymous"
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes/crossorigin#anonymous
+ p.RequireCrossOriginAnonymous(true)
+
+ // Force target="_blank".
+ // See: https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a#target
+ p.AddTargetBlankToFullyQualifiedLinks(true)
+
+ return p
+}()
+
+// '[C]an be thought of as equivalent to stripping all HTML
+// elements and their attributes as it has nothing on its allowlist.
+// An example usage scenario would be blog post titles where HTML
+// tags are not expected at all and if they are then the elements
+// and the content of the elements should be stripped. This is a
+// very strict policy.'
//
// Source: https://github.com/microcosm-cc/bluemonday#usage
var strict *bluemonday.Policy = bluemonday.StrictPolicy()
-// removeHTML strictly removes *all* recognized HTML elements from the given string.
+// removeHTML strictly removes *all* recognized
+// HTML elements from the given string.
func removeHTML(in string) string {
return strict.Sanitize(in)
}
-// SanitizeHTML sanitizes risky html elements from the given string, allowing only safe ones through.
-func SanitizeHTML(in string) string {
+// SanitizeToHTML sanitizes only risky html elements
+// from the given string, allowing safe ones through.
+func SanitizeToHTML(in string) string {
return regular.Sanitize(in)
}
-// SanitizePlaintext runs text through basic sanitization. This removes
-// any html elements that were in the string, and returns clean plaintext.
-func SanitizePlaintext(in string) string {
+// SanitizeToPlaintext runs text through basic sanitization.
+// This removes any html elements that were in the string,
+// and returns clean plaintext.
+func SanitizeToPlaintext(in string) string {
+ // Unescape first to catch any tricky critters.
content := html.UnescapeString(in)
+
+ // Remove all detected HTML.
content = removeHTML(content)
+
+ // Unescape again to return plaintext.
content = html.UnescapeString(content)
return strings.TrimSpace(content)
}
diff --git a/internal/text/sanitize_test.go b/internal/text/sanitize_test.go
index f299c2923..ae49c942c 100644
--- a/internal/text/sanitize_test.go
+++ b/internal/text/sanitize_test.go
@@ -36,30 +36,30 @@ type SanitizeTestSuite struct {
}
func (suite *SanitizeTestSuite) TestSanitizeOutgoing() {
- s := text.SanitizeHTML(sanitizeOutgoing)
+ s := text.SanitizeToHTML(sanitizeOutgoing)
suite.Equal(sanitizedOutgoing, s)
}
func (suite *SanitizeTestSuite) TestSanitizeHTML() {
- s := text.SanitizeHTML(sanitizeHTML)
+ s := text.SanitizeToHTML(sanitizeHTML)
suite.Equal(sanitizedHTML, s)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption1() {
dodgyCaption := "<script>console.log('haha!')</script>this is just a normal caption ;)"
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("this is just a normal caption ;)", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption2() {
dodgyCaption := "<em>here's a LOUD caption</em>"
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("here's a LOUD caption", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption3() {
dodgyCaption := ""
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("", sanitized)
}
@@ -75,21 +75,21 @@ with some newlines
`
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("here is\na multi line\ncaption\nwith some newlines", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption5() {
// html-escaped: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;script&gt;console.log(&apos;aha!&apos;)&lt;/script&gt; hello world`
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("hello world", sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeCaption6() {
// html-encoded: "<script>console.log('aha!')</script> hello world"
dodgyCaption := `&lt;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#99;&#111;&#110;&#115;&#111;&#108;&#101;&period;&#108;&#111;&#103;&lpar;&apos;&#97;&#104;&#97;&excl;&apos;&rpar;&lt;&sol;&#115;&#99;&#114;&#105;&#112;&#116;&gt;&#32;&#104;&#101;&#108;&#108;&#111;&#32;&#119;&#111;&#114;&#108;&#100;`
- sanitized := text.SanitizePlaintext(dodgyCaption)
+ sanitized := text.SanitizeToPlaintext(dodgyCaption)
suite.Equal("hello world", sanitized)
}
@@ -104,24 +104,30 @@ func (suite *SanitizeTestSuite) TestSanitizeCustomCSS() {
overflow: hidden;
text-overflow: ellipsis;
}`
- sanitized := text.SanitizePlaintext(customCSS)
+ sanitized := text.SanitizeToPlaintext(customCSS)
suite.Equal(customCSS, sanitized) // should be the same as it was before
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS1() {
// try to break out of <style> into <head> and change the document title
customCSS := "</style><title>pee pee poo poo</title><style>"
- sanitized := text.SanitizePlaintext(customCSS)
+ sanitized := text.SanitizeToPlaintext(customCSS)
suite.Empty(sanitized)
}
func (suite *SanitizeTestSuite) TestSanitizeNaughtyCustomCSS2() {
// try to break out of <style> into <head> and change the document title
customCSS := "pee pee poo poo</style><title></title><style>"
- sanitized := text.SanitizePlaintext(customCSS)
+ sanitized := text.SanitizeToPlaintext(customCSS)
suite.Equal("pee pee poo poo", sanitized)
}
+func (suite *SanitizeTestSuite) TestSanitizeInlineImg() {
+ withInlineImg := "<p>Here's an inline image: <img class=\"fixed-size-img svelte-uci8eb\" aria-hidden=\"false\" alt=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" title=\"A black-and-white photo of an Oblique Strategy card. The card reads: 'Define an area as 'safe' and use it as an anchor'.\" width=\"0\" height=\"0\" src=\"https://example.org/fileserver/01H7J83147QMCE17C0RS9P10Y9/attachment/small/01H7J8365XXRTCP6CAMGEM49ZE.jpg\" style=\"object-position: 50% 50%;\"></p>"
+ sanitized := text.SanitizeToHTML(withInlineImg)
+ suite.Equal(`<p>Here&#39;s an inline image: </p>`, sanitized)
+}
+
func TestSanitizeTestSuite(t *testing.T) {
suite.Run(t, new(SanitizeTestSuite))
}