From d8113c11e4d84a6d04d56b58d337c235154a535b Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Fri, 7 Mar 2025 15:04:34 +0100 Subject: [feature] Parse content warning to HTML, serialize via client API as plaintext (#3876) * [feature] Parse content warning as HTML, serialize via API to plaintext * tidy up some cruft * whoops * oops * i'm da joker baybee * clemency muy lorde * rename some of the text functions for clarity * jiggle the opts * fiddle de deee * hopefully the last test fix i ever have to do in my beautiful life --- internal/text/plain.go | 66 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 58 insertions(+), 8 deletions(-) (limited to 'internal/text/plain.go') diff --git a/internal/text/plain.go b/internal/text/plain.go index 362941773..ee4947bf7 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -20,8 +20,11 @@ package text import ( "bytes" "context" + gohtml "html" + "strings" "codeberg.org/gruf/go-byteutil" + "github.com/k3a/html2text" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" @@ -52,7 +55,7 @@ func (f *Formatter) FromPlain( return f.fromPlain( ctx, plainTextParser, - false, // emojiOnly = false + false, // basic = false parseMention, authorID, statusID, @@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph( return f.fromPlain( ctx, plainTextParser, - false, // emojiOnly = false + false, // basic = false parseMention, authorID, statusID, @@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph( ) } -// FromPlainEmojiOnly fulfils FormatFunc by parsing +// FromPlainBasic fulfils FormatFunc by parsing // the given plaintext input into a FormatResult. // // Unlike FromPlain, it will only parse emojis with // the custom renderer, leaving aside mentions and tags. -func (f *Formatter) FromPlainEmojiOnly( +// +// Resulting HTML will also NOT be wrapped in
tags.
+func (f *Formatter) FromPlainBasic(
ctx context.Context,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
@@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly(
return f.fromPlain(
ctx,
plainTextParser,
- true, // emojiOnly = true
+ true, // basic = true
parseMention,
authorID,
statusID,
@@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly(
func (f *Formatter) fromPlain(
ctx context.Context,
plainTextParser parser.Parser,
- emojiOnly bool,
+ basic bool,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
statusID string,
@@ -156,7 +161,9 @@ func (f *Formatter) fromPlain(
parseMention,
authorID,
statusID,
- emojiOnly,
+ // If basic, pass
+ // emojiOnly = true.
+ basic,
result,
},
// Turns URLs into links.
@@ -181,8 +188,51 @@ func (f *Formatter) fromPlain(
// Clean and shrink HTML.
result.HTML = byteutil.B2S(htmlBytes.Bytes())
- result.HTML = SanitizeToHTML(result.HTML)
+ result.HTML = SanitizeHTML(result.HTML)
result.HTML = MinifyHTML(result.HTML)
return result
}
+
+// ParseHTMLToPlain parses the given HTML string, then
+// outputs it to equivalent plaintext while trying to
+// keep as much of the smenantic intent of the input
+// HTML as possible, ie., titles are placed on separate
+// lines, `
`s are converted to newlines, text inside
+// `` and `` tags is retained, but without
+// emphasis, `` links are unnested and the URL they
+// link to is placed in angle brackets next to them,
+// lists are replaced with newline-separated indented
+// items, etc.
+//
+// This function is useful when you need to filter on
+// HTML and want to avoid catching tags in the filter,
+// or when you want to serve something in a plaintext
+// format that may contain HTML tags (eg., CWs).
+func ParseHTMLToPlain(html string) string {
+ plain := html2text.HTML2TextWithOptions(
+ html,
+ html2text.WithLinksInnerText(),
+ html2text.WithUnixLineBreaks(),
+ html2text.WithListSupport(),
+ )
+ return strings.TrimSpace(plain)
+}
+
+// StripHTMLFromText runs text through strict sanitization
+// to completely remove any HTML from the input without
+// trying to preserve the semantic intent of any HTML tags.
+//
+// This is useful in cases where the input was not allowed
+// to contain HTML at all, and the output isn't either.
+func StripHTMLFromText(text string) string {
+ // Unescape first to catch any tricky critters.
+ content := gohtml.UnescapeString(text)
+
+ // Remove all detected HTML.
+ content = strict.Sanitize(content)
+
+ // Unescape again to return plaintext.
+ content = gohtml.UnescapeString(content)
+ return strings.TrimSpace(content)
+}
--
cgit v1.2.3