From d8113c11e4d84a6d04d56b58d337c235154a535b Mon Sep 17 00:00:00 2001
From: tobi <31960611+tsmethurst@users.noreply.github.com>
Date: Fri, 7 Mar 2025 15:04:34 +0100
Subject: [feature] Parse content warning to HTML, serialize via client API as
 plaintext (#3876)

* [feature] Parse content warning as HTML, serialize via API to plaintext

* tidy up some cruft

* whoops

* oops

* i'm da joker baybee

* clemency muy lorde

* rename some of the text functions for clarity

* jiggle the opts

* fiddle de deee

* hopefully the last test fix i ever have to do in my beautiful life
---
 internal/text/plain.go | 66 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 8 deletions(-)

(limited to 'internal/text/plain.go')
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 362941773..ee4947bf7 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -20,8 +20,11 @@ package text
 import (
 	"bytes"
 	"context"
+	gohtml "html"
+	"strings"
 
 	"codeberg.org/gruf/go-byteutil"
+	"github.com/k3a/html2text"
 	"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
 	"github.com/superseriousbusiness/gotosocial/internal/log"
 	"github.com/superseriousbusiness/gotosocial/internal/regexes"
@@ -52,7 +55,7 @@ func (f *Formatter) FromPlain(
 	return f.fromPlain(
 		ctx,
 		plainTextParser,
-		false, // emojiOnly = false
+		false, // basic = false
 		parseMention,
 		authorID,
 		statusID,
@@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph(
 	return f.fromPlain(
 		ctx,
 		plainTextParser,
-		false, // emojiOnly = false
+		false, // basic = false
 		parseMention,
 		authorID,
 		statusID,
@@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph(
 	)
 }
 
-// FromPlainEmojiOnly fulfils FormatFunc by parsing
+// FromPlainBasic fulfils FormatFunc by parsing
 // the given plaintext input into a FormatResult.
 //
 // Unlike FromPlain, it will only parse emojis with
 // the custom renderer, leaving aside mentions and tags.
-func (f *Formatter) FromPlainEmojiOnly(
+//
+// Resulting HTML will also NOT be wrapped in <p> tags.
+func (f *Formatter) FromPlainBasic(
 	ctx context.Context,
 	parseMention gtsmodel.ParseMentionFunc,
 	authorID string,
@@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly(
 	return f.fromPlain(
 		ctx,
 		plainTextParser,
-		true, // emojiOnly = true
+		true, // basic = true
 		parseMention,
 		authorID,
 		statusID,
@@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly(
 func (f *Formatter) fromPlain(
 	ctx context.Context,
 	plainTextParser parser.Parser,
-	emojiOnly bool,
+	basic bool,
 	parseMention gtsmodel.ParseMentionFunc,
 	authorID string,
 	statusID string,
@@ -156,7 +161,9 @@ func (f *Formatter) fromPlain(
 				parseMention,
 				authorID,
 				statusID,
-				emojiOnly,
+				// If basic, pass
+				// emojiOnly = true.
+				basic,
 				result,
 			},
 			// Turns URLs into links.
@@ -181,8 +188,51 @@ func (f *Formatter) fromPlain(
 
 	// Clean and shrink HTML.
 	result.HTML = byteutil.B2S(htmlBytes.Bytes())
-	result.HTML = SanitizeToHTML(result.HTML)
+	result.HTML = SanitizeHTML(result.HTML)
 	result.HTML = MinifyHTML(result.HTML)
 
 	return result
 }
+
+// ParseHTMLToPlain parses the given HTML string, then
+// outputs it to equivalent plaintext while trying to
+// keep as much of the smenantic intent of the input
+// HTML as possible, ie., titles are placed on separate
+// lines, `<br>`s are converted to newlines, text inside
+// `<strong>` and `<em>` tags is retained, but without
+// emphasis, `<a>` links are unnested and the URL they
+// link to is placed in angle brackets next to them,
+// lists are replaced with newline-separated indented
+// items, etc.
+//
+// This function is useful when you need to filter on
+// HTML and want to avoid catching tags in the filter,
+// or when you want to serve something in a plaintext
+// format that may contain HTML tags (eg., CWs).
+func ParseHTMLToPlain(html string) string {
+	plain := html2text.HTML2TextWithOptions(
+		html,
+		html2text.WithLinksInnerText(),
+		html2text.WithUnixLineBreaks(),
+		html2text.WithListSupport(),
+	)
+	return strings.TrimSpace(plain)
+}
+
+// StripHTMLFromText runs text through strict sanitization
+// to completely remove any HTML from the input without
+// trying to preserve the semantic intent of any HTML tags.
+//
+// This is useful in cases where the input was not allowed
+// to contain HTML at all, and the output isn't either.
+func StripHTMLFromText(text string) string {
+	// Unescape first to catch any tricky critters.
+	content := gohtml.UnescapeString(text)
+
+	// Remove all detected HTML.
+	content = strict.Sanitize(content)
+
+	// Unescape again to return plaintext.
+	content = gohtml.UnescapeString(content)
+	return strings.TrimSpace(content)
+}
-- 
cgit v1.2.3