summaryrefslogtreecommitdiff
path: root/internal/text/plain.go
diff options
context:
space:
mode:
authorLibravatar tobi <31960611+tsmethurst@users.noreply.github.com>2025-03-07 15:04:34 +0100
committerLibravatar GitHub <noreply@github.com>2025-03-07 14:04:34 +0000
commitd8113c11e4d84a6d04d56b58d337c235154a535b (patch)
tree3ed983cbb8f95c9ef51a02a51a50ab89c42abd14 /internal/text/plain.go
parent[bugfix] Store and expose status content type (#3870) (diff)
downloadgotosocial-d8113c11e4d84a6d04d56b58d337c235154a535b.tar.xz
[feature] Parse content warning to HTML, serialize via client API as plaintext (#3876)
* [feature] Parse content warning as HTML, serialize via API to plaintext * tidy up some cruft * whoops * oops * i'm da joker baybee * clemency muy lorde * rename some of the text functions for clarity * jiggle the opts * fiddle de deee * hopefully the last test fix i ever have to do in my beautiful life
Diffstat (limited to 'internal/text/plain.go')
-rw-r--r--internal/text/plain.go66
1 files changed, 58 insertions, 8 deletions
diff --git a/internal/text/plain.go b/internal/text/plain.go
index 362941773..ee4947bf7 100644
--- a/internal/text/plain.go
+++ b/internal/text/plain.go
@@ -20,8 +20,11 @@ package text
import (
"bytes"
"context"
+ gohtml "html"
+ "strings"
"codeberg.org/gruf/go-byteutil"
+ "github.com/k3a/html2text"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
@@ -52,7 +55,7 @@ func (f *Formatter) FromPlain(
return f.fromPlain(
ctx,
plainTextParser,
- false, // emojiOnly = false
+ false, // basic = false
parseMention,
authorID,
statusID,
@@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph(
return f.fromPlain(
ctx,
plainTextParser,
- false, // emojiOnly = false
+ false, // basic = false
parseMention,
authorID,
statusID,
@@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph(
)
}
-// FromPlainEmojiOnly fulfils FormatFunc by parsing
+// FromPlainBasic fulfils FormatFunc by parsing
// the given plaintext input into a FormatResult.
//
// Unlike FromPlain, it will only parse emojis with
// the custom renderer, leaving aside mentions and tags.
-func (f *Formatter) FromPlainEmojiOnly(
+//
+// Resulting HTML will also NOT be wrapped in <p> tags.
+func (f *Formatter) FromPlainBasic(
ctx context.Context,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
@@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly(
return f.fromPlain(
ctx,
plainTextParser,
- true, // emojiOnly = true
+ true, // basic = true
parseMention,
authorID,
statusID,
@@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly(
func (f *Formatter) fromPlain(
ctx context.Context,
plainTextParser parser.Parser,
- emojiOnly bool,
+ basic bool,
parseMention gtsmodel.ParseMentionFunc,
authorID string,
statusID string,
@@ -156,7 +161,9 @@ func (f *Formatter) fromPlain(
parseMention,
authorID,
statusID,
- emojiOnly,
+ // If basic, pass
+ // emojiOnly = true.
+ basic,
result,
},
// Turns URLs into links.
@@ -181,8 +188,51 @@ func (f *Formatter) fromPlain(
// Clean and shrink HTML.
result.HTML = byteutil.B2S(htmlBytes.Bytes())
- result.HTML = SanitizeToHTML(result.HTML)
+ result.HTML = SanitizeHTML(result.HTML)
result.HTML = MinifyHTML(result.HTML)
return result
}
+
+// ParseHTMLToPlain parses the given HTML string, then
+// outputs it to equivalent plaintext while trying to
+// keep as much of the smenantic intent of the input
+// HTML as possible, ie., titles are placed on separate
+// lines, `<br>`s are converted to newlines, text inside
+// `<strong>` and `<em>` tags is retained, but without
+// emphasis, `<a>` links are unnested and the URL they
+// link to is placed in angle brackets next to them,
+// lists are replaced with newline-separated indented
+// items, etc.
+//
+// This function is useful when you need to filter on
+// HTML and want to avoid catching tags in the filter,
+// or when you want to serve something in a plaintext
+// format that may contain HTML tags (eg., CWs).
+func ParseHTMLToPlain(html string) string {
+ plain := html2text.HTML2TextWithOptions(
+ html,
+ html2text.WithLinksInnerText(),
+ html2text.WithUnixLineBreaks(),
+ html2text.WithListSupport(),
+ )
+ return strings.TrimSpace(plain)
+}
+
+// StripHTMLFromText runs text through strict sanitization
+// to completely remove any HTML from the input without
+// trying to preserve the semantic intent of any HTML tags.
+//
+// This is useful in cases where the input was not allowed
+// to contain HTML at all, and the output isn't either.
+func StripHTMLFromText(text string) string {
+ // Unescape first to catch any tricky critters.
+ content := gohtml.UnescapeString(text)
+
+ // Remove all detected HTML.
+ content = strict.Sanitize(content)
+
+ // Unescape again to return plaintext.
+ content = gohtml.UnescapeString(content)
+ return strings.TrimSpace(content)
+}