diff options
| author | 2025-03-07 15:04:34 +0100 | |
|---|---|---|
| committer | 2025-03-07 14:04:34 +0000 | |
| commit | d8113c11e4d84a6d04d56b58d337c235154a535b (patch) | |
| tree | 3ed983cbb8f95c9ef51a02a51a50ab89c42abd14 /internal/text/plain.go | |
| parent | [bugfix] Store and expose status content type (#3870) (diff) | |
| download | gotosocial-d8113c11e4d84a6d04d56b58d337c235154a535b.tar.xz | |
[feature] Parse content warning to HTML, serialize via client API as plaintext (#3876)
* [feature] Parse content warning as HTML, serialize via API to plaintext
* tidy up some cruft
* whoops
* oops
* i'm da joker baybee
* clemency muy lorde
* rename some of the text functions for clarity
* jiggle the opts
* fiddle de deee
* hopefully the last test fix i ever have to do in my beautiful life
Diffstat (limited to 'internal/text/plain.go')
| -rw-r--r-- | internal/text/plain.go | 66 |
1 files changed, 58 insertions, 8 deletions
diff --git a/internal/text/plain.go b/internal/text/plain.go index 362941773..ee4947bf7 100644 --- a/internal/text/plain.go +++ b/internal/text/plain.go @@ -20,8 +20,11 @@ package text import ( "bytes" "context" + gohtml "html" + "strings" "codeberg.org/gruf/go-byteutil" + "github.com/k3a/html2text" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" @@ -52,7 +55,7 @@ func (f *Formatter) FromPlain( return f.fromPlain( ctx, plainTextParser, - false, // emojiOnly = false + false, // basic = false parseMention, authorID, statusID, @@ -85,7 +88,7 @@ func (f *Formatter) FromPlainNoParagraph( return f.fromPlain( ctx, plainTextParser, - false, // emojiOnly = false + false, // basic = false parseMention, authorID, statusID, @@ -93,12 +96,14 @@ func (f *Formatter) FromPlainNoParagraph( ) } -// FromPlainEmojiOnly fulfils FormatFunc by parsing +// FromPlainBasic fulfils FormatFunc by parsing // the given plaintext input into a FormatResult. // // Unlike FromPlain, it will only parse emojis with // the custom renderer, leaving aside mentions and tags. -func (f *Formatter) FromPlainEmojiOnly( +// +// Resulting HTML will also NOT be wrapped in <p> tags. +func (f *Formatter) FromPlainBasic( ctx context.Context, parseMention gtsmodel.ParseMentionFunc, authorID string, @@ -116,7 +121,7 @@ func (f *Formatter) FromPlainEmojiOnly( return f.fromPlain( ctx, plainTextParser, - true, // emojiOnly = true + true, // basic = true parseMention, authorID, statusID, @@ -130,7 +135,7 @@ func (f *Formatter) FromPlainEmojiOnly( func (f *Formatter) fromPlain( ctx context.Context, plainTextParser parser.Parser, - emojiOnly bool, + basic bool, parseMention gtsmodel.ParseMentionFunc, authorID string, statusID string, @@ -156,7 +161,9 @@ func (f *Formatter) fromPlain( parseMention, authorID, statusID, - emojiOnly, + // If basic, pass + // emojiOnly = true. + basic, result, }, // Turns URLs into links. @@ -181,8 +188,51 @@ func (f *Formatter) fromPlain( // Clean and shrink HTML. result.HTML = byteutil.B2S(htmlBytes.Bytes()) - result.HTML = SanitizeToHTML(result.HTML) + result.HTML = SanitizeHTML(result.HTML) result.HTML = MinifyHTML(result.HTML) return result } + +// ParseHTMLToPlain parses the given HTML string, then +// outputs it to equivalent plaintext while trying to +// keep as much of the smenantic intent of the input +// HTML as possible, ie., titles are placed on separate +// lines, `<br>`s are converted to newlines, text inside +// `<strong>` and `<em>` tags is retained, but without +// emphasis, `<a>` links are unnested and the URL they +// link to is placed in angle brackets next to them, +// lists are replaced with newline-separated indented +// items, etc. +// +// This function is useful when you need to filter on +// HTML and want to avoid catching tags in the filter, +// or when you want to serve something in a plaintext +// format that may contain HTML tags (eg., CWs). +func ParseHTMLToPlain(html string) string { + plain := html2text.HTML2TextWithOptions( + html, + html2text.WithLinksInnerText(), + html2text.WithUnixLineBreaks(), + html2text.WithListSupport(), + ) + return strings.TrimSpace(plain) +} + +// StripHTMLFromText runs text through strict sanitization +// to completely remove any HTML from the input without +// trying to preserve the semantic intent of any HTML tags. +// +// This is useful in cases where the input was not allowed +// to contain HTML at all, and the output isn't either. +func StripHTMLFromText(text string) string { + // Unescape first to catch any tricky critters. + content := gohtml.UnescapeString(text) + + // Remove all detected HTML. + content = strict.Sanitize(content) + + // Unescape again to return plaintext. + content = gohtml.UnescapeString(content) + return strings.TrimSpace(content) +} |
