diff options
author | 2022-07-19 15:21:17 +0200 | |
---|---|---|
committer | 2022-07-19 15:21:17 +0200 | |
commit | c84384e6608368a13a774d6d33a8cc32da7cf209 (patch) | |
tree | a18aa9c1ced1299d2682c1993e1ba38f46448dba /vendor/github.com/tdewolff/parse/v2/util.go | |
parent | [chore] use our own logging implementation (#716) (diff) | |
download | gotosocial-c84384e6608368a13a774d6d33a8cc32da7cf209.tar.xz |
[bugfix] html escape special characters in text instead of totally removing them (#719)
* remove minify dependency
* tidy up some tests
* remove pre + postformat funcs
* rework sanitization + formatting
* update tests
* add some more markdown tests
Diffstat (limited to 'vendor/github.com/tdewolff/parse/v2/util.go')
-rw-r--r-- | vendor/github.com/tdewolff/parse/v2/util.go | 486 |
1 files changed, 0 insertions, 486 deletions
diff --git a/vendor/github.com/tdewolff/parse/v2/util.go b/vendor/github.com/tdewolff/parse/v2/util.go deleted file mode 100644 index 4174cb242..000000000 --- a/vendor/github.com/tdewolff/parse/v2/util.go +++ /dev/null @@ -1,486 +0,0 @@ -package parse - -import ( - "bytes" - "fmt" - "strconv" - "unicode" -) - -// Copy returns a copy of the given byte slice. -func Copy(src []byte) (dst []byte) { - dst = make([]byte, len(src)) - copy(dst, src) - return -} - -// ToLower converts all characters in the byte slice from A-Z to a-z. -func ToLower(src []byte) []byte { - for i, c := range src { - if c >= 'A' && c <= 'Z' { - src[i] = c + ('a' - 'A') - } - } - return src -} - -// EqualFold returns true when s matches case-insensitively the targetLower (which must be lowercase). -func EqualFold(s, targetLower []byte) bool { - if len(s) != len(targetLower) { - return false - } - for i, c := range targetLower { - d := s[i] - if d != c && (d < 'A' || d > 'Z' || d+('a'-'A') != c) { - return false - } - } - return true -} - -// Printable returns a printable string for given rune -func Printable(r rune) string { - if unicode.IsGraphic(r) { - return fmt.Sprintf("%c", r) - } else if r < 128 { - return fmt.Sprintf("0x%02X", r) - } - return fmt.Sprintf("%U", r) -} - -var whitespaceTable = [256]bool{ - // ASCII - false, false, false, false, false, false, false, false, - false, true, true, false, true, true, false, false, // tab, new line, form feed, carriage return - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - true, false, false, false, false, false, false, false, // space - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - // non-ASCII - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, -} - -// IsWhitespace returns true for space, \n, \r, \t, \f. -func IsWhitespace(c byte) bool { - return whitespaceTable[c] -} - -var newlineTable = [256]bool{ - // ASCII - false, false, false, false, false, false, false, false, - false, false, true, false, false, true, false, false, // new line, carriage return - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - // non-ASCII - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, -} - -// IsNewline returns true for \n, \r. -func IsNewline(c byte) bool { - return newlineTable[c] -} - -// IsAllWhitespace returns true when the entire byte slice consists of space, \n, \r, \t, \f. -func IsAllWhitespace(b []byte) bool { - for _, c := range b { - if !IsWhitespace(c) { - return false - } - } - return true -} - -// TrimWhitespace removes any leading and trailing whitespace characters. -func TrimWhitespace(b []byte) []byte { - n := len(b) - start := n - for i := 0; i < n; i++ { - if !IsWhitespace(b[i]) { - start = i - break - } - } - end := n - for i := n - 1; i >= start; i-- { - if !IsWhitespace(b[i]) { - end = i + 1 - break - } - } - return b[start:end] -} - -// ReplaceMultipleWhitespace replaces character series of space, \n, \t, \f, \r into a single space or newline (when the serie contained a \n or \r). -func ReplaceMultipleWhitespace(b []byte) []byte { - j, k := 0, 0 // j is write position, k is start of next text section - for i := 0; i < len(b); i++ { - if IsWhitespace(b[i]) { - start := i - newline := IsNewline(b[i]) - i++ - for ; i < len(b) && IsWhitespace(b[i]); i++ { - if IsNewline(b[i]) { - newline = true - } - } - if newline { - b[start] = '\n' - } else { - b[start] = ' ' - } - if 1 < i-start { // more than one whitespace - if j == 0 { - j = start + 1 - } else { - j += copy(b[j:], b[k:start+1]) - } - k = i - } - } - } - if j == 0 { - return b - } else if j == 1 { // only if starts with whitespace - b[k-1] = b[0] - return b[k-1:] - } else if k < len(b) { - j += copy(b[j:], b[k:]) - } - return b[:j] -} - -// replaceEntities will replace in b at index i, assuming that b[i] == '&' and that i+3<len(b). The returned int will be the last character of the entity, so that the next iteration can safely do i++ to continue and not miss any entitites. -func replaceEntities(b []byte, i int, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) ([]byte, int) { - const MaxEntityLength = 31 // longest HTML entity: CounterClockwiseContourIntegral - var r []byte - j := i + 1 - if b[j] == '#' { - j++ - if b[j] == 'x' { - j++ - c := 0 - for ; j < len(b) && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { - if b[j] <= '9' { - c = c<<4 + int(b[j]-'0') - } else if b[j] <= 'F' { - c = c<<4 + int(b[j]-'A') + 10 - } else if b[j] <= 'f' { - c = c<<4 + int(b[j]-'a') + 10 - } - } - if j <= i+3 || 10000 <= c { - return b, j - 1 - } - if c < 128 { - r = []byte{byte(c)} - } else { - r = append(r, '&', '#') - r = strconv.AppendInt(r, int64(c), 10) - r = append(r, ';') - } - } else { - c := 0 - for ; j < len(b) && c < 128 && b[j] >= '0' && b[j] <= '9'; j++ { - c = c*10 + int(b[j]-'0') - } - if j <= i+2 || 128 <= c { - return b, j - 1 - } - r = []byte{byte(c)} - } - } else { - for ; j < len(b) && j-i-1 <= MaxEntityLength && b[j] != ';'; j++ { - } - if j <= i+1 || len(b) <= j { - return b, j - 1 - } - - var ok bool - r, ok = entitiesMap[string(b[i+1:j])] - if !ok { - return b, j - } - } - - // j is at semicolon - n := j + 1 - i - if j < len(b) && b[j] == ';' && 2 < n { - if len(r) == 1 { - if q, ok := revEntitiesMap[r[0]]; ok { - if len(q) == len(b[i:j+1]) && bytes.Equal(q, b[i:j+1]) { - return b, j - } - r = q - } else if r[0] == '&' { - // check if for example & is followed by something that could potentially be an entity - k := j + 1 - if k < len(b) && b[k] == '#' { - k++ - } - for ; k < len(b) && k-j <= MaxEntityLength && (b[k] >= '0' && b[k] <= '9' || b[k] >= 'a' && b[k] <= 'z' || b[k] >= 'A' && b[k] <= 'Z'); k++ { - } - if k < len(b) && b[k] == ';' { - return b, k - } - } - } - - copy(b[i:], r) - copy(b[i+len(r):], b[j+1:]) - b = b[:len(b)-n+len(r)] - return b, i + len(r) - 1 - } - return b, i -} - -// ReplaceEntities replaces all occurrences of entites (such as ") to their respective unencoded bytes. -func ReplaceEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { - for i := 0; i < len(b); i++ { - if b[i] == '&' && i+3 < len(b) { - b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) - } - } - return b -} - -// ReplaceMultipleWhitespaceAndEntities is a combination of ReplaceMultipleWhitespace and ReplaceEntities. It is faster than executing both sequentially. -func ReplaceMultipleWhitespaceAndEntities(b []byte, entitiesMap map[string][]byte, revEntitiesMap map[byte][]byte) []byte { - j, k := 0, 0 // j is write position, k is start of next text section - for i := 0; i < len(b); i++ { - if IsWhitespace(b[i]) { - start := i - newline := IsNewline(b[i]) - i++ - for ; i < len(b) && IsWhitespace(b[i]); i++ { - if IsNewline(b[i]) { - newline = true - } - } - if newline { - b[start] = '\n' - } else { - b[start] = ' ' - } - if 1 < i-start { // more than one whitespace - if j == 0 { - j = start + 1 - } else { - j += copy(b[j:], b[k:start+1]) - } - k = i - } - } - if i+3 < len(b) && b[i] == '&' { - b, i = replaceEntities(b, i, entitiesMap, revEntitiesMap) - } - } - if j == 0 { - return b - } else if j == 1 { // only if starts with whitespace - b[k-1] = b[0] - return b[k-1:] - } else if k < len(b) { - j += copy(b[j:], b[k:]) - } - return b[:j] -} - -// URLEncodingTable is a charmap for which characters need escaping in the URL encoding scheme -var URLEncodingTable = [256]bool{ - // ASCII - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, false, true, true, true, true, true, false, // space, ", #, $, %, & - false, false, false, true, true, false, false, true, // +, comma, / - false, false, false, false, false, false, false, false, - false, false, true, true, true, true, true, true, // :, ;, <, =, >, ? - - true, false, false, false, false, false, false, false, // @ - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, true, true, true, false, // [, \, ], ^ - - true, false, false, false, false, false, false, false, // ` - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, true, true, false, true, // {, |, }, DEL - - // non-ASCII - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, -} - -// DataURIEncodingTable is a charmap for which characters need escaping in the Data URI encoding scheme -// Escape only non-printable characters, unicode and %, #, &. -// IE11 additionally requires encoding of \, [, ], ", <, >, `, {, }, |, ^ which is not required by Chrome, Firefox, Opera, Edge, Safari, Yandex -// To pass the HTML validator, restricted URL characters must be escaped: non-printable characters, space, <, >, #, %, " -var DataURIEncodingTable = [256]bool{ - // ASCII - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, false, true, true, false, true, true, false, // space, ", #, %, & - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, true, false, true, false, // <, > - - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, true, true, true, false, // [, \, ], ^ - - true, false, false, false, false, false, false, false, // ` - false, false, false, false, false, false, false, false, - false, false, false, false, false, false, false, false, - false, false, false, true, true, true, false, true, // {, |, }, DEL - - // non-ASCII - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, - true, true, true, true, true, true, true, true, -} - -// EncodeURL encodes bytes using the URL encoding scheme -func EncodeURL(b []byte, table [256]bool) []byte { - for i := 0; i < len(b); i++ { - c := b[i] - if table[c] { - b = append(b, 0, 0) - copy(b[i+3:], b[i+1:]) - b[i+0] = '%' - b[i+1] = "0123456789ABCDEF"[c>>4] - b[i+2] = "0123456789ABCDEF"[c&15] - } - } - return b -} - -// DecodeURL decodes an URL encoded using the URL encoding scheme -func DecodeURL(b []byte) []byte { - for i := 0; i < len(b); i++ { - if b[i] == '%' && i+2 < len(b) { - j := i + 1 - c := 0 - for ; j < i+3 && (b[j] >= '0' && b[j] <= '9' || b[j] >= 'a' && b[j] <= 'f' || b[j] >= 'A' && b[j] <= 'F'); j++ { - if b[j] <= '9' { - c = c<<4 + int(b[j]-'0') - } else if b[j] <= 'F' { - c = c<<4 + int(b[j]-'A') + 10 - } else if b[j] <= 'f' { - c = c<<4 + int(b[j]-'a') + 10 - } - } - if j == i+3 && c < 128 { - b[i] = byte(c) - b = append(b[:i+1], b[i+3:]...) - } - } else if b[i] == '+' { - b[i] = ' ' - } - } - return b -} |