diff options
| author | 2021-08-12 21:03:24 +0200 | |
|---|---|---|
| committer | 2021-08-12 21:03:24 +0200 | |
| commit | 98263a7de64269898a2f81207e38943b5c8e8653 (patch) | |
| tree | 743c90f109a6c5d27832d1dcef2388d939f0f77a /vendor/github.com/microcosm-cc/bluemonday/sanitize.go | |
| parent | Text duplication fix (#137) (diff) | |
| download | gotosocial-98263a7de64269898a2f81207e38943b5c8e8653.tar.xz | |
Grand test fixup (#138)
* start fixing up tests
* fix up tests + automate with drone
* fiddle with linting
* messing about with drone.yml
* some more fiddling
* hmmm
* add cache
* add vendor directory
* verbose
* ci updates
* update some little things
* update sig
Diffstat (limited to 'vendor/github.com/microcosm-cc/bluemonday/sanitize.go')
| -rw-r--r-- | vendor/github.com/microcosm-cc/bluemonday/sanitize.go | 1061 |
1 files changed, 1061 insertions, 0 deletions
diff --git a/vendor/github.com/microcosm-cc/bluemonday/sanitize.go b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go new file mode 100644 index 000000000..5f4b60d71 --- /dev/null +++ b/vendor/github.com/microcosm-cc/bluemonday/sanitize.go @@ -0,0 +1,1061 @@ +// Copyright (c) 2014, David Kitchen <david@buro9.com> +// +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the organisation (Microcosm) nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +package bluemonday + +import ( + "bytes" + "fmt" + "io" + "net/url" + "regexp" + "strconv" + "strings" + + "golang.org/x/net/html" + + "github.com/aymerick/douceur/parser" +) + +var ( + dataAttribute = regexp.MustCompile("^data-.+") + dataAttributeXMLPrefix = regexp.MustCompile("^xml.+") + dataAttributeInvalidChars = regexp.MustCompile("[A-Z;]+") + cssUnicodeChar = regexp.MustCompile(`\\[0-9a-f]{1,6} ?`) + dataURIbase64Prefix = regexp.MustCompile(`^data:[^,]*;base64,`) +) + +// Sanitize takes a string that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a HTML string that has been sanitized by the policy or an empty +// string if an error has occurred (most likely as a consequence of extremely +// malformed input) +func (p *Policy) Sanitize(s string) string { + if strings.TrimSpace(s) == "" { + return s + } + + return p.sanitizeWithBuff(strings.NewReader(s)).String() +} + +// SanitizeBytes takes a []byte that contains a HTML fragment or document and applies +// the given policy allowlist. +// +// It returns a []byte containing the HTML that has been sanitized by the policy +// or an empty []byte if an error has occurred (most likely as a consequence of +// extremely malformed input) +func (p *Policy) SanitizeBytes(b []byte) []byte { + if len(bytes.TrimSpace(b)) == 0 { + return b + } + + return p.sanitizeWithBuff(bytes.NewReader(b)).Bytes() +} + +// SanitizeReader takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist. +// +// It returns a bytes.Buffer containing the HTML that has been sanitized by the +// policy. Errors during sanitization will merely return an empty result. +func (p *Policy) SanitizeReader(r io.Reader) *bytes.Buffer { + return p.sanitizeWithBuff(r) +} + +// SanitizeReaderToWriter takes an io.Reader that contains a HTML fragment or document +// and applies the given policy allowlist and writes to the provided writer returning +// an error if there is one. +func (p *Policy) SanitizeReaderToWriter(r io.Reader, w io.Writer) error { + return p.sanitize(r, w) +} + +const escapedURLChars = "'<>\"\r" + +func escapeUrlComponent(w stringWriterWriter, val string) error { + i := strings.IndexAny(val, escapedURLChars) + for i != -1 { + if _, err := w.WriteString(val[:i]); err != nil { + return err + } + var esc string + switch val[i] { + case '\'': + // "'" is shorter than "'" and apos was not in HTML until HTML5. + esc = "'" + case '<': + esc = "<" + case '>': + esc = ">" + case '"': + // """ is shorter than """. + esc = """ + case '\r': + esc = " " + default: + panic("unrecognized escape character") + } + val = val[i+1:] + if _, err := w.WriteString(esc); err != nil { + return err + } + i = strings.IndexAny(val, escapedURLChars) + } + _, err := w.WriteString(val) + return err +} + +// Query represents a single part of the query string, a query param +type Query struct { + Key string + Value string + HasValue bool +} + +func parseQuery(query string) (values []Query, err error) { + // This is essentially a copy of parseQuery from + // https://golang.org/src/net/url/url.go but adjusted to build our values + // based on our type, which we need to preserve the ordering of the query + // string + for query != "" { + key := query + if i := strings.IndexAny(key, "&;"); i >= 0 { + key, query = key[:i], key[i+1:] + } else { + query = "" + } + if key == "" { + continue + } + value := "" + hasValue := false + if i := strings.Index(key, "="); i >= 0 { + key, value = key[:i], key[i+1:] + hasValue = true + } + key, err1 := url.QueryUnescape(key) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + value, err1 = url.QueryUnescape(value) + if err1 != nil { + if err == nil { + err = err1 + } + continue + } + values = append(values, Query{ + Key: key, + Value: value, + HasValue: hasValue, + }) + } + return values, err +} + +func encodeQueries(queries []Query) string { + var buff bytes.Buffer + for i, query := range queries { + buff.WriteString(url.QueryEscape(query.Key)) + if query.HasValue { + buff.WriteString("=") + buff.WriteString(url.QueryEscape(query.Value)) + } + if i < len(queries)-1 { + buff.WriteString("&") + } + } + return buff.String() +} + +func sanitizedURL(val string) (string, error) { + u, err := url.Parse(val) + if err != nil { + return "", err + } + + // we use parseQuery but not u.Query to keep the order not change because + // url.Values is a map which has a random order. + queryValues, err := parseQuery(u.RawQuery) + if err != nil { + return "", err + } + // sanitize the url query params + for i, query := range queryValues { + queryValues[i].Key = html.EscapeString(query.Key) + } + u.RawQuery = encodeQueries(queryValues) + // u.String() will also sanitize host/scheme/user/pass + return u.String(), nil +} + +// Performs the actual sanitization process. +func (p *Policy) sanitizeWithBuff(r io.Reader) *bytes.Buffer { + var buff bytes.Buffer + if err := p.sanitize(r, &buff); err != nil { + return &bytes.Buffer{} + } + return &buff +} + +type asStringWriter struct { + io.Writer +} + +func (a *asStringWriter) WriteString(s string) (int, error) { + return a.Write([]byte(s)) +} + +func (p *Policy) sanitize(r io.Reader, w io.Writer) error { + // It is possible that the developer has created the policy via: + // p := bluemonday.Policy{} + // rather than: + // p := bluemonday.NewPolicy() + // If this is the case, and if they haven't yet triggered an action that + // would initiliaze the maps, then we need to do that. + p.init() + + buff, ok := w.(stringWriterWriter) + if !ok { + buff = &asStringWriter{w} + } + + var ( + skipElementContent bool + skippingElementsCount int64 + skipClosingTag bool + closingTagToSkipStack []string + mostRecentlyStartedToken string + ) + + tokenizer := html.NewTokenizer(r) + for { + if tokenizer.Next() == html.ErrorToken { + err := tokenizer.Err() + if err == io.EOF { + // End of input means end of processing + return nil + } + + // Raw tokenizer error + return err + } + + token := tokenizer.Token() + switch token.Type { + case html.DoctypeToken: + + // DocType is not handled as there is no safe parsing mechanism + // provided by golang.org/x/net/html for the content, and this can + // be misused to insert HTML tags that are not then sanitized + // + // One might wish to recursively sanitize here using the same policy + // but I will need to do some further testing before considering + // this. + + case html.CommentToken: + + // Comments are ignored by default + if p.allowComments { + // But if allowed then write the comment out as-is + buff.WriteString(token.String()) + } + + case html.StartTagToken: + + mostRecentlyStartedToken = normaliseElementName(token.Data) + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok { + skipElementContent = true + skippingElementsCount++ + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + if len(token.Attr) != 0 { + token.Attr = escapeAttributes( + p.sanitizeAttrs(token.Data, token.Attr, aps), + ) + } + + if len(token.Attr) == 0 { + if !p.allowNoAttrs(token.Data) { + skipClosingTag = true + closingTagToSkipStack = append(closingTagToSkipStack, token.Data) + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.EndTagToken: + + if mostRecentlyStartedToken == normaliseElementName(token.Data) { + mostRecentlyStartedToken = "" + } + + if skipClosingTag && closingTagToSkipStack[len(closingTagToSkipStack)-1] == token.Data { + closingTagToSkipStack = closingTagToSkipStack[:len(closingTagToSkipStack)-1] + if len(closingTagToSkipStack) == 0 { + skipClosingTag = false + } + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + if _, ok := p.elsAndAttrs[token.Data]; !ok { + match := false + for regex := range p.elsMatchingAndAttrs { + if regex.MatchString(token.Data) { + skipElementContent = false + match = true + break + } + } + if _, ok := p.setOfElementsToSkipContent[token.Data]; ok && !match { + skippingElementsCount-- + if skippingElementsCount == 0 { + skipElementContent = false + } + } + if !match { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + } + + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.SelfClosingTagToken: + + aps, ok := p.elsAndAttrs[token.Data] + if !ok { + aa, matched := p.matchRegex(token.Data) + if !matched { + if p.addSpaces && !matched { + if _, err := buff.WriteString(" "); err != nil { + return err + } + } + break + } + aps = aa + } + + if len(token.Attr) != 0 { + token.Attr = escapeAttributes(p.sanitizeAttrs(token.Data, token.Attr, aps)) + } + + if len(token.Attr) == 0 && !p.allowNoAttrs(token.Data) { + if p.addSpaces { + if _, err := buff.WriteString(" "); err != nil { + return err + } + break + } + } + if !skipElementContent { + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + + case html.TextToken: + + if !skipElementContent { + switch mostRecentlyStartedToken { + case `script`: + // not encouraged, but if a policy allows JavaScript we + // should not HTML escape it as that would break the output + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + case "style": + // not encouraged, but if a policy allows CSS styles we + // should not HTML escape it as that would break the output + if _, err := buff.WriteString(token.Data); err != nil { + return err + } + default: + // HTML escape the text + if _, err := buff.WriteString(token.String()); err != nil { + return err + } + } + } + + default: + // A token that didn't exist in the html package when we wrote this + return fmt.Errorf("unknown token: %v", token) + } + } +} + +// sanitizeAttrs takes a set of element attribute policies and the global +// attribute policies and applies them to the []html.Attribute returning a set +// of html.Attributes that match the policies +func (p *Policy) sanitizeAttrs( + elementName string, + attrs []html.Attribute, + aps map[string][]attrPolicy, +) []html.Attribute { + + if len(attrs) == 0 { + return attrs + } + + hasStylePolicies := false + sps, elementHasStylePolicies := p.elsAndStyles[elementName] + if len(p.globalStyles) > 0 || (elementHasStylePolicies && len(sps) > 0) { + hasStylePolicies = true + } + // no specific element policy found, look for a pattern match + if !hasStylePolicies { + for k, v := range p.elsMatchingAndStyles { + if k.MatchString(elementName) { + if len(v) > 0 { + hasStylePolicies = true + break + } + } + } + } + + // Builds a new attribute slice based on the whether the attribute has been + // allowed explicitly or globally. + cleanAttrs := []html.Attribute{} +attrsLoop: + for _, htmlAttr := range attrs { + if p.allowDataAttributes { + // If we see a data attribute, let it through. + if isDataAttribute(htmlAttr.Key) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + // Is this a "style" attribute, and if so, do we need to sanitize it? + if htmlAttr.Key == "style" && hasStylePolicies { + htmlAttr = p.sanitizeStyles(htmlAttr, elementName) + if htmlAttr.Val == "" { + // We've sanitized away any and all styles; don't bother to + // output the style attribute (even if it's allowed) + continue + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue + } + } + + // Is there an element specific attribute policy that applies? + if apl, ok := aps[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } else { + cleanAttrs = append(cleanAttrs, htmlAttr) + continue attrsLoop + } + } + } + + // Is there a global attribute policy that applies? + if apl, ok := p.globalAttrs[htmlAttr.Key]; ok { + for _, ap := range apl { + if ap.regexp != nil { + if ap.regexp.MatchString(htmlAttr.Val) { + htmlAttr.Val = escapeAttribute(htmlAttr.Val) + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } else { + htmlAttr.Val = escapeAttribute(htmlAttr.Val) + cleanAttrs = append(cleanAttrs, htmlAttr) + } + } + } + } + + if len(cleanAttrs) == 0 { + // If nothing was allowed, let's get out of here + return cleanAttrs + } + // cleanAttrs now contains the attributes that are permitted + + if linkable(elementName) { + if p.requireParseableURLs { + // Ensure URLs are parseable: + // - a.href + // - area.href + // - link.href + // - blockquote.cite + // - q.cite + // - img.src + // - script.src + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + switch elementName { + case "a", "area", "base", "link": + if htmlAttr.Key == "href" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "blockquote", "del", "ins", "q": + if htmlAttr.Key == "cite" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + case "audio", "embed", "iframe", "img", "script", "source", "track", "video": + if htmlAttr.Key == "src" { + if u, ok := p.validURL(htmlAttr.Val); ok { + htmlAttr.Val = u + tmpAttrs = append(tmpAttrs, htmlAttr) + } + break + } + tmpAttrs = append(tmpAttrs, htmlAttr) + default: + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + cleanAttrs = tmpAttrs + } + + if (p.requireNoFollow || + p.requireNoFollowFullyQualifiedLinks || + p.requireNoReferrer || + p.requireNoReferrerFullyQualifiedLinks || + p.addTargetBlankToFullyQualifiedLinks) && + len(cleanAttrs) > 0 { + + // Add rel="nofollow" if a "href" exists + switch elementName { + case "a", "area", "base", "link": + var hrefFound bool + var externalLink bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "href" { + hrefFound = true + + u, err := url.Parse(htmlAttr.Val) + if err != nil { + continue + } + if u.Host != "" { + externalLink = true + } + + continue + } + } + + if hrefFound { + var ( + noFollowFound bool + noReferrerFound bool + targetBlankFound bool + ) + + addNoFollow := (p.requireNoFollow || + externalLink && p.requireNoFollowFullyQualifiedLinks) + + addNoReferrer := (p.requireNoReferrer || + externalLink && p.requireNoReferrerFullyQualifiedLinks) + + addTargetBlank := (externalLink && + p.addTargetBlankToFullyQualifiedLinks) + + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + + var appended bool + if htmlAttr.Key == "rel" && (addNoFollow || addNoReferrer) { + + if addNoFollow && !strings.Contains(htmlAttr.Val, "nofollow") { + htmlAttr.Val += " nofollow" + } + if addNoReferrer && !strings.Contains(htmlAttr.Val, "noreferrer") { + htmlAttr.Val += " noreferrer" + } + noFollowFound = addNoFollow + noReferrerFound = addNoReferrer + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + + if elementName == "a" && htmlAttr.Key == "target" { + if htmlAttr.Val == "_blank" { + targetBlankFound = true + } + if addTargetBlank && !targetBlankFound { + htmlAttr.Val = "_blank" + targetBlankFound = true + tmpAttrs = append(tmpAttrs, htmlAttr) + appended = true + } + } + + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noFollowFound || noReferrerFound || targetBlankFound { + cleanAttrs = tmpAttrs + } + + if (addNoFollow && !noFollowFound) || (addNoReferrer && !noReferrerFound) { + rel := html.Attribute{} + rel.Key = "rel" + if addNoFollow { + rel.Val = "nofollow" + } + if addNoReferrer { + if rel.Val != "" { + rel.Val += " " + } + rel.Val += "noreferrer" + } + cleanAttrs = append(cleanAttrs, rel) + } + + if elementName == "a" && addTargetBlank && !targetBlankFound { + rel := html.Attribute{} + rel.Key = "target" + rel.Val = "_blank" + targetBlankFound = true + cleanAttrs = append(cleanAttrs, rel) + } + + if targetBlankFound { + // target="_blank" has a security risk that allows the + // opened window/tab to issue JavaScript calls against + // window.opener, which in effect allow the destination + // of the link to control the source: + // https://dev.to/ben/the-targetblank-vulnerability-by-example + // + // To mitigate this risk, we need to add a specific rel + // attribute if it is not already present. + // rel="noopener" + // + // Unfortunately this is processing the rel twice (we + // already looked at it earlier ^^) as we cannot be sure + // of the ordering of the href and rel, and whether we + // have fully satisfied that we need to do this. This + // double processing only happens *if* target="_blank" + // is true. + var noOpenerAdded bool + tmpAttrs := []html.Attribute{} + for _, htmlAttr := range cleanAttrs { + var appended bool + if htmlAttr.Key == "rel" { + if strings.Contains(htmlAttr.Val, "noopener") { + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } else { + htmlAttr.Val += " noopener" + noOpenerAdded = true + tmpAttrs = append(tmpAttrs, htmlAttr) + } + + appended = true + } + if !appended { + tmpAttrs = append(tmpAttrs, htmlAttr) + } + } + if noOpenerAdded { + cleanAttrs = tmpAttrs + } else { + // rel attr was not found, or else noopener would + // have been added already + rel := html.Attribute{} + rel.Key = "rel" + rel.Val = "noopener" + cleanAttrs = append(cleanAttrs, rel) + } + + } + } + default: + } + } + } + + if p.requireCrossOriginAnonymous && len(cleanAttrs) > 0 { + switch elementName { + case "audio", "img", "link", "script", "video": + var crossOriginFound bool + for _, htmlAttr := range cleanAttrs { + if htmlAttr.Key == "crossorigin" { + crossOriginFound = true + htmlAttr.Val = "anonymous" + } + } + + if !crossOriginFound { + crossOrigin := html.Attribute{} + crossOrigin.Key = "crossorigin" + crossOrigin.Val = "anonymous" + cleanAttrs = append(cleanAttrs, crossOrigin) + } + } + } + + return cleanAttrs +} + +func (p *Policy) sanitizeStyles(attr html.Attribute, elementName string) html.Attribute { + sps := p.elsAndStyles[elementName] + if len(sps) == 0 { + sps = map[string][]stylePolicy{} + // check for any matching elements, if we don't already have a policy found + // if multiple matches are found they will be overwritten, it's best + // to not have overlapping matchers + for regex, policies := range p.elsMatchingAndStyles { + if regex.MatchString(elementName) { + for k, v := range policies { + sps[k] = append(sps[k], v...) + } + } + } + } + + //Add semi-colon to end to fix parsing issue + if len(attr.Val) > 0 && attr.Val[len(attr.Val)-1] != ';' { + attr.Val = attr.Val + ";" + } + decs, err := parser.ParseDeclarations(attr.Val) + if err != nil { + attr.Val = "" + return attr + } + clean := []string{} + prefixes := []string{"-webkit-", "-moz-", "-ms-", "-o-", "mso-", "-xv-", "-atsc-", "-wap-", "-khtml-", "prince-", "-ah-", "-hp-", "-ro-", "-rim-", "-tc-"} + +decLoop: + for _, dec := range decs { + tempProperty := strings.ToLower(dec.Property) + tempValue := removeUnicode(strings.ToLower(dec.Value)) + for _, i := range prefixes { + tempProperty = strings.TrimPrefix(tempProperty, i) + } + if spl, ok := sps[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + if spl, ok := p.globalStyles[tempProperty]; ok { + for _, sp := range spl { + if sp.handler != nil { + if sp.handler(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if len(sp.enum) > 0 { + if stringInSlice(tempValue, sp.enum) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } else if sp.regexp != nil { + if sp.regexp.MatchString(tempValue) { + clean = append(clean, dec.Property+": "+dec.Value) + continue decLoop + } + } + } + } + } + if len(clean) > 0 { + attr.Val = strings.Join(clean, "; ") + } else { + attr.Val = "" + } + return attr +} + +func (p *Policy) allowNoAttrs(elementName string) bool { + _, ok := p.setOfElementsAllowedWithoutAttrs[elementName] + if !ok { + for _, r := range p.setOfElementsMatchingAllowedWithoutAttrs { + if r.MatchString(elementName) { + ok = true + break + } + } + } + return ok +} + +func (p *Policy) validURL(rawurl string) (string, bool) { + if p.requireParseableURLs { + // URLs are valid if when space is trimmed the URL is valid + rawurl = strings.TrimSpace(rawurl) + + // URLs cannot contain whitespace, unless it is a data-uri + if strings.Contains(rawurl, " ") || + strings.Contains(rawurl, "\t") || + strings.Contains(rawurl, "\n") { + if !strings.HasPrefix(rawurl, `data:`) { + return "", false + } + + // Remove \r and \n from base64 encoded data to pass url.Parse. + matched := dataURIbase64Prefix.FindString(rawurl) + if matched != "" { + rawurl = matched + strings.Replace( + strings.Replace( + rawurl[len(matched):], + "\r", + "", + -1, + ), + "\n", + "", + -1, + ) + } + } + + // URLs are valid if they parse + u, err := url.Parse(rawurl) + if err != nil { + return "", false + } + + if u.Scheme != "" { + + urlPolicies, ok := p.allowURLSchemes[u.Scheme] + if !ok { + return "", false + } + + if len(urlPolicies) == 0 { + return u.String(), true + } + + for _, urlPolicy := range urlPolicies { + if urlPolicy(u) == true { + return u.String(), true + } + } + + return "", false + } + + if p.allowRelativeURLs { + if u.String() != "" { + return u.String(), true + } + } + + return "", false + } + + return rawurl, true +} + +func linkable(elementName string) bool { + switch elementName { + case "a", "area", "base", "link": + // elements that allow .href + return true + case "blockquote", "del", "ins", "q": + // elements that allow .cite + return true + case "audio", "embed", "iframe", "img", "input", "script", "track", "video": + // elements that allow .src + return true + default: + return false + } +} + +// stringInSlice returns true if needle exists in haystack +func stringInSlice(needle string, haystack []string) bool { + for _, straw := range haystack { + if strings.ToLower(straw) == strings.ToLower(needle) { + return true + } + } + return false +} + +func isDataAttribute(val string) bool { + if !dataAttribute.MatchString(val) { + return false + } + rest := strings.Split(val, "data-") + if len(rest) == 1 { + return false + } + // data-xml* is invalid. + if dataAttributeXMLPrefix.MatchString(rest[1]) { + return false + } + // no uppercase or semi-colons allowed. + if dataAttributeInvalidChars.MatchString(rest[1]) { + return false + } + return true +} + +func removeUnicode(value string) string { + substitutedValue := value + currentLoc := cssUnicodeChar.FindStringIndex(substitutedValue) + for currentLoc != nil { + + character := substitutedValue[currentLoc[0]+1 : currentLoc[1]] + character = strings.TrimSpace(character) + if len(character) < 4 { + character = strings.Repeat("0", 4-len(character)) + character + } else { + for len(character) > 4 { + if character[0] != '0' { + character = "" + break + } else { + character = character[1:] + } + } + } + character = "\\u" + character + translatedChar, err := strconv.Unquote(`"` + character + `"`) + translatedChar = strings.TrimSpace(translatedChar) + if err != nil { + return "" + } + substitutedValue = substitutedValue[0:currentLoc[0]] + translatedChar + substitutedValue[currentLoc[1]:] + currentLoc = cssUnicodeChar.FindStringIndex(substitutedValue) + } + return substitutedValue +} + +func (p *Policy) matchRegex(elementName string) (map[string][]attrPolicy, bool) { + aps := make(map[string][]attrPolicy, 0) + matched := false + for regex, attrs := range p.elsMatchingAndAttrs { + if regex.MatchString(elementName) { + matched = true + for k, v := range attrs { + aps[k] = append(aps[k], v...) + } + } + } + return aps, matched +} + +// normaliseElementName takes a HTML element like <script> which is user input +// and returns a lower case version of it that is immune to UTF-8 to ASCII +// conversion tricks (like the use of upper case cyrillic i scrİpt which a +// strings.ToLower would convert to script). Instead this func will preserve +// all non-ASCII as their escaped equivalent, i.e. \u0130 which reveals the +// characters when lower cased +func normaliseElementName(str string) string { + // that useful QuoteToASCII put quote marks at the start and end + // so those are trimmed off + return strings.TrimSuffix( + strings.TrimPrefix( + strings.ToLower( + strconv.QuoteToASCII(str), + ), + `"`), + `"`, + ) +} + +func escapeAttributes(attrs []html.Attribute) []html.Attribute { + escapedAttrs := []html.Attribute{} + for _, attr := range attrs { + attr.Val = escapeAttribute(attr.Val) + escapedAttrs = append(escapedAttrs, attr) + } + return escapedAttrs +} + +func escapeAttribute(val string) string { + val = strings.Replace(val, string([]rune{'\u00A0'}), ` `, -1) + val = strings.Replace(val, `"`, `"`, -1) + return val +}
\ No newline at end of file |
