diff options
Diffstat (limited to 'vendor/github.com/k3a/html2text/html2text.go')
-rw-r--r-- | vendor/github.com/k3a/html2text/html2text.go | 333 |
1 files changed, 333 insertions, 0 deletions
diff --git a/vendor/github.com/k3a/html2text/html2text.go b/vendor/github.com/k3a/html2text/html2text.go new file mode 100644 index 000000000..f79fbe395 --- /dev/null +++ b/vendor/github.com/k3a/html2text/html2text.go @@ -0,0 +1,333 @@ +package html2text + +import ( + "bytes" + "regexp" + "strconv" + "strings" +) + +// Line break constants +// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) +const ( + WIN_LBR = "\r\n" + UNIX_LBR = "\n" +) + +var legacyLBR = WIN_LBR +var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`) +var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`) +var badLinkHrefRE = regexp.MustCompile(`javascript:`) +var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`) +var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`) + +type options struct { + lbr string + linksInnerText bool + listPrefix string +} + +func newOptions() *options { + // apply defaults + return &options{ + lbr: WIN_LBR, + } +} + +// Option is a functional option +type Option func(*options) + +// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default) +func WithUnixLineBreaks() Option { + return func(o *options) { + o.lbr = UNIX_LBR + } +} + +// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text +// Example: click news <http://bit.ly/2n4wXRs> +func WithLinksInnerText() Option { + return func(o *options) { + o.linksInnerText = true + } +} + +// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix +func WithListSupportPrefix(prefix string) Option { + return func(o *options) { + o.listPrefix = prefix + } +} + +// WithListSupport formats <ul> and <li> lists with " - " prefix +func WithListSupport() Option { + return WithListSupportPrefix(" - ") +} + +func parseHTMLEntity(entName string) (string, bool) { + if r, ok := entity[entName]; ok { + return string(r), true + } + + if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 { + var ( + err error + n int64 + digits = match[1] + ) + + if digits != "" && (digits[0] == 'x' || digits[0] == 'X') { + n, err = strconv.ParseInt(digits[1:], 16, 64) + } else { + n, err = strconv.ParseInt(digits, 10, 64) + } + + if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) { + return string(rune(n)), true + } + } + + return "", false +} + +// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n") +// with argument false sets Windows-style line-breaks in output ("\r\n", the default) +// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) +func SetUnixLbr(b bool) { + if b { + legacyLBR = UNIX_LBR + } else { + legacyLBR = WIN_LBR + } +} + +// HTMLEntitiesToText decodes HTML entities inside a provided +// string and returns decoded text +func HTMLEntitiesToText(htmlEntsText string) string { + outBuf := bytes.NewBufferString("") + inEnt := false + + for i, r := range htmlEntsText { + switch { + case r == ';' && inEnt: + inEnt = false + continue + + case r == '&': //possible html entity + entName := "" + isEnt := false + + // parse the entity name - max 10 chars + chars := 0 + for _, er := range htmlEntsText[i+1:] { + if er == ';' { + isEnt = true + break + } else { + entName += string(er) + } + + chars++ + if chars == 10 { + break + } + } + + if isEnt { + if ent, isEnt := parseHTMLEntity(entName); isEnt { + outBuf.WriteString(ent) + inEnt = true + continue + } + } + } + + if !inEnt { + outBuf.WriteRune(r) + } + } + + return outBuf.String() +} + +func writeSpace(outBuf *bytes.Buffer) { + bts := outBuf.Bytes() + if len(bts) > 0 && bts[len(bts)-1] != ' ' { + outBuf.WriteString(" ") + } +} + +// HTML2Text converts html into a text form +func HTML2Text(html string) string { + var opts []Option + if legacyLBR == UNIX_LBR { + opts = append(opts, WithUnixLineBreaks()) + } + return HTML2TextWithOptions(html, opts...) +} + +// HTML2TextWithOptions converts html into a text form with additional options +func HTML2TextWithOptions(html string, reqOpts ...Option) string { + opts := newOptions() + for _, opt := range reqOpts { + opt(opts) + } + + inLen := len(html) + tagStart := 0 + inEnt := false + badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head> + shouldOutput := true + // maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only) + hrefs := []string{} + // new line cannot be printed at the beginning or + // for <p> after a new line created by previous <p></p> + canPrintNewline := false + + outBuf := bytes.NewBufferString("") + + for i, r := range html { + if inLen > 0 && i == inLen-1 { + // prevent new line at the end of the document + canPrintNewline = false + } + + switch { + // skip new lines and spaces adding a single space if not there yet + case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines + r == ' ', r >= 0x2008 && r <= 0x200B: // spaces + if shouldOutput && badTagStackDepth == 0 && !inEnt { + //outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i])) + writeSpace(outBuf) + } + continue + + case r == ';' && inEnt: // end of html entity + inEnt = false + continue + + case r == '&' && shouldOutput: // possible html entity + entName := "" + isEnt := false + + // parse the entity name - max 10 chars + chars := 0 + for _, er := range html[i+1:] { + if er == ';' { + isEnt = true + break + } else { + entName += string(er) + } + + chars++ + if chars == 10 { + break + } + } + + if isEnt { + if ent, isEnt := parseHTMLEntity(entName); isEnt { + outBuf.WriteString(ent) + inEnt = true + continue + } + } + + case r == '<': // start of a tag + tagStart = i + 1 + shouldOutput = false + continue + + case r == '>': // end of a tag + shouldOutput = true + tag := html[tagStart:i] + tagNameLowercase := strings.ToLower(tag) + + if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" { + outBuf.WriteString(opts.lbr) + } else if tagNameLowercase == "li" || tagNameLowercase == "li/" { + if opts.listPrefix != "" { + outBuf.WriteString(opts.lbr + opts.listPrefix) + } else { + outBuf.WriteString(opts.lbr) + } + } else if headersRE.MatchString(tagNameLowercase) { + if canPrintNewline { + outBuf.WriteString(opts.lbr + opts.lbr) + } + canPrintNewline = false + } else if tagNameLowercase == "br" || tagNameLowercase == "br/" { + // new line + outBuf.WriteString(opts.lbr) + } else if tagNameLowercase == "p" || tagNameLowercase == "/p" { + if canPrintNewline { + outBuf.WriteString(opts.lbr + opts.lbr) + } + canPrintNewline = false + } else if opts.linksInnerText && tagNameLowercase == "/a" { + // end of link + // links can be empty can happen if the link matches the badLinkHrefRE + if len(hrefs) > 0 { + outBuf.WriteString(" <") + outBuf.WriteString(HTMLEntitiesToText(hrefs[0])) + outBuf.WriteString(">") + hrefs = hrefs[1:] + } + } else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) { + // parse link href + // add special handling for a tags + m := linkTagRE.FindStringSubmatch(tag) + if len(m) == 5 { + link := m[2] + if len(link) == 0 { + link = m[3] + if len(link) == 0 { + link = m[4] + } + } + + if opts.linksInnerText && !badLinkHrefRE.MatchString(link) { + hrefs = append(hrefs, link) + } + } + } else if badTagnamesRE.MatchString(tagNameLowercase) { + // unwanted block + badTagStackDepth++ + + // if link inner text preservation is not enabled + // and the current tag is a link tag, parse its href and output that + if !opts.linksInnerText { + // parse link href + m := linkTagRE.FindStringSubmatch(tag) + if len(m) == 5 { + link := m[2] + if len(link) == 0 { + link = m[3] + if len(link) == 0 { + link = m[4] + } + } + + if !badLinkHrefRE.MatchString(link) { + outBuf.WriteString(HTMLEntitiesToText(link)) + } + } + } + } else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' && + badTagnamesRE.MatchString(tagNameLowercase[1:]) { + // end of unwanted block + badTagStackDepth-- + } + continue + + } // switch end + + if shouldOutput && badTagStackDepth == 0 && !inEnt { + canPrintNewline = true + outBuf.WriteRune(r) + } + } + + return outBuf.String() +} |