summaryrefslogtreecommitdiff
path: root/vendor/github.com/k3a/html2text/html2text.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/k3a/html2text/html2text.go')
-rw-r--r--vendor/github.com/k3a/html2text/html2text.go333
1 files changed, 333 insertions, 0 deletions
diff --git a/vendor/github.com/k3a/html2text/html2text.go b/vendor/github.com/k3a/html2text/html2text.go
new file mode 100644
index 000000000..f79fbe395
--- /dev/null
+++ b/vendor/github.com/k3a/html2text/html2text.go
@@ -0,0 +1,333 @@
+package html2text
+
+import (
+ "bytes"
+ "regexp"
+ "strconv"
+ "strings"
+)
+
+// Line break constants
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+const (
+ WIN_LBR = "\r\n"
+ UNIX_LBR = "\n"
+)
+
+var legacyLBR = WIN_LBR
+var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
+var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
+var badLinkHrefRE = regexp.MustCompile(`javascript:`)
+var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
+var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
+
+type options struct {
+ lbr string
+ linksInnerText bool
+ listPrefix string
+}
+
+func newOptions() *options {
+ // apply defaults
+ return &options{
+ lbr: WIN_LBR,
+ }
+}
+
+// Option is a functional option
+type Option func(*options)
+
+// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
+func WithUnixLineBreaks() Option {
+ return func(o *options) {
+ o.lbr = UNIX_LBR
+ }
+}
+
+// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
+// Example: click news <http://bit.ly/2n4wXRs>
+func WithLinksInnerText() Option {
+ return func(o *options) {
+ o.linksInnerText = true
+ }
+}
+
+// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
+func WithListSupportPrefix(prefix string) Option {
+ return func(o *options) {
+ o.listPrefix = prefix
+ }
+}
+
+// WithListSupport formats <ul> and <li> lists with " - " prefix
+func WithListSupport() Option {
+ return WithListSupportPrefix(" - ")
+}
+
+func parseHTMLEntity(entName string) (string, bool) {
+ if r, ok := entity[entName]; ok {
+ return string(r), true
+ }
+
+ if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
+ var (
+ err error
+ n int64
+ digits = match[1]
+ )
+
+ if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
+ n, err = strconv.ParseInt(digits[1:], 16, 64)
+ } else {
+ n, err = strconv.ParseInt(digits, 10, 64)
+ }
+
+ if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
+ return string(rune(n)), true
+ }
+ }
+
+ return "", false
+}
+
+// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
+// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
+// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
+func SetUnixLbr(b bool) {
+ if b {
+ legacyLBR = UNIX_LBR
+ } else {
+ legacyLBR = WIN_LBR
+ }
+}
+
+// HTMLEntitiesToText decodes HTML entities inside a provided
+// string and returns decoded text
+func HTMLEntitiesToText(htmlEntsText string) string {
+ outBuf := bytes.NewBufferString("")
+ inEnt := false
+
+ for i, r := range htmlEntsText {
+ switch {
+ case r == ';' && inEnt:
+ inEnt = false
+ continue
+
+ case r == '&': //possible html entity
+ entName := ""
+ isEnt := false
+
+ // parse the entity name - max 10 chars
+ chars := 0
+ for _, er := range htmlEntsText[i+1:] {
+ if er == ';' {
+ isEnt = true
+ break
+ } else {
+ entName += string(er)
+ }
+
+ chars++
+ if chars == 10 {
+ break
+ }
+ }
+
+ if isEnt {
+ if ent, isEnt := parseHTMLEntity(entName); isEnt {
+ outBuf.WriteString(ent)
+ inEnt = true
+ continue
+ }
+ }
+ }
+
+ if !inEnt {
+ outBuf.WriteRune(r)
+ }
+ }
+
+ return outBuf.String()
+}
+
+func writeSpace(outBuf *bytes.Buffer) {
+ bts := outBuf.Bytes()
+ if len(bts) > 0 && bts[len(bts)-1] != ' ' {
+ outBuf.WriteString(" ")
+ }
+}
+
+// HTML2Text converts html into a text form
+func HTML2Text(html string) string {
+ var opts []Option
+ if legacyLBR == UNIX_LBR {
+ opts = append(opts, WithUnixLineBreaks())
+ }
+ return HTML2TextWithOptions(html, opts...)
+}
+
+// HTML2TextWithOptions converts html into a text form with additional options
+func HTML2TextWithOptions(html string, reqOpts ...Option) string {
+ opts := newOptions()
+ for _, opt := range reqOpts {
+ opt(opts)
+ }
+
+ inLen := len(html)
+ tagStart := 0
+ inEnt := false
+ badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
+ shouldOutput := true
+ // maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
+ hrefs := []string{}
+ // new line cannot be printed at the beginning or
+ // for <p> after a new line created by previous <p></p>
+ canPrintNewline := false
+
+ outBuf := bytes.NewBufferString("")
+
+ for i, r := range html {
+ if inLen > 0 && i == inLen-1 {
+ // prevent new line at the end of the document
+ canPrintNewline = false
+ }
+
+ switch {
+ // skip new lines and spaces adding a single space if not there yet
+ case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
+ r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
+ if shouldOutput && badTagStackDepth == 0 && !inEnt {
+ //outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
+ writeSpace(outBuf)
+ }
+ continue
+
+ case r == ';' && inEnt: // end of html entity
+ inEnt = false
+ continue
+
+ case r == '&' && shouldOutput: // possible html entity
+ entName := ""
+ isEnt := false
+
+ // parse the entity name - max 10 chars
+ chars := 0
+ for _, er := range html[i+1:] {
+ if er == ';' {
+ isEnt = true
+ break
+ } else {
+ entName += string(er)
+ }
+
+ chars++
+ if chars == 10 {
+ break
+ }
+ }
+
+ if isEnt {
+ if ent, isEnt := parseHTMLEntity(entName); isEnt {
+ outBuf.WriteString(ent)
+ inEnt = true
+ continue
+ }
+ }
+
+ case r == '<': // start of a tag
+ tagStart = i + 1
+ shouldOutput = false
+ continue
+
+ case r == '>': // end of a tag
+ shouldOutput = true
+ tag := html[tagStart:i]
+ tagNameLowercase := strings.ToLower(tag)
+
+ if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
+ outBuf.WriteString(opts.lbr)
+ } else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
+ if opts.listPrefix != "" {
+ outBuf.WriteString(opts.lbr + opts.listPrefix)
+ } else {
+ outBuf.WriteString(opts.lbr)
+ }
+ } else if headersRE.MatchString(tagNameLowercase) {
+ if canPrintNewline {
+ outBuf.WriteString(opts.lbr + opts.lbr)
+ }
+ canPrintNewline = false
+ } else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
+ // new line
+ outBuf.WriteString(opts.lbr)
+ } else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
+ if canPrintNewline {
+ outBuf.WriteString(opts.lbr + opts.lbr)
+ }
+ canPrintNewline = false
+ } else if opts.linksInnerText && tagNameLowercase == "/a" {
+ // end of link
+ // links can be empty can happen if the link matches the badLinkHrefRE
+ if len(hrefs) > 0 {
+ outBuf.WriteString(" <")
+ outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
+ outBuf.WriteString(">")
+ hrefs = hrefs[1:]
+ }
+ } else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
+ // parse link href
+ // add special handling for a tags
+ m := linkTagRE.FindStringSubmatch(tag)
+ if len(m) == 5 {
+ link := m[2]
+ if len(link) == 0 {
+ link = m[3]
+ if len(link) == 0 {
+ link = m[4]
+ }
+ }
+
+ if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
+ hrefs = append(hrefs, link)
+ }
+ }
+ } else if badTagnamesRE.MatchString(tagNameLowercase) {
+ // unwanted block
+ badTagStackDepth++
+
+ // if link inner text preservation is not enabled
+ // and the current tag is a link tag, parse its href and output that
+ if !opts.linksInnerText {
+ // parse link href
+ m := linkTagRE.FindStringSubmatch(tag)
+ if len(m) == 5 {
+ link := m[2]
+ if len(link) == 0 {
+ link = m[3]
+ if len(link) == 0 {
+ link = m[4]
+ }
+ }
+
+ if !badLinkHrefRE.MatchString(link) {
+ outBuf.WriteString(HTMLEntitiesToText(link))
+ }
+ }
+ }
+ } else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
+ badTagnamesRE.MatchString(tagNameLowercase[1:]) {
+ // end of unwanted block
+ badTagStackDepth--
+ }
+ continue
+
+ } // switch end
+
+ if shouldOutput && badTagStackDepth == 0 && !inEnt {
+ canPrintNewline = true
+ outBuf.WriteRune(r)
+ }
+ }
+
+ return outBuf.String()
+}