summaryrefslogtreecommitdiff
path: root/vendor/github.com/k3a/html2text/html2text.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/k3a/html2text/html2text.go')
-rw-r--r--vendor/github.com/k3a/html2text/html2text.go333
1 files changed, 0 insertions, 333 deletions
diff --git a/vendor/github.com/k3a/html2text/html2text.go b/vendor/github.com/k3a/html2text/html2text.go
deleted file mode 100644
index f79fbe395..000000000
--- a/vendor/github.com/k3a/html2text/html2text.go
+++ /dev/null
@@ -1,333 +0,0 @@
-package html2text
-
-import (
- "bytes"
- "regexp"
- "strconv"
- "strings"
-)
-
-// Line break constants
-// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
-const (
- WIN_LBR = "\r\n"
- UNIX_LBR = "\n"
-)
-
-var legacyLBR = WIN_LBR
-var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`)
-var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`)
-var badLinkHrefRE = regexp.MustCompile(`javascript:`)
-var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`)
-var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`)
-
-type options struct {
- lbr string
- linksInnerText bool
- listPrefix string
-}
-
-func newOptions() *options {
- // apply defaults
- return &options{
- lbr: WIN_LBR,
- }
-}
-
-// Option is a functional option
-type Option func(*options)
-
-// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default)
-func WithUnixLineBreaks() Option {
- return func(o *options) {
- o.lbr = UNIX_LBR
- }
-}
-
-// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text
-// Example: click news <http://bit.ly/2n4wXRs>
-func WithLinksInnerText() Option {
- return func(o *options) {
- o.linksInnerText = true
- }
-}
-
-// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix
-func WithListSupportPrefix(prefix string) Option {
- return func(o *options) {
- o.listPrefix = prefix
- }
-}
-
-// WithListSupport formats <ul> and <li> lists with " - " prefix
-func WithListSupport() Option {
- return WithListSupportPrefix(" - ")
-}
-
-func parseHTMLEntity(entName string) (string, bool) {
- if r, ok := entity[entName]; ok {
- return string(r), true
- }
-
- if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 {
- var (
- err error
- n int64
- digits = match[1]
- )
-
- if digits != "" && (digits[0] == 'x' || digits[0] == 'X') {
- n, err = strconv.ParseInt(digits[1:], 16, 64)
- } else {
- n, err = strconv.ParseInt(digits, 10, 64)
- }
-
- if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) {
- return string(rune(n)), true
- }
- }
-
- return "", false
-}
-
-// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n")
-// with argument false sets Windows-style line-breaks in output ("\r\n", the default)
-// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak())
-func SetUnixLbr(b bool) {
- if b {
- legacyLBR = UNIX_LBR
- } else {
- legacyLBR = WIN_LBR
- }
-}
-
-// HTMLEntitiesToText decodes HTML entities inside a provided
-// string and returns decoded text
-func HTMLEntitiesToText(htmlEntsText string) string {
- outBuf := bytes.NewBufferString("")
- inEnt := false
-
- for i, r := range htmlEntsText {
- switch {
- case r == ';' && inEnt:
- inEnt = false
- continue
-
- case r == '&': //possible html entity
- entName := ""
- isEnt := false
-
- // parse the entity name - max 10 chars
- chars := 0
- for _, er := range htmlEntsText[i+1:] {
- if er == ';' {
- isEnt = true
- break
- } else {
- entName += string(er)
- }
-
- chars++
- if chars == 10 {
- break
- }
- }
-
- if isEnt {
- if ent, isEnt := parseHTMLEntity(entName); isEnt {
- outBuf.WriteString(ent)
- inEnt = true
- continue
- }
- }
- }
-
- if !inEnt {
- outBuf.WriteRune(r)
- }
- }
-
- return outBuf.String()
-}
-
-func writeSpace(outBuf *bytes.Buffer) {
- bts := outBuf.Bytes()
- if len(bts) > 0 && bts[len(bts)-1] != ' ' {
- outBuf.WriteString(" ")
- }
-}
-
-// HTML2Text converts html into a text form
-func HTML2Text(html string) string {
- var opts []Option
- if legacyLBR == UNIX_LBR {
- opts = append(opts, WithUnixLineBreaks())
- }
- return HTML2TextWithOptions(html, opts...)
-}
-
-// HTML2TextWithOptions converts html into a text form with additional options
-func HTML2TextWithOptions(html string, reqOpts ...Option) string {
- opts := newOptions()
- for _, opt := range reqOpts {
- opt(opts)
- }
-
- inLen := len(html)
- tagStart := 0
- inEnt := false
- badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head>
- shouldOutput := true
- // maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only)
- hrefs := []string{}
- // new line cannot be printed at the beginning or
- // for <p> after a new line created by previous <p></p>
- canPrintNewline := false
-
- outBuf := bytes.NewBufferString("")
-
- for i, r := range html {
- if inLen > 0 && i == inLen-1 {
- // prevent new line at the end of the document
- canPrintNewline = false
- }
-
- switch {
- // skip new lines and spaces adding a single space if not there yet
- case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines
- r == ' ', r >= 0x2008 && r <= 0x200B: // spaces
- if shouldOutput && badTagStackDepth == 0 && !inEnt {
- //outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i]))
- writeSpace(outBuf)
- }
- continue
-
- case r == ';' && inEnt: // end of html entity
- inEnt = false
- continue
-
- case r == '&' && shouldOutput: // possible html entity
- entName := ""
- isEnt := false
-
- // parse the entity name - max 10 chars
- chars := 0
- for _, er := range html[i+1:] {
- if er == ';' {
- isEnt = true
- break
- } else {
- entName += string(er)
- }
-
- chars++
- if chars == 10 {
- break
- }
- }
-
- if isEnt {
- if ent, isEnt := parseHTMLEntity(entName); isEnt {
- outBuf.WriteString(ent)
- inEnt = true
- continue
- }
- }
-
- case r == '<': // start of a tag
- tagStart = i + 1
- shouldOutput = false
- continue
-
- case r == '>': // end of a tag
- shouldOutput = true
- tag := html[tagStart:i]
- tagNameLowercase := strings.ToLower(tag)
-
- if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" {
- outBuf.WriteString(opts.lbr)
- } else if tagNameLowercase == "li" || tagNameLowercase == "li/" {
- if opts.listPrefix != "" {
- outBuf.WriteString(opts.lbr + opts.listPrefix)
- } else {
- outBuf.WriteString(opts.lbr)
- }
- } else if headersRE.MatchString(tagNameLowercase) {
- if canPrintNewline {
- outBuf.WriteString(opts.lbr + opts.lbr)
- }
- canPrintNewline = false
- } else if tagNameLowercase == "br" || tagNameLowercase == "br/" {
- // new line
- outBuf.WriteString(opts.lbr)
- } else if tagNameLowercase == "p" || tagNameLowercase == "/p" {
- if canPrintNewline {
- outBuf.WriteString(opts.lbr + opts.lbr)
- }
- canPrintNewline = false
- } else if opts.linksInnerText && tagNameLowercase == "/a" {
- // end of link
- // links can be empty can happen if the link matches the badLinkHrefRE
- if len(hrefs) > 0 {
- outBuf.WriteString(" <")
- outBuf.WriteString(HTMLEntitiesToText(hrefs[0]))
- outBuf.WriteString(">")
- hrefs = hrefs[1:]
- }
- } else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) {
- // parse link href
- // add special handling for a tags
- m := linkTagRE.FindStringSubmatch(tag)
- if len(m) == 5 {
- link := m[2]
- if len(link) == 0 {
- link = m[3]
- if len(link) == 0 {
- link = m[4]
- }
- }
-
- if opts.linksInnerText && !badLinkHrefRE.MatchString(link) {
- hrefs = append(hrefs, link)
- }
- }
- } else if badTagnamesRE.MatchString(tagNameLowercase) {
- // unwanted block
- badTagStackDepth++
-
- // if link inner text preservation is not enabled
- // and the current tag is a link tag, parse its href and output that
- if !opts.linksInnerText {
- // parse link href
- m := linkTagRE.FindStringSubmatch(tag)
- if len(m) == 5 {
- link := m[2]
- if len(link) == 0 {
- link = m[3]
- if len(link) == 0 {
- link = m[4]
- }
- }
-
- if !badLinkHrefRE.MatchString(link) {
- outBuf.WriteString(HTMLEntitiesToText(link))
- }
- }
- }
- } else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' &&
- badTagnamesRE.MatchString(tagNameLowercase[1:]) {
- // end of unwanted block
- badTagStackDepth--
- }
- continue
-
- } // switch end
-
- if shouldOutput && badTagStackDepth == 0 && !inEnt {
- canPrintNewline = true
- outBuf.WriteRune(r)
- }
- }
-
- return outBuf.String()
-}