diff options
Diffstat (limited to 'vendor/github.com/k3a/html2text/html2text.go')
-rw-r--r-- | vendor/github.com/k3a/html2text/html2text.go | 333 |
1 files changed, 0 insertions, 333 deletions
diff --git a/vendor/github.com/k3a/html2text/html2text.go b/vendor/github.com/k3a/html2text/html2text.go deleted file mode 100644 index f79fbe395..000000000 --- a/vendor/github.com/k3a/html2text/html2text.go +++ /dev/null @@ -1,333 +0,0 @@ -package html2text - -import ( - "bytes" - "regexp" - "strconv" - "strings" -) - -// Line break constants -// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) -const ( - WIN_LBR = "\r\n" - UNIX_LBR = "\n" -) - -var legacyLBR = WIN_LBR -var badTagnamesRE = regexp.MustCompile(`^(head|script|style|a)($|\s+)`) -var linkTagRE = regexp.MustCompile(`^(?i:a)(?:$|\s).*(?i:href)\s*=\s*('([^']*?)'|"([^"]*?)"|([^\s"'` + "`" + `=<>]+))`) -var badLinkHrefRE = regexp.MustCompile(`javascript:`) -var headersRE = regexp.MustCompile(`^(\/)?h[1-6]`) -var numericEntityRE = regexp.MustCompile(`(?i)^#(x?[a-f0-9]+)$`) - -type options struct { - lbr string - linksInnerText bool - listPrefix string -} - -func newOptions() *options { - // apply defaults - return &options{ - lbr: WIN_LBR, - } -} - -// Option is a functional option -type Option func(*options) - -// WithUnixLineBreaks instructs the converter to use unix line breaks ("\n" instead of "\r\n" default) -func WithUnixLineBreaks() Option { - return func(o *options) { - o.lbr = UNIX_LBR - } -} - -// WithLinksInnerText instructs the converter to retain link tag inner text and append href URLs in angle brackets after the text -// Example: click news <http://bit.ly/2n4wXRs> -func WithLinksInnerText() Option { - return func(o *options) { - o.linksInnerText = true - } -} - -// WithListSupportPrefix formats <ul> and <li> lists with the specified prefix -func WithListSupportPrefix(prefix string) Option { - return func(o *options) { - o.listPrefix = prefix - } -} - -// WithListSupport formats <ul> and <li> lists with " - " prefix -func WithListSupport() Option { - return WithListSupportPrefix(" - ") -} - -func parseHTMLEntity(entName string) (string, bool) { - if r, ok := entity[entName]; ok { - return string(r), true - } - - if match := numericEntityRE.FindStringSubmatch(entName); len(match) == 2 { - var ( - err error - n int64 - digits = match[1] - ) - - if digits != "" && (digits[0] == 'x' || digits[0] == 'X') { - n, err = strconv.ParseInt(digits[1:], 16, 64) - } else { - n, err = strconv.ParseInt(digits, 10, 64) - } - - if err == nil && (n == 9 || n == 10 || n == 13 || n > 31) { - return string(rune(n)), true - } - } - - return "", false -} - -// SetUnixLbr with argument true sets Unix-style line-breaks in output ("\n") -// with argument false sets Windows-style line-breaks in output ("\r\n", the default) -// Deprecated: Please use HTML2TextWithOptions(text, WithUnixLineBreak()) -func SetUnixLbr(b bool) { - if b { - legacyLBR = UNIX_LBR - } else { - legacyLBR = WIN_LBR - } -} - -// HTMLEntitiesToText decodes HTML entities inside a provided -// string and returns decoded text -func HTMLEntitiesToText(htmlEntsText string) string { - outBuf := bytes.NewBufferString("") - inEnt := false - - for i, r := range htmlEntsText { - switch { - case r == ';' && inEnt: - inEnt = false - continue - - case r == '&': //possible html entity - entName := "" - isEnt := false - - // parse the entity name - max 10 chars - chars := 0 - for _, er := range htmlEntsText[i+1:] { - if er == ';' { - isEnt = true - break - } else { - entName += string(er) - } - - chars++ - if chars == 10 { - break - } - } - - if isEnt { - if ent, isEnt := parseHTMLEntity(entName); isEnt { - outBuf.WriteString(ent) - inEnt = true - continue - } - } - } - - if !inEnt { - outBuf.WriteRune(r) - } - } - - return outBuf.String() -} - -func writeSpace(outBuf *bytes.Buffer) { - bts := outBuf.Bytes() - if len(bts) > 0 && bts[len(bts)-1] != ' ' { - outBuf.WriteString(" ") - } -} - -// HTML2Text converts html into a text form -func HTML2Text(html string) string { - var opts []Option - if legacyLBR == UNIX_LBR { - opts = append(opts, WithUnixLineBreaks()) - } - return HTML2TextWithOptions(html, opts...) -} - -// HTML2TextWithOptions converts html into a text form with additional options -func HTML2TextWithOptions(html string, reqOpts ...Option) string { - opts := newOptions() - for _, opt := range reqOpts { - opt(opts) - } - - inLen := len(html) - tagStart := 0 - inEnt := false - badTagStackDepth := 0 // if == 1 it means we are inside <head>...</head> - shouldOutput := true - // maintain a stack of <a> tag href links and output it after the tag's inner text (for opts.linksInnerText only) - hrefs := []string{} - // new line cannot be printed at the beginning or - // for <p> after a new line created by previous <p></p> - canPrintNewline := false - - outBuf := bytes.NewBufferString("") - - for i, r := range html { - if inLen > 0 && i == inLen-1 { - // prevent new line at the end of the document - canPrintNewline = false - } - - switch { - // skip new lines and spaces adding a single space if not there yet - case r <= 0xD, r == 0x85, r == 0x2028, r == 0x2029, // new lines - r == ' ', r >= 0x2008 && r <= 0x200B: // spaces - if shouldOutput && badTagStackDepth == 0 && !inEnt { - //outBuf.WriteString(fmt.Sprintf("{DBG r:%c, inEnt:%t, tag:%s}", r, inEnt, html[tagStart:i])) - writeSpace(outBuf) - } - continue - - case r == ';' && inEnt: // end of html entity - inEnt = false - continue - - case r == '&' && shouldOutput: // possible html entity - entName := "" - isEnt := false - - // parse the entity name - max 10 chars - chars := 0 - for _, er := range html[i+1:] { - if er == ';' { - isEnt = true - break - } else { - entName += string(er) - } - - chars++ - if chars == 10 { - break - } - } - - if isEnt { - if ent, isEnt := parseHTMLEntity(entName); isEnt { - outBuf.WriteString(ent) - inEnt = true - continue - } - } - - case r == '<': // start of a tag - tagStart = i + 1 - shouldOutput = false - continue - - case r == '>': // end of a tag - shouldOutput = true - tag := html[tagStart:i] - tagNameLowercase := strings.ToLower(tag) - - if tagNameLowercase == "/ul" || tagNameLowercase == "/ol" { - outBuf.WriteString(opts.lbr) - } else if tagNameLowercase == "li" || tagNameLowercase == "li/" { - if opts.listPrefix != "" { - outBuf.WriteString(opts.lbr + opts.listPrefix) - } else { - outBuf.WriteString(opts.lbr) - } - } else if headersRE.MatchString(tagNameLowercase) { - if canPrintNewline { - outBuf.WriteString(opts.lbr + opts.lbr) - } - canPrintNewline = false - } else if tagNameLowercase == "br" || tagNameLowercase == "br/" { - // new line - outBuf.WriteString(opts.lbr) - } else if tagNameLowercase == "p" || tagNameLowercase == "/p" { - if canPrintNewline { - outBuf.WriteString(opts.lbr + opts.lbr) - } - canPrintNewline = false - } else if opts.linksInnerText && tagNameLowercase == "/a" { - // end of link - // links can be empty can happen if the link matches the badLinkHrefRE - if len(hrefs) > 0 { - outBuf.WriteString(" <") - outBuf.WriteString(HTMLEntitiesToText(hrefs[0])) - outBuf.WriteString(">") - hrefs = hrefs[1:] - } - } else if opts.linksInnerText && linkTagRE.MatchString(tagNameLowercase) { - // parse link href - // add special handling for a tags - m := linkTagRE.FindStringSubmatch(tag) - if len(m) == 5 { - link := m[2] - if len(link) == 0 { - link = m[3] - if len(link) == 0 { - link = m[4] - } - } - - if opts.linksInnerText && !badLinkHrefRE.MatchString(link) { - hrefs = append(hrefs, link) - } - } - } else if badTagnamesRE.MatchString(tagNameLowercase) { - // unwanted block - badTagStackDepth++ - - // if link inner text preservation is not enabled - // and the current tag is a link tag, parse its href and output that - if !opts.linksInnerText { - // parse link href - m := linkTagRE.FindStringSubmatch(tag) - if len(m) == 5 { - link := m[2] - if len(link) == 0 { - link = m[3] - if len(link) == 0 { - link = m[4] - } - } - - if !badLinkHrefRE.MatchString(link) { - outBuf.WriteString(HTMLEntitiesToText(link)) - } - } - } - } else if len(tagNameLowercase) > 0 && tagNameLowercase[0] == '/' && - badTagnamesRE.MatchString(tagNameLowercase[1:]) { - // end of unwanted block - badTagStackDepth-- - } - continue - - } // switch end - - if shouldOutput && badTagStackDepth == 0 && !inEnt { - canPrintNewline = true - outBuf.WriteRune(r) - } - } - - return outBuf.String() -} |