summaryrefslogtreecommitdiff
path: root/vendor/github.com/rivo/uniseg/wordrules.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/rivo/uniseg/wordrules.go')
-rw-r--r--vendor/github.com/rivo/uniseg/wordrules.go282
1 files changed, 282 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/wordrules.go b/vendor/github.com/rivo/uniseg/wordrules.go
new file mode 100644
index 000000000..57a8c6831
--- /dev/null
+++ b/vendor/github.com/rivo/uniseg/wordrules.go
@@ -0,0 +1,282 @@
+package uniseg
+
+import "unicode/utf8"
+
+// The states of the word break parser.
+const (
+ wbAny = iota
+ wbCR
+ wbLF
+ wbNewline
+ wbWSegSpace
+ wbHebrewLetter
+ wbALetter
+ wbWB7
+ wbWB7c
+ wbNumeric
+ wbWB11
+ wbKatakana
+ wbExtendNumLet
+ wbOddRI
+ wbEvenRI
+ wbZWJBit = 16 // This bit is set for any states followed by at least one zero-width joiner (see WB4 and WB3c).
+)
+
+// wbTransitions implements the word break parser's state transitions. It's
+// anologous to [grTransitions], see comments there for details.
+//
+// Unicode version 15.0.0.
+func wbTransitions(state, prop int) (newState int, wordBreak bool, rule int) {
+ switch uint64(state) | uint64(prop)<<32 {
+ // WB3b.
+ case wbAny | prNewline<<32:
+ return wbNewline, true, 32
+ case wbAny | prCR<<32:
+ return wbCR, true, 32
+ case wbAny | prLF<<32:
+ return wbLF, true, 32
+
+ // WB3a.
+ case wbNewline | prAny<<32:
+ return wbAny, true, 31
+ case wbCR | prAny<<32:
+ return wbAny, true, 31
+ case wbLF | prAny<<32:
+ return wbAny, true, 31
+
+ // WB3.
+ case wbCR | prLF<<32:
+ return wbLF, false, 30
+
+ // WB3d.
+ case wbAny | prWSegSpace<<32:
+ return wbWSegSpace, true, 9990
+ case wbWSegSpace | prWSegSpace<<32:
+ return wbWSegSpace, false, 34
+
+ // WB5.
+ case wbAny | prALetter<<32:
+ return wbALetter, true, 9990
+ case wbAny | prHebrewLetter<<32:
+ return wbHebrewLetter, true, 9990
+ case wbALetter | prALetter<<32:
+ return wbALetter, false, 50
+ case wbALetter | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 50
+ case wbHebrewLetter | prALetter<<32:
+ return wbALetter, false, 50
+ case wbHebrewLetter | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 50
+
+ // WB7. Transitions to wbWB7 handled by transitionWordBreakState().
+ case wbWB7 | prALetter<<32:
+ return wbALetter, false, 70
+ case wbWB7 | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 70
+
+ // WB7a.
+ case wbHebrewLetter | prSingleQuote<<32:
+ return wbAny, false, 71
+
+ // WB7c. Transitions to wbWB7c handled by transitionWordBreakState().
+ case wbWB7c | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 73
+
+ // WB8.
+ case wbAny | prNumeric<<32:
+ return wbNumeric, true, 9990
+ case wbNumeric | prNumeric<<32:
+ return wbNumeric, false, 80
+
+ // WB9.
+ case wbALetter | prNumeric<<32:
+ return wbNumeric, false, 90
+ case wbHebrewLetter | prNumeric<<32:
+ return wbNumeric, false, 90
+
+ // WB10.
+ case wbNumeric | prALetter<<32:
+ return wbALetter, false, 100
+ case wbNumeric | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 100
+
+ // WB11. Transitions to wbWB11 handled by transitionWordBreakState().
+ case wbWB11 | prNumeric<<32:
+ return wbNumeric, false, 110
+
+ // WB13.
+ case wbAny | prKatakana<<32:
+ return wbKatakana, true, 9990
+ case wbKatakana | prKatakana<<32:
+ return wbKatakana, false, 130
+
+ // WB13a.
+ case wbAny | prExtendNumLet<<32:
+ return wbExtendNumLet, true, 9990
+ case wbALetter | prExtendNumLet<<32:
+ return wbExtendNumLet, false, 131
+ case wbHebrewLetter | prExtendNumLet<<32:
+ return wbExtendNumLet, false, 131
+ case wbNumeric | prExtendNumLet<<32:
+ return wbExtendNumLet, false, 131
+ case wbKatakana | prExtendNumLet<<32:
+ return wbExtendNumLet, false, 131
+ case wbExtendNumLet | prExtendNumLet<<32:
+ return wbExtendNumLet, false, 131
+
+ // WB13b.
+ case wbExtendNumLet | prALetter<<32:
+ return wbALetter, false, 132
+ case wbExtendNumLet | prHebrewLetter<<32:
+ return wbHebrewLetter, false, 132
+ case wbExtendNumLet | prNumeric<<32:
+ return wbNumeric, false, 132
+ case wbExtendNumLet | prKatakana<<32:
+ return wbKatakana, false, 132
+
+ default:
+ return -1, false, -1
+ }
+}
+
+// transitionWordBreakState determines the new state of the word break parser
+// given the current state and the next code point. It also returns whether a
+// word boundary was detected. If more than one code point is needed to
+// determine the new state, the byte slice or the string starting after rune "r"
+// can be used (whichever is not nil or empty) for further lookups.
+func transitionWordBreakState(state int, r rune, b []byte, str string) (newState int, wordBreak bool) {
+ // Determine the property of the next character.
+ nextProperty := property(workBreakCodePoints, r)
+
+ // "Replacing Ignore Rules".
+ if nextProperty == prZWJ {
+ // WB4 (for zero-width joiners).
+ if state == wbNewline || state == wbCR || state == wbLF {
+ return wbAny | wbZWJBit, true // Make sure we don't apply WB4 to WB3a.
+ }
+ if state < 0 {
+ return wbAny | wbZWJBit, false
+ }
+ return state | wbZWJBit, false
+ } else if nextProperty == prExtend || nextProperty == prFormat {
+ // WB4 (for Extend and Format).
+ if state == wbNewline || state == wbCR || state == wbLF {
+ return wbAny, true // Make sure we don't apply WB4 to WB3a.
+ }
+ if state == wbWSegSpace || state == wbAny|wbZWJBit {
+ return wbAny, false // We don't break but this is also not WB3d or WB3c.
+ }
+ if state < 0 {
+ return wbAny, false
+ }
+ return state, false
+ } else if nextProperty == prExtendedPictographic && state >= 0 && state&wbZWJBit != 0 {
+ // WB3c.
+ return wbAny, false
+ }
+ if state >= 0 {
+ state = state &^ wbZWJBit
+ }
+
+ // Find the applicable transition in the table.
+ var rule int
+ newState, wordBreak, rule = wbTransitions(state, nextProperty)
+ if newState < 0 {
+ // No specific transition found. Try the less specific ones.
+ anyPropState, anyPropWordBreak, anyPropRule := wbTransitions(state, prAny)
+ anyStateState, anyStateWordBreak, anyStateRule := wbTransitions(wbAny, nextProperty)
+ if anyPropState >= 0 && anyStateState >= 0 {
+ // Both apply. We'll use a mix (see comments for grTransitions).
+ newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
+ if anyPropRule < anyStateRule {
+ wordBreak, rule = anyPropWordBreak, anyPropRule
+ }
+ } else if anyPropState >= 0 {
+ // We only have a specific state.
+ newState, wordBreak, rule = anyPropState, anyPropWordBreak, anyPropRule
+ // This branch will probably never be reached because okAnyState will
+ // always be true given the current transition map. But we keep it here
+ // for future modifications to the transition map where this may not be
+ // true anymore.
+ } else if anyStateState >= 0 {
+ // We only have a specific property.
+ newState, wordBreak, rule = anyStateState, anyStateWordBreak, anyStateRule
+ } else {
+ // No known transition. WB999: Any รท Any.
+ newState, wordBreak, rule = wbAny, true, 9990
+ }
+ }
+
+ // For those rules that need to look up runes further in the string, we
+ // determine the property after nextProperty, skipping over Format, Extend,
+ // and ZWJ (according to WB4). It's -1 if not needed, if such a rune cannot
+ // be determined (because the text ends or the rune is faulty).
+ farProperty := -1
+ if rule > 60 &&
+ (state == wbALetter || state == wbHebrewLetter || state == wbNumeric) &&
+ (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote || // WB6.
+ nextProperty == prDoubleQuote || // WB7b.
+ nextProperty == prMidNum) { // WB12.
+ for {
+ var (
+ r rune
+ length int
+ )
+ if b != nil { // Byte slice version.
+ r, length = utf8.DecodeRune(b)
+ b = b[length:]
+ } else { // String version.
+ r, length = utf8.DecodeRuneInString(str)
+ str = str[length:]
+ }
+ if r == utf8.RuneError {
+ break
+ }
+ prop := property(workBreakCodePoints, r)
+ if prop == prExtend || prop == prFormat || prop == prZWJ {
+ continue
+ }
+ farProperty = prop
+ break
+ }
+ }
+
+ // WB6.
+ if rule > 60 &&
+ (state == wbALetter || state == wbHebrewLetter) &&
+ (nextProperty == prMidLetter || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
+ (farProperty == prALetter || farProperty == prHebrewLetter) {
+ return wbWB7, false
+ }
+
+ // WB7b.
+ if rule > 72 &&
+ state == wbHebrewLetter &&
+ nextProperty == prDoubleQuote &&
+ farProperty == prHebrewLetter {
+ return wbWB7c, false
+ }
+
+ // WB12.
+ if rule > 120 &&
+ state == wbNumeric &&
+ (nextProperty == prMidNum || nextProperty == prMidNumLet || nextProperty == prSingleQuote) &&
+ farProperty == prNumeric {
+ return wbWB11, false
+ }
+
+ // WB15 and WB16.
+ if newState == wbAny && nextProperty == prRegionalIndicator {
+ if state != wbOddRI && state != wbEvenRI { // Includes state == -1.
+ // Transition into the first RI.
+ return wbOddRI, true
+ }
+ if state == wbOddRI {
+ // Don't break pairs of Regional Indicators.
+ return wbEvenRI, false
+ }
+ return wbOddRI, true // We can break after a pair.
+ }
+
+ return
+}