diff options
Diffstat (limited to 'vendor/github.com/rivo/uniseg/sentencerules.go')
-rw-r--r-- | vendor/github.com/rivo/uniseg/sentencerules.go | 276 |
1 files changed, 276 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/sentencerules.go b/vendor/github.com/rivo/uniseg/sentencerules.go new file mode 100644 index 000000000..0b29c7bdb --- /dev/null +++ b/vendor/github.com/rivo/uniseg/sentencerules.go @@ -0,0 +1,276 @@ +package uniseg + +import "unicode/utf8" + +// The states of the sentence break parser. +const ( + sbAny = iota + sbCR + sbParaSep + sbATerm + sbUpper + sbLower + sbSB7 + sbSB8Close + sbSB8Sp + sbSTerm + sbSB8aClose + sbSB8aSp +) + +// sbTransitions implements the sentence break parser's state transitions. It's +// anologous to [grTransitions], see comments there for details. +// +// Unicode version 15.0.0. +func sbTransitions(state, prop int) (newState int, sentenceBreak bool, rule int) { + switch uint64(state) | uint64(prop)<<32 { + // SB3. + case sbAny | prCR<<32: + return sbCR, false, 9990 + case sbCR | prLF<<32: + return sbParaSep, false, 30 + + // SB4. + case sbAny | prSep<<32: + return sbParaSep, false, 9990 + case sbAny | prLF<<32: + return sbParaSep, false, 9990 + case sbParaSep | prAny<<32: + return sbAny, true, 40 + case sbCR | prAny<<32: + return sbAny, true, 40 + + // SB6. + case sbAny | prATerm<<32: + return sbATerm, false, 9990 + case sbATerm | prNumeric<<32: + return sbAny, false, 60 + case sbSB7 | prNumeric<<32: + return sbAny, false, 60 // Because ATerm also appears in SB7. + + // SB7. + case sbAny | prUpper<<32: + return sbUpper, false, 9990 + case sbAny | prLower<<32: + return sbLower, false, 9990 + case sbUpper | prATerm<<32: + return sbSB7, false, 70 + case sbLower | prATerm<<32: + return sbSB7, false, 70 + case sbSB7 | prUpper<<32: + return sbUpper, false, 70 + + // SB8a. + case sbAny | prSTerm<<32: + return sbSTerm, false, 9990 + case sbATerm | prSContinue<<32: + return sbAny, false, 81 + case sbATerm | prATerm<<32: + return sbATerm, false, 81 + case sbATerm | prSTerm<<32: + return sbSTerm, false, 81 + case sbSB7 | prSContinue<<32: + return sbAny, false, 81 + case sbSB7 | prATerm<<32: + return sbATerm, false, 81 + case sbSB7 | prSTerm<<32: + return sbSTerm, false, 81 + case sbSB8Close | prSContinue<<32: + return sbAny, false, 81 + case sbSB8Close | prATerm<<32: + return sbATerm, false, 81 + case sbSB8Close | prSTerm<<32: + return sbSTerm, false, 81 + case sbSB8Sp | prSContinue<<32: + return sbAny, false, 81 + case sbSB8Sp | prATerm<<32: + return sbATerm, false, 81 + case sbSB8Sp | prSTerm<<32: + return sbSTerm, false, 81 + case sbSTerm | prSContinue<<32: + return sbAny, false, 81 + case sbSTerm | prATerm<<32: + return sbATerm, false, 81 + case sbSTerm | prSTerm<<32: + return sbSTerm, false, 81 + case sbSB8aClose | prSContinue<<32: + return sbAny, false, 81 + case sbSB8aClose | prATerm<<32: + return sbATerm, false, 81 + case sbSB8aClose | prSTerm<<32: + return sbSTerm, false, 81 + case sbSB8aSp | prSContinue<<32: + return sbAny, false, 81 + case sbSB8aSp | prATerm<<32: + return sbATerm, false, 81 + case sbSB8aSp | prSTerm<<32: + return sbSTerm, false, 81 + + // SB9. + case sbATerm | prClose<<32: + return sbSB8Close, false, 90 + case sbSB7 | prClose<<32: + return sbSB8Close, false, 90 + case sbSB8Close | prClose<<32: + return sbSB8Close, false, 90 + case sbATerm | prSp<<32: + return sbSB8Sp, false, 90 + case sbSB7 | prSp<<32: + return sbSB8Sp, false, 90 + case sbSB8Close | prSp<<32: + return sbSB8Sp, false, 90 + case sbSTerm | prClose<<32: + return sbSB8aClose, false, 90 + case sbSB8aClose | prClose<<32: + return sbSB8aClose, false, 90 + case sbSTerm | prSp<<32: + return sbSB8aSp, false, 90 + case sbSB8aClose | prSp<<32: + return sbSB8aSp, false, 90 + case sbATerm | prSep<<32: + return sbParaSep, false, 90 + case sbATerm | prCR<<32: + return sbParaSep, false, 90 + case sbATerm | prLF<<32: + return sbParaSep, false, 90 + case sbSB7 | prSep<<32: + return sbParaSep, false, 90 + case sbSB7 | prCR<<32: + return sbParaSep, false, 90 + case sbSB7 | prLF<<32: + return sbParaSep, false, 90 + case sbSB8Close | prSep<<32: + return sbParaSep, false, 90 + case sbSB8Close | prCR<<32: + return sbParaSep, false, 90 + case sbSB8Close | prLF<<32: + return sbParaSep, false, 90 + case sbSTerm | prSep<<32: + return sbParaSep, false, 90 + case sbSTerm | prCR<<32: + return sbParaSep, false, 90 + case sbSTerm | prLF<<32: + return sbParaSep, false, 90 + case sbSB8aClose | prSep<<32: + return sbParaSep, false, 90 + case sbSB8aClose | prCR<<32: + return sbParaSep, false, 90 + case sbSB8aClose | prLF<<32: + return sbParaSep, false, 90 + + // SB10. + case sbSB8Sp | prSp<<32: + return sbSB8Sp, false, 100 + case sbSB8aSp | prSp<<32: + return sbSB8aSp, false, 100 + case sbSB8Sp | prSep<<32: + return sbParaSep, false, 100 + case sbSB8Sp | prCR<<32: + return sbParaSep, false, 100 + case sbSB8Sp | prLF<<32: + return sbParaSep, false, 100 + + // SB11. + case sbATerm | prAny<<32: + return sbAny, true, 110 + case sbSB7 | prAny<<32: + return sbAny, true, 110 + case sbSB8Close | prAny<<32: + return sbAny, true, 110 + case sbSB8Sp | prAny<<32: + return sbAny, true, 110 + case sbSTerm | prAny<<32: + return sbAny, true, 110 + case sbSB8aClose | prAny<<32: + return sbAny, true, 110 + case sbSB8aSp | prAny<<32: + return sbAny, true, 110 + // We'll always break after ParaSep due to SB4. + + default: + return -1, false, -1 + } +} + +// transitionSentenceBreakState determines the new state of the sentence break +// parser given the current state and the next code point. It also returns +// whether a sentence boundary was detected. If more than one code point is +// needed to determine the new state, the byte slice or the string starting +// after rune "r" can be used (whichever is not nil or empty) for further +// lookups. +func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) { + // Determine the property of the next character. + nextProperty := property(sentenceBreakCodePoints, r) + + // SB5 (Replacing Ignore Rules). + if nextProperty == prExtend || nextProperty == prFormat { + if state == sbParaSep || state == sbCR { + return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4. + } + if state < 0 { + return sbAny, true // SB1. + } + return state, false + } + + // Find the applicable transition in the table. + var rule int + newState, sentenceBreak, rule = sbTransitions(state, nextProperty) + if newState < 0 { + // No specific transition found. Try the less specific ones. + anyPropState, anyPropProp, anyPropRule := sbTransitions(state, prAny) + anyStateState, anyStateProp, anyStateRule := sbTransitions(sbAny, nextProperty) + if anyPropState >= 0 && anyStateState >= 0 { + // Both apply. We'll use a mix (see comments for grTransitions). + newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule + if anyPropRule < anyStateRule { + sentenceBreak, rule = anyPropProp, anyPropRule + } + } else if anyPropState >= 0 { + // We only have a specific state. + newState, sentenceBreak, rule = anyPropState, anyPropProp, anyPropRule + // This branch will probably never be reached because okAnyState will + // always be true given the current transition map. But we keep it here + // for future modifications to the transition map where this may not be + // true anymore. + } else if anyStateState >= 0 { + // We only have a specific property. + newState, sentenceBreak, rule = anyStateState, anyStateProp, anyStateRule + } else { + // No known transition. SB999: Any × Any. + newState, sentenceBreak, rule = sbAny, false, 9990 + } + } + + // SB8. + if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) { + // Check the right side of the rule. + var length int + for nextProperty != prOLetter && + nextProperty != prUpper && + nextProperty != prLower && + nextProperty != prSep && + nextProperty != prCR && + nextProperty != prLF && + nextProperty != prATerm && + nextProperty != prSTerm { + // Move on to the next rune. + if b != nil { // Byte slice version. + r, length = utf8.DecodeRune(b) + b = b[length:] + } else { // String version. + r, length = utf8.DecodeRuneInString(str) + str = str[length:] + } + if r == utf8.RuneError { + break + } + nextProperty = property(sentenceBreakCodePoints, r) + } + if nextProperty == prLower { + return sbLower, false + } + } + + return +} |