diff options
Diffstat (limited to 'vendor/github.com/rivo/uniseg/grapheme.go')
-rw-r--r-- | vendor/github.com/rivo/uniseg/grapheme.go | 331 |
1 files changed, 331 insertions, 0 deletions
diff --git a/vendor/github.com/rivo/uniseg/grapheme.go b/vendor/github.com/rivo/uniseg/grapheme.go new file mode 100644 index 000000000..b12403d43 --- /dev/null +++ b/vendor/github.com/rivo/uniseg/grapheme.go @@ -0,0 +1,331 @@ +package uniseg + +import "unicode/utf8" + +// Graphemes implements an iterator over Unicode grapheme clusters, or +// user-perceived characters. While iterating, it also provides information +// about word boundaries, sentence boundaries, line breaks, and monospace +// character widths. +// +// After constructing the class via [NewGraphemes] for a given string "str", +// [Graphemes.Next] is called for every grapheme cluster in a loop until it +// returns false. Inside the loop, information about the grapheme cluster as +// well as boundary information and character width is available via the various +// methods (see examples below). +// +// This class basically wraps the [StepString] parser and provides a convenient +// interface to it. If you are only interested in some parts of this package's +// functionality, using the specialized functions starting with "First" is +// almost always faster. +type Graphemes struct { + // The original string. + original string + + // The remaining string to be parsed. + remaining string + + // The current grapheme cluster. + cluster string + + // The byte offset of the current grapheme cluster relative to the original + // string. + offset int + + // The current boundary information of the [Step] parser. + boundaries int + + // The current state of the [Step] parser. + state int +} + +// NewGraphemes returns a new grapheme cluster iterator. +func NewGraphemes(str string) *Graphemes { + return &Graphemes{ + original: str, + remaining: str, + state: -1, + } +} + +// Next advances the iterator by one grapheme cluster and returns false if no +// clusters are left. This function must be called before the first cluster is +// accessed. +func (g *Graphemes) Next() bool { + if len(g.remaining) == 0 { + // We're already past the end. + g.state = -2 + g.cluster = "" + return false + } + g.offset += len(g.cluster) + g.cluster, g.remaining, g.boundaries, g.state = StepString(g.remaining, g.state) + return true +} + +// Runes returns a slice of runes (code points) which corresponds to the current +// grapheme cluster. If the iterator is already past the end or [Graphemes.Next] +// has not yet been called, nil is returned. +func (g *Graphemes) Runes() []rune { + if g.state < 0 { + return nil + } + return []rune(g.cluster) +} + +// Str returns a substring of the original string which corresponds to the +// current grapheme cluster. If the iterator is already past the end or +// [Graphemes.Next] has not yet been called, an empty string is returned. +func (g *Graphemes) Str() string { + return g.cluster +} + +// Bytes returns a byte slice which corresponds to the current grapheme cluster. +// If the iterator is already past the end or [Graphemes.Next] has not yet been +// called, nil is returned. +func (g *Graphemes) Bytes() []byte { + if g.state < 0 { + return nil + } + return []byte(g.cluster) +} + +// Positions returns the interval of the current grapheme cluster as byte +// positions into the original string. The first returned value "from" indexes +// the first byte and the second returned value "to" indexes the first byte that +// is not included anymore, i.e. str[from:to] is the current grapheme cluster of +// the original string "str". If [Graphemes.Next] has not yet been called, both +// values are 0. If the iterator is already past the end, both values are 1. +func (g *Graphemes) Positions() (int, int) { + if g.state == -1 { + return 0, 0 + } else if g.state == -2 { + return 1, 1 + } + return g.offset, g.offset + len(g.cluster) +} + +// IsWordBoundary returns true if a word ends after the current grapheme +// cluster. +func (g *Graphemes) IsWordBoundary() bool { + if g.state < 0 { + return true + } + return g.boundaries&MaskWord != 0 +} + +// IsSentenceBoundary returns true if a sentence ends after the current +// grapheme cluster. +func (g *Graphemes) IsSentenceBoundary() bool { + if g.state < 0 { + return true + } + return g.boundaries&MaskSentence != 0 +} + +// LineBreak returns whether the line can be broken after the current grapheme +// cluster. A value of [LineDontBreak] means the line may not be broken, a value +// of [LineMustBreak] means the line must be broken, and a value of +// [LineCanBreak] means the line may or may not be broken. +func (g *Graphemes) LineBreak() int { + if g.state == -1 { + return LineDontBreak + } + if g.state == -2 { + return LineMustBreak + } + return g.boundaries & MaskLine +} + +// Width returns the monospace width of the current grapheme cluster. +func (g *Graphemes) Width() int { + if g.state < 0 { + return 0 + } + return g.boundaries >> ShiftWidth +} + +// Reset puts the iterator into its initial state such that the next call to +// [Graphemes.Next] sets it to the first grapheme cluster again. +func (g *Graphemes) Reset() { + g.state = -1 + g.offset = 0 + g.cluster = "" + g.remaining = g.original +} + +// GraphemeClusterCount returns the number of user-perceived characters +// (grapheme clusters) for the given string. +func GraphemeClusterCount(s string) (n int) { + state := -1 + for len(s) > 0 { + _, s, _, state = FirstGraphemeClusterInString(s, state) + n++ + } + return +} + +// ReverseString reverses the given string while observing grapheme cluster +// boundaries. +func ReverseString(s string) string { + str := []byte(s) + reversed := make([]byte, len(str)) + state := -1 + index := len(str) + for len(str) > 0 { + var cluster []byte + cluster, str, _, state = FirstGraphemeCluster(str, state) + index -= len(cluster) + copy(reversed[index:], cluster) + if index <= len(str)/2 { + break + } + } + return string(reversed) +} + +// The number of bits the grapheme property must be shifted to make place for +// grapheme states. +const shiftGraphemePropState = 4 + +// FirstGraphemeCluster returns the first grapheme cluster found in the given +// byte slice according to the rules of [Unicode Standard Annex #29, Grapheme +// Cluster Boundaries]. This function can be called continuously to extract all +// grapheme clusters from a byte slice, as illustrated in the example below. +// +// If you don't know the current state, for example when calling the function +// for the first time, you must pass -1. For consecutive calls, pass the state +// and rest slice returned by the previous call. +// +// The "rest" slice is the sub-slice of the original byte slice "b" starting +// after the last byte of the identified grapheme cluster. If the length of the +// "rest" slice is 0, the entire byte slice "b" has been processed. The +// "cluster" byte slice is the sub-slice of the input slice containing the +// identified grapheme cluster. +// +// The returned width is the width of the grapheme cluster for most monospace +// fonts where a value of 1 represents one character cell. +// +// Given an empty byte slice "b", the function returns nil values. +// +// While slightly less convenient than using the Graphemes class, this function +// has much better performance and makes no allocations. It lends itself well to +// large byte slices. +// +// [Unicode Standard Annex #29, Grapheme Cluster Boundaries]: http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries +func FirstGraphemeCluster(b []byte, state int) (cluster, rest []byte, width, newState int) { + // An empty byte slice returns nothing. + if len(b) == 0 { + return + } + + // Extract the first rune. + r, length := utf8.DecodeRune(b) + if len(b) <= length { // If we're already past the end, there is nothing else to parse. + var prop int + if state < 0 { + prop = propertyGraphemes(r) + } else { + prop = state >> shiftGraphemePropState + } + return b, nil, runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) + } + + // If we don't know the state, determine it now. + var firstProp int + if state < 0 { + state, firstProp, _ = transitionGraphemeState(state, r) + } else { + firstProp = state >> shiftGraphemePropState + } + width += runeWidth(r, firstProp) + + // Transition until we find a boundary. + for { + var ( + prop int + boundary bool + ) + + r, l := utf8.DecodeRune(b[length:]) + state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) + + if boundary { + return b[:length], b[length:], width, state | (prop << shiftGraphemePropState) + } + + if firstProp == prExtendedPictographic { + if r == vs15 { + width = 1 + } else if r == vs16 { + width = 2 + } + } else if firstProp != prRegionalIndicator && firstProp != prL { + width += runeWidth(r, prop) + } + + length += l + if len(b) <= length { + return b, nil, width, grAny | (prop << shiftGraphemePropState) + } + } +} + +// FirstGraphemeClusterInString is like [FirstGraphemeCluster] but its input and +// outputs are strings. +func FirstGraphemeClusterInString(str string, state int) (cluster, rest string, width, newState int) { + // An empty string returns nothing. + if len(str) == 0 { + return + } + + // Extract the first rune. + r, length := utf8.DecodeRuneInString(str) + if len(str) <= length { // If we're already past the end, there is nothing else to parse. + var prop int + if state < 0 { + prop = propertyGraphemes(r) + } else { + prop = state >> shiftGraphemePropState + } + return str, "", runeWidth(r, prop), grAny | (prop << shiftGraphemePropState) + } + + // If we don't know the state, determine it now. + var firstProp int + if state < 0 { + state, firstProp, _ = transitionGraphemeState(state, r) + } else { + firstProp = state >> shiftGraphemePropState + } + width += runeWidth(r, firstProp) + + // Transition until we find a boundary. + for { + var ( + prop int + boundary bool + ) + + r, l := utf8.DecodeRuneInString(str[length:]) + state, prop, boundary = transitionGraphemeState(state&maskGraphemeState, r) + + if boundary { + return str[:length], str[length:], width, state | (prop << shiftGraphemePropState) + } + + if firstProp == prExtendedPictographic { + if r == vs15 { + width = 1 + } else if r == vs16 { + width = 2 + } + } else if firstProp != prRegionalIndicator && firstProp != prL { + width += runeWidth(r, prop) + } + + length += l + if len(str) <= length { + return str, "", width, grAny | (prop << shiftGraphemePropState) + } + } +} |