summaryrefslogtreecommitdiff
path: root/vendor/github.com/rivo/uniseg/graphemerules.go
blob: 5d399d29c8fb2428e29e6ee641a5908f2c8a73bc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
package uniseg

// The states of the grapheme cluster parser.
const (
	grAny = iota
	grCR
	grControlLF
	grL
	grLVV
	grLVTT
	grPrepend
	grExtendedPictographic
	grExtendedPictographicZWJ
	grRIOdd
	grRIEven
)

// The grapheme cluster parser's breaking instructions.
const (
	grNoBoundary = iota
	grBoundary
)

// grTransitions implements the grapheme cluster parser's state transitions.
// Maps state and property to a new state, a breaking instruction, and rule
// number. The breaking instruction always refers to the boundary between the
// last and next code point. Returns negative values if no transition is found.
//
// This function is used as follows:
//
//  1. Find specific state + specific property. Stop if found.
//  2. Find specific state + any property.
//  3. Find any state + specific property.
//  4. If only (2) or (3) (but not both) was found, stop.
//  5. If both (2) and (3) were found, use state from (3) and breaking instruction
//     from the transition with the lower rule number, prefer (3) if rule numbers
//     are equal. Stop.
//  6. Assume grAny and grBoundary.
//
// Unicode version 15.0.0.
func grTransitions(state, prop int) (newState int, newProp int, boundary int) {
	// It turns out that using a big switch statement is much faster than using
	// a map.

	switch uint64(state) | uint64(prop)<<32 {
	// GB5
	case grAny | prCR<<32:
		return grCR, grBoundary, 50
	case grAny | prLF<<32:
		return grControlLF, grBoundary, 50
	case grAny | prControl<<32:
		return grControlLF, grBoundary, 50

	// GB4
	case grCR | prAny<<32:
		return grAny, grBoundary, 40
	case grControlLF | prAny<<32:
		return grAny, grBoundary, 40

	// GB3
	case grCR | prLF<<32:
		return grControlLF, grNoBoundary, 30

	// GB6
	case grAny | prL<<32:
		return grL, grBoundary, 9990
	case grL | prL<<32:
		return grL, grNoBoundary, 60
	case grL | prV<<32:
		return grLVV, grNoBoundary, 60
	case grL | prLV<<32:
		return grLVV, grNoBoundary, 60
	case grL | prLVT<<32:
		return grLVTT, grNoBoundary, 60

	// GB7
	case grAny | prLV<<32:
		return grLVV, grBoundary, 9990
	case grAny | prV<<32:
		return grLVV, grBoundary, 9990
	case grLVV | prV<<32:
		return grLVV, grNoBoundary, 70
	case grLVV | prT<<32:
		return grLVTT, grNoBoundary, 70

	// GB8
	case grAny | prLVT<<32:
		return grLVTT, grBoundary, 9990
	case grAny | prT<<32:
		return grLVTT, grBoundary, 9990
	case grLVTT | prT<<32:
		return grLVTT, grNoBoundary, 80

	// GB9
	case grAny | prExtend<<32:
		return grAny, grNoBoundary, 90
	case grAny | prZWJ<<32:
		return grAny, grNoBoundary, 90

	// GB9a
	case grAny | prSpacingMark<<32:
		return grAny, grNoBoundary, 91

	// GB9b
	case grAny | prPrepend<<32:
		return grPrepend, grBoundary, 9990
	case grPrepend | prAny<<32:
		return grAny, grNoBoundary, 92

	// GB11
	case grAny | prExtendedPictographic<<32:
		return grExtendedPictographic, grBoundary, 9990
	case grExtendedPictographic | prExtend<<32:
		return grExtendedPictographic, grNoBoundary, 110
	case grExtendedPictographic | prZWJ<<32:
		return grExtendedPictographicZWJ, grNoBoundary, 110
	case grExtendedPictographicZWJ | prExtendedPictographic<<32:
		return grExtendedPictographic, grNoBoundary, 110

	// GB12 / GB13
	case grAny | prRegionalIndicator<<32:
		return grRIOdd, grBoundary, 9990
	case grRIOdd | prRegionalIndicator<<32:
		return grRIEven, grNoBoundary, 120
	case grRIEven | prRegionalIndicator<<32:
		return grRIOdd, grBoundary, 120
	default:
		return -1, -1, -1
	}
}

// transitionGraphemeState determines the new state of the grapheme cluster
// parser given the current state and the next code point. It also returns the
// code point's grapheme property (the value mapped by the [graphemeCodePoints]
// table) and whether a cluster boundary was detected.
func transitionGraphemeState(state int, r rune) (newState, prop int, boundary bool) {
	// Determine the property of the next character.
	prop = propertyGraphemes(r)

	// Find the applicable transition.
	nextState, nextProp, _ := grTransitions(state, prop)
	if nextState >= 0 {
		// We have a specific transition. We'll use it.
		return nextState, prop, nextProp == grBoundary
	}

	// No specific transition found. Try the less specific ones.
	anyPropState, anyPropProp, anyPropRule := grTransitions(state, prAny)
	anyStateState, anyStateProp, anyStateRule := grTransitions(grAny, prop)
	if anyPropState >= 0 && anyStateState >= 0 {
		// Both apply. We'll use a mix (see comments for grTransitions).
		newState = anyStateState
		boundary = anyStateProp == grBoundary
		if anyPropRule < anyStateRule {
			boundary = anyPropProp == grBoundary
		}
		return
	}

	if anyPropState >= 0 {
		// We only have a specific state.
		return anyPropState, prop, anyPropProp == grBoundary
		// This branch will probably never be reached because okAnyState will
		// always be true given the current transition map. But we keep it here
		// for future modifications to the transition map where this may not be
		// true anymore.
	}

	if anyStateState >= 0 {
		// We only have a specific property.
		return anyStateState, prop, anyStateProp == grBoundary
	}

	// No known transition. GB999: Any ÷ Any.
	return grAny, prop, true
}