1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
11 // Level identifies the collation comparison level.
12 // The primary level corresponds to the basic sorting of text.
13 // The secondary level corresponds to accents and related linguistic elements.
14 // The tertiary level corresponds to casing and related concepts.
15 // The quaternary level is derived from the other levels by the
16 // various algorithms for handling variable elements.
28 defaultSecondary
= 0x20
31 MaxQuaternary
= 0x1FFFFF // 21 bits.
34 // Elem is a representation of a collation element. This API provides ways to encode
35 // and decode Elems. Implementations of collation tables may use values greater
36 // or equal to PrivateUse for their own purposes. However, these should never be
37 // returned by AppendNext.
41 maxCE Elem
= 0xAFFFFFFF
42 PrivateUse
= minContract
43 minContract
= 0xC0000000
44 maxContract
= 0xDFFFFFFF
45 minExpand
= 0xE0000000
46 maxExpand
= 0xEFFFFFFF
47 minDecomp
= 0xF0000000
53 ceNormal ceType
= iota // ceNormal includes implicits (ce == 0)
54 ceContractionIndex
// rune can be a start of a contraction
55 ceExpansionIndex
// rune expands into a sequence of collation elements
56 ceDecompose
// rune expands using NFKC decomposition
59 func (ce Elem
) ctype() ceType
{
63 if ce
<= maxContract
{
64 return ceContractionIndex
67 return ceExpansionIndex
71 panic("should not reach here")
75 // For normal collation elements, we assume that a collation element either has
76 // a primary or non-default secondary value, not both.
77 // Collation elements with a primary value are of the form
78 // 01pppppp pppppppp ppppppp0 ssssssss
79 // - p* is primary collation value
80 // - s* is the secondary collation value
81 // 00pppppp pppppppp ppppppps sssttttt, where
82 // - p* is primary collation value
83 // - s* offset of secondary from default value.
84 // - t* is the tertiary collation value
85 // 100ttttt cccccccc pppppppp pppppppp
86 // - t* is the tertiar collation value
87 // - c* is the cannonical combining class
88 // - p* is the primary collation value
89 // Collation elements with a secondary value are of the form
90 // 1010cccc ccccssss ssssssss tttttttt, where
91 // - c* is the canonical combining class
92 // - s* is the secondary collation value
93 // - t* is the tertiary collation value
94 // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
95 // - q* quaternary value
97 ceTypeMask
= 0xC0000000
98 ceTypeMaskExt
= 0xE0000000
101 ceType3or4
= 0x80000000
105 firstNonPrimary
= 0x80000000
106 lastSpecialPrimary
= 0xA0000000
107 secondaryMask
= 0x80000000
108 hasTertiaryMask
= 0x40000000
109 primaryValueMask
= 0x3FFFFE00
111 compactPrimaryBits
= 16
112 compactSecondaryShift
= 5
113 minCompactSecondary
= defaultSecondary
- 4
116 func makeImplicitCE(primary
int) Elem
{
117 return ceType1 |
Elem(primary
<<primaryShift
) | defaultSecondary
120 // MakeElem returns an Elem for the given values. It will return an error
121 // if the given combination of values is invalid.
122 func MakeElem(primary
, secondary
, tertiary
int, ccc
uint8) (Elem
, error
) {
127 // MakeQuaternary returns an Elem with the given quaternary value.
128 func MakeQuaternary(v
int) Elem
{
129 return ceTypeQ |
Elem(v
<<primaryShift
)
132 // Mask sets weights for any level smaller than l to 0.
133 // The resulting Elem can be used to test for equality with
134 // other Elems to which the same mask has been applied.
135 func (ce Elem
) Mask(l Level
) uint32 {
139 // CCC returns the canoncial combining class associated with the underlying character,
140 // if applicable, or 0 otherwise.
141 func (ce Elem
) CCC() uint8 {
142 if ce
&ceType3or4
!= 0 {
143 if ce
&ceType4
== ceType3or4
{
144 return uint8(ce
>> 16)
146 return uint8(ce
>> 20)
151 // Primary returns the primary collation weight for ce.
152 func (ce Elem
) Primary() int {
153 if ce
>= firstNonPrimary
{
154 if ce
> lastSpecialPrimary
{
157 return int(uint16(ce
))
159 return int(ce
&primaryValueMask
) >> primaryShift
162 // Secondary returns the secondary collation weight for ce.
163 func (ce Elem
) Secondary() int {
164 switch ce
& ceTypeMask
{
166 return int(uint8(ce
))
168 return minCompactSecondary
+ int((ce
>>compactSecondaryShift
)&0xF)
171 return defaultSecondary
173 return int(ce
>>8) & 0xFFF
177 panic("should not reach here")
180 // Tertiary returns the tertiary collation weight for ce.
181 func (ce Elem
) Tertiary() uint8 {
182 if ce
&hasTertiaryMask
== 0 {
183 if ce
&ceType3or4
== 0 {
184 return uint8(ce
& 0x1F)
186 if ce
&ceType4
== ceType4
{
189 return uint8(ce
>>24) & 0x1F // type 2
190 } else if ce
&ceTypeMask
== ceType1
{
191 return defaultTertiary
193 // ce is a quaternary value.
197 func (ce Elem
) updateTertiary(t
uint8) Elem
{
198 if ce
&ceTypeMask
== ceType1
{
200 nce
:= ce
& primaryValueMask
201 nce |
= Elem(uint8(ce
)-minCompactSecondary
) << compactSecondaryShift
203 } else if ce
&ceTypeMaskExt
== ceType3or4
{
204 ce
&= ^Elem(maxTertiary
<< 24)
205 return ce |
(Elem(t
) << 24)
208 ce
&= ^Elem(maxTertiary
)
213 // Quaternary returns the quaternary value if explicitly specified,
214 // 0 if ce == ceIgnore, or MaxQuaternary otherwise.
215 // Quaternary values are used only for shifted variants.
216 func (ce Elem
) Quaternary() int {
217 if ce
&ceTypeMask
== ceTypeQ
{
218 return int(ce
&primaryValueMask
) >> primaryShift
219 } else if ce
== ceIgnore
{
225 // Weight returns the collation weight for the given level.
226 func (ce Elem
) Weight(l Level
) int {
231 return ce
.Secondary()
233 return int(ce
.Tertiary())
235 return ce
.Quaternary()
237 return 0 // return 0 (ignore) for undefined levels.
240 // For contractions, collation elements are of the form
241 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
242 // - n* is the size of the first node in the contraction trie.
243 // - i* is the index of the first node in the contraction trie.
244 // - b* is the offset into the contraction collation element table.
245 // See contract.go for details on the contraction trie.
248 maxTrieIndexBits
= 12
249 maxContractOffsetBits
= 13
252 func splitContractIndex(ce Elem
) (index
, n
, offset
int) {
253 n
= int(ce
& (1<<maxNBits
- 1))
255 index
= int(ce
& (1<<maxTrieIndexBits
- 1))
256 ce
>>= maxTrieIndexBits
257 offset
= int(ce
& (1<<maxContractOffsetBits
- 1))
261 // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
262 // where b* is the index into the expansion sequence table.
263 const maxExpandIndexBits
= 16
265 func splitExpandIndex(ce Elem
) (index
int) {
266 return int(uint16(ce
))
269 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
270 // sequence of collation elements, we decompose the rune and lookup the collation
271 // elements for each rune in the decomposition and modify the tertiary weights.
272 // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
273 // - v* is the replacement tertiary weight for the first rune,
274 // - w* is the replacement tertiary weight for the second rune,
275 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
276 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
277 func splitDecompose(ce Elem
) (t1
, t2
uint8) {
278 return uint8(ce
), uint8(ce
>> 8)
282 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
283 minUnified rune
= 0x4E00
285 minCompatibility
= 0xF900
286 maxCompatibility
= 0xFAFF
291 commonUnifiedOffset
= 0x10000
292 rareUnifiedOffset
= 0x20000 // largest rune in common is U+FAFF
293 otherOffset
= 0x50000 // largest rune in rare is U+2FA1D
294 illegalOffset
= otherOffset
+ int(unicode
.MaxRune
)
295 maxPrimary
= illegalOffset
+ 1
298 // implicitPrimary returns the primary weight for the a rune
299 // for which there is no entry for the rune in the collation table.
300 // We take a different approach from the one specified in
301 // http://unicode.org/reports/tr10/#Implicit_Weights,
302 // but preserve the resulting relative ordering of the runes.
303 func implicitPrimary(r rune
) int {
304 if unicode
.Is(unicode
.Ideographic
, r
) {
305 if r
>= minUnified
&& r
<= maxUnified
{
306 // The most common case for CJK.
307 return int(r
) + commonUnifiedOffset
309 if r
>= minCompatibility
&& r
<= maxCompatibility
{
310 // This will typically not hit. The DUCET explicitly specifies mappings
311 // for all characters that do not decompose.
312 return int(r
) + commonUnifiedOffset
314 return int(r
) + rareUnifiedOffset
316 return int(r
) + otherOffset