1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 defaultSecondary
= 0x20
24 func makeRawCE(w
[]int, ccc
uint8) rawCE
{
25 ce
:= rawCE
{w
: make([]int, 4), ccc
: ccc
}
30 // A collation element is represented as an uint32.
31 // In the typical case, a rune maps to a single collation element. If a rune
32 // can be the start of a contraction or expands into multiple collation elements,
33 // then the collation element that is associated with a rune will have a special
34 // form to represent such m to n mappings. Such special collation elements
35 // have a value >= 0x80000000.
37 // For normal collation elements, we assume that a collation element either has
38 // a primary or non-default secondary value, not both.
39 // Collation elements with a primary value are of the form
40 // 01pppppp pppppppp ppppppp0 ssssssss
41 // - p* is primary collation value
42 // - s* is the secondary collation value
43 // 00pppppp pppppppp ppppppps sssttttt, where
44 // - p* is primary collation value
45 // - s* offset of secondary from default value.
46 // - t* is the tertiary collation value
47 // 100ttttt cccccccc pppppppp pppppppp
48 // - t* is the tertiar collation value
49 // - c* is the cannonical combining class
50 // - p* is the primary collation value
51 // Collation elements with a secondary value are of the form
52 // 1010cccc ccccssss ssssssss tttttttt, where
53 // - c* is the canonical combining class
54 // - s* is the secondary collation value
55 // - t* is the tertiary collation value
58 maxPrimaryCompactBits
= 16
60 maxSecondaryCompactBits
= 8
62 maxSecondaryDiffBits
= 4
64 maxTertiaryCompactBits
= 5
66 isPrimary
= 0x40000000
67 isPrimaryCCC
= 0x80000000
68 isSecondary
= 0xA0000000
71 func makeCE(rce rawCE
) (uint32, error
) {
73 if w
:= weights
[0]; w
>= 1<<maxPrimaryBits || w
< 0 {
74 return 0, fmt
.Errorf("makeCE: primary weight out of bounds: %x >= %x", w
, 1<<maxPrimaryBits
)
76 if w
:= weights
[1]; w
>= 1<<maxSecondaryBits || w
< 0 {
77 return 0, fmt
.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w
, 1<<maxSecondaryBits
)
79 if w
:= weights
[2]; w
>= 1<<maxTertiaryBits || w
< 0 {
80 return 0, fmt
.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w
, 1<<maxTertiaryBits
)
85 if weights
[0] >= 1<<maxPrimaryCompactBits
{
86 return 0, fmt
.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights
[0], 1<<maxPrimaryCompactBits
)
88 if weights
[1] != defaultSecondary
{
89 return 0, fmt
.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights
[1], rce
.ccc
)
91 ce
= uint32(weights
[2] << (maxPrimaryCompactBits
+ maxCCCBits
))
92 ce |
= uint32(rce
.ccc
) << maxPrimaryCompactBits
93 ce |
= uint32(weights
[0])
95 } else if weights
[2] == defaultTertiary
{
96 if weights
[1] >= 1<<maxSecondaryCompactBits
{
97 return 0, fmt
.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights
[1], 1<<maxSecondaryCompactBits
)
99 ce
= uint32(weights
[0]<<(maxSecondaryCompactBits
+1) + weights
[1])
102 d
:= weights
[1] - defaultSecondary
+ maxSecondaryDiffBits
103 if d
>= 1<<maxSecondaryDiffBits || d
< 0 {
104 return 0, fmt
.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d
, d
, 1<<maxSecondaryDiffBits
)
106 if weights
[2] >= 1<<maxTertiaryCompactBits
{
107 return 0, fmt
.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x (%X)", weights
[2], 1<<maxTertiaryCompactBits
, weights
)
109 ce
= uint32(weights
[0]<<maxSecondaryDiffBits
+ d
)
110 ce
= ce
<<maxTertiaryCompactBits
+ uint32(weights
[2])
113 ce
= uint32(weights
[1]<<maxTertiaryBits
+ weights
[2])
114 ce
+= uint32(rce
.ccc
) << (maxSecondaryBits
+ maxTertiaryBits
)
120 // For contractions, collation elements are of the form
121 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
122 // - n* is the size of the first node in the contraction trie.
123 // - i* is the index of the first node in the contraction trie.
124 // - b* is the offset into the contraction collation element table.
125 // See contract.go for details on the contraction trie.
127 contractID
= 0xC0000000
129 maxTrieIndexBits
= 12
130 maxContractOffsetBits
= 13
133 func makeContractIndex(h ctHandle
, offset
int) (uint32, error
) {
134 if h
.n
>= 1<<maxNBits
{
135 return 0, fmt
.Errorf("size of contraction trie node too large: %d >= %d", h
.n
, 1<<maxNBits
)
137 if h
.index
>= 1<<maxTrieIndexBits
{
138 return 0, fmt
.Errorf("size of contraction trie offset too large: %d >= %d", h
.index
, 1<<maxTrieIndexBits
)
140 if offset
>= 1<<maxContractOffsetBits
{
141 return 0, fmt
.Errorf("contraction offset out of bounds: %x >= %x", offset
, 1<<maxContractOffsetBits
)
143 ce
:= uint32(contractID
)
144 ce
+= uint32(offset
<< (maxNBits
+ maxTrieIndexBits
))
145 ce
+= uint32(h
.index
<< maxNBits
)
150 // For expansions, collation elements are of the form
151 // 11100000 00000000 bbbbbbbb bbbbbbbb,
152 // where b* is the index into the expansion sequence table.
154 expandID
= 0xE0000000
155 maxExpandIndexBits
= 16
158 func makeExpandIndex(index
int) (uint32, error
) {
159 if index
>= 1<<maxExpandIndexBits
{
160 return 0, fmt
.Errorf("expansion index out of bounds: %x >= %x", index
, 1<<maxExpandIndexBits
)
162 return expandID
+ uint32(index
), nil
165 // Each list of collation elements corresponding to an expansion starts with
166 // a header indicating the length of the sequence.
167 func makeExpansionHeader(n
int) (uint32, error
) {
168 return uint32(n
), nil
171 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
172 // sequence of collation elements, we decompose the rune and lookup the collation
173 // elements for each rune in the decomposition and modify the tertiary weights.
174 // The collation element, in this case, is of the form
175 // 11110000 00000000 wwwwwwww vvvvvvvv, where
176 // - v* is the replacement tertiary weight for the first rune,
177 // - w* is the replacement tertiary weight for the second rune,
178 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
179 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
181 decompID
= 0xF0000000
184 func makeDecompose(t1
, t2
int) (uint32, error
) {
185 if t1
>= 256 || t1
< 0 {
186 return 0, fmt
.Errorf("first tertiary weight out of bounds: %d >= 256", t1
)
188 if t2
>= 256 || t2
< 0 {
189 return 0, fmt
.Errorf("second tertiary weight out of bounds: %d >= 256", t2
)
191 return uint32(t2
<<8+t1
) + decompID
, nil
195 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
196 minUnified rune
= 0x4E00
198 minCompatibility
= 0xF900
199 maxCompatibility
= 0xFAFF
204 commonUnifiedOffset
= 0x10000
205 rareUnifiedOffset
= 0x20000 // largest rune in common is U+FAFF
206 otherOffset
= 0x50000 // largest rune in rare is U+2FA1D
207 illegalOffset
= otherOffset
+ int(unicode
.MaxRune
)
208 maxPrimary
= illegalOffset
+ 1
211 // implicitPrimary returns the primary weight for the a rune
212 // for which there is no entry for the rune in the collation table.
213 // We take a different approach from the one specified in
214 // http://unicode.org/reports/tr10/#Implicit_Weights,
215 // but preserve the resulting relative ordering of the runes.
216 func implicitPrimary(r rune
) int {
217 if unicode
.Is(unicode
.Ideographic
, r
) {
218 if r
>= minUnified
&& r
<= maxUnified
{
219 // The most common case for CJK.
220 return int(r
) + commonUnifiedOffset
222 if r
>= minCompatibility
&& r
<= maxCompatibility
{
223 // This will typically not hit. The DUCET explicitly specifies mappings
224 // for all characters that do not decompose.
225 return int(r
) + commonUnifiedOffset
227 return int(r
) + rareUnifiedOffset
229 return int(r
) + otherOffset
232 // convertLargeWeights converts collation elements with large
233 // primaries (either double primaries or for illegal runes)
234 // to our own representation.
235 // A CJK character C is represented in the DUCET as
236 // [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
237 // We will rewrite these characters to a single CE.
238 // We assume the CJK values start at 0x8000.
239 // See http://unicode.org/reports/tr10/#Implicit_Weights
240 func convertLargeWeights(elems
[]rawCE
) (res
[]rawCE
, err error
) {
242 cjkPrimaryStart
= 0xFB40
243 rarePrimaryStart
= 0xFB80
244 otherPrimaryStart
= 0xFBC0
245 illegalPrimary
= 0xFFFE
251 for i
:= 0; i
< len(elems
); i
++ {
254 if p
< cjkPrimaryStart
{
258 return elems
, fmt
.Errorf("found primary weight %X; should be <= 0xFFFF", p
)
260 if p
>= illegalPrimary
{
261 ce
[0] = illegalOffset
+ p
- illegalPrimary
263 if i
+1 >= len(elems
) {
264 return elems
, fmt
.Errorf("second part of double primary weight missing: %v", elems
)
266 if elems
[i
+1].w
[0]&lowBitsFlag
== 0 {
267 return elems
, fmt
.Errorf("malformed second part of double primary weight: %v", elems
)
269 np
:= ((p
& highBitsMask
) << shiftBits
) + elems
[i
+1].w
[0]&lowBitsMask
271 case p
< rarePrimaryStart
:
272 np
+= commonUnifiedOffset
273 case p
< otherPrimaryStart
:
274 np
+= rareUnifiedOffset
279 for j
:= i
+ 1; j
+1 < len(elems
); j
++ {
280 elems
[j
] = elems
[j
+1]
282 elems
= elems
[:len(elems
)-1]
288 // nextWeight computes the first possible collation weights following elems
289 // for the given level.
290 func nextWeight(level collate
.Level
, elems
[]rawCE
) []rawCE
{
291 if level
== collate
.Identity
{
292 next
:= make([]rawCE
, len(elems
))
296 next
:= []rawCE
{makeRawCE(elems
[0].w
, elems
[0].ccc
)}
298 if level
< collate
.Secondary
{
299 next
[0].w
[collate
.Secondary
] = defaultSecondary
301 if level
< collate
.Tertiary
{
302 next
[0].w
[collate
.Tertiary
] = defaultTertiary
304 // Filter entries that cannot influence ordering.
305 for _
, ce
:= range elems
[1:] {
307 for i
:= collate
.Primary
; i
< level
; i
++ {
308 skip
= skip
&& ce
.w
[i
] == 0
311 next
= append(next
, ce
)
317 func nextVal(elems
[]rawCE
, i
int, level collate
.Level
) (index
, value
int) {
318 for ; i
< len(elems
) && elems
[i
].w
[level
] == 0; i
++ {
321 return i
, elems
[i
].w
[level
]
326 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
327 // It also returns the collation level at which the difference is found.
328 func compareWeights(a
, b
[]rawCE
) (result
int, level collate
.Level
) {
329 for level
:= collate
.Primary
; level
< collate
.Identity
; level
++ {
331 for ia
, ib
:= 0, 0; ia
< len(a
) || ib
< len(b
); ia
, ib
= ia
+1, ib
+1 {
332 ia
, va
= nextVal(a
, ia
, level
)
333 ib
, vb
= nextVal(b
, ib
, level
)
343 return 0, collate
.Identity
346 func equalCE(a
, b rawCE
) bool {
347 for i
:= 0; i
< 3; i
++ {
348 if b
.w
[i
] != a
.w
[i
] {
355 func equalCEArrays(a
, b
[]rawCE
) bool {
356 if len(a
) != len(b
) {
360 if !equalCE(a
[i
], b
[i
]) {