Merged revisions 195034,195219,195245,195357,195374,195428,195599,195673,195809 via...
[official-gcc.git] / main / libgo / go / exp / locale / collate / colelem.go
blob7633ddc0e7f6256474850efaf3bb9abffb82bacc
1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package collate
7 import (
8 "unicode"
11 // Level identifies the collation comparison level.
12 // The primary level corresponds to the basic sorting of text.
13 // The secondary level corresponds to accents and related linguistic elements.
14 // The tertiary level corresponds to casing and related concepts.
15 // The quaternary level is derived from the other levels by the
16 // various algorithms for handling variable elements.
17 type Level int
19 const (
20 Primary Level = iota
21 Secondary
22 Tertiary
23 Quaternary
24 Identity
27 const (
28 defaultSecondary = 0x20
29 defaultTertiary = 0x2
30 maxTertiary = 0x1F
31 MaxQuaternary = 0x1FFFFF // 21 bits.
34 // Elem is a representation of a collation element. This API provides ways to encode
35 // and decode Elems. Implementations of collation tables may use values greater
36 // or equal to PrivateUse for their own purposes. However, these should never be
37 // returned by AppendNext.
38 type Elem uint32
40 const (
41 maxCE Elem = 0xAFFFFFFF
42 PrivateUse = minContract
43 minContract = 0xC0000000
44 maxContract = 0xDFFFFFFF
45 minExpand = 0xE0000000
46 maxExpand = 0xEFFFFFFF
47 minDecomp = 0xF0000000
50 type ceType int
52 const (
53 ceNormal ceType = iota // ceNormal includes implicits (ce == 0)
54 ceContractionIndex // rune can be a start of a contraction
55 ceExpansionIndex // rune expands into a sequence of collation elements
56 ceDecompose // rune expands using NFKC decomposition
59 func (ce Elem) ctype() ceType {
60 if ce <= maxCE {
61 return ceNormal
63 if ce <= maxContract {
64 return ceContractionIndex
65 } else {
66 if ce <= maxExpand {
67 return ceExpansionIndex
69 return ceDecompose
71 panic("should not reach here")
72 return ceType(-1)
75 // For normal collation elements, we assume that a collation element either has
76 // a primary or non-default secondary value, not both.
77 // Collation elements with a primary value are of the form
78 // 01pppppp pppppppp ppppppp0 ssssssss
79 // - p* is primary collation value
80 // - s* is the secondary collation value
81 // 00pppppp pppppppp ppppppps sssttttt, where
82 // - p* is primary collation value
83 // - s* offset of secondary from default value.
84 // - t* is the tertiary collation value
85 // 100ttttt cccccccc pppppppp pppppppp
86 // - t* is the tertiar collation value
87 // - c* is the cannonical combining class
88 // - p* is the primary collation value
89 // Collation elements with a secondary value are of the form
90 // 1010cccc ccccssss ssssssss tttttttt, where
91 // - c* is the canonical combining class
92 // - s* is the secondary collation value
93 // - t* is the tertiary collation value
94 // 11qqqqqq qqqqqqqq qqqqqqq0 00000000
95 // - q* quaternary value
96 const (
97 ceTypeMask = 0xC0000000
98 ceTypeMaskExt = 0xE0000000
99 ceType1 = 0x40000000
100 ceType2 = 0x00000000
101 ceType3or4 = 0x80000000
102 ceType4 = 0xA0000000
103 ceTypeQ = 0xC0000000
104 ceIgnore = ceType4
105 firstNonPrimary = 0x80000000
106 lastSpecialPrimary = 0xA0000000
107 secondaryMask = 0x80000000
108 hasTertiaryMask = 0x40000000
109 primaryValueMask = 0x3FFFFE00
110 primaryShift = 9
111 compactPrimaryBits = 16
112 compactSecondaryShift = 5
113 minCompactSecondary = defaultSecondary - 4
116 func makeImplicitCE(primary int) Elem {
117 return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
120 // MakeElem returns an Elem for the given values. It will return an error
121 // if the given combination of values is invalid.
122 func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
123 // TODO: implement
124 return 0, nil
127 // MakeQuaternary returns an Elem with the given quaternary value.
128 func MakeQuaternary(v int) Elem {
129 return ceTypeQ | Elem(v<<primaryShift)
132 // Mask sets weights for any level smaller than l to 0.
133 // The resulting Elem can be used to test for equality with
134 // other Elems to which the same mask has been applied.
135 func (ce Elem) Mask(l Level) uint32 {
136 return 0
139 // CCC returns the canoncial combining class associated with the underlying character,
140 // if applicable, or 0 otherwise.
141 func (ce Elem) CCC() uint8 {
142 if ce&ceType3or4 != 0 {
143 if ce&ceType4 == ceType3or4 {
144 return uint8(ce >> 16)
146 return uint8(ce >> 20)
148 return 0
151 // Primary returns the primary collation weight for ce.
152 func (ce Elem) Primary() int {
153 if ce >= firstNonPrimary {
154 if ce > lastSpecialPrimary {
155 return 0
157 return int(uint16(ce))
159 return int(ce&primaryValueMask) >> primaryShift
162 // Secondary returns the secondary collation weight for ce.
163 func (ce Elem) Secondary() int {
164 switch ce & ceTypeMask {
165 case ceType1:
166 return int(uint8(ce))
167 case ceType2:
168 return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
169 case ceType3or4:
170 if ce < ceType4 {
171 return defaultSecondary
173 return int(ce>>8) & 0xFFF
174 case ceTypeQ:
175 return 0
177 panic("should not reach here")
180 // Tertiary returns the tertiary collation weight for ce.
181 func (ce Elem) Tertiary() uint8 {
182 if ce&hasTertiaryMask == 0 {
183 if ce&ceType3or4 == 0 {
184 return uint8(ce & 0x1F)
186 if ce&ceType4 == ceType4 {
187 return uint8(ce)
189 return uint8(ce>>24) & 0x1F // type 2
190 } else if ce&ceTypeMask == ceType1 {
191 return defaultTertiary
193 // ce is a quaternary value.
194 return 0
197 func (ce Elem) updateTertiary(t uint8) Elem {
198 if ce&ceTypeMask == ceType1 {
199 // convert to type 4
200 nce := ce & primaryValueMask
201 nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
202 ce = nce
203 } else if ce&ceTypeMaskExt == ceType3or4 {
204 ce &= ^Elem(maxTertiary << 24)
205 return ce | (Elem(t) << 24)
206 } else {
207 // type 2 or 4
208 ce &= ^Elem(maxTertiary)
210 return ce | Elem(t)
213 // Quaternary returns the quaternary value if explicitly specified,
214 // 0 if ce == ceIgnore, or MaxQuaternary otherwise.
215 // Quaternary values are used only for shifted variants.
216 func (ce Elem) Quaternary() int {
217 if ce&ceTypeMask == ceTypeQ {
218 return int(ce&primaryValueMask) >> primaryShift
219 } else if ce == ceIgnore {
220 return 0
222 return MaxQuaternary
225 // Weight returns the collation weight for the given level.
226 func (ce Elem) Weight(l Level) int {
227 switch l {
228 case Primary:
229 return ce.Primary()
230 case Secondary:
231 return ce.Secondary()
232 case Tertiary:
233 return int(ce.Tertiary())
234 case Quaternary:
235 return ce.Quaternary()
237 return 0 // return 0 (ignore) for undefined levels.
240 // For contractions, collation elements are of the form
241 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
242 // - n* is the size of the first node in the contraction trie.
243 // - i* is the index of the first node in the contraction trie.
244 // - b* is the offset into the contraction collation element table.
245 // See contract.go for details on the contraction trie.
246 const (
247 maxNBits = 4
248 maxTrieIndexBits = 12
249 maxContractOffsetBits = 13
252 func splitContractIndex(ce Elem) (index, n, offset int) {
253 n = int(ce & (1<<maxNBits - 1))
254 ce >>= maxNBits
255 index = int(ce & (1<<maxTrieIndexBits - 1))
256 ce >>= maxTrieIndexBits
257 offset = int(ce & (1<<maxContractOffsetBits - 1))
258 return
261 // For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
262 // where b* is the index into the expansion sequence table.
263 const maxExpandIndexBits = 16
265 func splitExpandIndex(ce Elem) (index int) {
266 return int(uint16(ce))
269 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
270 // sequence of collation elements, we decompose the rune and lookup the collation
271 // elements for each rune in the decomposition and modify the tertiary weights.
272 // The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
273 // - v* is the replacement tertiary weight for the first rune,
274 // - w* is the replacement tertiary weight for the second rune,
275 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
276 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
277 func splitDecompose(ce Elem) (t1, t2 uint8) {
278 return uint8(ce), uint8(ce >> 8)
281 const (
282 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
283 minUnified rune = 0x4E00
284 maxUnified = 0x9FFF
285 minCompatibility = 0xF900
286 maxCompatibility = 0xFAFF
287 minRare = 0x3400
288 maxRare = 0x4DBF
290 const (
291 commonUnifiedOffset = 0x10000
292 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
293 otherOffset = 0x50000 // largest rune in rare is U+2FA1D
294 illegalOffset = otherOffset + int(unicode.MaxRune)
295 maxPrimary = illegalOffset + 1
298 // implicitPrimary returns the primary weight for the a rune
299 // for which there is no entry for the rune in the collation table.
300 // We take a different approach from the one specified in
301 // http://unicode.org/reports/tr10/#Implicit_Weights,
302 // but preserve the resulting relative ordering of the runes.
303 func implicitPrimary(r rune) int {
304 if unicode.Is(unicode.Ideographic, r) {
305 if r >= minUnified && r <= maxUnified {
306 // The most common case for CJK.
307 return int(r) + commonUnifiedOffset
309 if r >= minCompatibility && r <= maxCompatibility {
310 // This will typically not hit. The DUCET explicitly specifies mappings
311 // for all characters that do not decompose.
312 return int(r) + commonUnifiedOffset
314 return int(r) + rareUnifiedOffset
316 return int(r) + otherOffset