Daily bump.
[official-gcc.git] / libgo / go / exp / locale / collate / build / colelem.go
blob1a8356d72bcd38eb439bf4e51f20c92653b2de68
1 // Copyright 2012 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package build
7 import (
8 "exp/locale/collate"
9 "fmt"
10 "unicode"
13 const (
14 defaultSecondary = 0x20
15 defaultTertiary = 0x2
16 maxTertiary = 0x1F
19 type rawCE struct {
20 w []int
21 ccc uint8
24 func makeRawCE(w []int, ccc uint8) rawCE {
25 ce := rawCE{w: make([]int, 4), ccc: ccc}
26 copy(ce.w, w)
27 return ce
30 // A collation element is represented as an uint32.
31 // In the typical case, a rune maps to a single collation element. If a rune
32 // can be the start of a contraction or expands into multiple collation elements,
33 // then the collation element that is associated with a rune will have a special
34 // form to represent such m to n mappings. Such special collation elements
35 // have a value >= 0x80000000.
37 // For normal collation elements, we assume that a collation element either has
38 // a primary or non-default secondary value, not both.
39 // Collation elements with a primary value are of the form
40 // 01pppppp pppppppp ppppppp0 ssssssss
41 // - p* is primary collation value
42 // - s* is the secondary collation value
43 // 00pppppp pppppppp ppppppps sssttttt, where
44 // - p* is primary collation value
45 // - s* offset of secondary from default value.
46 // - t* is the tertiary collation value
47 // 100ttttt cccccccc pppppppp pppppppp
48 // - t* is the tertiar collation value
49 // - c* is the cannonical combining class
50 // - p* is the primary collation value
51 // Collation elements with a secondary value are of the form
52 // 1010cccc ccccssss ssssssss tttttttt, where
53 // - c* is the canonical combining class
54 // - s* is the secondary collation value
55 // - t* is the tertiary collation value
56 const (
57 maxPrimaryBits = 21
58 maxPrimaryCompactBits = 16
59 maxSecondaryBits = 12
60 maxSecondaryCompactBits = 8
61 maxCCCBits = 8
62 maxSecondaryDiffBits = 4
63 maxTertiaryBits = 8
64 maxTertiaryCompactBits = 5
66 isPrimary = 0x40000000
67 isPrimaryCCC = 0x80000000
68 isSecondary = 0xA0000000
71 func makeCE(rce rawCE) (uint32, error) {
72 weights := rce.w
73 if w := weights[0]; w >= 1<<maxPrimaryBits || w < 0 {
74 return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
76 if w := weights[1]; w >= 1<<maxSecondaryBits || w < 0 {
77 return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
79 if w := weights[2]; w >= 1<<maxTertiaryBits || w < 0 {
80 return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
82 ce := uint32(0)
83 if weights[0] != 0 {
84 if rce.ccc != 0 {
85 if weights[0] >= 1<<maxPrimaryCompactBits {
86 return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", weights[0], 1<<maxPrimaryCompactBits)
88 if weights[1] != defaultSecondary {
89 return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", weights[1], rce.ccc)
91 ce = uint32(weights[2] << (maxPrimaryCompactBits + maxCCCBits))
92 ce |= uint32(rce.ccc) << maxPrimaryCompactBits
93 ce |= uint32(weights[0])
94 ce |= isPrimaryCCC
95 } else if weights[2] == defaultTertiary {
96 if weights[1] >= 1<<maxSecondaryCompactBits {
97 return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", weights[1], 1<<maxSecondaryCompactBits)
99 ce = uint32(weights[0]<<(maxSecondaryCompactBits+1) + weights[1])
100 ce |= isPrimary
101 } else {
102 d := weights[1] - defaultSecondary + maxSecondaryDiffBits
103 if d >= 1<<maxSecondaryDiffBits || d < 0 {
104 return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
106 if weights[2] >= 1<<maxTertiaryCompactBits {
107 return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x (%X)", weights[2], 1<<maxTertiaryCompactBits, weights)
109 ce = uint32(weights[0]<<maxSecondaryDiffBits + d)
110 ce = ce<<maxTertiaryCompactBits + uint32(weights[2])
112 } else {
113 ce = uint32(weights[1]<<maxTertiaryBits + weights[2])
114 ce += uint32(rce.ccc) << (maxSecondaryBits + maxTertiaryBits)
115 ce |= isSecondary
117 return ce, nil
120 // For contractions, collation elements are of the form
121 // 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
122 // - n* is the size of the first node in the contraction trie.
123 // - i* is the index of the first node in the contraction trie.
124 // - b* is the offset into the contraction collation element table.
125 // See contract.go for details on the contraction trie.
126 const (
127 contractID = 0xC0000000
128 maxNBits = 4
129 maxTrieIndexBits = 12
130 maxContractOffsetBits = 13
133 func makeContractIndex(h ctHandle, offset int) (uint32, error) {
134 if h.n >= 1<<maxNBits {
135 return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
137 if h.index >= 1<<maxTrieIndexBits {
138 return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
140 if offset >= 1<<maxContractOffsetBits {
141 return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
143 ce := uint32(contractID)
144 ce += uint32(offset << (maxNBits + maxTrieIndexBits))
145 ce += uint32(h.index << maxNBits)
146 ce += uint32(h.n)
147 return ce, nil
150 // For expansions, collation elements are of the form
151 // 11100000 00000000 bbbbbbbb bbbbbbbb,
152 // where b* is the index into the expansion sequence table.
153 const (
154 expandID = 0xE0000000
155 maxExpandIndexBits = 16
158 func makeExpandIndex(index int) (uint32, error) {
159 if index >= 1<<maxExpandIndexBits {
160 return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
162 return expandID + uint32(index), nil
165 // Each list of collation elements corresponding to an expansion starts with
166 // a header indicating the length of the sequence.
167 func makeExpansionHeader(n int) (uint32, error) {
168 return uint32(n), nil
171 // Some runes can be expanded using NFKD decomposition. Instead of storing the full
172 // sequence of collation elements, we decompose the rune and lookup the collation
173 // elements for each rune in the decomposition and modify the tertiary weights.
174 // The collation element, in this case, is of the form
175 // 11110000 00000000 wwwwwwww vvvvvvvv, where
176 // - v* is the replacement tertiary weight for the first rune,
177 // - w* is the replacement tertiary weight for the second rune,
178 // Tertiary weights of subsequent runes should be replaced with maxTertiary.
179 // See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
180 const (
181 decompID = 0xF0000000
184 func makeDecompose(t1, t2 int) (uint32, error) {
185 if t1 >= 256 || t1 < 0 {
186 return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
188 if t2 >= 256 || t2 < 0 {
189 return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
191 return uint32(t2<<8+t1) + decompID, nil
194 const (
195 // These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
196 minUnified rune = 0x4E00
197 maxUnified = 0x9FFF
198 minCompatibility = 0xF900
199 maxCompatibility = 0xFAFF
200 minRare = 0x3400
201 maxRare = 0x4DBF
203 const (
204 commonUnifiedOffset = 0x10000
205 rareUnifiedOffset = 0x20000 // largest rune in common is U+FAFF
206 otherOffset = 0x50000 // largest rune in rare is U+2FA1D
207 illegalOffset = otherOffset + int(unicode.MaxRune)
208 maxPrimary = illegalOffset + 1
211 // implicitPrimary returns the primary weight for the a rune
212 // for which there is no entry for the rune in the collation table.
213 // We take a different approach from the one specified in
214 // http://unicode.org/reports/tr10/#Implicit_Weights,
215 // but preserve the resulting relative ordering of the runes.
216 func implicitPrimary(r rune) int {
217 if unicode.Is(unicode.Ideographic, r) {
218 if r >= minUnified && r <= maxUnified {
219 // The most common case for CJK.
220 return int(r) + commonUnifiedOffset
222 if r >= minCompatibility && r <= maxCompatibility {
223 // This will typically not hit. The DUCET explicitly specifies mappings
224 // for all characters that do not decompose.
225 return int(r) + commonUnifiedOffset
227 return int(r) + rareUnifiedOffset
229 return int(r) + otherOffset
232 // convertLargeWeights converts collation elements with large
233 // primaries (either double primaries or for illegal runes)
234 // to our own representation.
235 // A CJK character C is represented in the DUCET as
236 // [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
237 // We will rewrite these characters to a single CE.
238 // We assume the CJK values start at 0x8000.
239 // See http://unicode.org/reports/tr10/#Implicit_Weights
240 func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
241 const (
242 cjkPrimaryStart = 0xFB40
243 rarePrimaryStart = 0xFB80
244 otherPrimaryStart = 0xFBC0
245 illegalPrimary = 0xFFFE
246 highBitsMask = 0x3F
247 lowBitsMask = 0x7FFF
248 lowBitsFlag = 0x8000
249 shiftBits = 15
251 for i := 0; i < len(elems); i++ {
252 ce := elems[i].w
253 p := ce[0]
254 if p < cjkPrimaryStart {
255 continue
257 if p > 0xFFFF {
258 return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
260 if p >= illegalPrimary {
261 ce[0] = illegalOffset + p - illegalPrimary
262 } else {
263 if i+1 >= len(elems) {
264 return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
266 if elems[i+1].w[0]&lowBitsFlag == 0 {
267 return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
269 np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
270 switch {
271 case p < rarePrimaryStart:
272 np += commonUnifiedOffset
273 case p < otherPrimaryStart:
274 np += rareUnifiedOffset
275 default:
276 p += otherOffset
278 ce[0] = np
279 for j := i + 1; j+1 < len(elems); j++ {
280 elems[j] = elems[j+1]
282 elems = elems[:len(elems)-1]
285 return elems, nil
288 // nextWeight computes the first possible collation weights following elems
289 // for the given level.
290 func nextWeight(level collate.Level, elems []rawCE) []rawCE {
291 if level == collate.Identity {
292 next := make([]rawCE, len(elems))
293 copy(next, elems)
294 return next
296 next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
297 next[0].w[level]++
298 if level < collate.Secondary {
299 next[0].w[collate.Secondary] = defaultSecondary
301 if level < collate.Tertiary {
302 next[0].w[collate.Tertiary] = defaultTertiary
304 // Filter entries that cannot influence ordering.
305 for _, ce := range elems[1:] {
306 skip := true
307 for i := collate.Primary; i < level; i++ {
308 skip = skip && ce.w[i] == 0
310 if !skip {
311 next = append(next, ce)
314 return next
317 func nextVal(elems []rawCE, i int, level collate.Level) (index, value int) {
318 for ; i < len(elems) && elems[i].w[level] == 0; i++ {
320 if i < len(elems) {
321 return i, elems[i].w[level]
323 return i, 0
326 // compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
327 // It also returns the collation level at which the difference is found.
328 func compareWeights(a, b []rawCE) (result int, level collate.Level) {
329 for level := collate.Primary; level < collate.Identity; level++ {
330 var va, vb int
331 for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
332 ia, va = nextVal(a, ia, level)
333 ib, vb = nextVal(b, ib, level)
334 if va != vb {
335 if va < vb {
336 return -1, level
337 } else {
338 return 1, level
343 return 0, collate.Identity
346 func equalCE(a, b rawCE) bool {
347 for i := 0; i < 3; i++ {
348 if b.w[i] != a.w[i] {
349 return false
352 return true
355 func equalCEArrays(a, b []rawCE) bool {
356 if len(a) != len(b) {
357 return false
359 for i := range a {
360 if !equalCE(a[i], b[i]) {
361 return false
364 return true