Merged revisions 195034,195219,195245,195357,195374,195428,195599,195673,195809 via...
[official-gcc.git] / main / libgo / go / exp / norm / iter.go
bloba9546247c3b4954a8c422119f1da05ef2c559a49
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package norm
7 import (
8 "fmt"
9 "unicode/utf8"
12 const MaxSegmentSize = maxByteBufferSize
14 // An Iter iterates over a string or byte slice, while normalizing it
15 // to a given Form.
16 type Iter struct {
17 rb reorderBuffer
18 buf [maxByteBufferSize]byte
19 info Properties // first character saved from previous iteration
20 next iterFunc // implementation of next depends on form
21 asciiF iterFunc
23 p int // current position in input source
24 multiSeg []byte // remainder of multi-segment decomposition
27 type iterFunc func(*Iter) []byte
29 // Init initializes i to iterate over src after normalizing it to Form f.
30 func (i *Iter) Init(f Form, src []byte) {
31 i.p = 0
32 if len(src) == 0 {
33 i.setDone()
34 i.rb.nsrc = 0
35 return
37 i.multiSeg = nil
38 i.rb.init(f, src)
39 i.next = i.rb.f.nextMain
40 i.asciiF = nextASCIIBytes
41 i.info = i.rb.f.info(i.rb.src, i.p)
44 // InitString initializes i to iterate over src after normalizing it to Form f.
45 func (i *Iter) InitString(f Form, src string) {
46 i.p = 0
47 if len(src) == 0 {
48 i.setDone()
49 i.rb.nsrc = 0
50 return
52 i.multiSeg = nil
53 i.rb.initString(f, src)
54 i.next = i.rb.f.nextMain
55 i.asciiF = nextASCIIString
56 i.info = i.rb.f.info(i.rb.src, i.p)
59 // Seek sets the segment to be returned by the next call to Next to start
60 // at position p. It is the responsibility of the caller to set p to the
61 // start of a UTF8 rune.
62 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
63 var abs int64
64 switch whence {
65 case 0:
66 abs = offset
67 case 1:
68 abs = int64(i.p) + offset
69 case 2:
70 abs = int64(i.rb.nsrc) + offset
71 default:
72 return 0, fmt.Errorf("norm: invalid whence")
74 if abs < 0 {
75 return 0, fmt.Errorf("norm: negative position")
77 if int(abs) >= i.rb.nsrc {
78 i.setDone()
79 return int64(i.p), nil
81 i.p = int(abs)
82 i.multiSeg = nil
83 i.next = i.rb.f.nextMain
84 i.info = i.rb.f.info(i.rb.src, i.p)
85 return abs, nil
88 // returnSlice returns a slice of the underlying input type as a byte slice.
89 // If the underlying is of type []byte, it will simply return a slice.
90 // If the underlying is of type string, it will copy the slice to the buffer
91 // and return that.
92 func (i *Iter) returnSlice(a, b int) []byte {
93 if i.rb.src.bytes == nil {
94 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
96 return i.rb.src.bytes[a:b]
99 // Pos returns the byte position at which the next call to Next will commence processing.
100 func (i *Iter) Pos() int {
101 return i.p
104 func (i *Iter) setDone() {
105 i.next = nextDone
106 i.p = i.rb.nsrc
109 // Done returns true if there is no more input to process.
110 func (i *Iter) Done() bool {
111 return i.p >= i.rb.nsrc
114 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
115 // For any input a and b for which f(a) == f(b), subsequent calls
116 // to Next will return the same segments.
117 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
118 // Although not guaranteed, n will typically be the smallest possible n.
119 func (i *Iter) Next() []byte {
120 return i.next(i)
123 func nextASCIIBytes(i *Iter) []byte {
124 p := i.p + 1
125 if p >= i.rb.nsrc {
126 i.setDone()
127 return i.rb.src.bytes[i.p:p]
129 if i.rb.src.bytes[p] < utf8.RuneSelf {
130 p0 := i.p
131 i.p = p
132 return i.rb.src.bytes[p0:p]
134 i.info = i.rb.f.info(i.rb.src, i.p)
135 i.next = i.rb.f.nextMain
136 return i.next(i)
139 func nextASCIIString(i *Iter) []byte {
140 p := i.p + 1
141 if p >= i.rb.nsrc {
142 i.buf[0] = i.rb.src.str[i.p]
143 i.setDone()
144 return i.buf[:1]
146 if i.rb.src.str[p] < utf8.RuneSelf {
147 i.buf[0] = i.rb.src.str[i.p]
148 i.p = p
149 return i.buf[:1]
151 i.info = i.rb.f.info(i.rb.src, i.p)
152 i.next = i.rb.f.nextMain
153 return i.next(i)
156 func nextHangul(i *Iter) []byte {
157 if r := i.rb.src.hangul(i.p); r != 0 {
158 i.p += hangulUTF8Size
159 if i.p >= i.rb.nsrc {
160 i.setDone()
162 return i.buf[:decomposeHangul(i.buf[:], r)]
164 i.info = i.rb.f.info(i.rb.src, i.p)
165 i.next = i.rb.f.nextMain
166 return i.next(i)
169 func nextDone(i *Iter) []byte {
170 return nil
173 // nextMulti is used for iterating over multi-segment decompositions
174 // for decomposing normal forms.
175 func nextMulti(i *Iter) []byte {
176 j := 0
177 d := i.multiSeg
178 // skip first rune
179 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
181 for j < len(d) {
182 info := i.rb.f.info(input{bytes: d}, j)
183 if info.ccc == 0 {
184 i.multiSeg = d[j:]
185 return d[:j]
187 j += int(info.size)
189 // treat last segment as normal decomposition
190 i.next = i.rb.f.nextMain
191 return i.next(i)
194 // nextMultiNorm is used for iterating over multi-segment decompositions
195 // for composing normal forms.
196 func nextMultiNorm(i *Iter) []byte {
197 j := 0
198 d := i.multiSeg
199 // skip first rune
200 for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
202 for j < len(d) {
203 info := i.rb.f.info(input{bytes: d}, j)
204 if info.ccc == 0 {
205 i.multiSeg = d[j:]
206 return d[:j]
208 j += int(info.size)
210 i.multiSeg = nil
211 i.next = nextComposed
212 i.p++ // restore old valud of i.p. See nextComposed.
213 if i.p >= i.rb.nsrc {
214 i.setDone()
216 return d
219 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
220 func nextDecomposed(i *Iter) (next []byte) {
221 startp, outp := i.p, 0
222 inCopyStart, outCopyStart := i.p, 0
223 for {
224 if sz := int(i.info.size); sz <= 1 {
225 p := i.p
226 i.p++ // ASCII or illegal byte. Either way, advance by 1.
227 if i.p >= i.rb.nsrc {
228 i.setDone()
229 return i.returnSlice(p, i.p)
230 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
231 i.next = i.asciiF
232 return i.returnSlice(p, i.p)
234 outp++
235 } else if d := i.info.Decomposition(); d != nil {
236 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
237 // Case 1: there is a leftover to copy. In this case the decomposition
238 // must begin with a modifier and should always be appended.
239 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
240 p := outp + len(d)
241 if outp > 0 {
242 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
243 if p > len(i.buf) {
244 return i.buf[:outp]
246 } else if i.info.multiSegment() {
247 // outp must be 0 as multi-segment decompositions always
248 // start a new segment.
249 if i.multiSeg == nil {
250 i.multiSeg = d
251 i.next = nextMulti
252 return nextMulti(i)
254 // We are in the last segment. Treat as normal decomposition.
255 d = i.multiSeg
256 i.multiSeg = nil
257 p = len(d)
259 prevCC := i.info.tccc
260 if i.p += sz; i.p >= i.rb.nsrc {
261 i.setDone()
262 i.info = Properties{} // Force BoundaryBefore to succeed.
263 } else {
264 i.info = i.rb.f.info(i.rb.src, i.p)
266 if i.info.BoundaryBefore() {
267 if outp > 0 {
268 copy(i.buf[outp:], d)
269 return i.buf[:p]
271 return d
273 copy(i.buf[outp:], d)
274 outp = p
275 inCopyStart, outCopyStart = i.p, outp
276 if i.info.ccc < prevCC {
277 goto doNorm
279 continue
280 } else if r := i.rb.src.hangul(i.p); r != 0 {
281 i.next = nextHangul
282 i.p += hangulUTF8Size
283 if i.p >= i.rb.nsrc {
284 i.setDone()
286 return i.buf[:decomposeHangul(i.buf[:], r)]
287 } else {
288 p := outp + sz
289 if p > len(i.buf) {
290 break
292 outp = p
293 i.p += sz
295 if i.p >= i.rb.nsrc {
296 i.setDone()
297 break
299 prevCC := i.info.tccc
300 i.info = i.rb.f.info(i.rb.src, i.p)
301 if i.info.BoundaryBefore() {
302 break
303 } else if i.info.ccc < prevCC {
304 goto doNorm
307 if outCopyStart == 0 {
308 return i.returnSlice(inCopyStart, i.p)
309 } else if inCopyStart < i.p {
310 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
312 return i.buf[:outp]
313 doNorm:
314 // Insert what we have decomposed so far in the reorderBuffer.
315 // As we will only reorder, there will always be enough room.
316 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
317 if !i.rb.insertDecomposed(i.buf[0:outp]) {
318 // Start over to prevent decompositions from crossing segment boundaries.
319 // This is a rare occurrence.
320 i.p = startp
321 i.info = i.rb.f.info(i.rb.src, i.p)
323 for {
324 if !i.rb.insert(i.rb.src, i.p, i.info) {
325 break
327 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
328 i.setDone()
329 break
331 i.info = i.rb.f.info(i.rb.src, i.p)
332 if i.info.ccc == 0 {
333 break
336 // new segment or too many combining characters: exit normalization
337 return i.buf[:i.rb.flushCopy(i.buf[:])]
340 // nextComposed is the implementation of Next for forms NFC and NFKC.
341 func nextComposed(i *Iter) []byte {
342 outp, startp := 0, i.p
343 var prevCC uint8
344 for {
345 if !i.info.isYesC() {
346 goto doNorm
348 if cc := i.info.ccc; cc == 0 && outp > 0 {
349 break
350 } else if cc < prevCC {
351 goto doNorm
353 prevCC = i.info.tccc
354 sz := int(i.info.size)
355 if sz == 0 {
356 sz = 1 // illegal rune: copy byte-by-byte
358 p := outp + sz
359 if p > len(i.buf) {
360 break
362 outp = p
363 i.p += sz
364 if i.p >= i.rb.nsrc {
365 i.setDone()
366 break
367 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
368 i.next = i.asciiF
369 break
371 i.info = i.rb.f.info(i.rb.src, i.p)
373 return i.returnSlice(startp, i.p)
374 doNorm:
375 multi := false
376 i.p = startp
377 i.info = i.rb.f.info(i.rb.src, i.p)
378 for {
379 if !i.rb.insert(i.rb.src, i.p, i.info) {
380 break
382 multi = multi || i.info.multiSegment()
383 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
384 i.setDone()
385 break
387 i.info = i.rb.f.info(i.rb.src, i.p)
388 if i.info.BoundaryBefore() {
389 break
392 i.rb.compose()
393 seg := i.buf[:i.rb.flushCopy(i.buf[:])]
394 if multi {
395 i.p-- // fake not being done yet
396 i.multiSeg = seg
397 i.next = nextMultiNorm
398 return nextMultiNorm(i)
400 return seg