1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
12 const MaxSegmentSize
= maxByteBufferSize
14 // An Iter iterates over a string or byte slice, while normalizing it
18 buf
[maxByteBufferSize
]byte
19 info Properties
// first character saved from previous iteration
20 next iterFunc
// implementation of next depends on form
23 p
int // current position in input source
24 multiSeg
[]byte // remainder of multi-segment decomposition
27 type iterFunc
func(*Iter
) []byte
29 // Init initializes i to iterate over src after normalizing it to Form f.
30 func (i
*Iter
) Init(f Form
, src
[]byte) {
39 i
.next
= i
.rb
.f
.nextMain
40 i
.asciiF
= nextASCIIBytes
41 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
44 // InitString initializes i to iterate over src after normalizing it to Form f.
45 func (i
*Iter
) InitString(f Form
, src
string) {
53 i
.rb
.initString(f
, src
)
54 i
.next
= i
.rb
.f
.nextMain
55 i
.asciiF
= nextASCIIString
56 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
59 // Seek sets the segment to be returned by the next call to Next to start
60 // at position p. It is the responsibility of the caller to set p to the
61 // start of a UTF8 rune.
62 func (i
*Iter
) Seek(offset
int64, whence
int) (int64, error
) {
68 abs
= int64(i
.p
) + offset
70 abs
= int64(i
.rb
.nsrc
) + offset
72 return 0, fmt
.Errorf("norm: invalid whence")
75 return 0, fmt
.Errorf("norm: negative position")
77 if int(abs
) >= i
.rb
.nsrc
{
79 return int64(i
.p
), nil
83 i
.next
= i
.rb
.f
.nextMain
84 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
88 // returnSlice returns a slice of the underlying input type as a byte slice.
89 // If the underlying is of type []byte, it will simply return a slice.
90 // If the underlying is of type string, it will copy the slice to the buffer
92 func (i
*Iter
) returnSlice(a
, b
int) []byte {
93 if i
.rb
.src
.bytes
== nil {
94 return i
.buf
[:copy(i
.buf
[:], i
.rb
.src
.str
[a
:b
])]
96 return i
.rb
.src
.bytes
[a
:b
]
99 // Pos returns the byte position at which the next call to Next will commence processing.
100 func (i
*Iter
) Pos() int {
104 func (i
*Iter
) setDone() {
109 // Done returns true if there is no more input to process.
110 func (i
*Iter
) Done() bool {
111 return i
.p
>= i
.rb
.nsrc
114 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
115 // For any input a and b for which f(a) == f(b), subsequent calls
116 // to Next will return the same segments.
117 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
118 // Although not guaranteed, n will typically be the smallest possible n.
119 func (i
*Iter
) Next() []byte {
123 func nextASCIIBytes(i
*Iter
) []byte {
127 return i
.rb
.src
.bytes
[i
.p
:p
]
129 if i
.rb
.src
.bytes
[p
] < utf8
.RuneSelf
{
132 return i
.rb
.src
.bytes
[p0
:p
]
134 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
135 i
.next
= i
.rb
.f
.nextMain
139 func nextASCIIString(i
*Iter
) []byte {
142 i
.buf
[0] = i
.rb
.src
.str
[i
.p
]
146 if i
.rb
.src
.str
[p
] < utf8
.RuneSelf
{
147 i
.buf
[0] = i
.rb
.src
.str
[i
.p
]
151 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
152 i
.next
= i
.rb
.f
.nextMain
156 func nextHangul(i
*Iter
) []byte {
157 if r
:= i
.rb
.src
.hangul(i
.p
); r
!= 0 {
158 i
.p
+= hangulUTF8Size
159 if i
.p
>= i
.rb
.nsrc
{
162 return i
.buf
[:decomposeHangul(i
.buf
[:], r
)]
164 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
165 i
.next
= i
.rb
.f
.nextMain
169 func nextDone(i
*Iter
) []byte {
173 // nextMulti is used for iterating over multi-segment decompositions
174 // for decomposing normal forms.
175 func nextMulti(i
*Iter
) []byte {
179 for j
= 1; j
< len(d
) && !utf8
.RuneStart(d
[j
]); j
++ {
182 info
:= i
.rb
.f
.info(input
{bytes
: d
}, j
)
189 // treat last segment as normal decomposition
190 i
.next
= i
.rb
.f
.nextMain
194 // nextMultiNorm is used for iterating over multi-segment decompositions
195 // for composing normal forms.
196 func nextMultiNorm(i
*Iter
) []byte {
200 for j
= 1; j
< len(d
) && !utf8
.RuneStart(d
[j
]); j
++ {
203 info
:= i
.rb
.f
.info(input
{bytes
: d
}, j
)
211 i
.next
= nextComposed
212 i
.p
++ // restore old valud of i.p. See nextComposed.
213 if i
.p
>= i
.rb
.nsrc
{
219 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
220 func nextDecomposed(i
*Iter
) (next
[]byte) {
221 startp
, outp
:= i
.p
, 0
222 inCopyStart
, outCopyStart
:= i
.p
, 0
224 if sz
:= int(i
.info
.size
); sz
<= 1 {
226 i
.p
++ // ASCII or illegal byte. Either way, advance by 1.
227 if i
.p
>= i
.rb
.nsrc
{
229 return i
.returnSlice(p
, i
.p
)
230 } else if i
.rb
.src
._byte(i
.p
) < utf8
.RuneSelf
{
232 return i
.returnSlice(p
, i
.p
)
235 } else if d
:= i
.info
.Decomposition(); d
!= nil {
236 // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
237 // Case 1: there is a leftover to copy. In this case the decomposition
238 // must begin with a modifier and should always be appended.
239 // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
242 i
.rb
.src
.copySlice(i
.buf
[outCopyStart
:], inCopyStart
, i
.p
)
246 } else if i
.info
.multiSegment() {
247 // outp must be 0 as multi-segment decompositions always
248 // start a new segment.
249 if i
.multiSeg
== nil {
254 // We are in the last segment. Treat as normal decomposition.
259 prevCC
:= i
.info
.tccc
260 if i
.p
+= sz
; i
.p
>= i
.rb
.nsrc
{
262 i
.info
= Properties
{} // Force BoundaryBefore to succeed.
264 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
266 if i
.info
.BoundaryBefore() {
268 copy(i
.buf
[outp
:], d
)
273 copy(i
.buf
[outp
:], d
)
275 inCopyStart
, outCopyStart
= i
.p
, outp
276 if i
.info
.ccc
< prevCC
{
280 } else if r
:= i
.rb
.src
.hangul(i
.p
); r
!= 0 {
282 i
.p
+= hangulUTF8Size
283 if i
.p
>= i
.rb
.nsrc
{
286 return i
.buf
[:decomposeHangul(i
.buf
[:], r
)]
295 if i
.p
>= i
.rb
.nsrc
{
299 prevCC
:= i
.info
.tccc
300 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
301 if i
.info
.BoundaryBefore() {
303 } else if i
.info
.ccc
< prevCC
{
307 if outCopyStart
== 0 {
308 return i
.returnSlice(inCopyStart
, i
.p
)
309 } else if inCopyStart
< i
.p
{
310 i
.rb
.src
.copySlice(i
.buf
[outCopyStart
:], inCopyStart
, i
.p
)
314 // Insert what we have decomposed so far in the reorderBuffer.
315 // As we will only reorder, there will always be enough room.
316 i
.rb
.src
.copySlice(i
.buf
[outCopyStart
:], inCopyStart
, i
.p
)
317 if !i
.rb
.insertDecomposed(i
.buf
[0:outp
]) {
318 // Start over to prevent decompositions from crossing segment boundaries.
319 // This is a rare occurrence.
321 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
324 if !i
.rb
.insert(i
.rb
.src
, i
.p
, i
.info
) {
327 if i
.p
+= int(i
.info
.size
); i
.p
>= i
.rb
.nsrc
{
331 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
336 // new segment or too many combining characters: exit normalization
337 return i
.buf
[:i
.rb
.flushCopy(i
.buf
[:])]
340 // nextComposed is the implementation of Next for forms NFC and NFKC.
341 func nextComposed(i
*Iter
) []byte {
342 outp
, startp
:= 0, i
.p
345 if !i
.info
.isYesC() {
348 if cc
:= i
.info
.ccc
; cc
== 0 && outp
> 0 {
350 } else if cc
< prevCC
{
354 sz
:= int(i
.info
.size
)
356 sz
= 1 // illegal rune: copy byte-by-byte
364 if i
.p
>= i
.rb
.nsrc
{
367 } else if i
.rb
.src
._byte(i
.p
) < utf8
.RuneSelf
{
371 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
373 return i
.returnSlice(startp
, i
.p
)
377 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
379 if !i
.rb
.insert(i
.rb
.src
, i
.p
, i
.info
) {
382 multi
= multi || i
.info
.multiSegment()
383 if i
.p
+= int(i
.info
.size
); i
.p
>= i
.rb
.nsrc
{
387 i
.info
= i
.rb
.f
.info(i
.rb
.src
, i
.p
)
388 if i
.info
.BoundaryBefore() {
393 seg
:= i
.buf
[:i
.rb
.flushCopy(i
.buf
[:])]
395 i
.p
-- // fake not being done yet
397 i
.next
= nextMultiNorm
398 return nextMultiNorm(i
)