main/libgo/go/exp/norm/iter.go

   1 // Copyright 2011 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package norm
   6
   7 import (
   8         "fmt"
   9         "unicode/utf8"
  10 )
  11
  12 const MaxSegmentSize = maxByteBufferSize
  13
  14 // An Iter iterates over a string or byte slice, while normalizing it
  15 // to a given Form.
  16 type Iter struct {
  17         rb     reorderBuffer
  18         buf    [maxByteBufferSize]byte
  19         info   Properties // first character saved from previous iteration
  20         next   iterFunc   // implementation of next depends on form
  21         asciiF iterFunc
  22
  23         p        int    // current position in input source
  24         multiSeg []byte // remainder of multi-segment decomposition
  25 }
  26
  27 type iterFunc func(*Iter) []byte
  28
  29 // Init initializes i to iterate over src after normalizing it to Form f.
  30 func (i *Iter) Init(f Form, src []byte) {
  31         i.p = 0
  32         if len(src) == 0 {
  33                 i.setDone()
  34                 i.rb.nsrc = 0
  35                 return
  36         }
  37         i.multiSeg = nil
  38         i.rb.init(f, src)
  39         i.next = i.rb.f.nextMain
  40         i.asciiF = nextASCIIBytes
  41         i.info = i.rb.f.info(i.rb.src, i.p)
  42 }
  43
  44 // InitString initializes i to iterate over src after normalizing it to Form f.
  45 func (i *Iter) InitString(f Form, src string) {
  46         i.p = 0
  47         if len(src) == 0 {
  48                 i.setDone()
  49                 i.rb.nsrc = 0
  50                 return
  51         }
  52         i.multiSeg = nil
  53         i.rb.initString(f, src)
  54         i.next = i.rb.f.nextMain
  55         i.asciiF = nextASCIIString
  56         i.info = i.rb.f.info(i.rb.src, i.p)
  57 }
  58
  59 // Seek sets the segment to be returned by the next call to Next to start
  60 // at position p.  It is the responsibility of the caller to set p to the
  61 // start of a UTF8 rune.
  62 func (i *Iter) Seek(offset int64, whence int) (int64, error) {
  63         var abs int64
  64         switch whence {
  65         case 0:
  66                 abs = offset
  67         case 1:
  68                 abs = int64(i.p) + offset
  69         case 2:
  70                 abs = int64(i.rb.nsrc) + offset
  71         default:
  72                 return 0, fmt.Errorf("norm: invalid whence")
  73         }
  74         if abs < 0 {
  75                 return 0, fmt.Errorf("norm: negative position")
  76         }
  77         if int(abs) >= i.rb.nsrc {
  78                 i.setDone()
  79                 return int64(i.p), nil
  80         }
  81         i.p = int(abs)
  82         i.multiSeg = nil
  83         i.next = i.rb.f.nextMain
  84         i.info = i.rb.f.info(i.rb.src, i.p)
  85         return abs, nil
  86 }
  87
  88 // returnSlice returns a slice of the underlying input type as a byte slice.
  89 // If the underlying is of type []byte, it will simply return a slice.
  90 // If the underlying is of type string, it will copy the slice to the buffer
  91 // and return that.
  92 func (i *Iter) returnSlice(a, b int) []byte {
  93         if i.rb.src.bytes == nil {
  94                 return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
  95         }
  96         return i.rb.src.bytes[a:b]
  97 }
  98
  99 // Pos returns the byte position at which the next call to Next will commence processing.
 100 func (i *Iter) Pos() int {
 101         return i.p
 102 }
 103
 104 func (i *Iter) setDone() {
 105         i.next = nextDone
 106         i.p = i.rb.nsrc
 107 }
 108
 109 // Done returns true if there is no more input to process.
 110 func (i *Iter) Done() bool {
 111         return i.p >= i.rb.nsrc
 112 }
 113
 114 // Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
 115 // For any input a and b for which f(a) == f(b), subsequent calls
 116 // to Next will return the same segments.
 117 // Modifying runes are grouped together with the preceding starter, if such a starter exists.
 118 // Although not guaranteed, n will typically be the smallest possible n.
 119 func (i *Iter) Next() []byte {
 120         return i.next(i)
 121 }
 122
 123 func nextASCIIBytes(i *Iter) []byte {
 124         p := i.p + 1
 125         if p >= i.rb.nsrc {
 126                 i.setDone()
 127                 return i.rb.src.bytes[i.p:p]
 128         }
 129         if i.rb.src.bytes[p] < utf8.RuneSelf {
 130                 p0 := i.p
 131                 i.p = p
 132                 return i.rb.src.bytes[p0:p]
 133         }
 134         i.info = i.rb.f.info(i.rb.src, i.p)
 135         i.next = i.rb.f.nextMain
 136         return i.next(i)
 137 }
 138
 139 func nextASCIIString(i *Iter) []byte {
 140         p := i.p + 1
 141         if p >= i.rb.nsrc {
 142                 i.buf[0] = i.rb.src.str[i.p]
 143                 i.setDone()
 144                 return i.buf[:1]
 145         }
 146         if i.rb.src.str[p] < utf8.RuneSelf {
 147                 i.buf[0] = i.rb.src.str[i.p]
 148                 i.p = p
 149                 return i.buf[:1]
 150         }
 151         i.info = i.rb.f.info(i.rb.src, i.p)
 152         i.next = i.rb.f.nextMain
 153         return i.next(i)
 154 }
 155
 156 func nextHangul(i *Iter) []byte {
 157         if r := i.rb.src.hangul(i.p); r != 0 {
 158                 i.p += hangulUTF8Size
 159                 if i.p >= i.rb.nsrc {
 160                         i.setDone()
 161                 }
 162                 return i.buf[:decomposeHangul(i.buf[:], r)]
 163         }
 164         i.info = i.rb.f.info(i.rb.src, i.p)
 165         i.next = i.rb.f.nextMain
 166         return i.next(i)
 167 }
 168
 169 func nextDone(i *Iter) []byte {
 170         return nil
 171 }
 172
 173 // nextMulti is used for iterating over multi-segment decompositions
 174 // for decomposing normal forms.
 175 func nextMulti(i *Iter) []byte {
 176         j := 0
 177         d := i.multiSeg
 178         // skip first rune
 179         for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
 180         }
 181         for j < len(d) {
 182                 info := i.rb.f.info(input{bytes: d}, j)
 183                 if info.ccc == 0 {
 184                         i.multiSeg = d[j:]
 185                         return d[:j]
 186                 }
 187                 j += int(info.size)
 188         }
 189         // treat last segment as normal decomposition
 190         i.next = i.rb.f.nextMain
 191         return i.next(i)
 192 }
 193
 194 // nextMultiNorm is used for iterating over multi-segment decompositions
 195 // for composing normal forms.
 196 func nextMultiNorm(i *Iter) []byte {
 197         j := 0
 198         d := i.multiSeg
 199         // skip first rune
 200         for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
 201         }
 202         for j < len(d) {
 203                 info := i.rb.f.info(input{bytes: d}, j)
 204                 if info.ccc == 0 {
 205                         i.multiSeg = d[j:]
 206                         return d[:j]
 207                 }
 208                 j += int(info.size)
 209         }
 210         i.multiSeg = nil
 211         i.next = nextComposed
 212         i.p++ // restore old valud of i.p. See nextComposed.
 213         if i.p >= i.rb.nsrc {
 214                 i.setDone()
 215         }
 216         return d
 217 }
 218
 219 // nextDecomposed is the implementation of Next for forms NFD and NFKD.
 220 func nextDecomposed(i *Iter) (next []byte) {
 221         startp, outp := i.p, 0
 222         inCopyStart, outCopyStart := i.p, 0
 223         for {
 224                 if sz := int(i.info.size); sz <= 1 {
 225                         p := i.p
 226                         i.p++ // ASCII or illegal byte.  Either way, advance by 1.
 227                         if i.p >= i.rb.nsrc {
 228                                 i.setDone()
 229                                 return i.returnSlice(p, i.p)
 230                         } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
 231                                 i.next = i.asciiF
 232                                 return i.returnSlice(p, i.p)
 233                         }
 234                         outp++
 235                 } else if d := i.info.Decomposition(); d != nil {
 236                         // Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
 237                         // Case 1: there is a leftover to copy.  In this case the decomposition
 238                         // must begin with a modifier and should always be appended.
 239                         // Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
 240                         p := outp + len(d)
 241                         if outp > 0 {
 242                                 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 243                                 if p > len(i.buf) {
 244                                         return i.buf[:outp]
 245                                 }
 246                         } else if i.info.multiSegment() {
 247                                 // outp must be 0 as multi-segment decompositions always
 248                                 // start a new segment.
 249                                 if i.multiSeg == nil {
 250                                         i.multiSeg = d
 251                                         i.next = nextMulti
 252                                         return nextMulti(i)
 253                                 }
 254                                 // We are in the last segment.  Treat as normal decomposition.
 255                                 d = i.multiSeg
 256                                 i.multiSeg = nil
 257                                 p = len(d)
 258                         }
 259                         prevCC := i.info.tccc
 260                         if i.p += sz; i.p >= i.rb.nsrc {
 261                                 i.setDone()
 262                                 i.info = Properties{} // Force BoundaryBefore to succeed.
 263                         } else {
 264                                 i.info = i.rb.f.info(i.rb.src, i.p)
 265                         }
 266                         if i.info.BoundaryBefore() {
 267                                 if outp > 0 {
 268                                         copy(i.buf[outp:], d)
 269                                         return i.buf[:p]
 270                                 }
 271                                 return d
 272                         }
 273                         copy(i.buf[outp:], d)
 274                         outp = p
 275                         inCopyStart, outCopyStart = i.p, outp
 276                         if i.info.ccc < prevCC {
 277                                 goto doNorm
 278                         }
 279                         continue
 280                 } else if r := i.rb.src.hangul(i.p); r != 0 {
 281                         i.next = nextHangul
 282                         i.p += hangulUTF8Size
 283                         if i.p >= i.rb.nsrc {
 284                                 i.setDone()
 285                         }
 286                         return i.buf[:decomposeHangul(i.buf[:], r)]
 287                 } else {
 288                         p := outp + sz
 289                         if p > len(i.buf) {
 290                                 break
 291                         }
 292                         outp = p
 293                         i.p += sz
 294                 }
 295                 if i.p >= i.rb.nsrc {
 296                         i.setDone()
 297                         break
 298                 }
 299                 prevCC := i.info.tccc
 300                 i.info = i.rb.f.info(i.rb.src, i.p)
 301                 if i.info.BoundaryBefore() {
 302                         break
 303                 } else if i.info.ccc < prevCC {
 304                         goto doNorm
 305                 }
 306         }
 307         if outCopyStart == 0 {
 308                 return i.returnSlice(inCopyStart, i.p)
 309         } else if inCopyStart < i.p {
 310                 i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 311         }
 312         return i.buf[:outp]
 313 doNorm:
 314         // Insert what we have decomposed so far in the reorderBuffer.
 315         // As we will only reorder, there will always be enough room.
 316         i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
 317         if !i.rb.insertDecomposed(i.buf[0:outp]) {
 318                 // Start over to prevent decompositions from crossing segment boundaries.
 319                 // This is a rare occurrence.
 320                 i.p = startp
 321                 i.info = i.rb.f.info(i.rb.src, i.p)
 322         }
 323         for {
 324                 if !i.rb.insert(i.rb.src, i.p, i.info) {
 325                         break
 326                 }
 327                 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
 328                         i.setDone()
 329                         break
 330                 }
 331                 i.info = i.rb.f.info(i.rb.src, i.p)
 332                 if i.info.ccc == 0 {
 333                         break
 334                 }
 335         }
 336         // new segment or too many combining characters: exit normalization
 337         return i.buf[:i.rb.flushCopy(i.buf[:])]
 338 }
 339
 340 // nextComposed is the implementation of Next for forms NFC and NFKC.
 341 func nextComposed(i *Iter) []byte {
 342         outp, startp := 0, i.p
 343         var prevCC uint8
 344         for {
 345                 if !i.info.isYesC() {
 346                         goto doNorm
 347                 }
 348                 if cc := i.info.ccc; cc == 0 && outp > 0 {
 349                         break
 350                 } else if cc < prevCC {
 351                         goto doNorm
 352                 }
 353                 prevCC = i.info.tccc
 354                 sz := int(i.info.size)
 355                 if sz == 0 {
 356                         sz = 1 // illegal rune: copy byte-by-byte
 357                 }
 358                 p := outp + sz
 359                 if p > len(i.buf) {
 360                         break
 361                 }
 362                 outp = p
 363                 i.p += sz
 364                 if i.p >= i.rb.nsrc {
 365                         i.setDone()
 366                         break
 367                 } else if i.rb.src._byte(i.p) < utf8.RuneSelf {
 368                         i.next = i.asciiF
 369                         break
 370                 }
 371                 i.info = i.rb.f.info(i.rb.src, i.p)
 372         }
 373         return i.returnSlice(startp, i.p)
 374 doNorm:
 375         multi := false
 376         i.p = startp
 377         i.info = i.rb.f.info(i.rb.src, i.p)
 378         for {
 379                 if !i.rb.insert(i.rb.src, i.p, i.info) {
 380                         break
 381                 }
 382                 multi = multi || i.info.multiSegment()
 383                 if i.p += int(i.info.size); i.p >= i.rb.nsrc {
 384                         i.setDone()
 385                         break
 386                 }
 387                 i.info = i.rb.f.info(i.rb.src, i.p)
 388                 if i.info.BoundaryBefore() {
 389                         break
 390                 }
 391         }
 392         i.rb.compose()
 393         seg := i.buf[:i.rb.flushCopy(i.buf[:])]
 394         if multi {
 395                 i.p-- // fake not being done yet
 396                 i.multiSeg = seg
 397                 i.next = nextMultiNorm
 398                 return nextMultiNorm(i)
 399         }
 400         return seg
 401 }