libgo/go/bytes/bytes.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package bytes implements functions for the manipulation of byte slices.
   6 // It is analogous to the facilities of the strings package.
   7 package bytes
   8
   9 import (
  10         "internal/bytealg"
  11         "unicode"
  12         "unicode/utf8"
  13 )
  14
  15 // Equal reports whether a and b
  16 // are the same length and contain the same bytes.
  17 // A nil argument is equivalent to an empty slice.
  18 func Equal(a, b []byte) bool {
  19         // Neither cmd/compile nor gccgo allocates for these string conversions.
  20         return string(a) == string(b)
  21 }
  22
  23 // Compare returns an integer comparing two byte slices lexicographically.
  24 // The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
  25 // A nil argument is equivalent to an empty slice.
  26 func Compare(a, b []byte) int {
  27         return bytealg.Compare(a, b)
  28 }
  29
  30 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
  31 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
  32 func explode(s []byte, n int) [][]byte {
  33         if n <= 0 {
  34                 n = len(s)
  35         }
  36         a := make([][]byte, n)
  37         var size int
  38         na := 0
  39         for len(s) > 0 {
  40                 if na+1 >= n {
  41                         a[na] = s
  42                         na++
  43                         break
  44                 }
  45                 _, size = utf8.DecodeRune(s)
  46                 a[na] = s[0:size:size]
  47                 s = s[size:]
  48                 na++
  49         }
  50         return a[0:na]
  51 }
  52
  53 // Count counts the number of non-overlapping instances of sep in s.
  54 // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
  55 func Count(s, sep []byte) int {
  56         // special case
  57         if len(sep) == 0 {
  58                 return utf8.RuneCount(s) + 1
  59         }
  60         if len(sep) == 1 {
  61                 return bytealg.Count(s, sep[0])
  62         }
  63         n := 0
  64         for {
  65                 i := Index(s, sep)
  66                 if i == -1 {
  67                         return n
  68                 }
  69                 n++
  70                 s = s[i+len(sep):]
  71         }
  72 }
  73
  74 // Contains reports whether subslice is within b.
  75 func Contains(b, subslice []byte) bool {
  76         return Index(b, subslice) != -1
  77 }
  78
  79 // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
  80 func ContainsAny(b []byte, chars string) bool {
  81         return IndexAny(b, chars) >= 0
  82 }
  83
  84 // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
  85 func ContainsRune(b []byte, r rune) bool {
  86         return IndexRune(b, r) >= 0
  87 }
  88
  89 // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
  90 func IndexByte(b []byte, c byte) int {
  91         return bytealg.IndexByte(b, c)
  92 }
  93
  94 func indexBytePortable(s []byte, c byte) int {
  95         for i, b := range s {
  96                 if b == c {
  97                         return i
  98                 }
  99         }
 100         return -1
 101 }
 102
 103 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
 104 func LastIndex(s, sep []byte) int {
 105         n := len(sep)
 106         switch {
 107         case n == 0:
 108                 return len(s)
 109         case n == 1:
 110                 return LastIndexByte(s, sep[0])
 111         case n == len(s):
 112                 if Equal(s, sep) {
 113                         return 0
 114                 }
 115                 return -1
 116         case n > len(s):
 117                 return -1
 118         }
 119         // Rabin-Karp search from the end of the string
 120         hashss, pow := bytealg.HashStrRevBytes(sep)
 121         last := len(s) - n
 122         var h uint32
 123         for i := len(s) - 1; i >= last; i-- {
 124                 h = h*bytealg.PrimeRK + uint32(s[i])
 125         }
 126         if h == hashss && Equal(s[last:], sep) {
 127                 return last
 128         }
 129         for i := last - 1; i >= 0; i-- {
 130                 h *= bytealg.PrimeRK
 131                 h += uint32(s[i])
 132                 h -= pow * uint32(s[i+n])
 133                 if h == hashss && Equal(s[i:i+n], sep) {
 134                         return i
 135                 }
 136         }
 137         return -1
 138 }
 139
 140 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
 141 func LastIndexByte(s []byte, c byte) int {
 142         for i := len(s) - 1; i >= 0; i-- {
 143                 if s[i] == c {
 144                         return i
 145                 }
 146         }
 147         return -1
 148 }
 149
 150 // IndexRune interprets s as a sequence of UTF-8-encoded code points.
 151 // It returns the byte index of the first occurrence in s of the given rune.
 152 // It returns -1 if rune is not present in s.
 153 // If r is utf8.RuneError, it returns the first instance of any
 154 // invalid UTF-8 byte sequence.
 155 func IndexRune(s []byte, r rune) int {
 156         switch {
 157         case 0 <= r && r < utf8.RuneSelf:
 158                 return IndexByte(s, byte(r))
 159         case r == utf8.RuneError:
 160                 for i := 0; i < len(s); {
 161                         r1, n := utf8.DecodeRune(s[i:])
 162                         if r1 == utf8.RuneError {
 163                                 return i
 164                         }
 165                         i += n
 166                 }
 167                 return -1
 168         case !utf8.ValidRune(r):
 169                 return -1
 170         default:
 171                 var b [utf8.UTFMax]byte
 172                 n := utf8.EncodeRune(b[:], r)
 173                 return Index(s, b[:n])
 174         }
 175 }
 176
 177 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
 178 // It returns the byte index of the first occurrence in s of any of the Unicode
 179 // code points in chars. It returns -1 if chars is empty or if there is no code
 180 // point in common.
 181 func IndexAny(s []byte, chars string) int {
 182         if chars == "" {
 183                 // Avoid scanning all of s.
 184                 return -1
 185         }
 186         if len(s) == 1 {
 187                 r := rune(s[0])
 188                 if r >= utf8.RuneSelf {
 189                         // search utf8.RuneError.
 190                         for _, r = range chars {
 191                                 if r == utf8.RuneError {
 192                                         return 0
 193                                 }
 194                         }
 195                         return -1
 196                 }
 197                 if bytealg.IndexByteString(chars, s[0]) >= 0 {
 198                         return 0
 199                 }
 200                 return -1
 201         }
 202         if len(chars) == 1 {
 203                 r := rune(chars[0])
 204                 if r >= utf8.RuneSelf {
 205                         r = utf8.RuneError
 206                 }
 207                 return IndexRune(s, r)
 208         }
 209         if len(s) > 8 {
 210                 if as, isASCII := makeASCIISet(chars); isASCII {
 211                         for i, c := range s {
 212                                 if as.contains(c) {
 213                                         return i
 214                                 }
 215                         }
 216                         return -1
 217                 }
 218         }
 219         var width int
 220         for i := 0; i < len(s); i += width {
 221                 r := rune(s[i])
 222                 if r < utf8.RuneSelf {
 223                         if bytealg.IndexByteString(chars, s[i]) >= 0 {
 224                                 return i
 225                         }
 226                         width = 1
 227                         continue
 228                 }
 229                 r, width = utf8.DecodeRune(s[i:])
 230                 if r == utf8.RuneError {
 231                         for _, r = range chars {
 232                                 if r == utf8.RuneError {
 233                                         return i
 234                                 }
 235                         }
 236                         continue
 237                 }
 238                 // r is 2 to 4 bytes. Using strings.Index is more reasonable, but as the bytes
 239                 // package should not import the strings package, use bytealg.IndexString
 240                 // instead. And this does not seem to lose much performance.
 241                 if chars == string(r) || bytealg.IndexString(chars, string(r)) >= 0 {
 242                         return i
 243                 }
 244         }
 245         return -1
 246 }
 247
 248 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
 249 // points. It returns the byte index of the last occurrence in s of any of
 250 // the Unicode code points in chars. It returns -1 if chars is empty or if
 251 // there is no code point in common.
 252 func LastIndexAny(s []byte, chars string) int {
 253         if chars == "" {
 254                 // Avoid scanning all of s.
 255                 return -1
 256         }
 257         if len(s) > 8 {
 258                 if as, isASCII := makeASCIISet(chars); isASCII {
 259                         for i := len(s) - 1; i >= 0; i-- {
 260                                 if as.contains(s[i]) {
 261                                         return i
 262                                 }
 263                         }
 264                         return -1
 265                 }
 266         }
 267         if len(s) == 1 {
 268                 r := rune(s[0])
 269                 if r >= utf8.RuneSelf {
 270                         for _, r = range chars {
 271                                 if r == utf8.RuneError {
 272                                         return 0
 273                                 }
 274                         }
 275                         return -1
 276                 }
 277                 if bytealg.IndexByteString(chars, s[0]) >= 0 {
 278                         return 0
 279                 }
 280                 return -1
 281         }
 282         if len(chars) == 1 {
 283                 cr := rune(chars[0])
 284                 if cr >= utf8.RuneSelf {
 285                         cr = utf8.RuneError
 286                 }
 287                 for i := len(s); i > 0; {
 288                         r, size := utf8.DecodeLastRune(s[:i])
 289                         i -= size
 290                         if r == cr {
 291                                 return i
 292                         }
 293                 }
 294                 return -1
 295         }
 296         for i := len(s); i > 0; {
 297                 r := rune(s[i-1])
 298                 if r < utf8.RuneSelf {
 299                         if bytealg.IndexByteString(chars, s[i-1]) >= 0 {
 300                                 return i - 1
 301                         }
 302                         i--
 303                         continue
 304                 }
 305                 r, size := utf8.DecodeLastRune(s[:i])
 306                 i -= size
 307                 if r == utf8.RuneError {
 308                         for _, r = range chars {
 309                                 if r == utf8.RuneError {
 310                                         return i
 311                                 }
 312                         }
 313                         continue
 314                 }
 315                 // r is 2 to 4 bytes. Using strings.Index is more reasonable, but as the bytes
 316                 // package should not import the strings package, use bytealg.IndexString
 317                 // instead. And this does not seem to lose much performance.
 318                 if chars == string(r) || bytealg.IndexString(chars, string(r)) >= 0 {
 319                         return i
 320                 }
 321         }
 322         return -1
 323 }
 324
 325 // Generic split: splits after each instance of sep,
 326 // including sepSave bytes of sep in the subslices.
 327 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
 328         if n == 0 {
 329                 return nil
 330         }
 331         if len(sep) == 0 {
 332                 return explode(s, n)
 333         }
 334         if n < 0 {
 335                 n = Count(s, sep) + 1
 336         }
 337
 338         a := make([][]byte, n)
 339         n--
 340         i := 0
 341         for i < n {
 342                 m := Index(s, sep)
 343                 if m < 0 {
 344                         break
 345                 }
 346                 a[i] = s[: m+sepSave : m+sepSave]
 347                 s = s[m+len(sep):]
 348                 i++
 349         }
 350         a[i] = s
 351         return a[:i+1]
 352 }
 353
 354 // SplitN slices s into subslices separated by sep and returns a slice of
 355 // the subslices between those separators.
 356 // If sep is empty, SplitN splits after each UTF-8 sequence.
 357 // The count determines the number of subslices to return:
 358 //   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
 359 //   n == 0: the result is nil (zero subslices)
 360 //   n < 0: all subslices
 361 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
 362
 363 // SplitAfterN slices s into subslices after each instance of sep and
 364 // returns a slice of those subslices.
 365 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
 366 // The count determines the number of subslices to return:
 367 //   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
 368 //   n == 0: the result is nil (zero subslices)
 369 //   n < 0: all subslices
 370 func SplitAfterN(s, sep []byte, n int) [][]byte {
 371         return genSplit(s, sep, len(sep), n)
 372 }
 373
 374 // Split slices s into all subslices separated by sep and returns a slice of
 375 // the subslices between those separators.
 376 // If sep is empty, Split splits after each UTF-8 sequence.
 377 // It is equivalent to SplitN with a count of -1.
 378 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
 379
 380 // SplitAfter slices s into all subslices after each instance of sep and
 381 // returns a slice of those subslices.
 382 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
 383 // It is equivalent to SplitAfterN with a count of -1.
 384 func SplitAfter(s, sep []byte) [][]byte {
 385         return genSplit(s, sep, len(sep), -1)
 386 }
 387
 388 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
 389
 390 // Fields interprets s as a sequence of UTF-8-encoded code points.
 391 // It splits the slice s around each instance of one or more consecutive white space
 392 // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
 393 // empty slice if s contains only white space.
 394 func Fields(s []byte) [][]byte {
 395         // First count the fields.
 396         // This is an exact count if s is ASCII, otherwise it is an approximation.
 397         n := 0
 398         wasSpace := 1
 399         // setBits is used to track which bits are set in the bytes of s.
 400         setBits := uint8(0)
 401         for i := 0; i < len(s); i++ {
 402                 r := s[i]
 403                 setBits |= r
 404                 isSpace := int(asciiSpace[r])
 405                 n += wasSpace & ^isSpace
 406                 wasSpace = isSpace
 407         }
 408
 409         if setBits >= utf8.RuneSelf {
 410                 // Some runes in the input slice are not ASCII.
 411                 return FieldsFunc(s, unicode.IsSpace)
 412         }
 413
 414         // ASCII fast path
 415         a := make([][]byte, n)
 416         na := 0
 417         fieldStart := 0
 418         i := 0
 419         // Skip spaces in the front of the input.
 420         for i < len(s) && asciiSpace[s[i]] != 0 {
 421                 i++
 422         }
 423         fieldStart = i
 424         for i < len(s) {
 425                 if asciiSpace[s[i]] == 0 {
 426                         i++
 427                         continue
 428                 }
 429                 a[na] = s[fieldStart:i:i]
 430                 na++
 431                 i++
 432                 // Skip spaces in between fields.
 433                 for i < len(s) && asciiSpace[s[i]] != 0 {
 434                         i++
 435                 }
 436                 fieldStart = i
 437         }
 438         if fieldStart < len(s) { // Last field might end at EOF.
 439                 a[na] = s[fieldStart:len(s):len(s)]
 440         }
 441         return a
 442 }
 443
 444 // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
 445 // It splits the slice s at each run of code points c satisfying f(c) and
 446 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
 447 // len(s) == 0, an empty slice is returned.
 448 //
 449 // FieldsFunc makes no guarantees about the order in which it calls f(c)
 450 // and assumes that f always returns the same value for a given c.
 451 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 452         // A span is used to record a slice of s of the form s[start:end].
 453         // The start index is inclusive and the end index is exclusive.
 454         type span struct {
 455                 start int
 456                 end   int
 457         }
 458         spans := make([]span, 0, 32)
 459
 460         // Find the field start and end indices.
 461         // Doing this in a separate pass (rather than slicing the string s
 462         // and collecting the result substrings right away) is significantly
 463         // more efficient, possibly due to cache effects.
 464         start := -1 // valid span start if >= 0
 465         for i := 0; i < len(s); {
 466                 size := 1
 467                 r := rune(s[i])
 468                 if r >= utf8.RuneSelf {
 469                         r, size = utf8.DecodeRune(s[i:])
 470                 }
 471                 if f(r) {
 472                         if start >= 0 {
 473                                 spans = append(spans, span{start, i})
 474                                 start = -1
 475                         }
 476                 } else {
 477                         if start < 0 {
 478                                 start = i
 479                         }
 480                 }
 481                 i += size
 482         }
 483
 484         // Last field might end at EOF.
 485         if start >= 0 {
 486                 spans = append(spans, span{start, len(s)})
 487         }
 488
 489         // Create subslices from recorded field indices.
 490         a := make([][]byte, len(spans))
 491         for i, span := range spans {
 492                 a[i] = s[span.start:span.end:span.end]
 493         }
 494
 495         return a
 496 }
 497
 498 // Join concatenates the elements of s to create a new byte slice. The separator
 499 // sep is placed between elements in the resulting slice.
 500 func Join(s [][]byte, sep []byte) []byte {
 501         if len(s) == 0 {
 502                 return []byte{}
 503         }
 504         if len(s) == 1 {
 505                 // Just return a copy.
 506                 return append([]byte(nil), s[0]...)
 507         }
 508         n := len(sep) * (len(s) - 1)
 509         for _, v := range s {
 510                 n += len(v)
 511         }
 512
 513         b := make([]byte, n)
 514         bp := copy(b, s[0])
 515         for _, v := range s[1:] {
 516                 bp += copy(b[bp:], sep)
 517                 bp += copy(b[bp:], v)
 518         }
 519         return b
 520 }
 521
 522 // HasPrefix tests whether the byte slice s begins with prefix.
 523 func HasPrefix(s, prefix []byte) bool {
 524         return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
 525 }
 526
 527 // HasSuffix tests whether the byte slice s ends with suffix.
 528 func HasSuffix(s, suffix []byte) bool {
 529         return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
 530 }
 531
 532 // Map returns a copy of the byte slice s with all its characters modified
 533 // according to the mapping function. If mapping returns a negative value, the character is
 534 // dropped from the byte slice with no replacement. The characters in s and the
 535 // output are interpreted as UTF-8-encoded code points.
 536 func Map(mapping func(r rune) rune, s []byte) []byte {
 537         // In the worst case, the slice can grow when mapped, making
 538         // things unpleasant. But it's so rare we barge in assuming it's
 539         // fine. It could also shrink but that falls out naturally.
 540         maxbytes := len(s) // length of b
 541         nbytes := 0        // number of bytes encoded in b
 542         b := make([]byte, maxbytes)
 543         for i := 0; i < len(s); {
 544                 wid := 1
 545                 r := rune(s[i])
 546                 if r >= utf8.RuneSelf {
 547                         r, wid = utf8.DecodeRune(s[i:])
 548                 }
 549                 r = mapping(r)
 550                 if r >= 0 {
 551                         rl := utf8.RuneLen(r)
 552                         if rl < 0 {
 553                                 rl = len(string(utf8.RuneError))
 554                         }
 555                         if nbytes+rl > maxbytes {
 556                                 // Grow the buffer.
 557                                 maxbytes = maxbytes*2 + utf8.UTFMax
 558                                 nb := make([]byte, maxbytes)
 559                                 copy(nb, b[0:nbytes])
 560                                 b = nb
 561                         }
 562                         nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
 563                 }
 564                 i += wid
 565         }
 566         return b[0:nbytes]
 567 }
 568
 569 // Repeat returns a new byte slice consisting of count copies of b.
 570 //
 571 // It panics if count is negative or if
 572 // the result of (len(b) * count) overflows.
 573 func Repeat(b []byte, count int) []byte {
 574         if count == 0 {
 575                 return []byte{}
 576         }
 577         // Since we cannot return an error on overflow,
 578         // we should panic if the repeat will generate
 579         // an overflow.
 580         // See Issue golang.org/issue/16237.
 581         if count < 0 {
 582                 panic("bytes: negative Repeat count")
 583         } else if len(b)*count/count != len(b) {
 584                 panic("bytes: Repeat count causes overflow")
 585         }
 586
 587         nb := make([]byte, len(b)*count)
 588         bp := copy(nb, b)
 589         for bp < len(nb) {
 590                 copy(nb[bp:], nb[:bp])
 591                 bp *= 2
 592         }
 593         return nb
 594 }
 595
 596 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
 597 // their upper case.
 598 func ToUpper(s []byte) []byte {
 599         isASCII, hasLower := true, false
 600         for i := 0; i < len(s); i++ {
 601                 c := s[i]
 602                 if c >= utf8.RuneSelf {
 603                         isASCII = false
 604                         break
 605                 }
 606                 hasLower = hasLower || ('a' <= c && c <= 'z')
 607         }
 608
 609         if isASCII { // optimize for ASCII-only byte slices.
 610                 if !hasLower {
 611                         // Just return a copy.
 612                         return append([]byte(""), s...)
 613                 }
 614                 b := make([]byte, len(s))
 615                 for i := 0; i < len(s); i++ {
 616                         c := s[i]
 617                         if 'a' <= c && c <= 'z' {
 618                                 c -= 'a' - 'A'
 619                         }
 620                         b[i] = c
 621                 }
 622                 return b
 623         }
 624         return Map(unicode.ToUpper, s)
 625 }
 626
 627 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to
 628 // their lower case.
 629 func ToLower(s []byte) []byte {
 630         isASCII, hasUpper := true, false
 631         for i := 0; i < len(s); i++ {
 632                 c := s[i]
 633                 if c >= utf8.RuneSelf {
 634                         isASCII = false
 635                         break
 636                 }
 637                 hasUpper = hasUpper || ('A' <= c && c <= 'Z')
 638         }
 639
 640         if isASCII { // optimize for ASCII-only byte slices.
 641                 if !hasUpper {
 642                         return append([]byte(""), s...)
 643                 }
 644                 b := make([]byte, len(s))
 645                 for i := 0; i < len(s); i++ {
 646                         c := s[i]
 647                         if 'A' <= c && c <= 'Z' {
 648                                 c += 'a' - 'A'
 649                         }
 650                         b[i] = c
 651                 }
 652                 return b
 653         }
 654         return Map(unicode.ToLower, s)
 655 }
 656
 657 // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
 658 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
 659
 660 // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
 661 // upper case, giving priority to the special casing rules.
 662 func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
 663         return Map(c.ToUpper, s)
 664 }
 665
 666 // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
 667 // lower case, giving priority to the special casing rules.
 668 func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
 669         return Map(c.ToLower, s)
 670 }
 671
 672 // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
 673 // title case, giving priority to the special casing rules.
 674 func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
 675         return Map(c.ToTitle, s)
 676 }
 677
 678 // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
 679 // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
 680 func ToValidUTF8(s, replacement []byte) []byte {
 681         b := make([]byte, 0, len(s)+len(replacement))
 682         invalid := false // previous byte was from an invalid UTF-8 sequence
 683         for i := 0; i < len(s); {
 684                 c := s[i]
 685                 if c < utf8.RuneSelf {
 686                         i++
 687                         invalid = false
 688                         b = append(b, byte(c))
 689                         continue
 690                 }
 691                 _, wid := utf8.DecodeRune(s[i:])
 692                 if wid == 1 {
 693                         i++
 694                         if !invalid {
 695                                 invalid = true
 696                                 b = append(b, replacement...)
 697                         }
 698                         continue
 699                 }
 700                 invalid = false
 701                 b = append(b, s[i:i+wid]...)
 702                 i += wid
 703         }
 704         return b
 705 }
 706
 707 // isSeparator reports whether the rune could mark a word boundary.
 708 // TODO: update when package unicode captures more of the properties.
 709 func isSeparator(r rune) bool {
 710         // ASCII alphanumerics and underscore are not separators
 711         if r <= 0x7F {
 712                 switch {
 713                 case '0' <= r && r <= '9':
 714                         return false
 715                 case 'a' <= r && r <= 'z':
 716                         return false
 717                 case 'A' <= r && r <= 'Z':
 718                         return false
 719                 case r == '_':
 720                         return false
 721                 }
 722                 return true
 723         }
 724         // Letters and digits are not separators
 725         if unicode.IsLetter(r) || unicode.IsDigit(r) {
 726                 return false
 727         }
 728         // Otherwise, all we can do for now is treat spaces as separators.
 729         return unicode.IsSpace(r)
 730 }
 731
 732 // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
 733 // words mapped to their title case.
 734 //
 735 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
 736 func Title(s []byte) []byte {
 737         // Use a closure here to remember state.
 738         // Hackish but effective. Depends on Map scanning in order and calling
 739         // the closure once per rune.
 740         prev := ' '
 741         return Map(
 742                 func(r rune) rune {
 743                         if isSeparator(prev) {
 744                                 prev = r
 745                                 return unicode.ToTitle(r)
 746                         }
 747                         prev = r
 748                         return r
 749                 },
 750                 s)
 751 }
 752
 753 // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
 754 // all leading UTF-8-encoded code points c that satisfy f(c).
 755 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
 756         i := indexFunc(s, f, false)
 757         if i == -1 {
 758                 return nil
 759         }
 760         return s[i:]
 761 }
 762
 763 // TrimRightFunc returns a subslice of s by slicing off all trailing
 764 // UTF-8-encoded code points c that satisfy f(c).
 765 func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
 766         i := lastIndexFunc(s, f, false)
 767         if i >= 0 && s[i] >= utf8.RuneSelf {
 768                 _, wid := utf8.DecodeRune(s[i:])
 769                 i += wid
 770         } else {
 771                 i++
 772         }
 773         return s[0:i]
 774 }
 775
 776 // TrimFunc returns a subslice of s by slicing off all leading and trailing
 777 // UTF-8-encoded code points c that satisfy f(c).
 778 func TrimFunc(s []byte, f func(r rune) bool) []byte {
 779         return TrimRightFunc(TrimLeftFunc(s, f), f)
 780 }
 781
 782 // TrimPrefix returns s without the provided leading prefix string.
 783 // If s doesn't start with prefix, s is returned unchanged.
 784 func TrimPrefix(s, prefix []byte) []byte {
 785         if HasPrefix(s, prefix) {
 786                 return s[len(prefix):]
 787         }
 788         return s
 789 }
 790
 791 // TrimSuffix returns s without the provided trailing suffix string.
 792 // If s doesn't end with suffix, s is returned unchanged.
 793 func TrimSuffix(s, suffix []byte) []byte {
 794         if HasSuffix(s, suffix) {
 795                 return s[:len(s)-len(suffix)]
 796         }
 797         return s
 798 }
 799
 800 // IndexFunc interprets s as a sequence of UTF-8-encoded code points.
 801 // It returns the byte index in s of the first Unicode
 802 // code point satisfying f(c), or -1 if none do.
 803 func IndexFunc(s []byte, f func(r rune) bool) int {
 804         return indexFunc(s, f, true)
 805 }
 806
 807 // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
 808 // It returns the byte index in s of the last Unicode
 809 // code point satisfying f(c), or -1 if none do.
 810 func LastIndexFunc(s []byte, f func(r rune) bool) int {
 811         return lastIndexFunc(s, f, true)
 812 }
 813
 814 // indexFunc is the same as IndexFunc except that if
 815 // truth==false, the sense of the predicate function is
 816 // inverted.
 817 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
 818         start := 0
 819         for start < len(s) {
 820                 wid := 1
 821                 r := rune(s[start])
 822                 if r >= utf8.RuneSelf {
 823                         r, wid = utf8.DecodeRune(s[start:])
 824                 }
 825                 if f(r) == truth {
 826                         return start
 827                 }
 828                 start += wid
 829         }
 830         return -1
 831 }
 832
 833 // lastIndexFunc is the same as LastIndexFunc except that if
 834 // truth==false, the sense of the predicate function is
 835 // inverted.
 836 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
 837         for i := len(s); i > 0; {
 838                 r, size := rune(s[i-1]), 1
 839                 if r >= utf8.RuneSelf {
 840                         r, size = utf8.DecodeLastRune(s[0:i])
 841                 }
 842                 i -= size
 843                 if f(r) == truth {
 844                         return i
 845                 }
 846         }
 847         return -1
 848 }
 849
 850 // asciiSet is a 32-byte value, where each bit represents the presence of a
 851 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
 852 // starting with the least-significant bit of the lowest word to the
 853 // most-significant bit of the highest word, map to the full range of all
 854 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
 855 // ensuring that any non-ASCII character will be reported as not in the set.
 856 type asciiSet [8]uint32
 857
 858 // makeASCIISet creates a set of ASCII characters and reports whether all
 859 // characters in chars are ASCII.
 860 func makeASCIISet(chars string) (as asciiSet, ok bool) {
 861         for i := 0; i < len(chars); i++ {
 862                 c := chars[i]
 863                 if c >= utf8.RuneSelf {
 864                         return as, false
 865                 }
 866                 as[c>>5] |= 1 << uint(c&31)
 867         }
 868         return as, true
 869 }
 870
 871 // contains reports whether c is inside the set.
 872 func (as *asciiSet) contains(c byte) bool {
 873         return (as[c>>5] & (1 << uint(c&31))) != 0
 874 }
 875
 876 func makeCutsetFunc(cutset string) func(r rune) bool {
 877         if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
 878                 return func(r rune) bool {
 879                         return r == rune(cutset[0])
 880                 }
 881         }
 882         if as, isASCII := makeASCIISet(cutset); isASCII {
 883                 return func(r rune) bool {
 884                         return r < utf8.RuneSelf && as.contains(byte(r))
 885                 }
 886         }
 887         return func(r rune) bool {
 888                 for _, c := range cutset {
 889                         if c == r {
 890                                 return true
 891                         }
 892                 }
 893                 return false
 894         }
 895 }
 896
 897 // Trim returns a subslice of s by slicing off all leading and
 898 // trailing UTF-8-encoded code points contained in cutset.
 899 func Trim(s []byte, cutset string) []byte {
 900         return TrimFunc(s, makeCutsetFunc(cutset))
 901 }
 902
 903 // TrimLeft returns a subslice of s by slicing off all leading
 904 // UTF-8-encoded code points contained in cutset.
 905 func TrimLeft(s []byte, cutset string) []byte {
 906         return TrimLeftFunc(s, makeCutsetFunc(cutset))
 907 }
 908
 909 // TrimRight returns a subslice of s by slicing off all trailing
 910 // UTF-8-encoded code points that are contained in cutset.
 911 func TrimRight(s []byte, cutset string) []byte {
 912         return TrimRightFunc(s, makeCutsetFunc(cutset))
 913 }
 914
 915 // TrimSpace returns a subslice of s by slicing off all leading and
 916 // trailing white space, as defined by Unicode.
 917 func TrimSpace(s []byte) []byte {
 918         // Fast path for ASCII: look for the first ASCII non-space byte
 919         start := 0
 920         for ; start < len(s); start++ {
 921                 c := s[start]
 922                 if c >= utf8.RuneSelf {
 923                         // If we run into a non-ASCII byte, fall back to the
 924                         // slower unicode-aware method on the remaining bytes
 925                         return TrimFunc(s[start:], unicode.IsSpace)
 926                 }
 927                 if asciiSpace[c] == 0 {
 928                         break
 929                 }
 930         }
 931
 932         // Now look for the first ASCII non-space byte from the end
 933         stop := len(s)
 934         for ; stop > start; stop-- {
 935                 c := s[stop-1]
 936                 if c >= utf8.RuneSelf {
 937                         return TrimFunc(s[start:stop], unicode.IsSpace)
 938                 }
 939                 if asciiSpace[c] == 0 {
 940                         break
 941                 }
 942         }
 943
 944         // At this point s[start:stop] starts and ends with an ASCII
 945         // non-space bytes, so we're done. Non-ASCII cases have already
 946         // been handled above.
 947         if start == stop {
 948                 // Special case to preserve previous TrimLeftFunc behavior,
 949                 // returning nil instead of empty slice if all spaces.
 950                 return nil
 951         }
 952         return s[start:stop]
 953 }
 954
 955 // Runes interprets s as a sequence of UTF-8-encoded code points.
 956 // It returns a slice of runes (Unicode code points) equivalent to s.
 957 func Runes(s []byte) []rune {
 958         t := make([]rune, utf8.RuneCount(s))
 959         i := 0
 960         for len(s) > 0 {
 961                 r, l := utf8.DecodeRune(s)
 962                 t[i] = r
 963                 i++
 964                 s = s[l:]
 965         }
 966         return t
 967 }
 968
 969 // Replace returns a copy of the slice s with the first n
 970 // non-overlapping instances of old replaced by new.
 971 // If old is empty, it matches at the beginning of the slice
 972 // and after each UTF-8 sequence, yielding up to k+1 replacements
 973 // for a k-rune slice.
 974 // If n < 0, there is no limit on the number of replacements.
 975 func Replace(s, old, new []byte, n int) []byte {
 976         m := 0
 977         if n != 0 {
 978                 // Compute number of replacements.
 979                 m = Count(s, old)
 980         }
 981         if m == 0 {
 982                 // Just return a copy.
 983                 return append([]byte(nil), s...)
 984         }
 985         if n < 0 || m < n {
 986                 n = m
 987         }
 988
 989         // Apply replacements to buffer.
 990         t := make([]byte, len(s)+n*(len(new)-len(old)))
 991         w := 0
 992         start := 0
 993         for i := 0; i < n; i++ {
 994                 j := start
 995                 if len(old) == 0 {
 996                         if i > 0 {
 997                                 _, wid := utf8.DecodeRune(s[start:])
 998                                 j += wid
 999                         }
1000                 } else {
1001                         j += Index(s[start:], old)
1002                 }
1003                 w += copy(t[w:], s[start:j])
1004                 w += copy(t[w:], new)
1005                 start = j + len(old)
1006         }
1007         w += copy(t[w:], s[start:])
1008         return t[0:w]
1009 }
1010
1011 // ReplaceAll returns a copy of the slice s with all
1012 // non-overlapping instances of old replaced by new.
1013 // If old is empty, it matches at the beginning of the slice
1014 // and after each UTF-8 sequence, yielding up to k+1 replacements
1015 // for a k-rune slice.
1016 func ReplaceAll(s, old, new []byte) []byte {
1017         return Replace(s, old, new, -1)
1018 }
1019
1020 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
1021 // are equal under Unicode case-folding, which is a more general
1022 // form of case-insensitivity.
1023 func EqualFold(s, t []byte) bool {
1024         for len(s) != 0 && len(t) != 0 {
1025                 // Extract first rune from each.
1026                 var sr, tr rune
1027                 if s[0] < utf8.RuneSelf {
1028                         sr, s = rune(s[0]), s[1:]
1029                 } else {
1030                         r, size := utf8.DecodeRune(s)
1031                         sr, s = r, s[size:]
1032                 }
1033                 if t[0] < utf8.RuneSelf {
1034                         tr, t = rune(t[0]), t[1:]
1035                 } else {
1036                         r, size := utf8.DecodeRune(t)
1037                         tr, t = r, t[size:]
1038                 }
1039
1040                 // If they match, keep going; if not, return false.
1041
1042                 // Easy case.
1043                 if tr == sr {
1044                         continue
1045                 }
1046
1047                 // Make sr < tr to simplify what follows.
1048                 if tr < sr {
1049                         tr, sr = sr, tr
1050                 }
1051                 // Fast check for ASCII.
1052                 if tr < utf8.RuneSelf {
1053                         // ASCII only, sr/tr must be upper/lower case
1054                         if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
1055                                 continue
1056                         }
1057                         return false
1058                 }
1059
1060                 // General case. SimpleFold(x) returns the next equivalent rune > x
1061                 // or wraps around to smaller values.
1062                 r := unicode.SimpleFold(sr)
1063                 for r != sr && r < tr {
1064                         r = unicode.SimpleFold(r)
1065                 }
1066                 if r == tr {
1067                         continue
1068                 }
1069                 return false
1070         }
1071
1072         // One string is empty. Are both?
1073         return len(s) == len(t)
1074 }
1075
1076 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
1077 func Index(s, sep []byte) int {
1078         n := len(sep)
1079         switch {
1080         case n == 0:
1081                 return 0
1082         case n == 1:
1083                 return IndexByte(s, sep[0])
1084         case n == len(s):
1085                 if Equal(sep, s) {
1086                         return 0
1087                 }
1088                 return -1
1089         case n > len(s):
1090                 return -1
1091         case n <= bytealg.MaxLen:
1092                 // Use brute force when s and sep both are small
1093                 if len(s) <= bytealg.MaxBruteForce {
1094                         return bytealg.Index(s, sep)
1095                 }
1096                 c0 := sep[0]
1097                 c1 := sep[1]
1098                 i := 0
1099                 t := len(s) - n + 1
1100                 fails := 0
1101                 for i < t {
1102                         if s[i] != c0 {
1103                                 // IndexByte is faster than bytealg.Index, so use it as long as
1104                                 // we're not getting lots of false positives.
1105                                 o := IndexByte(s[i+1:t], c0)
1106                                 if o < 0 {
1107                                         return -1
1108                                 }
1109                                 i += o + 1
1110                         }
1111                         if s[i+1] == c1 && Equal(s[i:i+n], sep) {
1112                                 return i
1113                         }
1114                         fails++
1115                         i++
1116                         // Switch to bytealg.Index when IndexByte produces too many false positives.
1117                         if fails > bytealg.Cutover(i) {
1118                                 r := bytealg.Index(s[i:], sep)
1119                                 if r >= 0 {
1120                                         return r + i
1121                                 }
1122                                 return -1
1123                         }
1124                 }
1125                 return -1
1126         }
1127         c0 := sep[0]
1128         c1 := sep[1]
1129         i := 0
1130         fails := 0
1131         t := len(s) - n + 1
1132         for i < t {
1133                 if s[i] != c0 {
1134                         o := IndexByte(s[i+1:t], c0)
1135                         if o < 0 {
1136                                 break
1137                         }
1138                         i += o + 1
1139                 }
1140                 if s[i+1] == c1 && Equal(s[i:i+n], sep) {
1141                         return i
1142                 }
1143                 i++
1144                 fails++
1145                 if fails >= 4+i>>4 && i < t {
1146                         // Give up on IndexByte, it isn't skipping ahead
1147                         // far enough to be better than Rabin-Karp.
1148                         // Experiments (using IndexPeriodic) suggest
1149                         // the cutover is about 16 byte skips.
1150                         // TODO: if large prefixes of sep are matching
1151                         // we should cutover at even larger average skips,
1152                         // because Equal becomes that much more expensive.
1153                         // This code does not take that effect into account.
1154                         j := bytealg.IndexRabinKarpBytes(s[i:], sep)
1155                         if j < 0 {
1156                                 return -1
1157                         }
1158                         return i + j
1159                 }
1160         }
1161         return -1
1162 }