libgo/go/bytes/bytes.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package bytes implements functions for the manipulation of byte slices.
   6 // It is analogous to the facilities of the strings package.
   7 package bytes
   8
   9 import (
  10         "unicode"
  11         "unicode/utf8"
  12 )
  13
  14 func equalPortable(a, b []byte) bool {
  15         if len(a) != len(b) {
  16                 return false
  17         }
  18         for i, c := range a {
  19                 if c != b[i] {
  20                         return false
  21                 }
  22         }
  23         return true
  24 }
  25
  26 // explode splits s into a slice of UTF-8 sequences, one per Unicode character (still slices of bytes),
  27 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
  28 func explode(s []byte, n int) [][]byte {
  29         if n <= 0 {
  30                 n = len(s)
  31         }
  32         a := make([][]byte, n)
  33         var size int
  34         na := 0
  35         for len(s) > 0 {
  36                 if na+1 >= n {
  37                         a[na] = s
  38                         na++
  39                         break
  40                 }
  41                 _, size = utf8.DecodeRune(s)
  42                 a[na] = s[0:size]
  43                 s = s[size:]
  44                 na++
  45         }
  46         return a[0:na]
  47 }
  48
  49 // Count counts the number of non-overlapping instances of sep in s.
  50 func Count(s, sep []byte) int {
  51         n := len(sep)
  52         if n == 0 {
  53                 return utf8.RuneCount(s) + 1
  54         }
  55         if n > len(s) {
  56                 return 0
  57         }
  58         count := 0
  59         c := sep[0]
  60         i := 0
  61         t := s[:len(s)-n+1]
  62         for i < len(t) {
  63                 if t[i] != c {
  64                         o := IndexByte(t[i:], c)
  65                         if o < 0 {
  66                                 break
  67                         }
  68                         i += o
  69                 }
  70                 if n == 1 || Equal(s[i:i+n], sep) {
  71                         count++
  72                         i += n
  73                         continue
  74                 }
  75                 i++
  76         }
  77         return count
  78 }
  79
  80 // Contains reports whether subslice is within b.
  81 func Contains(b, subslice []byte) bool {
  82         return Index(b, subslice) != -1
  83 }
  84
  85 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
  86 func Index(s, sep []byte) int {
  87         n := len(sep)
  88         if n == 0 {
  89                 return 0
  90         }
  91         if n > len(s) {
  92                 return -1
  93         }
  94         c := sep[0]
  95         if n == 1 {
  96                 return IndexByte(s, c)
  97         }
  98         i := 0
  99         t := s[:len(s)-n+1]
 100         for i < len(t) {
 101                 if t[i] != c {
 102                         o := IndexByte(t[i:], c)
 103                         if o < 0 {
 104                                 break
 105                         }
 106                         i += o
 107                 }
 108                 if Equal(s[i:i+n], sep) {
 109                         return i
 110                 }
 111                 i++
 112         }
 113         return -1
 114 }
 115
 116 func indexBytePortable(s []byte, c byte) int {
 117         for i, b := range s {
 118                 if b == c {
 119                         return i
 120                 }
 121         }
 122         return -1
 123 }
 124
 125 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
 126 func LastIndex(s, sep []byte) int {
 127         n := len(sep)
 128         if n == 0 {
 129                 return len(s)
 130         }
 131         c := sep[0]
 132         for i := len(s) - n; i >= 0; i-- {
 133                 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
 134                         return i
 135                 }
 136         }
 137         return -1
 138 }
 139
 140 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
 141 // It returns the byte index of the first occurrence in s of the given rune.
 142 // It returns -1 if rune is not present in s.
 143 func IndexRune(s []byte, r rune) int {
 144         for i := 0; i < len(s); {
 145                 r1, size := utf8.DecodeRune(s[i:])
 146                 if r == r1 {
 147                         return i
 148                 }
 149                 i += size
 150         }
 151         return -1
 152 }
 153
 154 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
 155 // It returns the byte index of the first occurrence in s of any of the Unicode
 156 // code points in chars.  It returns -1 if chars is empty or if there is no code
 157 // point in common.
 158 func IndexAny(s []byte, chars string) int {
 159         if len(chars) > 0 {
 160                 var r rune
 161                 var width int
 162                 for i := 0; i < len(s); i += width {
 163                         r = rune(s[i])
 164                         if r < utf8.RuneSelf {
 165                                 width = 1
 166                         } else {
 167                                 r, width = utf8.DecodeRune(s[i:])
 168                         }
 169                         for _, ch := range chars {
 170                                 if r == ch {
 171                                         return i
 172                                 }
 173                         }
 174                 }
 175         }
 176         return -1
 177 }
 178
 179 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
 180 // points.  It returns the byte index of the last occurrence in s of any of
 181 // the Unicode code points in chars.  It returns -1 if chars is empty or if
 182 // there is no code point in common.
 183 func LastIndexAny(s []byte, chars string) int {
 184         if len(chars) > 0 {
 185                 for i := len(s); i > 0; {
 186                         r, size := utf8.DecodeLastRune(s[0:i])
 187                         i -= size
 188                         for _, ch := range chars {
 189                                 if r == ch {
 190                                         return i
 191                                 }
 192                         }
 193                 }
 194         }
 195         return -1
 196 }
 197
 198 // Generic split: splits after each instance of sep,
 199 // including sepSave bytes of sep in the subslices.
 200 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
 201         if n == 0 {
 202                 return nil
 203         }
 204         if len(sep) == 0 {
 205                 return explode(s, n)
 206         }
 207         if n < 0 {
 208                 n = Count(s, sep) + 1
 209         }
 210         c := sep[0]
 211         start := 0
 212         a := make([][]byte, n)
 213         na := 0
 214         for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
 215                 if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
 216                         a[na] = s[start : i+sepSave]
 217                         na++
 218                         start = i + len(sep)
 219                         i += len(sep) - 1
 220                 }
 221         }
 222         a[na] = s[start:]
 223         return a[0 : na+1]
 224 }
 225
 226 // SplitN slices s into subslices separated by sep and returns a slice of
 227 // the subslices between those separators.
 228 // If sep is empty, SplitN splits after each UTF-8 sequence.
 229 // The count determines the number of subslices to return:
 230 //   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
 231 //   n == 0: the result is nil (zero subslices)
 232 //   n < 0: all subslices
 233 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
 234
 235 // SplitAfterN slices s into subslices after each instance of sep and
 236 // returns a slice of those subslices.
 237 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
 238 // The count determines the number of subslices to return:
 239 //   n > 0: at most n subslices; the last subslice will be the unsplit remainder.
 240 //   n == 0: the result is nil (zero subslices)
 241 //   n < 0: all subslices
 242 func SplitAfterN(s, sep []byte, n int) [][]byte {
 243         return genSplit(s, sep, len(sep), n)
 244 }
 245
 246 // Split slices s into all subslices separated by sep and returns a slice of
 247 // the subslices between those separators.
 248 // If sep is empty, Split splits after each UTF-8 sequence.
 249 // It is equivalent to SplitN with a count of -1.
 250 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
 251
 252 // SplitAfter slices s into all subslices after each instance of sep and
 253 // returns a slice of those subslices.
 254 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
 255 // It is equivalent to SplitAfterN with a count of -1.
 256 func SplitAfter(s, sep []byte) [][]byte {
 257         return genSplit(s, sep, len(sep), -1)
 258 }
 259
 260 // Fields splits the slice s around each instance of one or more consecutive white space
 261 // characters, returning a slice of subslices of s or an empty list if s contains only white space.
 262 func Fields(s []byte) [][]byte {
 263         return FieldsFunc(s, unicode.IsSpace)
 264 }
 265
 266 // FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
 267 // It splits the slice s at each run of code points c satisfying f(c) and
 268 // returns a slice of subslices of s.  If all code points in s satisfy f(c), or
 269 // len(s) == 0, an empty slice is returned.
 270 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
 271         n := 0
 272         inField := false
 273         for i := 0; i < len(s); {
 274                 r, size := utf8.DecodeRune(s[i:])
 275                 wasInField := inField
 276                 inField = !f(r)
 277                 if inField && !wasInField {
 278                         n++
 279                 }
 280                 i += size
 281         }
 282
 283         a := make([][]byte, n)
 284         na := 0
 285         fieldStart := -1
 286         for i := 0; i <= len(s) && na < n; {
 287                 r, size := utf8.DecodeRune(s[i:])
 288                 if fieldStart < 0 && size > 0 && !f(r) {
 289                         fieldStart = i
 290                         i += size
 291                         continue
 292                 }
 293                 if fieldStart >= 0 && (size == 0 || f(r)) {
 294                         a[na] = s[fieldStart:i]
 295                         na++
 296                         fieldStart = -1
 297                 }
 298                 if size == 0 {
 299                         break
 300                 }
 301                 i += size
 302         }
 303         return a[0:na]
 304 }
 305
 306 // Join concatenates the elements of s to create a new byte slice. The separator
 307 // sep is placed between elements in the resulting slice.
 308 func Join(s [][]byte, sep []byte) []byte {
 309         if len(s) == 0 {
 310                 return []byte{}
 311         }
 312         if len(s) == 1 {
 313                 // Just return a copy.
 314                 return append([]byte(nil), s[0]...)
 315         }
 316         n := len(sep) * (len(s) - 1)
 317         for _, v := range s {
 318                 n += len(v)
 319         }
 320
 321         b := make([]byte, n)
 322         bp := copy(b, s[0])
 323         for _, v := range s[1:] {
 324                 bp += copy(b[bp:], sep)
 325                 bp += copy(b[bp:], v)
 326         }
 327         return b
 328 }
 329
 330 // HasPrefix tests whether the byte slice s begins with prefix.
 331 func HasPrefix(s, prefix []byte) bool {
 332         return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
 333 }
 334
 335 // HasSuffix tests whether the byte slice s ends with suffix.
 336 func HasSuffix(s, suffix []byte) bool {
 337         return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
 338 }
 339
 340 // Map returns a copy of the byte slice s with all its characters modified
 341 // according to the mapping function. If mapping returns a negative value, the character is
 342 // dropped from the string with no replacement.  The characters in s and the
 343 // output are interpreted as UTF-8-encoded Unicode code points.
 344 func Map(mapping func(r rune) rune, s []byte) []byte {
 345         // In the worst case, the slice can grow when mapped, making
 346         // things unpleasant.  But it's so rare we barge in assuming it's
 347         // fine.  It could also shrink but that falls out naturally.
 348         maxbytes := len(s) // length of b
 349         nbytes := 0        // number of bytes encoded in b
 350         b := make([]byte, maxbytes)
 351         for i := 0; i < len(s); {
 352                 wid := 1
 353                 r := rune(s[i])
 354                 if r >= utf8.RuneSelf {
 355                         r, wid = utf8.DecodeRune(s[i:])
 356                 }
 357                 r = mapping(r)
 358                 if r >= 0 {
 359                         if nbytes+utf8.RuneLen(r) > maxbytes {
 360                                 // Grow the buffer.
 361                                 maxbytes = maxbytes*2 + utf8.UTFMax
 362                                 nb := make([]byte, maxbytes)
 363                                 copy(nb, b[0:nbytes])
 364                                 b = nb
 365                         }
 366                         nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
 367                 }
 368                 i += wid
 369         }
 370         return b[0:nbytes]
 371 }
 372
 373 // Repeat returns a new byte slice consisting of count copies of b.
 374 func Repeat(b []byte, count int) []byte {
 375         nb := make([]byte, len(b)*count)
 376         bp := 0
 377         for i := 0; i < count; i++ {
 378                 bp += copy(nb[bp:], b)
 379         }
 380         return nb
 381 }
 382
 383 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to their upper case.
 384 func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
 385
 386 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to their lower case.
 387 func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
 388
 389 // ToTitle returns a copy of the byte slice s with all Unicode letters mapped to their title case.
 390 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
 391
 392 // ToUpperSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
 393 // upper case, giving priority to the special casing rules.
 394 func ToUpperSpecial(_case unicode.SpecialCase, s []byte) []byte {
 395         return Map(func(r rune) rune { return _case.ToUpper(r) }, s)
 396 }
 397
 398 // ToLowerSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
 399 // lower case, giving priority to the special casing rules.
 400 func ToLowerSpecial(_case unicode.SpecialCase, s []byte) []byte {
 401         return Map(func(r rune) rune { return _case.ToLower(r) }, s)
 402 }
 403
 404 // ToTitleSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
 405 // title case, giving priority to the special casing rules.
 406 func ToTitleSpecial(_case unicode.SpecialCase, s []byte) []byte {
 407         return Map(func(r rune) rune { return _case.ToTitle(r) }, s)
 408 }
 409
 410 // isSeparator reports whether the rune could mark a word boundary.
 411 // TODO: update when package unicode captures more of the properties.
 412 func isSeparator(r rune) bool {
 413         // ASCII alphanumerics and underscore are not separators
 414         if r <= 0x7F {
 415                 switch {
 416                 case '0' <= r && r <= '9':
 417                         return false
 418                 case 'a' <= r && r <= 'z':
 419                         return false
 420                 case 'A' <= r && r <= 'Z':
 421                         return false
 422                 case r == '_':
 423                         return false
 424                 }
 425                 return true
 426         }
 427         // Letters and digits are not separators
 428         if unicode.IsLetter(r) || unicode.IsDigit(r) {
 429                 return false
 430         }
 431         // Otherwise, all we can do for now is treat spaces as separators.
 432         return unicode.IsSpace(r)
 433 }
 434
 435 // Title returns a copy of s with all Unicode letters that begin words
 436 // mapped to their title case.
 437 //
 438 // BUG: The rule Title uses for word boundaries does not handle Unicode punctuation properly.
 439 func Title(s []byte) []byte {
 440         // Use a closure here to remember state.
 441         // Hackish but effective. Depends on Map scanning in order and calling
 442         // the closure once per rune.
 443         prev := ' '
 444         return Map(
 445                 func(r rune) rune {
 446                         if isSeparator(prev) {
 447                                 prev = r
 448                                 return unicode.ToTitle(r)
 449                         }
 450                         prev = r
 451                         return r
 452                 },
 453                 s)
 454 }
 455
 456 // TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
 457 // Unicode code points c that satisfy f(c).
 458 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
 459         i := indexFunc(s, f, false)
 460         if i == -1 {
 461                 return nil
 462         }
 463         return s[i:]
 464 }
 465
 466 // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
 467 // encoded Unicode code points c that satisfy f(c).
 468 func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
 469         i := lastIndexFunc(s, f, false)
 470         if i >= 0 && s[i] >= utf8.RuneSelf {
 471                 _, wid := utf8.DecodeRune(s[i:])
 472                 i += wid
 473         } else {
 474                 i++
 475         }
 476         return s[0:i]
 477 }
 478
 479 // TrimFunc returns a subslice of s by slicing off all leading and trailing
 480 // UTF-8-encoded Unicode code points c that satisfy f(c).
 481 func TrimFunc(s []byte, f func(r rune) bool) []byte {
 482         return TrimRightFunc(TrimLeftFunc(s, f), f)
 483 }
 484
 485 // TrimPrefix returns s without the provided leading prefix string.
 486 // If s doesn't start with prefix, s is returned unchanged.
 487 func TrimPrefix(s, prefix []byte) []byte {
 488         if HasPrefix(s, prefix) {
 489                 return s[len(prefix):]
 490         }
 491         return s
 492 }
 493
 494 // TrimSuffix returns s without the provided trailing suffix string.
 495 // If s doesn't end with suffix, s is returned unchanged.
 496 func TrimSuffix(s, suffix []byte) []byte {
 497         if HasSuffix(s, suffix) {
 498                 return s[:len(s)-len(suffix)]
 499         }
 500         return s
 501 }
 502
 503 // IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
 504 // It returns the byte index in s of the first Unicode
 505 // code point satisfying f(c), or -1 if none do.
 506 func IndexFunc(s []byte, f func(r rune) bool) int {
 507         return indexFunc(s, f, true)
 508 }
 509
 510 // LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
 511 // It returns the byte index in s of the last Unicode
 512 // code point satisfying f(c), or -1 if none do.
 513 func LastIndexFunc(s []byte, f func(r rune) bool) int {
 514         return lastIndexFunc(s, f, true)
 515 }
 516
 517 // indexFunc is the same as IndexFunc except that if
 518 // truth==false, the sense of the predicate function is
 519 // inverted.
 520 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
 521         start := 0
 522         for start < len(s) {
 523                 wid := 1
 524                 r := rune(s[start])
 525                 if r >= utf8.RuneSelf {
 526                         r, wid = utf8.DecodeRune(s[start:])
 527                 }
 528                 if f(r) == truth {
 529                         return start
 530                 }
 531                 start += wid
 532         }
 533         return -1
 534 }
 535
 536 // lastIndexFunc is the same as LastIndexFunc except that if
 537 // truth==false, the sense of the predicate function is
 538 // inverted.
 539 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
 540         for i := len(s); i > 0; {
 541                 r, size := rune(s[i-1]), 1
 542                 if r >= utf8.RuneSelf {
 543                         r, size = utf8.DecodeLastRune(s[0:i])
 544                 }
 545                 i -= size
 546                 if f(r) == truth {
 547                         return i
 548                 }
 549         }
 550         return -1
 551 }
 552
 553 func makeCutsetFunc(cutset string) func(r rune) bool {
 554         return func(r rune) bool {
 555                 for _, c := range cutset {
 556                         if c == r {
 557                                 return true
 558                         }
 559                 }
 560                 return false
 561         }
 562 }
 563
 564 // Trim returns a subslice of s by slicing off all leading and
 565 // trailing UTF-8-encoded Unicode code points contained in cutset.
 566 func Trim(s []byte, cutset string) []byte {
 567         return TrimFunc(s, makeCutsetFunc(cutset))
 568 }
 569
 570 // TrimLeft returns a subslice of s by slicing off all leading
 571 // UTF-8-encoded Unicode code points contained in cutset.
 572 func TrimLeft(s []byte, cutset string) []byte {
 573         return TrimLeftFunc(s, makeCutsetFunc(cutset))
 574 }
 575
 576 // TrimRight returns a subslice of s by slicing off all trailing
 577 // UTF-8-encoded Unicode code points that are contained in cutset.
 578 func TrimRight(s []byte, cutset string) []byte {
 579         return TrimRightFunc(s, makeCutsetFunc(cutset))
 580 }
 581
 582 // TrimSpace returns a subslice of s by slicing off all leading and
 583 // trailing white space, as defined by Unicode.
 584 func TrimSpace(s []byte) []byte {
 585         return TrimFunc(s, unicode.IsSpace)
 586 }
 587
 588 // Runes returns a slice of runes (Unicode code points) equivalent to s.
 589 func Runes(s []byte) []rune {
 590         t := make([]rune, utf8.RuneCount(s))
 591         i := 0
 592         for len(s) > 0 {
 593                 r, l := utf8.DecodeRune(s)
 594                 t[i] = r
 595                 i++
 596                 s = s[l:]
 597         }
 598         return t
 599 }
 600
 601 // Replace returns a copy of the slice s with the first n
 602 // non-overlapping instances of old replaced by new.
 603 // If n < 0, there is no limit on the number of replacements.
 604 func Replace(s, old, new []byte, n int) []byte {
 605         m := 0
 606         if n != 0 {
 607                 // Compute number of replacements.
 608                 m = Count(s, old)
 609         }
 610         if m == 0 {
 611                 // Just return a copy.
 612                 return append([]byte(nil), s...)
 613         }
 614         if n < 0 || m < n {
 615                 n = m
 616         }
 617
 618         // Apply replacements to buffer.
 619         t := make([]byte, len(s)+n*(len(new)-len(old)))
 620         w := 0
 621         start := 0
 622         for i := 0; i < n; i++ {
 623                 j := start
 624                 if len(old) == 0 {
 625                         if i > 0 {
 626                                 _, wid := utf8.DecodeRune(s[start:])
 627                                 j += wid
 628                         }
 629                 } else {
 630                         j += Index(s[start:], old)
 631                 }
 632                 w += copy(t[w:], s[start:j])
 633                 w += copy(t[w:], new)
 634                 start = j + len(old)
 635         }
 636         w += copy(t[w:], s[start:])
 637         return t[0:w]
 638 }
 639
 640 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
 641 // are equal under Unicode case-folding.
 642 func EqualFold(s, t []byte) bool {
 643         for len(s) != 0 && len(t) != 0 {
 644                 // Extract first rune from each.
 645                 var sr, tr rune
 646                 if s[0] < utf8.RuneSelf {
 647                         sr, s = rune(s[0]), s[1:]
 648                 } else {
 649                         r, size := utf8.DecodeRune(s)
 650                         sr, s = r, s[size:]
 651                 }
 652                 if t[0] < utf8.RuneSelf {
 653                         tr, t = rune(t[0]), t[1:]
 654                 } else {
 655                         r, size := utf8.DecodeRune(t)
 656                         tr, t = r, t[size:]
 657                 }
 658
 659                 // If they match, keep going; if not, return false.
 660
 661                 // Easy case.
 662                 if tr == sr {
 663                         continue
 664                 }
 665
 666                 // Make sr < tr to simplify what follows.
 667                 if tr < sr {
 668                         tr, sr = sr, tr
 669                 }
 670                 // Fast check for ASCII.
 671                 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
 672                         // ASCII, and sr is upper case.  tr must be lower case.
 673                         if tr == sr+'a'-'A' {
 674                                 continue
 675                         }
 676                         return false
 677                 }
 678
 679                 // General case.  SimpleFold(x) returns the next equivalent rune > x
 680                 // or wraps around to smaller values.
 681                 r := unicode.SimpleFold(sr)
 682                 for r != sr && r < tr {
 683                         r = unicode.SimpleFold(r)
 684                 }
 685                 if r == tr {
 686                         continue
 687                 }
 688                 return false
 689         }
 690
 691         // One string is empty.  Are both?
 692         return len(s) == len(t)
 693 }