2017-07-18 François Dumont <fdumont@gcc.gnu.org>
[official-gcc.git] / libgo / go / bytes / bytes.go
blob406a38257a74c420266dc013782fd9c91256686f
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package bytes implements functions for the manipulation of byte slices.
6 // It is analogous to the facilities of the strings package.
7 package bytes
9 import (
10 "unicode"
11 "unicode/utf8"
14 func equalPortable(a, b []byte) bool {
15 if len(a) != len(b) {
16 return false
18 for i, c := range a {
19 if c != b[i] {
20 return false
23 return true
26 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
27 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
28 func explode(s []byte, n int) [][]byte {
29 if n <= 0 {
30 n = len(s)
32 a := make([][]byte, n)
33 var size int
34 na := 0
35 for len(s) > 0 {
36 if na+1 >= n {
37 a[na] = s
38 na++
39 break
41 _, size = utf8.DecodeRune(s)
42 a[na] = s[0:size]
43 s = s[size:]
44 na++
46 return a[0:na]
49 // Count counts the number of non-overlapping instances of sep in s.
50 // If sep is an empty slice, Count returns 1 + the number of Unicode code points in s.
51 func Count(s, sep []byte) int {
52 n := len(sep)
53 if n == 0 {
54 return utf8.RuneCount(s) + 1
56 if n > len(s) {
57 return 0
59 count := 0
60 c := sep[0]
61 i := 0
62 t := s[:len(s)-n+1]
63 for i < len(t) {
64 if t[i] != c {
65 o := IndexByte(t[i:], c)
66 if o < 0 {
67 break
69 i += o
71 if n == 1 || Equal(s[i:i+n], sep) {
72 count++
73 i += n
74 continue
76 i++
78 return count
81 // Contains reports whether subslice is within b.
82 func Contains(b, subslice []byte) bool {
83 return Index(b, subslice) != -1
86 // ContainsAny reports whether any of the UTF-8-encoded Unicode code points in chars are within b.
87 func ContainsAny(b []byte, chars string) bool {
88 return IndexAny(b, chars) >= 0
91 // ContainsRune reports whether the Unicode code point r is within b.
92 func ContainsRune(b []byte, r rune) bool {
93 return IndexRune(b, r) >= 0
96 func indexBytePortable(s []byte, c byte) int {
97 for i, b := range s {
98 if b == c {
99 return i
102 return -1
105 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
106 func LastIndex(s, sep []byte) int {
107 n := len(sep)
108 if n == 0 {
109 return len(s)
111 c := sep[0]
112 for i := len(s) - n; i >= 0; i-- {
113 if s[i] == c && (n == 1 || Equal(s[i:i+n], sep)) {
114 return i
117 return -1
120 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
121 func LastIndexByte(s []byte, c byte) int {
122 for i := len(s) - 1; i >= 0; i-- {
123 if s[i] == c {
124 return i
127 return -1
130 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
131 // It returns the byte index of the first occurrence in s of the given rune.
132 // It returns -1 if rune is not present in s.
133 // If r is utf8.RuneError, it returns the first instance of any
134 // invalid UTF-8 byte sequence.
135 func IndexRune(s []byte, r rune) int {
136 switch {
137 case 0 <= r && r < utf8.RuneSelf:
138 return IndexByte(s, byte(r))
139 case r == utf8.RuneError:
140 for i := 0; i < len(s); {
141 r1, n := utf8.DecodeRune(s[i:])
142 if r1 == utf8.RuneError {
143 return i
145 i += n
147 return -1
148 case !utf8.ValidRune(r):
149 return -1
150 default:
151 var b [utf8.UTFMax]byte
152 n := utf8.EncodeRune(b[:], r)
153 return Index(s, b[:n])
157 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
158 // It returns the byte index of the first occurrence in s of any of the Unicode
159 // code points in chars. It returns -1 if chars is empty or if there is no code
160 // point in common.
161 func IndexAny(s []byte, chars string) int {
162 if len(chars) > 0 {
163 if len(s) > 8 {
164 if as, isASCII := makeASCIISet(chars); isASCII {
165 for i, c := range s {
166 if as.contains(c) {
167 return i
170 return -1
173 var width int
174 for i := 0; i < len(s); i += width {
175 r := rune(s[i])
176 if r < utf8.RuneSelf {
177 width = 1
178 } else {
179 r, width = utf8.DecodeRune(s[i:])
181 for _, ch := range chars {
182 if r == ch {
183 return i
188 return -1
191 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
192 // points. It returns the byte index of the last occurrence in s of any of
193 // the Unicode code points in chars. It returns -1 if chars is empty or if
194 // there is no code point in common.
195 func LastIndexAny(s []byte, chars string) int {
196 if len(chars) > 0 {
197 if len(s) > 8 {
198 if as, isASCII := makeASCIISet(chars); isASCII {
199 for i := len(s) - 1; i >= 0; i-- {
200 if as.contains(s[i]) {
201 return i
204 return -1
207 for i := len(s); i > 0; {
208 r, size := utf8.DecodeLastRune(s[:i])
209 i -= size
210 for _, c := range chars {
211 if r == c {
212 return i
217 return -1
220 // Generic split: splits after each instance of sep,
221 // including sepSave bytes of sep in the subslices.
222 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
223 if n == 0 {
224 return nil
226 if len(sep) == 0 {
227 return explode(s, n)
229 if n < 0 {
230 n = Count(s, sep) + 1
232 c := sep[0]
233 start := 0
234 a := make([][]byte, n)
235 na := 0
236 for i := 0; i+len(sep) <= len(s) && na+1 < n; i++ {
237 if s[i] == c && (len(sep) == 1 || Equal(s[i:i+len(sep)], sep)) {
238 a[na] = s[start : i+sepSave]
239 na++
240 start = i + len(sep)
241 i += len(sep) - 1
244 a[na] = s[start:]
245 return a[0 : na+1]
248 // SplitN slices s into subslices separated by sep and returns a slice of
249 // the subslices between those separators.
250 // If sep is empty, SplitN splits after each UTF-8 sequence.
251 // The count determines the number of subslices to return:
252 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
253 // n == 0: the result is nil (zero subslices)
254 // n < 0: all subslices
255 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
257 // SplitAfterN slices s into subslices after each instance of sep and
258 // returns a slice of those subslices.
259 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
260 // The count determines the number of subslices to return:
261 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
262 // n == 0: the result is nil (zero subslices)
263 // n < 0: all subslices
264 func SplitAfterN(s, sep []byte, n int) [][]byte {
265 return genSplit(s, sep, len(sep), n)
268 // Split slices s into all subslices separated by sep and returns a slice of
269 // the subslices between those separators.
270 // If sep is empty, Split splits after each UTF-8 sequence.
271 // It is equivalent to SplitN with a count of -1.
272 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
274 // SplitAfter slices s into all subslices after each instance of sep and
275 // returns a slice of those subslices.
276 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
277 // It is equivalent to SplitAfterN with a count of -1.
278 func SplitAfter(s, sep []byte) [][]byte {
279 return genSplit(s, sep, len(sep), -1)
282 // Fields splits the slice s around each instance of one or more consecutive white space
283 // characters, returning a slice of subslices of s or an empty list if s contains only white space.
284 func Fields(s []byte) [][]byte {
285 return FieldsFunc(s, unicode.IsSpace)
288 // FieldsFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
289 // It splits the slice s at each run of code points c satisfying f(c) and
290 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
291 // len(s) == 0, an empty slice is returned.
292 // FieldsFunc makes no guarantees about the order in which it calls f(c).
293 // If f does not return consistent results for a given c, FieldsFunc may crash.
294 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
295 n := 0
296 inField := false
297 for i := 0; i < len(s); {
298 r, size := utf8.DecodeRune(s[i:])
299 wasInField := inField
300 inField = !f(r)
301 if inField && !wasInField {
304 i += size
307 a := make([][]byte, n)
308 na := 0
309 fieldStart := -1
310 for i := 0; i <= len(s) && na < n; {
311 r, size := utf8.DecodeRune(s[i:])
312 if fieldStart < 0 && size > 0 && !f(r) {
313 fieldStart = i
314 i += size
315 continue
317 if fieldStart >= 0 && (size == 0 || f(r)) {
318 a[na] = s[fieldStart:i]
319 na++
320 fieldStart = -1
322 if size == 0 {
323 break
325 i += size
327 return a[0:na]
330 // Join concatenates the elements of s to create a new byte slice. The separator
331 // sep is placed between elements in the resulting slice.
332 func Join(s [][]byte, sep []byte) []byte {
333 if len(s) == 0 {
334 return []byte{}
336 if len(s) == 1 {
337 // Just return a copy.
338 return append([]byte(nil), s[0]...)
340 n := len(sep) * (len(s) - 1)
341 for _, v := range s {
342 n += len(v)
345 b := make([]byte, n)
346 bp := copy(b, s[0])
347 for _, v := range s[1:] {
348 bp += copy(b[bp:], sep)
349 bp += copy(b[bp:], v)
351 return b
354 // HasPrefix tests whether the byte slice s begins with prefix.
355 func HasPrefix(s, prefix []byte) bool {
356 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix)
359 // HasSuffix tests whether the byte slice s ends with suffix.
360 func HasSuffix(s, suffix []byte) bool {
361 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
364 // Map returns a copy of the byte slice s with all its characters modified
365 // according to the mapping function. If mapping returns a negative value, the character is
366 // dropped from the string with no replacement. The characters in s and the
367 // output are interpreted as UTF-8-encoded Unicode code points.
368 func Map(mapping func(r rune) rune, s []byte) []byte {
369 // In the worst case, the slice can grow when mapped, making
370 // things unpleasant. But it's so rare we barge in assuming it's
371 // fine. It could also shrink but that falls out naturally.
372 maxbytes := len(s) // length of b
373 nbytes := 0 // number of bytes encoded in b
374 b := make([]byte, maxbytes)
375 for i := 0; i < len(s); {
376 wid := 1
377 r := rune(s[i])
378 if r >= utf8.RuneSelf {
379 r, wid = utf8.DecodeRune(s[i:])
381 r = mapping(r)
382 if r >= 0 {
383 rl := utf8.RuneLen(r)
384 if rl < 0 {
385 rl = len(string(utf8.RuneError))
387 if nbytes+rl > maxbytes {
388 // Grow the buffer.
389 maxbytes = maxbytes*2 + utf8.UTFMax
390 nb := make([]byte, maxbytes)
391 copy(nb, b[0:nbytes])
392 b = nb
394 nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r)
396 i += wid
398 return b[0:nbytes]
401 // Repeat returns a new byte slice consisting of count copies of b.
403 // It panics if count is negative or if
404 // the result of (len(b) * count) overflows.
405 func Repeat(b []byte, count int) []byte {
406 // Since we cannot return an error on overflow,
407 // we should panic if the repeat will generate
408 // an overflow.
409 // See Issue golang.org/issue/16237.
410 if count < 0 {
411 panic("bytes: negative Repeat count")
412 } else if count > 0 && len(b)*count/count != len(b) {
413 panic("bytes: Repeat count causes overflow")
416 nb := make([]byte, len(b)*count)
417 bp := copy(nb, b)
418 for bp < len(nb) {
419 copy(nb[bp:], nb[:bp])
420 bp *= 2
422 return nb
425 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to their upper case.
426 func ToUpper(s []byte) []byte { return Map(unicode.ToUpper, s) }
428 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to their lower case.
429 func ToLower(s []byte) []byte { return Map(unicode.ToLower, s) }
431 // ToTitle returns a copy of the byte slice s with all Unicode letters mapped to their title case.
432 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
434 // ToUpperSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
435 // upper case, giving priority to the special casing rules.
436 func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
437 return Map(func(r rune) rune { return c.ToUpper(r) }, s)
440 // ToLowerSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
441 // lower case, giving priority to the special casing rules.
442 func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
443 return Map(func(r rune) rune { return c.ToLower(r) }, s)
446 // ToTitleSpecial returns a copy of the byte slice s with all Unicode letters mapped to their
447 // title case, giving priority to the special casing rules.
448 func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
449 return Map(func(r rune) rune { return c.ToTitle(r) }, s)
452 // isSeparator reports whether the rune could mark a word boundary.
453 // TODO: update when package unicode captures more of the properties.
454 func isSeparator(r rune) bool {
455 // ASCII alphanumerics and underscore are not separators
456 if r <= 0x7F {
457 switch {
458 case '0' <= r && r <= '9':
459 return false
460 case 'a' <= r && r <= 'z':
461 return false
462 case 'A' <= r && r <= 'Z':
463 return false
464 case r == '_':
465 return false
467 return true
469 // Letters and digits are not separators
470 if unicode.IsLetter(r) || unicode.IsDigit(r) {
471 return false
473 // Otherwise, all we can do for now is treat spaces as separators.
474 return unicode.IsSpace(r)
477 // Title returns a copy of s with all Unicode letters that begin words
478 // mapped to their title case.
480 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
481 func Title(s []byte) []byte {
482 // Use a closure here to remember state.
483 // Hackish but effective. Depends on Map scanning in order and calling
484 // the closure once per rune.
485 prev := ' '
486 return Map(
487 func(r rune) rune {
488 if isSeparator(prev) {
489 prev = r
490 return unicode.ToTitle(r)
492 prev = r
493 return r
498 // TrimLeftFunc returns a subslice of s by slicing off all leading UTF-8-encoded
499 // Unicode code points c that satisfy f(c).
500 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
501 i := indexFunc(s, f, false)
502 if i == -1 {
503 return nil
505 return s[i:]
508 // TrimRightFunc returns a subslice of s by slicing off all trailing UTF-8
509 // encoded Unicode code points c that satisfy f(c).
510 func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
511 i := lastIndexFunc(s, f, false)
512 if i >= 0 && s[i] >= utf8.RuneSelf {
513 _, wid := utf8.DecodeRune(s[i:])
514 i += wid
515 } else {
518 return s[0:i]
521 // TrimFunc returns a subslice of s by slicing off all leading and trailing
522 // UTF-8-encoded Unicode code points c that satisfy f(c).
523 func TrimFunc(s []byte, f func(r rune) bool) []byte {
524 return TrimRightFunc(TrimLeftFunc(s, f), f)
527 // TrimPrefix returns s without the provided leading prefix string.
528 // If s doesn't start with prefix, s is returned unchanged.
529 func TrimPrefix(s, prefix []byte) []byte {
530 if HasPrefix(s, prefix) {
531 return s[len(prefix):]
533 return s
536 // TrimSuffix returns s without the provided trailing suffix string.
537 // If s doesn't end with suffix, s is returned unchanged.
538 func TrimSuffix(s, suffix []byte) []byte {
539 if HasSuffix(s, suffix) {
540 return s[:len(s)-len(suffix)]
542 return s
545 // IndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
546 // It returns the byte index in s of the first Unicode
547 // code point satisfying f(c), or -1 if none do.
548 func IndexFunc(s []byte, f func(r rune) bool) int {
549 return indexFunc(s, f, true)
552 // LastIndexFunc interprets s as a sequence of UTF-8-encoded Unicode code points.
553 // It returns the byte index in s of the last Unicode
554 // code point satisfying f(c), or -1 if none do.
555 func LastIndexFunc(s []byte, f func(r rune) bool) int {
556 return lastIndexFunc(s, f, true)
559 // indexFunc is the same as IndexFunc except that if
560 // truth==false, the sense of the predicate function is
561 // inverted.
562 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
563 start := 0
564 for start < len(s) {
565 wid := 1
566 r := rune(s[start])
567 if r >= utf8.RuneSelf {
568 r, wid = utf8.DecodeRune(s[start:])
570 if f(r) == truth {
571 return start
573 start += wid
575 return -1
578 // lastIndexFunc is the same as LastIndexFunc except that if
579 // truth==false, the sense of the predicate function is
580 // inverted.
581 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
582 for i := len(s); i > 0; {
583 r, size := rune(s[i-1]), 1
584 if r >= utf8.RuneSelf {
585 r, size = utf8.DecodeLastRune(s[0:i])
587 i -= size
588 if f(r) == truth {
589 return i
592 return -1
595 // asciiSet is a 32-byte value, where each bit represents the presence of a
596 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
597 // starting with the least-significant bit of the lowest word to the
598 // most-significant bit of the highest word, map to the full range of all
599 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
600 // ensuring that any non-ASCII character will be reported as not in the set.
601 type asciiSet [8]uint32
603 // makeASCIISet creates a set of ASCII characters and reports whether all
604 // characters in chars are ASCII.
605 func makeASCIISet(chars string) (as asciiSet, ok bool) {
606 for i := 0; i < len(chars); i++ {
607 c := chars[i]
608 if c >= utf8.RuneSelf {
609 return as, false
611 as[c>>5] |= 1 << uint(c&31)
613 return as, true
616 // contains reports whether c is inside the set.
617 func (as *asciiSet) contains(c byte) bool {
618 return (as[c>>5] & (1 << uint(c&31))) != 0
621 func makeCutsetFunc(cutset string) func(r rune) bool {
622 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
623 return func(r rune) bool {
624 return r == rune(cutset[0])
627 if as, isASCII := makeASCIISet(cutset); isASCII {
628 return func(r rune) bool {
629 return r < utf8.RuneSelf && as.contains(byte(r))
632 return func(r rune) bool {
633 for _, c := range cutset {
634 if c == r {
635 return true
638 return false
642 // Trim returns a subslice of s by slicing off all leading and
643 // trailing UTF-8-encoded Unicode code points contained in cutset.
644 func Trim(s []byte, cutset string) []byte {
645 return TrimFunc(s, makeCutsetFunc(cutset))
648 // TrimLeft returns a subslice of s by slicing off all leading
649 // UTF-8-encoded Unicode code points contained in cutset.
650 func TrimLeft(s []byte, cutset string) []byte {
651 return TrimLeftFunc(s, makeCutsetFunc(cutset))
654 // TrimRight returns a subslice of s by slicing off all trailing
655 // UTF-8-encoded Unicode code points that are contained in cutset.
656 func TrimRight(s []byte, cutset string) []byte {
657 return TrimRightFunc(s, makeCutsetFunc(cutset))
660 // TrimSpace returns a subslice of s by slicing off all leading and
661 // trailing white space, as defined by Unicode.
662 func TrimSpace(s []byte) []byte {
663 return TrimFunc(s, unicode.IsSpace)
666 // Runes returns a slice of runes (Unicode code points) equivalent to s.
667 func Runes(s []byte) []rune {
668 t := make([]rune, utf8.RuneCount(s))
669 i := 0
670 for len(s) > 0 {
671 r, l := utf8.DecodeRune(s)
672 t[i] = r
674 s = s[l:]
676 return t
679 // Replace returns a copy of the slice s with the first n
680 // non-overlapping instances of old replaced by new.
681 // If old is empty, it matches at the beginning of the slice
682 // and after each UTF-8 sequence, yielding up to k+1 replacements
683 // for a k-rune slice.
684 // If n < 0, there is no limit on the number of replacements.
685 func Replace(s, old, new []byte, n int) []byte {
686 m := 0
687 if n != 0 {
688 // Compute number of replacements.
689 m = Count(s, old)
691 if m == 0 {
692 // Just return a copy.
693 return append([]byte(nil), s...)
695 if n < 0 || m < n {
696 n = m
699 // Apply replacements to buffer.
700 t := make([]byte, len(s)+n*(len(new)-len(old)))
701 w := 0
702 start := 0
703 for i := 0; i < n; i++ {
704 j := start
705 if len(old) == 0 {
706 if i > 0 {
707 _, wid := utf8.DecodeRune(s[start:])
708 j += wid
710 } else {
711 j += Index(s[start:], old)
713 w += copy(t[w:], s[start:j])
714 w += copy(t[w:], new)
715 start = j + len(old)
717 w += copy(t[w:], s[start:])
718 return t[0:w]
721 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
722 // are equal under Unicode case-folding.
723 func EqualFold(s, t []byte) bool {
724 for len(s) != 0 && len(t) != 0 {
725 // Extract first rune from each.
726 var sr, tr rune
727 if s[0] < utf8.RuneSelf {
728 sr, s = rune(s[0]), s[1:]
729 } else {
730 r, size := utf8.DecodeRune(s)
731 sr, s = r, s[size:]
733 if t[0] < utf8.RuneSelf {
734 tr, t = rune(t[0]), t[1:]
735 } else {
736 r, size := utf8.DecodeRune(t)
737 tr, t = r, t[size:]
740 // If they match, keep going; if not, return false.
742 // Easy case.
743 if tr == sr {
744 continue
747 // Make sr < tr to simplify what follows.
748 if tr < sr {
749 tr, sr = sr, tr
751 // Fast check for ASCII.
752 if tr < utf8.RuneSelf && 'A' <= sr && sr <= 'Z' {
753 // ASCII, and sr is upper case. tr must be lower case.
754 if tr == sr+'a'-'A' {
755 continue
757 return false
760 // General case. SimpleFold(x) returns the next equivalent rune > x
761 // or wraps around to smaller values.
762 r := unicode.SimpleFold(sr)
763 for r != sr && r < tr {
764 r = unicode.SimpleFold(r)
766 if r == tr {
767 continue
769 return false
772 // One string is empty. Are both?
773 return len(s) == len(t)