1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package bytes implements functions for the manipulation of byte slices.
6 // It is analogous to the facilities of the strings package.
14 func equalPortable(a
, b
[]byte) bool {
26 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
27 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
28 func explode(s
[]byte, n
int) [][]byte {
32 a
:= make([][]byte, n
)
41 _
, size
= utf8
.DecodeRune(s
)
42 a
[na
] = s
[0:size
:size
]
49 // countGeneric actually implements Count
50 func countGeneric(s
, sep
[]byte) int {
53 return utf8
.RuneCount(s
) + 1
66 // Contains reports whether subslice is within b.
67 func Contains(b
, subslice
[]byte) bool {
68 return Index(b
, subslice
) != -1
71 // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
72 func ContainsAny(b
[]byte, chars
string) bool {
73 return IndexAny(b
, chars
) >= 0
76 // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
77 func ContainsRune(b
[]byte, r rune
) bool {
78 return IndexRune(b
, r
) >= 0
81 func indexBytePortable(s
[]byte, c
byte) int {
90 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
91 func LastIndex(s
, sep
[]byte) int {
97 for i
:= len(s
) - n
; i
>= 0; i
-- {
98 if s
[i
] == c
&& (n
== 1 ||
Equal(s
[i
:i
+n
], sep
)) {
105 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
106 func LastIndexByte(s
[]byte, c
byte) int {
107 for i
:= len(s
) - 1; i
>= 0; i
-- {
115 // IndexRune interprets s as a sequence of UTF-8-encoded code points.
116 // It returns the byte index of the first occurrence in s of the given rune.
117 // It returns -1 if rune is not present in s.
118 // If r is utf8.RuneError, it returns the first instance of any
119 // invalid UTF-8 byte sequence.
120 func IndexRune(s
[]byte, r rune
) int {
122 case 0 <= r
&& r
< utf8
.RuneSelf
:
123 return IndexByte(s
, byte(r
))
124 case r
== utf8
.RuneError
:
125 for i
:= 0; i
< len(s
); {
126 r1
, n
:= utf8
.DecodeRune(s
[i
:])
127 if r1
== utf8
.RuneError
{
133 case !utf8
.ValidRune(r
):
136 var b
[utf8
.UTFMax
]byte
137 n
:= utf8
.EncodeRune(b
[:], r
)
138 return Index(s
, b
[:n
])
142 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
143 // It returns the byte index of the first occurrence in s of any of the Unicode
144 // code points in chars. It returns -1 if chars is empty or if there is no code
146 func IndexAny(s
[]byte, chars
string) int {
148 // Avoid scanning all of s.
152 if as
, isASCII
:= makeASCIISet(chars
); isASCII
{
153 for i
, c
:= range s
{
162 for i
:= 0; i
< len(s
); i
+= width
{
164 if r
< utf8
.RuneSelf
{
167 r
, width
= utf8
.DecodeRune(s
[i
:])
169 for _
, ch
:= range chars
{
178 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
179 // points. It returns the byte index of the last occurrence in s of any of
180 // the Unicode code points in chars. It returns -1 if chars is empty or if
181 // there is no code point in common.
182 func LastIndexAny(s
[]byte, chars
string) int {
184 // Avoid scanning all of s.
188 if as
, isASCII
:= makeASCIISet(chars
); isASCII
{
189 for i
:= len(s
) - 1; i
>= 0; i
-- {
190 if as
.contains(s
[i
]) {
197 for i
:= len(s
); i
> 0; {
198 r
, size
:= utf8
.DecodeLastRune(s
[:i
])
200 for _
, c
:= range chars
{
209 // Generic split: splits after each instance of sep,
210 // including sepSave bytes of sep in the subslices.
211 func genSplit(s
, sep
[]byte, sepSave
, n
int) [][]byte {
219 n
= Count(s
, sep
) + 1
222 a
:= make([][]byte, n
)
230 a
[i
] = s
[: m
+sepSave
: m
+sepSave
]
238 // SplitN slices s into subslices separated by sep and returns a slice of
239 // the subslices between those separators.
240 // If sep is empty, SplitN splits after each UTF-8 sequence.
241 // The count determines the number of subslices to return:
242 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
243 // n == 0: the result is nil (zero subslices)
244 // n < 0: all subslices
245 func SplitN(s
, sep
[]byte, n
int) [][]byte { return genSplit(s
, sep
, 0, n
) }
247 // SplitAfterN slices s into subslices after each instance of sep and
248 // returns a slice of those subslices.
249 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
250 // The count determines the number of subslices to return:
251 // n > 0: at most n subslices; the last subslice will be the unsplit remainder.
252 // n == 0: the result is nil (zero subslices)
253 // n < 0: all subslices
254 func SplitAfterN(s
, sep
[]byte, n
int) [][]byte {
255 return genSplit(s
, sep
, len(sep
), n
)
258 // Split slices s into all subslices separated by sep and returns a slice of
259 // the subslices between those separators.
260 // If sep is empty, Split splits after each UTF-8 sequence.
261 // It is equivalent to SplitN with a count of -1.
262 func Split(s
, sep
[]byte) [][]byte { return genSplit(s
, sep
, 0, -1) }
264 // SplitAfter slices s into all subslices after each instance of sep and
265 // returns a slice of those subslices.
266 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
267 // It is equivalent to SplitAfterN with a count of -1.
268 func SplitAfter(s
, sep
[]byte) [][]byte {
269 return genSplit(s
, sep
, len(sep
), -1)
272 var asciiSpace
= [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
274 // Fields interprets s as a sequence of UTF-8-encoded code points.
275 // It splits the slice s around each instance of one or more consecutive white space
276 // characters, as defined by unicode.IsSpace, returning a slice of subslices of s or an
277 // empty slice if s contains only white space.
278 func Fields(s
[]byte) [][]byte {
279 // First count the fields.
280 // This is an exact count if s is ASCII, otherwise it is an approximation.
283 // setBits is used to track which bits are set in the bytes of s.
285 for i
:= 0; i
< len(s
); i
++ {
288 isSpace
:= int(asciiSpace
[r
])
289 n
+= wasSpace
& ^isSpace
293 if setBits
>= utf8
.RuneSelf
{
294 // Some runes in the input slice are not ASCII.
295 return FieldsFunc(s
, unicode
.IsSpace
)
299 a
:= make([][]byte, n
)
303 // Skip spaces in the front of the input.
304 for i
< len(s
) && asciiSpace
[s
[i
]] != 0 {
309 if asciiSpace
[s
[i
]] == 0 {
313 a
[na
] = s
[fieldStart
:i
:i
]
316 // Skip spaces in between fields.
317 for i
< len(s
) && asciiSpace
[s
[i
]] != 0 {
322 if fieldStart
< len(s
) { // Last field might end at EOF.
323 a
[na
] = s
[fieldStart
:len(s
):len(s
)]
328 // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
329 // It splits the slice s at each run of code points c satisfying f(c) and
330 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
331 // len(s) == 0, an empty slice is returned.
332 // FieldsFunc makes no guarantees about the order in which it calls f(c).
333 // If f does not return consistent results for a given c, FieldsFunc may crash.
334 func FieldsFunc(s
[]byte, f
func(rune
) bool) [][]byte {
335 // A span is used to record a slice of s of the form s[start:end].
336 // The start index is inclusive and the end index is exclusive.
341 spans
:= make([]span
, 0, 32)
343 // Find the field start and end indices.
346 for i
:= 0; i
< len(s
); {
349 if r
>= utf8
.RuneSelf
{
350 r
, size
= utf8
.DecodeRune(s
[i
:])
354 spans
= append(spans
, span
{start
: fromIndex
, end
: i
})
366 // Last field might end at EOF.
368 spans
= append(spans
, span
{fromIndex
, len(s
)})
371 // Create subslices from recorded field indices.
372 a
:= make([][]byte, len(spans
))
373 for i
, span
:= range spans
{
374 a
[i
] = s
[span
.start
:span
.end
:span
.end
]
380 // Join concatenates the elements of s to create a new byte slice. The separator
381 // sep is placed between elements in the resulting slice.
382 func Join(s
[][]byte, sep
[]byte) []byte {
387 // Just return a copy.
388 return append([]byte(nil), s
[0]...)
390 n
:= len(sep
) * (len(s
) - 1)
391 for _
, v
:= range s
{
397 for _
, v
:= range s
[1:] {
398 bp
+= copy(b
[bp
:], sep
)
399 bp
+= copy(b
[bp
:], v
)
404 // HasPrefix tests whether the byte slice s begins with prefix.
405 func HasPrefix(s
, prefix
[]byte) bool {
406 return len(s
) >= len(prefix
) && Equal(s
[0:len(prefix
)], prefix
)
409 // HasSuffix tests whether the byte slice s ends with suffix.
410 func HasSuffix(s
, suffix
[]byte) bool {
411 return len(s
) >= len(suffix
) && Equal(s
[len(s
)-len(suffix
):], suffix
)
414 // Map returns a copy of the byte slice s with all its characters modified
415 // according to the mapping function. If mapping returns a negative value, the character is
416 // dropped from the byte slice with no replacement. The characters in s and the
417 // output are interpreted as UTF-8-encoded code points.
418 func Map(mapping
func(r rune
) rune
, s
[]byte) []byte {
419 // In the worst case, the slice can grow when mapped, making
420 // things unpleasant. But it's so rare we barge in assuming it's
421 // fine. It could also shrink but that falls out naturally.
422 maxbytes
:= len(s
) // length of b
423 nbytes
:= 0 // number of bytes encoded in b
424 b
:= make([]byte, maxbytes
)
425 for i
:= 0; i
< len(s
); {
428 if r
>= utf8
.RuneSelf
{
429 r
, wid
= utf8
.DecodeRune(s
[i
:])
433 rl
:= utf8
.RuneLen(r
)
435 rl
= len(string(utf8
.RuneError
))
437 if nbytes
+rl
> maxbytes
{
439 maxbytes
= maxbytes
*2 + utf8
.UTFMax
440 nb
:= make([]byte, maxbytes
)
441 copy(nb
, b
[0:nbytes
])
444 nbytes
+= utf8
.EncodeRune(b
[nbytes
:maxbytes
], r
)
451 // Repeat returns a new byte slice consisting of count copies of b.
453 // It panics if count is negative or if
454 // the result of (len(b) * count) overflows.
455 func Repeat(b
[]byte, count
int) []byte {
456 // Since we cannot return an error on overflow,
457 // we should panic if the repeat will generate
459 // See Issue golang.org/issue/16237.
461 panic("bytes: negative Repeat count")
462 } else if count
> 0 && len(b
)*count
/count
!= len(b
) {
463 panic("bytes: Repeat count causes overflow")
466 nb
:= make([]byte, len(b
)*count
)
469 copy(nb
[bp
:], nb
[:bp
])
475 // ToUpper treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters within it mapped to their upper case.
476 func ToUpper(s
[]byte) []byte { return Map(unicode
.ToUpper
, s
) }
478 // ToLower treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their lower case.
479 func ToLower(s
[]byte) []byte { return Map(unicode
.ToLower
, s
) }
481 // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
482 func ToTitle(s
[]byte) []byte { return Map(unicode
.ToTitle
, s
) }
484 // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
485 // upper case, giving priority to the special casing rules.
486 func ToUpperSpecial(c unicode
.SpecialCase
, s
[]byte) []byte {
487 return Map(func(r rune
) rune
{ return c
.ToUpper(r
) }, s
)
490 // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
491 // lower case, giving priority to the special casing rules.
492 func ToLowerSpecial(c unicode
.SpecialCase
, s
[]byte) []byte {
493 return Map(func(r rune
) rune
{ return c
.ToLower(r
) }, s
)
496 // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
497 // title case, giving priority to the special casing rules.
498 func ToTitleSpecial(c unicode
.SpecialCase
, s
[]byte) []byte {
499 return Map(func(r rune
) rune
{ return c
.ToTitle(r
) }, s
)
502 // isSeparator reports whether the rune could mark a word boundary.
503 // TODO: update when package unicode captures more of the properties.
504 func isSeparator(r rune
) bool {
505 // ASCII alphanumerics and underscore are not separators
508 case '0' <= r
&& r
<= '9':
510 case 'a' <= r
&& r
<= 'z':
512 case 'A' <= r
&& r
<= 'Z':
519 // Letters and digits are not separators
520 if unicode
.IsLetter(r
) || unicode
.IsDigit(r
) {
523 // Otherwise, all we can do for now is treat spaces as separators.
524 return unicode
.IsSpace(r
)
527 // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
528 // words mapped to their title case.
530 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
531 func Title(s
[]byte) []byte {
532 // Use a closure here to remember state.
533 // Hackish but effective. Depends on Map scanning in order and calling
534 // the closure once per rune.
538 if isSeparator(prev
) {
540 return unicode
.ToTitle(r
)
548 // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
549 // all leading UTF-8-encoded code points c that satisfy f(c).
550 func TrimLeftFunc(s
[]byte, f
func(r rune
) bool) []byte {
551 i
:= indexFunc(s
, f
, false)
558 // TrimRightFunc returns a subslice of s by slicing off all trailing
559 // UTF-8-encoded code points c that satisfy f(c).
560 func TrimRightFunc(s
[]byte, f
func(r rune
) bool) []byte {
561 i
:= lastIndexFunc(s
, f
, false)
562 if i
>= 0 && s
[i
] >= utf8
.RuneSelf
{
563 _
, wid
:= utf8
.DecodeRune(s
[i
:])
571 // TrimFunc returns a subslice of s by slicing off all leading and trailing
572 // UTF-8-encoded code points c that satisfy f(c).
573 func TrimFunc(s
[]byte, f
func(r rune
) bool) []byte {
574 return TrimRightFunc(TrimLeftFunc(s
, f
), f
)
577 // TrimPrefix returns s without the provided leading prefix string.
578 // If s doesn't start with prefix, s is returned unchanged.
579 func TrimPrefix(s
, prefix
[]byte) []byte {
580 if HasPrefix(s
, prefix
) {
581 return s
[len(prefix
):]
586 // TrimSuffix returns s without the provided trailing suffix string.
587 // If s doesn't end with suffix, s is returned unchanged.
588 func TrimSuffix(s
, suffix
[]byte) []byte {
589 if HasSuffix(s
, suffix
) {
590 return s
[:len(s
)-len(suffix
)]
595 // IndexFunc interprets s as a sequence of UTF-8-encoded code points.
596 // It returns the byte index in s of the first Unicode
597 // code point satisfying f(c), or -1 if none do.
598 func IndexFunc(s
[]byte, f
func(r rune
) bool) int {
599 return indexFunc(s
, f
, true)
602 // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
603 // It returns the byte index in s of the last Unicode
604 // code point satisfying f(c), or -1 if none do.
605 func LastIndexFunc(s
[]byte, f
func(r rune
) bool) int {
606 return lastIndexFunc(s
, f
, true)
609 // indexFunc is the same as IndexFunc except that if
610 // truth==false, the sense of the predicate function is
612 func indexFunc(s
[]byte, f
func(r rune
) bool, truth
bool) int {
617 if r
>= utf8
.RuneSelf
{
618 r
, wid
= utf8
.DecodeRune(s
[start
:])
628 // lastIndexFunc is the same as LastIndexFunc except that if
629 // truth==false, the sense of the predicate function is
631 func lastIndexFunc(s
[]byte, f
func(r rune
) bool, truth
bool) int {
632 for i
:= len(s
); i
> 0; {
633 r
, size
:= rune(s
[i
-1]), 1
634 if r
>= utf8
.RuneSelf
{
635 r
, size
= utf8
.DecodeLastRune(s
[0:i
])
645 // asciiSet is a 32-byte value, where each bit represents the presence of a
646 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
647 // starting with the least-significant bit of the lowest word to the
648 // most-significant bit of the highest word, map to the full range of all
649 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
650 // ensuring that any non-ASCII character will be reported as not in the set.
651 type asciiSet
[8]uint32
653 // makeASCIISet creates a set of ASCII characters and reports whether all
654 // characters in chars are ASCII.
655 func makeASCIISet(chars
string) (as asciiSet
, ok
bool) {
656 for i
:= 0; i
< len(chars
); i
++ {
658 if c
>= utf8
.RuneSelf
{
661 as
[c
>>5] |
= 1 << uint(c
&31)
666 // contains reports whether c is inside the set.
667 func (as
*asciiSet
) contains(c
byte) bool {
668 return (as
[c
>>5] & (1 << uint(c
&31))) != 0
671 func makeCutsetFunc(cutset
string) func(r rune
) bool {
672 if len(cutset
) == 1 && cutset
[0] < utf8
.RuneSelf
{
673 return func(r rune
) bool {
674 return r
== rune(cutset
[0])
677 if as
, isASCII
:= makeASCIISet(cutset
); isASCII
{
678 return func(r rune
) bool {
679 return r
< utf8
.RuneSelf
&& as
.contains(byte(r
))
682 return func(r rune
) bool {
683 for _
, c
:= range cutset
{
692 // Trim returns a subslice of s by slicing off all leading and
693 // trailing UTF-8-encoded code points contained in cutset.
694 func Trim(s
[]byte, cutset
string) []byte {
695 return TrimFunc(s
, makeCutsetFunc(cutset
))
698 // TrimLeft returns a subslice of s by slicing off all leading
699 // UTF-8-encoded code points contained in cutset.
700 func TrimLeft(s
[]byte, cutset
string) []byte {
701 return TrimLeftFunc(s
, makeCutsetFunc(cutset
))
704 // TrimRight returns a subslice of s by slicing off all trailing
705 // UTF-8-encoded code points that are contained in cutset.
706 func TrimRight(s
[]byte, cutset
string) []byte {
707 return TrimRightFunc(s
, makeCutsetFunc(cutset
))
710 // TrimSpace returns a subslice of s by slicing off all leading and
711 // trailing white space, as defined by Unicode.
712 func TrimSpace(s
[]byte) []byte {
713 return TrimFunc(s
, unicode
.IsSpace
)
716 // Runes interprets s as a sequence of UTF-8-encoded code points.
717 // It returns a slice of runes (Unicode code points) equivalent to s.
718 func Runes(s
[]byte) []rune
{
719 t
:= make([]rune
, utf8
.RuneCount(s
))
722 r
, l
:= utf8
.DecodeRune(s
)
730 // Replace returns a copy of the slice s with the first n
731 // non-overlapping instances of old replaced by new.
732 // If old is empty, it matches at the beginning of the slice
733 // and after each UTF-8 sequence, yielding up to k+1 replacements
734 // for a k-rune slice.
735 // If n < 0, there is no limit on the number of replacements.
736 func Replace(s
, old
, new []byte, n
int) []byte {
739 // Compute number of replacements.
743 // Just return a copy.
744 return append([]byte(nil), s
...)
750 // Apply replacements to buffer.
751 t
:= make([]byte, len(s
)+n
*(len(new)-len(old
)))
754 for i
:= 0; i
< n
; i
++ {
758 _
, wid
:= utf8
.DecodeRune(s
[start
:])
762 j
+= Index(s
[start
:], old
)
764 w
+= copy(t
[w
:], s
[start
:j
])
765 w
+= copy(t
[w
:], new)
768 w
+= copy(t
[w
:], s
[start
:])
772 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
773 // are equal under Unicode case-folding.
774 func EqualFold(s
, t
[]byte) bool {
775 for len(s
) != 0 && len(t
) != 0 {
776 // Extract first rune from each.
778 if s
[0] < utf8
.RuneSelf
{
779 sr
, s
= rune(s
[0]), s
[1:]
781 r
, size
:= utf8
.DecodeRune(s
)
784 if t
[0] < utf8
.RuneSelf
{
785 tr
, t
= rune(t
[0]), t
[1:]
787 r
, size
:= utf8
.DecodeRune(t
)
791 // If they match, keep going; if not, return false.
798 // Make sr < tr to simplify what follows.
802 // Fast check for ASCII.
803 if tr
< utf8
.RuneSelf
&& 'A' <= sr
&& sr
<= 'Z' {
804 // ASCII, and sr is upper case. tr must be lower case.
805 if tr
== sr
+'a'-'A' {
811 // General case. SimpleFold(x) returns the next equivalent rune > x
812 // or wraps around to smaller values.
813 r
:= unicode
.SimpleFold(sr
)
814 for r
!= sr
&& r
< tr
{
815 r
= unicode
.SimpleFold(r
)
823 // One string is empty. Are both?
824 return len(s
) == len(t
)
827 func indexRabinKarp(s
, sep
[]byte) int {
829 hashsep
, pow
:= hashStr(sep
)
832 for i
:= 0; i
< n
; i
++ {
833 h
= h
*primeRK
+ uint32(s
[i
])
835 if h
== hashsep
&& Equal(s
[:n
], sep
) {
838 for i
:= n
; i
< len(s
); {
841 h
-= pow
* uint32(s
[i
-n
])
843 if h
== hashsep
&& Equal(s
[i
-n
:i
], sep
) {
850 // primeRK is the prime base used in Rabin-Karp algorithm.
851 const primeRK
= 16777619
853 // hashStr returns the hash and the appropriate multiplicative
854 // factor for use in Rabin-Karp algorithm.
855 func hashStr(sep
[]byte) (uint32, uint32) {
857 for i
:= 0; i
< len(sep
); i
++ {
858 hash
= hash
*primeRK
+ uint32(sep
[i
])
860 var pow
, sq
uint32 = 1, primeRK
861 for i
:= len(sep
); i
> 0; i
>>= 1 {