1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package strings implements simple functions to manipulate UTF-8 encoded strings.
7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings.
15 // explode splits s into a slice of UTF-8 strings,
16 // one string per Unicode character up to a maximum of n (n < 0 means no limit).
17 // Invalid UTF-8 sequences become correct encodings of U+FFFD.
18 func explode(s
string, n
int) []string {
19 l
:= utf8
.RuneCountInString(s
)
23 a
:= make([]string, n
)
24 for i
:= 0; i
< n
-1; i
++ {
25 ch
, size
:= utf8
.DecodeRuneInString(s
)
28 if ch
== utf8
.RuneError
{
29 a
[i
] = string(utf8
.RuneError
)
38 // primeRK is the prime base used in Rabin-Karp algorithm.
39 const primeRK
= 16777619
41 // hashStr returns the hash and the appropriate multiplicative
42 // factor for use in Rabin-Karp algorithm.
43 func hashStr(sep
string) (uint32, uint32) {
45 for i
:= 0; i
< len(sep
); i
++ {
46 hash
= hash
*primeRK
+ uint32(sep
[i
])
48 var pow
, sq
uint32 = 1, primeRK
49 for i
:= len(sep
); i
> 0; i
>>= 1 {
58 // hashStrRev returns the hash of the reverse of sep and the
59 // appropriate multiplicative factor for use in Rabin-Karp algorithm.
60 func hashStrRev(sep
string) (uint32, uint32) {
62 for i
:= len(sep
) - 1; i
>= 0; i
-- {
63 hash
= hash
*primeRK
+ uint32(sep
[i
])
65 var pow
, sq
uint32 = 1, primeRK
66 for i
:= len(sep
); i
> 0; i
>>= 1 {
75 // countGeneric implements Count.
76 func countGeneric(s
, substr
string) int {
79 return utf8
.RuneCountInString(s
) + 1
92 // Contains reports whether substr is within s.
93 func Contains(s
, substr
string) bool {
94 return Index(s
, substr
) >= 0
97 // ContainsAny reports whether any Unicode code points in chars are within s.
98 func ContainsAny(s
, chars
string) bool {
99 return IndexAny(s
, chars
) >= 0
102 // ContainsRune reports whether the Unicode code point r is within s.
103 func ContainsRune(s
string, r rune
) bool {
104 return IndexRune(s
, r
) >= 0
107 // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s.
108 func LastIndex(s
, substr
string) int {
114 return LastIndexByte(s
, substr
[0])
123 // Rabin-Karp search from the end of the string
124 hashss
, pow
:= hashStrRev(substr
)
127 for i
:= len(s
) - 1; i
>= last
; i
-- {
128 h
= h
*primeRK
+ uint32(s
[i
])
130 if h
== hashss
&& s
[last
:] == substr
{
133 for i
:= last
- 1; i
>= 0; i
-- {
136 h
-= pow
* uint32(s
[i
+n
])
137 if h
== hashss
&& s
[i
:i
+n
] == substr
{
144 // IndexRune returns the index of the first instance of the Unicode code point
145 // r, or -1 if rune is not present in s.
146 // If r is utf8.RuneError, it returns the first instance of any
147 // invalid UTF-8 byte sequence.
148 func IndexRune(s
string, r rune
) int {
150 case 0 <= r
&& r
< utf8
.RuneSelf
:
151 return IndexByte(s
, byte(r
))
152 case r
== utf8
.RuneError
:
153 for i
, r
:= range s
{
154 if r
== utf8
.RuneError
{
159 case !utf8
.ValidRune(r
):
162 return Index(s
, string(r
))
166 // IndexAny returns the index of the first instance of any Unicode code point
167 // from chars in s, or -1 if no Unicode code point from chars is present in s.
168 func IndexAny(s
, chars
string) int {
170 // Avoid scanning all of s.
174 if as
, isASCII
:= makeASCIISet(chars
); isASCII
{
175 for i
:= 0; i
< len(s
); i
++ {
176 if as
.contains(s
[i
]) {
183 for i
, c
:= range s
{
184 for _
, m
:= range chars
{
193 // LastIndexAny returns the index of the last instance of any Unicode code
194 // point from chars in s, or -1 if no Unicode code point from chars is
196 func LastIndexAny(s
, chars
string) int {
198 // Avoid scanning all of s.
202 if as
, isASCII
:= makeASCIISet(chars
); isASCII
{
203 for i
:= len(s
) - 1; i
>= 0; i
-- {
204 if as
.contains(s
[i
]) {
211 for i
:= len(s
); i
> 0; {
212 r
, size
:= utf8
.DecodeLastRuneInString(s
[:i
])
214 for _
, c
:= range chars
{
223 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
224 func LastIndexByte(s
string, c
byte) int {
225 for i
:= len(s
) - 1; i
>= 0; i
-- {
233 // Generic split: splits after each instance of sep,
234 // including sepSave bytes of sep in the subarrays.
235 func genSplit(s
, sep
string, sepSave
, n
int) []string {
243 n
= Count(s
, sep
) + 1
246 a
:= make([]string, n
)
262 // SplitN slices s into substrings separated by sep and returns a slice of
263 // the substrings between those separators.
265 // The count determines the number of substrings to return:
266 // n > 0: at most n substrings; the last substring will be the unsplit remainder.
267 // n == 0: the result is nil (zero substrings)
268 // n < 0: all substrings
270 // Edge cases for s and sep (for example, empty strings) are handled
271 // as described in the documentation for Split.
272 func SplitN(s
, sep
string, n
int) []string { return genSplit(s
, sep
, 0, n
) }
274 // SplitAfterN slices s into substrings after each instance of sep and
275 // returns a slice of those substrings.
277 // The count determines the number of substrings to return:
278 // n > 0: at most n substrings; the last substring will be the unsplit remainder.
279 // n == 0: the result is nil (zero substrings)
280 // n < 0: all substrings
282 // Edge cases for s and sep (for example, empty strings) are handled
283 // as described in the documentation for SplitAfter.
284 func SplitAfterN(s
, sep
string, n
int) []string {
285 return genSplit(s
, sep
, len(sep
), n
)
288 // Split slices s into all substrings separated by sep and returns a slice of
289 // the substrings between those separators.
291 // If s does not contain sep and sep is not empty, Split returns a
292 // slice of length 1 whose only element is s.
294 // If sep is empty, Split splits after each UTF-8 sequence. If both s
295 // and sep are empty, Split returns an empty slice.
297 // It is equivalent to SplitN with a count of -1.
298 func Split(s
, sep
string) []string { return genSplit(s
, sep
, 0, -1) }
300 // SplitAfter slices s into all substrings after each instance of sep and
301 // returns a slice of those substrings.
303 // If s does not contain sep and sep is not empty, SplitAfter returns
304 // a slice of length 1 whose only element is s.
306 // If sep is empty, SplitAfter splits after each UTF-8 sequence. If
307 // both s and sep are empty, SplitAfter returns an empty slice.
309 // It is equivalent to SplitAfterN with a count of -1.
310 func SplitAfter(s
, sep
string) []string {
311 return genSplit(s
, sep
, len(sep
), -1)
314 var asciiSpace
= [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
316 // Fields splits the string s around each instance of one or more consecutive white space
317 // characters, as defined by unicode.IsSpace, returning a slice of substrings of s or an
318 // empty slice if s contains only white space.
319 func Fields(s
string) []string {
320 // First count the fields.
321 // This is an exact count if s is ASCII, otherwise it is an approximation.
324 // setBits is used to track which bits are set in the bytes of s.
326 for i
:= 0; i
< len(s
); i
++ {
329 isSpace
:= int(asciiSpace
[r
])
330 n
+= wasSpace
& ^isSpace
334 if setBits
< utf8
.RuneSelf
{ // ASCII fast path
335 a
:= make([]string, n
)
339 // Skip spaces in the front of the input.
340 for i
< len(s
) && asciiSpace
[s
[i
]] != 0 {
345 if asciiSpace
[s
[i
]] == 0 {
349 a
[na
] = s
[fieldStart
:i
]
352 // Skip spaces in between fields.
353 for i
< len(s
) && asciiSpace
[s
[i
]] != 0 {
358 if fieldStart
< len(s
) { // Last field might end at EOF.
359 a
[na
] = s
[fieldStart
:]
364 // Some runes in the input string are not ASCII.
365 return FieldsFunc(s
, unicode
.IsSpace
)
368 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
369 // and returns an array of slices of s. If all code points in s satisfy f(c) or the
370 // string is empty, an empty slice is returned.
371 // FieldsFunc makes no guarantees about the order in which it calls f(c).
372 // If f does not return consistent results for a given c, FieldsFunc may crash.
373 func FieldsFunc(s
string, f
func(rune
) bool) []string {
374 // A span is used to record a slice of s of the form s[start:end].
375 // The start index is inclusive and the end index is exclusive.
380 spans
:= make([]span
, 0, 32)
382 // Find the field start and end indices.
385 for i
, rune
:= range s
{
388 spans
= append(spans
, span
{start
: fromIndex
, end
: i
})
399 // Last field might end at EOF.
401 spans
= append(spans
, span
{fromIndex
, len(s
)})
404 // Create strings from recorded field indices.
405 a
:= make([]string, len(spans
))
406 for i
, span
:= range spans
{
407 a
[i
] = s
[span
.start
:span
.end
]
413 // Join concatenates the elements of a to create a single string. The separator string
414 // sep is placed between elements in the resulting string.
415 func Join(a
[]string, sep
string) string {
422 // Special case for common small values.
423 // Remove if golang.org/issue/6714 is fixed
424 return a
[0] + sep
+ a
[1]
426 // Special case for common small values.
427 // Remove if golang.org/issue/6714 is fixed
428 return a
[0] + sep
+ a
[1] + sep
+ a
[2]
430 n
:= len(sep
) * (len(a
) - 1)
431 for i
:= 0; i
< len(a
); i
++ {
437 for _
, s
:= range a
[1:] {
438 bp
+= copy(b
[bp
:], sep
)
439 bp
+= copy(b
[bp
:], s
)
444 // HasPrefix tests whether the string s begins with prefix.
445 func HasPrefix(s
, prefix
string) bool {
446 return len(s
) >= len(prefix
) && s
[0:len(prefix
)] == prefix
449 // HasSuffix tests whether the string s ends with suffix.
450 func HasSuffix(s
, suffix
string) bool {
451 return len(s
) >= len(suffix
) && s
[len(s
)-len(suffix
):] == suffix
454 // Map returns a copy of the string s with all its characters modified
455 // according to the mapping function. If mapping returns a negative value, the character is
456 // dropped from the string with no replacement.
457 func Map(mapping
func(rune
) rune
, s
string) string {
458 // In the worst case, the string can grow when mapped, making
459 // things unpleasant. But it's so rare we barge in assuming it's
460 // fine. It could also shrink but that falls out naturally.
462 // The output buffer b is initialized on demand, the first
463 // time a character differs.
465 // nbytes is the number of bytes encoded in b.
468 for i
, c
:= range s
{
474 b
= make([]byte, len(s
)+utf8
.UTFMax
)
475 nbytes
= copy(b
, s
[:i
])
477 if r
< utf8
.RuneSelf
{
481 nbytes
+= utf8
.EncodeRune(b
[nbytes
:], r
)
485 if c
== utf8
.RuneError
{
486 // RuneError is the result of either decoding
487 // an invalid sequence or '\uFFFD'. Determine
488 // the correct number of bytes we need to advance.
489 _
, w
:= utf8
.DecodeRuneInString(s
[i
:])
503 for _
, c
:= range s
{
507 if (0 <= r
&& r
< utf8
.RuneSelf
) && nbytes
< len(b
) {
513 // b is not big enough or r is not a ASCII rune.
515 if nbytes
+utf8
.UTFMax
>= len(b
) {
517 nb
:= make([]byte, 2*len(b
))
521 nbytes
+= utf8
.EncodeRune(b
[nbytes
:], r
)
525 return string(b
[:nbytes
])
528 // Repeat returns a new string consisting of count copies of the string s.
530 // It panics if count is negative or if
531 // the result of (len(s) * count) overflows.
532 func Repeat(s
string, count
int) string {
533 // Since we cannot return an error on overflow,
534 // we should panic if the repeat will generate
536 // See Issue golang.org/issue/16237
538 panic("strings: negative Repeat count")
539 } else if count
> 0 && len(s
)*count
/count
!= len(s
) {
540 panic("strings: Repeat count causes overflow")
543 b
:= make([]byte, len(s
)*count
)
552 // ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
553 func ToUpper(s
string) string {
554 isASCII
, hasLower
:= true, false
555 for i
:= 0; i
< len(s
); i
++ {
557 if c
>= utf8
.RuneSelf
{
561 hasLower
= hasLower ||
(c
>= 'a' && c
<= 'z')
564 if isASCII
{ // optimize for ASCII-only strings.
568 b
:= make([]byte, len(s
))
569 for i
:= 0; i
< len(s
); i
++ {
571 if c
>= 'a' && c
<= 'z' {
578 return Map(unicode
.ToUpper
, s
)
581 // ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
582 func ToLower(s
string) string {
583 isASCII
, hasUpper
:= true, false
584 for i
:= 0; i
< len(s
); i
++ {
586 if c
>= utf8
.RuneSelf
{
590 hasUpper
= hasUpper ||
(c
>= 'A' && c
<= 'Z')
593 if isASCII
{ // optimize for ASCII-only strings.
597 b
:= make([]byte, len(s
))
598 for i
:= 0; i
< len(s
); i
++ {
600 if c
>= 'A' && c
<= 'Z' {
607 return Map(unicode
.ToLower
, s
)
610 // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
611 func ToTitle(s
string) string { return Map(unicode
.ToTitle
, s
) }
613 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
614 // upper case, giving priority to the special casing rules.
615 func ToUpperSpecial(c unicode
.SpecialCase
, s
string) string {
616 return Map(func(r rune
) rune
{ return c
.ToUpper(r
) }, s
)
619 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
620 // lower case, giving priority to the special casing rules.
621 func ToLowerSpecial(c unicode
.SpecialCase
, s
string) string {
622 return Map(func(r rune
) rune
{ return c
.ToLower(r
) }, s
)
625 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
626 // title case, giving priority to the special casing rules.
627 func ToTitleSpecial(c unicode
.SpecialCase
, s
string) string {
628 return Map(func(r rune
) rune
{ return c
.ToTitle(r
) }, s
)
631 // isSeparator reports whether the rune could mark a word boundary.
632 // TODO: update when package unicode captures more of the properties.
633 func isSeparator(r rune
) bool {
634 // ASCII alphanumerics and underscore are not separators
637 case '0' <= r
&& r
<= '9':
639 case 'a' <= r
&& r
<= 'z':
641 case 'A' <= r
&& r
<= 'Z':
648 // Letters and digits are not separators
649 if unicode
.IsLetter(r
) || unicode
.IsDigit(r
) {
652 // Otherwise, all we can do for now is treat spaces as separators.
653 return unicode
.IsSpace(r
)
656 // Title returns a copy of the string s with all Unicode letters that begin words
657 // mapped to their title case.
659 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
660 func Title(s
string) string {
661 // Use a closure here to remember state.
662 // Hackish but effective. Depends on Map scanning in order and calling
663 // the closure once per rune.
667 if isSeparator(prev
) {
669 return unicode
.ToTitle(r
)
677 // TrimLeftFunc returns a slice of the string s with all leading
678 // Unicode code points c satisfying f(c) removed.
679 func TrimLeftFunc(s
string, f
func(rune
) bool) string {
680 i
:= indexFunc(s
, f
, false)
687 // TrimRightFunc returns a slice of the string s with all trailing
688 // Unicode code points c satisfying f(c) removed.
689 func TrimRightFunc(s
string, f
func(rune
) bool) string {
690 i
:= lastIndexFunc(s
, f
, false)
691 if i
>= 0 && s
[i
] >= utf8
.RuneSelf
{
692 _
, wid
:= utf8
.DecodeRuneInString(s
[i
:])
700 // TrimFunc returns a slice of the string s with all leading
701 // and trailing Unicode code points c satisfying f(c) removed.
702 func TrimFunc(s
string, f
func(rune
) bool) string {
703 return TrimRightFunc(TrimLeftFunc(s
, f
), f
)
706 // IndexFunc returns the index into s of the first Unicode
707 // code point satisfying f(c), or -1 if none do.
708 func IndexFunc(s
string, f
func(rune
) bool) int {
709 return indexFunc(s
, f
, true)
712 // LastIndexFunc returns the index into s of the last
713 // Unicode code point satisfying f(c), or -1 if none do.
714 func LastIndexFunc(s
string, f
func(rune
) bool) int {
715 return lastIndexFunc(s
, f
, true)
718 // indexFunc is the same as IndexFunc except that if
719 // truth==false, the sense of the predicate function is
721 func indexFunc(s
string, f
func(rune
) bool, truth
bool) int {
722 for i
, r
:= range s
{
730 // lastIndexFunc is the same as LastIndexFunc except that if
731 // truth==false, the sense of the predicate function is
733 func lastIndexFunc(s
string, f
func(rune
) bool, truth
bool) int {
734 for i
:= len(s
); i
> 0; {
735 r
, size
:= utf8
.DecodeLastRuneInString(s
[0:i
])
744 // asciiSet is a 32-byte value, where each bit represents the presence of a
745 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
746 // starting with the least-significant bit of the lowest word to the
747 // most-significant bit of the highest word, map to the full range of all
748 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
749 // ensuring that any non-ASCII character will be reported as not in the set.
750 type asciiSet
[8]uint32
752 // makeASCIISet creates a set of ASCII characters and reports whether all
753 // characters in chars are ASCII.
754 func makeASCIISet(chars
string) (as asciiSet
, ok
bool) {
755 for i
:= 0; i
< len(chars
); i
++ {
757 if c
>= utf8
.RuneSelf
{
760 as
[c
>>5] |
= 1 << uint(c
&31)
765 // contains reports whether c is inside the set.
766 func (as
*asciiSet
) contains(c
byte) bool {
767 return (as
[c
>>5] & (1 << uint(c
&31))) != 0
770 func makeCutsetFunc(cutset
string) func(rune
) bool {
771 if len(cutset
) == 1 && cutset
[0] < utf8
.RuneSelf
{
772 return func(r rune
) bool {
773 return r
== rune(cutset
[0])
776 if as
, isASCII
:= makeASCIISet(cutset
); isASCII
{
777 return func(r rune
) bool {
778 return r
< utf8
.RuneSelf
&& as
.contains(byte(r
))
781 return func(r rune
) bool { return IndexRune(cutset
, r
) >= 0 }
784 // Trim returns a slice of the string s with all leading and
785 // trailing Unicode code points contained in cutset removed.
786 func Trim(s
string, cutset
string) string {
787 if s
== "" || cutset
== "" {
790 return TrimFunc(s
, makeCutsetFunc(cutset
))
793 // TrimLeft returns a slice of the string s with all leading
794 // Unicode code points contained in cutset removed.
795 func TrimLeft(s
string, cutset
string) string {
796 if s
== "" || cutset
== "" {
799 return TrimLeftFunc(s
, makeCutsetFunc(cutset
))
802 // TrimRight returns a slice of the string s, with all trailing
803 // Unicode code points contained in cutset removed.
804 func TrimRight(s
string, cutset
string) string {
805 if s
== "" || cutset
== "" {
808 return TrimRightFunc(s
, makeCutsetFunc(cutset
))
811 // TrimSpace returns a slice of the string s, with all leading
812 // and trailing white space removed, as defined by Unicode.
813 func TrimSpace(s
string) string {
814 return TrimFunc(s
, unicode
.IsSpace
)
817 // TrimPrefix returns s without the provided leading prefix string.
818 // If s doesn't start with prefix, s is returned unchanged.
819 func TrimPrefix(s
, prefix
string) string {
820 if HasPrefix(s
, prefix
) {
821 return s
[len(prefix
):]
826 // TrimSuffix returns s without the provided trailing suffix string.
827 // If s doesn't end with suffix, s is returned unchanged.
828 func TrimSuffix(s
, suffix
string) string {
829 if HasSuffix(s
, suffix
) {
830 return s
[:len(s
)-len(suffix
)]
835 // Replace returns a copy of the string s with the first n
836 // non-overlapping instances of old replaced by new.
837 // If old is empty, it matches at the beginning of the string
838 // and after each UTF-8 sequence, yielding up to k+1 replacements
839 // for a k-rune string.
840 // If n < 0, there is no limit on the number of replacements.
841 func Replace(s
, old
, new string, n
int) string {
842 if old
== new || n
== 0 {
843 return s
// avoid allocation
846 // Compute number of replacements.
847 if m
:= Count(s
, old
); m
== 0 {
848 return s
// avoid allocation
849 } else if n
< 0 || m
< n
{
853 // Apply replacements to buffer.
854 t
:= make([]byte, len(s
)+n
*(len(new)-len(old
)))
857 for i
:= 0; i
< n
; i
++ {
861 _
, wid
:= utf8
.DecodeRuneInString(s
[start
:])
865 j
+= Index(s
[start
:], old
)
867 w
+= copy(t
[w
:], s
[start
:j
])
868 w
+= copy(t
[w
:], new)
871 w
+= copy(t
[w
:], s
[start
:])
872 return string(t
[0:w
])
875 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
876 // are equal under Unicode case-folding.
877 func EqualFold(s
, t
string) bool {
878 for s
!= "" && t
!= "" {
879 // Extract first rune from each string.
881 if s
[0] < utf8
.RuneSelf
{
882 sr
, s
= rune(s
[0]), s
[1:]
884 r
, size
:= utf8
.DecodeRuneInString(s
)
887 if t
[0] < utf8
.RuneSelf
{
888 tr
, t
= rune(t
[0]), t
[1:]
890 r
, size
:= utf8
.DecodeRuneInString(t
)
894 // If they match, keep going; if not, return false.
901 // Make sr < tr to simplify what follows.
905 // Fast check for ASCII.
906 if tr
< utf8
.RuneSelf
&& 'A' <= sr
&& sr
<= 'Z' {
907 // ASCII, and sr is upper case. tr must be lower case.
908 if tr
== sr
+'a'-'A' {
914 // General case. SimpleFold(x) returns the next equivalent rune > x
915 // or wraps around to smaller values.
916 r
:= unicode
.SimpleFold(sr
)
917 for r
!= sr
&& r
< tr
{
918 r
= unicode
.SimpleFold(r
)
926 // One string is empty. Are both?
930 func indexRabinKarp(s
, substr
string) int {
932 hashss
, pow
:= hashStr(substr
)
935 for i
:= 0; i
< n
; i
++ {
936 h
= h
*primeRK
+ uint32(s
[i
])
938 if h
== hashss
&& s
[:n
] == substr
{
941 for i
:= n
; i
< len(s
); {
944 h
-= pow
* uint32(s
[i
-n
])
946 if h
== hashss
&& s
[i
-n
:i
] == substr
{