1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package strings implements simple functions to manipulate UTF-8 encoded strings.
7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings.
15 // explode splits s into a slice of UTF-8 strings,
16 // one string per Unicode character up to a maximum of n (n < 0 means no limit).
17 // Invalid UTF-8 sequences become correct encodings of U+FFFD.
18 func explode(s
string, n
int) []string {
19 l
:= utf8
.RuneCountInString(s
)
23 a
:= make([]string, n
)
24 for i
:= 0; i
< n
-1; i
++ {
25 ch
, size
:= utf8
.DecodeRuneInString(s
)
28 if ch
== utf8
.RuneError
{
29 a
[i
] = string(utf8
.RuneError
)
38 // primeRK is the prime base used in Rabin-Karp algorithm.
39 const primeRK
= 16777619
41 // hashStr returns the hash and the appropriate multiplicative
42 // factor for use in Rabin-Karp algorithm.
43 func hashStr(sep
string) (uint32, uint32) {
45 for i
:= 0; i
< len(sep
); i
++ {
46 hash
= hash
*primeRK
+ uint32(sep
[i
])
48 var pow
, sq
uint32 = 1, primeRK
49 for i
:= len(sep
); i
> 0; i
>>= 1 {
58 // hashStrRev returns the hash of the reverse of sep and the
59 // appropriate multiplicative factor for use in Rabin-Karp algorithm.
60 func hashStrRev(sep
string) (uint32, uint32) {
62 for i
:= len(sep
) - 1; i
>= 0; i
-- {
63 hash
= hash
*primeRK
+ uint32(sep
[i
])
65 var pow
, sq
uint32 = 1, primeRK
66 for i
:= len(sep
); i
> 0; i
>>= 1 {
75 // Count counts the number of non-overlapping instances of sep in s.
76 // If sep is an empty string, Count returns 1 + the number of Unicode code points in s.
77 func Count(s
, sep
string) int {
82 return utf8
.RuneCountInString(s
) + 1
84 // special case worth making fast
86 for i
:= 0; i
< len(s
); i
++ {
92 case len(sep
) > len(s
):
94 case len(sep
) == len(s
):
101 hashsep
, pow
:= hashStr(sep
)
103 for i
:= 0; i
< len(sep
); i
++ {
104 h
= h
*primeRK
+ uint32(s
[i
])
107 if h
== hashsep
&& s
[:len(sep
)] == sep
{
111 for i
:= len(sep
); i
< len(s
); {
114 h
-= pow
* uint32(s
[i
-len(sep
)])
116 if h
== hashsep
&& lastmatch
<= i
-len(sep
) && s
[i
-len(sep
):i
] == sep
{
124 // Contains reports whether substr is within s.
125 func Contains(s
, substr
string) bool {
126 return Index(s
, substr
) >= 0
129 // ContainsAny reports whether any Unicode code points in chars are within s.
130 func ContainsAny(s
, chars
string) bool {
131 return IndexAny(s
, chars
) >= 0
134 // ContainsRune reports whether the Unicode code point r is within s.
135 func ContainsRune(s
string, r rune
) bool {
136 return IndexRune(s
, r
) >= 0
139 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
140 func LastIndex(s
, sep
string) int {
146 return LastIndexByte(s
, sep
[0])
155 // Rabin-Karp search from the end of the string
156 hashsep
, pow
:= hashStrRev(sep
)
159 for i
:= len(s
) - 1; i
>= last
; i
-- {
160 h
= h
*primeRK
+ uint32(s
[i
])
162 if h
== hashsep
&& s
[last
:] == sep
{
165 for i
:= last
- 1; i
>= 0; i
-- {
168 h
-= pow
* uint32(s
[i
+n
])
169 if h
== hashsep
&& s
[i
:i
+n
] == sep
{
176 // IndexRune returns the index of the first instance of the Unicode code point
177 // r, or -1 if rune is not present in s.
178 func IndexRune(s
string, r rune
) int {
180 case r
< utf8
.RuneSelf
:
181 return IndexByte(s
, byte(r
))
183 for i
, c
:= range s
{
192 // IndexAny returns the index of the first instance of any Unicode code point
193 // from chars in s, or -1 if no Unicode code point from chars is present in s.
194 func IndexAny(s
, chars
string) int {
196 for i
, c
:= range s
{
197 for _
, m
:= range chars
{
207 // LastIndexAny returns the index of the last instance of any Unicode code
208 // point from chars in s, or -1 if no Unicode code point from chars is
210 func LastIndexAny(s
, chars
string) int {
212 for i
:= len(s
); i
> 0; {
213 rune
, size
:= utf8
.DecodeLastRuneInString(s
[0:i
])
215 for _
, m
:= range chars
{
225 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
226 func LastIndexByte(s
string, c
byte) int {
227 for i
:= len(s
) - 1; i
>= 0; i
-- {
235 // Generic split: splits after each instance of sep,
236 // including sepSave bytes of sep in the subarrays.
237 func genSplit(s
, sep
string, sepSave
, n
int) []string {
245 n
= Count(s
, sep
) + 1
249 a
:= make([]string, n
)
251 for i
:= 0; i
+len(sep
) <= len(s
) && na
+1 < n
; i
++ {
252 if s
[i
] == c
&& (len(sep
) == 1 || s
[i
:i
+len(sep
)] == sep
) {
253 a
[na
] = s
[start
: i
+sepSave
]
263 // SplitN slices s into substrings separated by sep and returns a slice of
264 // the substrings between those separators.
265 // If sep is empty, SplitN splits after each UTF-8 sequence.
266 // The count determines the number of substrings to return:
267 // n > 0: at most n substrings; the last substring will be the unsplit remainder.
268 // n == 0: the result is nil (zero substrings)
269 // n < 0: all substrings
270 func SplitN(s
, sep
string, n
int) []string { return genSplit(s
, sep
, 0, n
) }
272 // SplitAfterN slices s into substrings after each instance of sep and
273 // returns a slice of those substrings.
274 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
275 // The count determines the number of substrings to return:
276 // n > 0: at most n substrings; the last substring will be the unsplit remainder.
277 // n == 0: the result is nil (zero substrings)
278 // n < 0: all substrings
279 func SplitAfterN(s
, sep
string, n
int) []string {
280 return genSplit(s
, sep
, len(sep
), n
)
283 // Split slices s into all substrings separated by sep and returns a slice of
284 // the substrings between those separators.
285 // If sep is empty, Split splits after each UTF-8 sequence.
286 // It is equivalent to SplitN with a count of -1.
287 func Split(s
, sep
string) []string { return genSplit(s
, sep
, 0, -1) }
289 // SplitAfter slices s into all substrings after each instance of sep and
290 // returns a slice of those substrings.
291 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
292 // It is equivalent to SplitAfterN with a count of -1.
293 func SplitAfter(s
, sep
string) []string {
294 return genSplit(s
, sep
, len(sep
), -1)
297 // Fields splits the string s around each instance of one or more consecutive white space
298 // characters, as defined by unicode.IsSpace, returning an array of substrings of s or an
299 // empty list if s contains only white space.
300 func Fields(s
string) []string {
301 return FieldsFunc(s
, unicode
.IsSpace
)
304 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
305 // and returns an array of slices of s. If all code points in s satisfy f(c) or the
306 // string is empty, an empty slice is returned.
307 // FieldsFunc makes no guarantees about the order in which it calls f(c).
308 // If f does not return consistent results for a given c, FieldsFunc may crash.
309 func FieldsFunc(s
string, f
func(rune
) bool) []string {
310 // First count the fields.
313 for _
, rune
:= range s
{
314 wasInField
:= inField
316 if inField
&& !wasInField
{
322 a
:= make([]string, n
)
324 fieldStart
:= -1 // Set to -1 when looking for start of field.
325 for i
, rune
:= range s
{
328 a
[na
] = s
[fieldStart
:i
]
332 } else if fieldStart
== -1 {
336 if fieldStart
>= 0 { // Last field might end at EOF.
337 a
[na
] = s
[fieldStart
:]
342 // Join concatenates the elements of a to create a single string. The separator string
343 // sep is placed between elements in the resulting string.
344 func Join(a
[]string, sep
string) string {
351 n
:= len(sep
) * (len(a
) - 1)
352 for i
:= 0; i
< len(a
); i
++ {
358 for _
, s
:= range a
[1:] {
359 bp
+= copy(b
[bp
:], sep
)
360 bp
+= copy(b
[bp
:], s
)
365 // HasPrefix tests whether the string s begins with prefix.
366 func HasPrefix(s
, prefix
string) bool {
367 return len(s
) >= len(prefix
) && s
[0:len(prefix
)] == prefix
370 // HasSuffix tests whether the string s ends with suffix.
371 func HasSuffix(s
, suffix
string) bool {
372 return len(s
) >= len(suffix
) && s
[len(s
)-len(suffix
):] == suffix
375 // Map returns a copy of the string s with all its characters modified
376 // according to the mapping function. If mapping returns a negative value, the character is
377 // dropped from the string with no replacement.
378 func Map(mapping
func(rune
) rune
, s
string) string {
379 // In the worst case, the string can grow when mapped, making
380 // things unpleasant. But it's so rare we barge in assuming it's
381 // fine. It could also shrink but that falls out naturally.
382 maxbytes
:= len(s
) // length of b
383 nbytes
:= 0 // number of bytes encoded in b
384 // The output buffer b is initialized on demand, the first
385 // time a character differs.
388 for i
, c
:= range s
{
394 b
= make([]byte, maxbytes
)
395 nbytes
= copy(b
, s
[:i
])
399 if r
>= utf8
.RuneSelf
{
400 wid
= utf8
.RuneLen(r
)
402 if nbytes
+wid
> maxbytes
{
404 maxbytes
= maxbytes
*2 + utf8
.UTFMax
405 nb
:= make([]byte, maxbytes
)
406 copy(nb
, b
[0:nbytes
])
409 nbytes
+= utf8
.EncodeRune(b
[nbytes
:maxbytes
], r
)
415 return string(b
[0:nbytes
])
418 // Repeat returns a new string consisting of count copies of the string s.
419 func Repeat(s
string, count
int) string {
420 b
:= make([]byte, len(s
)*count
)
429 // ToUpper returns a copy of the string s with all Unicode letters mapped to their upper case.
430 func ToUpper(s
string) string { return Map(unicode
.ToUpper
, s
) }
432 // ToLower returns a copy of the string s with all Unicode letters mapped to their lower case.
433 func ToLower(s
string) string { return Map(unicode
.ToLower
, s
) }
435 // ToTitle returns a copy of the string s with all Unicode letters mapped to their title case.
436 func ToTitle(s
string) string { return Map(unicode
.ToTitle
, s
) }
438 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
439 // upper case, giving priority to the special casing rules.
440 func ToUpperSpecial(_case unicode
.SpecialCase
, s
string) string {
441 return Map(func(r rune
) rune
{ return _case
.ToUpper(r
) }, s
)
444 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
445 // lower case, giving priority to the special casing rules.
446 func ToLowerSpecial(_case unicode
.SpecialCase
, s
string) string {
447 return Map(func(r rune
) rune
{ return _case
.ToLower(r
) }, s
)
450 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
451 // title case, giving priority to the special casing rules.
452 func ToTitleSpecial(_case unicode
.SpecialCase
, s
string) string {
453 return Map(func(r rune
) rune
{ return _case
.ToTitle(r
) }, s
)
456 // isSeparator reports whether the rune could mark a word boundary.
457 // TODO: update when package unicode captures more of the properties.
458 func isSeparator(r rune
) bool {
459 // ASCII alphanumerics and underscore are not separators
462 case '0' <= r
&& r
<= '9':
464 case 'a' <= r
&& r
<= 'z':
466 case 'A' <= r
&& r
<= 'Z':
473 // Letters and digits are not separators
474 if unicode
.IsLetter(r
) || unicode
.IsDigit(r
) {
477 // Otherwise, all we can do for now is treat spaces as separators.
478 return unicode
.IsSpace(r
)
481 // Title returns a copy of the string s with all Unicode letters that begin words
482 // mapped to their title case.
484 // BUG(rsc): The rule Title uses for word boundaries does not handle Unicode punctuation properly.
485 func Title(s
string) string {
486 // Use a closure here to remember state.
487 // Hackish but effective. Depends on Map scanning in order and calling
488 // the closure once per rune.
492 if isSeparator(prev
) {
494 return unicode
.ToTitle(r
)
502 // TrimLeftFunc returns a slice of the string s with all leading
503 // Unicode code points c satisfying f(c) removed.
504 func TrimLeftFunc(s
string, f
func(rune
) bool) string {
505 i
:= indexFunc(s
, f
, false)
512 // TrimRightFunc returns a slice of the string s with all trailing
513 // Unicode code points c satisfying f(c) removed.
514 func TrimRightFunc(s
string, f
func(rune
) bool) string {
515 i
:= lastIndexFunc(s
, f
, false)
516 if i
>= 0 && s
[i
] >= utf8
.RuneSelf
{
517 _
, wid
:= utf8
.DecodeRuneInString(s
[i
:])
525 // TrimFunc returns a slice of the string s with all leading
526 // and trailing Unicode code points c satisfying f(c) removed.
527 func TrimFunc(s
string, f
func(rune
) bool) string {
528 return TrimRightFunc(TrimLeftFunc(s
, f
), f
)
531 // IndexFunc returns the index into s of the first Unicode
532 // code point satisfying f(c), or -1 if none do.
533 func IndexFunc(s
string, f
func(rune
) bool) int {
534 return indexFunc(s
, f
, true)
537 // LastIndexFunc returns the index into s of the last
538 // Unicode code point satisfying f(c), or -1 if none do.
539 func LastIndexFunc(s
string, f
func(rune
) bool) int {
540 return lastIndexFunc(s
, f
, true)
543 // indexFunc is the same as IndexFunc except that if
544 // truth==false, the sense of the predicate function is
546 func indexFunc(s
string, f
func(rune
) bool, truth
bool) int {
551 if r
>= utf8
.RuneSelf
{
552 r
, wid
= utf8
.DecodeRuneInString(s
[start
:])
562 // lastIndexFunc is the same as LastIndexFunc except that if
563 // truth==false, the sense of the predicate function is
565 func lastIndexFunc(s
string, f
func(rune
) bool, truth
bool) int {
566 for i
:= len(s
); i
> 0; {
567 r
, size
:= utf8
.DecodeLastRuneInString(s
[0:i
])
576 func makeCutsetFunc(cutset
string) func(rune
) bool {
577 return func(r rune
) bool { return IndexRune(cutset
, r
) >= 0 }
580 // Trim returns a slice of the string s with all leading and
581 // trailing Unicode code points contained in cutset removed.
582 func Trim(s
string, cutset
string) string {
583 if s
== "" || cutset
== "" {
586 return TrimFunc(s
, makeCutsetFunc(cutset
))
589 // TrimLeft returns a slice of the string s with all leading
590 // Unicode code points contained in cutset removed.
591 func TrimLeft(s
string, cutset
string) string {
592 if s
== "" || cutset
== "" {
595 return TrimLeftFunc(s
, makeCutsetFunc(cutset
))
598 // TrimRight returns a slice of the string s, with all trailing
599 // Unicode code points contained in cutset removed.
600 func TrimRight(s
string, cutset
string) string {
601 if s
== "" || cutset
== "" {
604 return TrimRightFunc(s
, makeCutsetFunc(cutset
))
607 // TrimSpace returns a slice of the string s, with all leading
608 // and trailing white space removed, as defined by Unicode.
609 func TrimSpace(s
string) string {
610 return TrimFunc(s
, unicode
.IsSpace
)
613 // TrimPrefix returns s without the provided leading prefix string.
614 // If s doesn't start with prefix, s is returned unchanged.
615 func TrimPrefix(s
, prefix
string) string {
616 if HasPrefix(s
, prefix
) {
617 return s
[len(prefix
):]
622 // TrimSuffix returns s without the provided trailing suffix string.
623 // If s doesn't end with suffix, s is returned unchanged.
624 func TrimSuffix(s
, suffix
string) string {
625 if HasSuffix(s
, suffix
) {
626 return s
[:len(s
)-len(suffix
)]
631 // Replace returns a copy of the string s with the first n
632 // non-overlapping instances of old replaced by new.
633 // If old is empty, it matches at the beginning of the string
634 // and after each UTF-8 sequence, yielding up to k+1 replacements
635 // for a k-rune string.
636 // If n < 0, there is no limit on the number of replacements.
637 func Replace(s
, old
, new string, n
int) string {
638 if old
== new || n
== 0 {
639 return s
// avoid allocation
642 // Compute number of replacements.
643 if m
:= Count(s
, old
); m
== 0 {
644 return s
// avoid allocation
645 } else if n
< 0 || m
< n
{
649 // Apply replacements to buffer.
650 t
:= make([]byte, len(s
)+n
*(len(new)-len(old
)))
653 for i
:= 0; i
< n
; i
++ {
657 _
, wid
:= utf8
.DecodeRuneInString(s
[start
:])
661 j
+= Index(s
[start
:], old
)
663 w
+= copy(t
[w
:], s
[start
:j
])
664 w
+= copy(t
[w
:], new)
667 w
+= copy(t
[w
:], s
[start
:])
668 return string(t
[0:w
])
671 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
672 // are equal under Unicode case-folding.
673 func EqualFold(s
, t
string) bool {
674 for s
!= "" && t
!= "" {
675 // Extract first rune from each string.
677 if s
[0] < utf8
.RuneSelf
{
678 sr
, s
= rune(s
[0]), s
[1:]
680 r
, size
:= utf8
.DecodeRuneInString(s
)
683 if t
[0] < utf8
.RuneSelf
{
684 tr
, t
= rune(t
[0]), t
[1:]
686 r
, size
:= utf8
.DecodeRuneInString(t
)
690 // If they match, keep going; if not, return false.
697 // Make sr < tr to simplify what follows.
701 // Fast check for ASCII.
702 if tr
< utf8
.RuneSelf
&& 'A' <= sr
&& sr
<= 'Z' {
703 // ASCII, and sr is upper case. tr must be lower case.
704 if tr
== sr
+'a'-'A' {
710 // General case. SimpleFold(x) returns the next equivalent rune > x
711 // or wraps around to smaller values.
712 r
:= unicode
.SimpleFold(sr
)
713 for r
!= sr
&& r
< tr
{
714 r
= unicode
.SimpleFold(r
)
722 // One string is empty. Are both?