2016-11-05 Richard Biener <rguenther@suse.de>
[official-gcc.git] / libgo / go / regexp / regexp.go
blobfe3db9f78b0f505943890a7c7d14f94d2860fb8d
1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Package regexp implements regular expression search.
6 //
7 // The syntax of the regular expressions accepted is the same
8 // general syntax used by Perl, Python, and other languages.
9 // More precisely, it is the syntax accepted by RE2 and described at
10 // https://golang.org/s/re2syntax, except for \C.
11 // For an overview of the syntax, run
12 // go doc regexp/syntax
14 // The regexp implementation provided by this package is
15 // guaranteed to run in time linear in the size of the input.
16 // (This is a property not guaranteed by most open source
17 // implementations of regular expressions.) For more information
18 // about this property, see
19 // http://swtch.com/~rsc/regexp/regexp1.html
20 // or any book about automata theory.
22 // All characters are UTF-8-encoded code points.
24 // There are 16 methods of Regexp that match a regular expression and identify
25 // the matched text. Their names are matched by this regular expression:
27 // Find(All)?(String)?(Submatch)?(Index)?
29 // If 'All' is present, the routine matches successive non-overlapping
30 // matches of the entire expression. Empty matches abutting a preceding
31 // match are ignored. The return value is a slice containing the successive
32 // return values of the corresponding non-'All' routine. These routines take
33 // an extra integer argument, n; if n >= 0, the function returns at most n
34 // matches/submatches.
36 // If 'String' is present, the argument is a string; otherwise it is a slice
37 // of bytes; return values are adjusted as appropriate.
39 // If 'Submatch' is present, the return value is a slice identifying the
40 // successive submatches of the expression. Submatches are matches of
41 // parenthesized subexpressions (also known as capturing groups) within the
42 // regular expression, numbered from left to right in order of opening
43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
44 // the match of the first parenthesized subexpression, and so on.
46 // If 'Index' is present, matches and submatches are identified by byte index
47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
48 // the nth submatch. The pair for n==0 identifies the match of the entire
49 // expression. If 'Index' is not present, the match is identified by the
50 // text of the match/submatch. If an index is negative, it means that
51 // subexpression did not match any string in the input.
53 // There is also a subset of the methods that can be applied to text read
54 // from a RuneReader:
56 // MatchReader, FindReaderIndex, FindReaderSubmatchIndex
58 // This set may grow. Note that regular expression matches may need to
59 // examine text beyond the text returned by a match, so the methods that
60 // match text from a RuneReader may read arbitrarily far into the input
61 // before returning.
63 // (There are a few other methods that do not match this pattern.)
65 package regexp
67 import (
68 "bytes"
69 "io"
70 "regexp/syntax"
71 "strconv"
72 "strings"
73 "sync"
74 "unicode"
75 "unicode/utf8"
78 // Regexp is the representation of a compiled regular expression.
79 // A Regexp is safe for concurrent use by multiple goroutines.
80 type Regexp struct {
81 // read-only after Compile
82 regexpRO
84 // cache of machines for running regexp
85 mu sync.Mutex
86 machine []*machine
89 type regexpRO struct {
90 expr string // as passed to Compile
91 prog *syntax.Prog // compiled program
92 onepass *onePassProg // onepass program or nil
93 prefix string // required prefix in unanchored matches
94 prefixBytes []byte // prefix, as a []byte
95 prefixComplete bool // prefix is the entire regexp
96 prefixRune rune // first rune in prefix
97 prefixEnd uint32 // pc for last rune in prefix
98 cond syntax.EmptyOp // empty-width conditions required at start of match
99 numSubexp int
100 subexpNames []string
101 longest bool
104 // String returns the source text used to compile the regular expression.
105 func (re *Regexp) String() string {
106 return re.expr
109 // Copy returns a new Regexp object copied from re.
111 // When using a Regexp in multiple goroutines, giving each goroutine
112 // its own copy helps to avoid lock contention.
113 func (re *Regexp) Copy() *Regexp {
114 // It is not safe to copy Regexp by value
115 // since it contains a sync.Mutex.
116 return &Regexp{
117 regexpRO: re.regexpRO,
121 // Compile parses a regular expression and returns, if successful,
122 // a Regexp object that can be used to match against text.
124 // When matching against text, the regexp returns a match that
125 // begins as early as possible in the input (leftmost), and among those
126 // it chooses the one that a backtracking search would have found first.
127 // This so-called leftmost-first matching is the same semantics
128 // that Perl, Python, and other implementations use, although this
129 // package implements it without the expense of backtracking.
130 // For POSIX leftmost-longest matching, see CompilePOSIX.
131 func Compile(expr string) (*Regexp, error) {
132 return compile(expr, syntax.Perl, false)
135 // CompilePOSIX is like Compile but restricts the regular expression
136 // to POSIX ERE (egrep) syntax and changes the match semantics to
137 // leftmost-longest.
139 // That is, when matching against text, the regexp returns a match that
140 // begins as early as possible in the input (leftmost), and among those
141 // it chooses a match that is as long as possible.
142 // This so-called leftmost-longest matching is the same semantics
143 // that early regular expression implementations used and that POSIX
144 // specifies.
146 // However, there can be multiple leftmost-longest matches, with different
147 // submatch choices, and here this package diverges from POSIX.
148 // Among the possible leftmost-longest matches, this package chooses
149 // the one that a backtracking search would have found first, while POSIX
150 // specifies that the match be chosen to maximize the length of the first
151 // subexpression, then the second, and so on from left to right.
152 // The POSIX rule is computationally prohibitive and not even well-defined.
153 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
154 func CompilePOSIX(expr string) (*Regexp, error) {
155 return compile(expr, syntax.POSIX, true)
158 // Longest makes future searches prefer the leftmost-longest match.
159 // That is, when matching against text, the regexp returns a match that
160 // begins as early as possible in the input (leftmost), and among those
161 // it chooses a match that is as long as possible.
162 func (re *Regexp) Longest() {
163 re.longest = true
166 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
167 re, err := syntax.Parse(expr, mode)
168 if err != nil {
169 return nil, err
171 maxCap := re.MaxCap()
172 capNames := re.CapNames()
174 re = re.Simplify()
175 prog, err := syntax.Compile(re)
176 if err != nil {
177 return nil, err
179 regexp := &Regexp{
180 regexpRO: regexpRO{
181 expr: expr,
182 prog: prog,
183 onepass: compileOnePass(prog),
184 numSubexp: maxCap,
185 subexpNames: capNames,
186 cond: prog.StartCond(),
187 longest: longest,
190 if regexp.onepass == notOnePass {
191 regexp.prefix, regexp.prefixComplete = prog.Prefix()
192 } else {
193 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
195 if regexp.prefix != "" {
196 // TODO(rsc): Remove this allocation by adding
197 // IndexString to package bytes.
198 regexp.prefixBytes = []byte(regexp.prefix)
199 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
201 return regexp, nil
204 // get returns a machine to use for matching re.
205 // It uses the re's machine cache if possible, to avoid
206 // unnecessary allocation.
207 func (re *Regexp) get() *machine {
208 re.mu.Lock()
209 if n := len(re.machine); n > 0 {
210 z := re.machine[n-1]
211 re.machine = re.machine[:n-1]
212 re.mu.Unlock()
213 return z
215 re.mu.Unlock()
216 z := progMachine(re.prog, re.onepass)
217 z.re = re
218 return z
221 // put returns a machine to the re's machine cache.
222 // There is no attempt to limit the size of the cache, so it will
223 // grow to the maximum number of simultaneous matches
224 // run using re. (The cache empties when re gets garbage collected.)
225 func (re *Regexp) put(z *machine) {
226 re.mu.Lock()
227 re.machine = append(re.machine, z)
228 re.mu.Unlock()
231 // MustCompile is like Compile but panics if the expression cannot be parsed.
232 // It simplifies safe initialization of global variables holding compiled regular
233 // expressions.
234 func MustCompile(str string) *Regexp {
235 regexp, error := Compile(str)
236 if error != nil {
237 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
239 return regexp
242 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
243 // It simplifies safe initialization of global variables holding compiled regular
244 // expressions.
245 func MustCompilePOSIX(str string) *Regexp {
246 regexp, error := CompilePOSIX(str)
247 if error != nil {
248 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
250 return regexp
253 func quote(s string) string {
254 if strconv.CanBackquote(s) {
255 return "`" + s + "`"
257 return strconv.Quote(s)
260 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
261 func (re *Regexp) NumSubexp() int {
262 return re.numSubexp
265 // SubexpNames returns the names of the parenthesized subexpressions
266 // in this Regexp. The name for the first sub-expression is names[1],
267 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
268 // Since the Regexp as a whole cannot be named, names[0] is always
269 // the empty string. The slice should not be modified.
270 func (re *Regexp) SubexpNames() []string {
271 return re.subexpNames
274 const endOfText rune = -1
276 // input abstracts different representations of the input text. It provides
277 // one-character lookahead.
278 type input interface {
279 step(pos int) (r rune, width int) // advance one rune
280 canCheckPrefix() bool // can we look ahead without losing info?
281 hasPrefix(re *Regexp) bool
282 index(re *Regexp, pos int) int
283 context(pos int) syntax.EmptyOp
286 // inputString scans a string.
287 type inputString struct {
288 str string
291 func (i *inputString) step(pos int) (rune, int) {
292 if pos < len(i.str) {
293 c := i.str[pos]
294 if c < utf8.RuneSelf {
295 return rune(c), 1
297 return utf8.DecodeRuneInString(i.str[pos:])
299 return endOfText, 0
302 func (i *inputString) canCheckPrefix() bool {
303 return true
306 func (i *inputString) hasPrefix(re *Regexp) bool {
307 return strings.HasPrefix(i.str, re.prefix)
310 func (i *inputString) index(re *Regexp, pos int) int {
311 return strings.Index(i.str[pos:], re.prefix)
314 func (i *inputString) context(pos int) syntax.EmptyOp {
315 r1, r2 := endOfText, endOfText
316 if pos > 0 && pos <= len(i.str) {
317 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
319 if pos < len(i.str) {
320 r2, _ = utf8.DecodeRuneInString(i.str[pos:])
322 return syntax.EmptyOpContext(r1, r2)
325 // inputBytes scans a byte slice.
326 type inputBytes struct {
327 str []byte
330 func (i *inputBytes) step(pos int) (rune, int) {
331 if pos < len(i.str) {
332 c := i.str[pos]
333 if c < utf8.RuneSelf {
334 return rune(c), 1
336 return utf8.DecodeRune(i.str[pos:])
338 return endOfText, 0
341 func (i *inputBytes) canCheckPrefix() bool {
342 return true
345 func (i *inputBytes) hasPrefix(re *Regexp) bool {
346 return bytes.HasPrefix(i.str, re.prefixBytes)
349 func (i *inputBytes) index(re *Regexp, pos int) int {
350 return bytes.Index(i.str[pos:], re.prefixBytes)
353 func (i *inputBytes) context(pos int) syntax.EmptyOp {
354 r1, r2 := endOfText, endOfText
355 if pos > 0 && pos <= len(i.str) {
356 r1, _ = utf8.DecodeLastRune(i.str[:pos])
358 if pos < len(i.str) {
359 r2, _ = utf8.DecodeRune(i.str[pos:])
361 return syntax.EmptyOpContext(r1, r2)
364 // inputReader scans a RuneReader.
365 type inputReader struct {
366 r io.RuneReader
367 atEOT bool
368 pos int
371 func (i *inputReader) step(pos int) (rune, int) {
372 if !i.atEOT && pos != i.pos {
373 return endOfText, 0
376 r, w, err := i.r.ReadRune()
377 if err != nil {
378 i.atEOT = true
379 return endOfText, 0
381 i.pos += w
382 return r, w
385 func (i *inputReader) canCheckPrefix() bool {
386 return false
389 func (i *inputReader) hasPrefix(re *Regexp) bool {
390 return false
393 func (i *inputReader) index(re *Regexp, pos int) int {
394 return -1
397 func (i *inputReader) context(pos int) syntax.EmptyOp {
398 return 0
401 // LiteralPrefix returns a literal string that must begin any match
402 // of the regular expression re. It returns the boolean true if the
403 // literal string comprises the entire regular expression.
404 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
405 return re.prefix, re.prefixComplete
408 // MatchReader reports whether the Regexp matches the text read by the
409 // RuneReader.
410 func (re *Regexp) MatchReader(r io.RuneReader) bool {
411 return re.doExecute(r, nil, "", 0, 0) != nil
414 // MatchString reports whether the Regexp matches the string s.
415 func (re *Regexp) MatchString(s string) bool {
416 return re.doExecute(nil, nil, s, 0, 0) != nil
419 // Match reports whether the Regexp matches the byte slice b.
420 func (re *Regexp) Match(b []byte) bool {
421 return re.doExecute(nil, b, "", 0, 0) != nil
424 // MatchReader checks whether a textual regular expression matches the text
425 // read by the RuneReader. More complicated queries need to use Compile and
426 // the full Regexp interface.
427 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
428 re, err := Compile(pattern)
429 if err != nil {
430 return false, err
432 return re.MatchReader(r), nil
435 // MatchString checks whether a textual regular expression
436 // matches a string. More complicated queries need
437 // to use Compile and the full Regexp interface.
438 func MatchString(pattern string, s string) (matched bool, err error) {
439 re, err := Compile(pattern)
440 if err != nil {
441 return false, err
443 return re.MatchString(s), nil
446 // Match checks whether a textual regular expression
447 // matches a byte slice. More complicated queries need
448 // to use Compile and the full Regexp interface.
449 func Match(pattern string, b []byte) (matched bool, err error) {
450 re, err := Compile(pattern)
451 if err != nil {
452 return false, err
454 return re.Match(b), nil
457 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
458 // with the replacement string repl. Inside repl, $ signs are interpreted as
459 // in Expand, so for instance $1 represents the text of the first submatch.
460 func (re *Regexp) ReplaceAllString(src, repl string) string {
461 n := 2
462 if strings.Contains(repl, "$") {
463 n = 2 * (re.numSubexp + 1)
465 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
466 return re.expand(dst, repl, nil, src, match)
468 return string(b)
471 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
472 // with the replacement string repl. The replacement repl is substituted directly,
473 // without using Expand.
474 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
475 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
476 return append(dst, repl...)
480 // ReplaceAllStringFunc returns a copy of src in which all matches of the
481 // Regexp have been replaced by the return value of function repl applied
482 // to the matched substring. The replacement returned by repl is substituted
483 // directly, without using Expand.
484 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
485 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
486 return append(dst, repl(src[match[0]:match[1]])...)
488 return string(b)
491 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
492 lastMatchEnd := 0 // end position of the most recent match
493 searchPos := 0 // position where we next look for a match
494 var buf []byte
495 var endPos int
496 if bsrc != nil {
497 endPos = len(bsrc)
498 } else {
499 endPos = len(src)
501 if nmatch > re.prog.NumCap {
502 nmatch = re.prog.NumCap
505 for searchPos <= endPos {
506 a := re.doExecute(nil, bsrc, src, searchPos, nmatch)
507 if len(a) == 0 {
508 break // no more matches
511 // Copy the unmatched characters before this match.
512 if bsrc != nil {
513 buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
514 } else {
515 buf = append(buf, src[lastMatchEnd:a[0]]...)
518 // Now insert a copy of the replacement string, but not for a
519 // match of the empty string immediately after another match.
520 // (Otherwise, we get double replacement for patterns that
521 // match both empty and nonempty strings.)
522 if a[1] > lastMatchEnd || a[0] == 0 {
523 buf = repl(buf, a)
525 lastMatchEnd = a[1]
527 // Advance past this match; always advance at least one character.
528 var width int
529 if bsrc != nil {
530 _, width = utf8.DecodeRune(bsrc[searchPos:])
531 } else {
532 _, width = utf8.DecodeRuneInString(src[searchPos:])
534 if searchPos+width > a[1] {
535 searchPos += width
536 } else if searchPos+1 > a[1] {
537 // This clause is only needed at the end of the input
538 // string. In that case, DecodeRuneInString returns width=0.
539 searchPos++
540 } else {
541 searchPos = a[1]
545 // Copy the unmatched characters after the last match.
546 if bsrc != nil {
547 buf = append(buf, bsrc[lastMatchEnd:]...)
548 } else {
549 buf = append(buf, src[lastMatchEnd:]...)
552 return buf
555 // ReplaceAll returns a copy of src, replacing matches of the Regexp
556 // with the replacement text repl. Inside repl, $ signs are interpreted as
557 // in Expand, so for instance $1 represents the text of the first submatch.
558 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
559 n := 2
560 if bytes.IndexByte(repl, '$') >= 0 {
561 n = 2 * (re.numSubexp + 1)
563 srepl := ""
564 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
565 if len(srepl) != len(repl) {
566 srepl = string(repl)
568 return re.expand(dst, srepl, src, "", match)
570 return b
573 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
574 // with the replacement bytes repl. The replacement repl is substituted directly,
575 // without using Expand.
576 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
577 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
578 return append(dst, repl...)
582 // ReplaceAllFunc returns a copy of src in which all matches of the
583 // Regexp have been replaced by the return value of function repl applied
584 // to the matched byte slice. The replacement returned by repl is substituted
585 // directly, without using Expand.
586 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
587 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
588 return append(dst, repl(src[match[0]:match[1]])...)
592 var specialBytes = []byte(`\.+*?()|[]{}^$`)
594 func special(b byte) bool {
595 return bytes.IndexByte(specialBytes, b) >= 0
598 // QuoteMeta returns a string that quotes all regular expression metacharacters
599 // inside the argument text; the returned string is a regular expression matching
600 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
601 func QuoteMeta(s string) string {
602 b := make([]byte, 2*len(s))
604 // A byte loop is correct because all metacharacters are ASCII.
605 j := 0
606 for i := 0; i < len(s); i++ {
607 if special(s[i]) {
608 b[j] = '\\'
611 b[j] = s[i]
614 return string(b[0:j])
617 // The number of capture values in the program may correspond
618 // to fewer capturing expressions than are in the regexp.
619 // For example, "(a){0}" turns into an empty program, so the
620 // maximum capture in the program is 0 but we need to return
621 // an expression for \1. Pad appends -1s to the slice a as needed.
622 func (re *Regexp) pad(a []int) []int {
623 if a == nil {
624 // No match.
625 return nil
627 n := (1 + re.numSubexp) * 2
628 for len(a) < n {
629 a = append(a, -1)
631 return a
634 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
635 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
636 var end int
637 if b == nil {
638 end = len(s)
639 } else {
640 end = len(b)
643 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
644 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap)
645 if len(matches) == 0 {
646 break
649 accept := true
650 if matches[1] == pos {
651 // We've found an empty match.
652 if matches[0] == prevMatchEnd {
653 // We don't allow an empty match right
654 // after a previous match, so ignore it.
655 accept = false
657 var width int
658 // TODO: use step()
659 if b == nil {
660 _, width = utf8.DecodeRuneInString(s[pos:end])
661 } else {
662 _, width = utf8.DecodeRune(b[pos:end])
664 if width > 0 {
665 pos += width
666 } else {
667 pos = end + 1
669 } else {
670 pos = matches[1]
672 prevMatchEnd = matches[1]
674 if accept {
675 deliver(re.pad(matches))
681 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
682 // A return value of nil indicates no match.
683 func (re *Regexp) Find(b []byte) []byte {
684 a := re.doExecute(nil, b, "", 0, 2)
685 if a == nil {
686 return nil
688 return b[a[0]:a[1]]
691 // FindIndex returns a two-element slice of integers defining the location of
692 // the leftmost match in b of the regular expression. The match itself is at
693 // b[loc[0]:loc[1]].
694 // A return value of nil indicates no match.
695 func (re *Regexp) FindIndex(b []byte) (loc []int) {
696 a := re.doExecute(nil, b, "", 0, 2)
697 if a == nil {
698 return nil
700 return a[0:2]
703 // FindString returns a string holding the text of the leftmost match in s of the regular
704 // expression. If there is no match, the return value is an empty string,
705 // but it will also be empty if the regular expression successfully matches
706 // an empty string. Use FindStringIndex or FindStringSubmatch if it is
707 // necessary to distinguish these cases.
708 func (re *Regexp) FindString(s string) string {
709 a := re.doExecute(nil, nil, s, 0, 2)
710 if a == nil {
711 return ""
713 return s[a[0]:a[1]]
716 // FindStringIndex returns a two-element slice of integers defining the
717 // location of the leftmost match in s of the regular expression. The match
718 // itself is at s[loc[0]:loc[1]].
719 // A return value of nil indicates no match.
720 func (re *Regexp) FindStringIndex(s string) (loc []int) {
721 a := re.doExecute(nil, nil, s, 0, 2)
722 if a == nil {
723 return nil
725 return a[0:2]
728 // FindReaderIndex returns a two-element slice of integers defining the
729 // location of the leftmost match of the regular expression in text read from
730 // the RuneReader. The match text was found in the input stream at
731 // byte offset loc[0] through loc[1]-1.
732 // A return value of nil indicates no match.
733 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
734 a := re.doExecute(r, nil, "", 0, 2)
735 if a == nil {
736 return nil
738 return a[0:2]
741 // FindSubmatch returns a slice of slices holding the text of the leftmost
742 // match of the regular expression in b and the matches, if any, of its
743 // subexpressions, as defined by the 'Submatch' descriptions in the package
744 // comment.
745 // A return value of nil indicates no match.
746 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
747 a := re.doExecute(nil, b, "", 0, re.prog.NumCap)
748 if a == nil {
749 return nil
751 ret := make([][]byte, 1+re.numSubexp)
752 for i := range ret {
753 if 2*i < len(a) && a[2*i] >= 0 {
754 ret[i] = b[a[2*i]:a[2*i+1]]
757 return ret
760 // Expand appends template to dst and returns the result; during the
761 // append, Expand replaces variables in the template with corresponding
762 // matches drawn from src. The match slice should have been returned by
763 // FindSubmatchIndex.
765 // In the template, a variable is denoted by a substring of the form
766 // $name or ${name}, where name is a non-empty sequence of letters,
767 // digits, and underscores. A purely numeric name like $1 refers to
768 // the submatch with the corresponding index; other names refer to
769 // capturing parentheses named with the (?P<name>...) syntax. A
770 // reference to an out of range or unmatched index or a name that is not
771 // present in the regular expression is replaced with an empty slice.
773 // In the $name form, name is taken to be as long as possible: $1x is
774 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
776 // To insert a literal $ in the output, use $$ in the template.
777 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
778 return re.expand(dst, string(template), src, "", match)
781 // ExpandString is like Expand but the template and source are strings.
782 // It appends to and returns a byte slice in order to give the calling
783 // code control over allocation.
784 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
785 return re.expand(dst, template, nil, src, match)
788 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
789 for len(template) > 0 {
790 i := strings.Index(template, "$")
791 if i < 0 {
792 break
794 dst = append(dst, template[:i]...)
795 template = template[i:]
796 if len(template) > 1 && template[1] == '$' {
797 // Treat $$ as $.
798 dst = append(dst, '$')
799 template = template[2:]
800 continue
802 name, num, rest, ok := extract(template)
803 if !ok {
804 // Malformed; treat $ as raw text.
805 dst = append(dst, '$')
806 template = template[1:]
807 continue
809 template = rest
810 if num >= 0 {
811 if 2*num+1 < len(match) && match[2*num] >= 0 {
812 if bsrc != nil {
813 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
814 } else {
815 dst = append(dst, src[match[2*num]:match[2*num+1]]...)
818 } else {
819 for i, namei := range re.subexpNames {
820 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
821 if bsrc != nil {
822 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
823 } else {
824 dst = append(dst, src[match[2*i]:match[2*i+1]]...)
826 break
831 dst = append(dst, template...)
832 return dst
835 // extract returns the name from a leading "$name" or "${name}" in str.
836 // If it is a number, extract returns num set to that number; otherwise num = -1.
837 func extract(str string) (name string, num int, rest string, ok bool) {
838 if len(str) < 2 || str[0] != '$' {
839 return
841 brace := false
842 if str[1] == '{' {
843 brace = true
844 str = str[2:]
845 } else {
846 str = str[1:]
848 i := 0
849 for i < len(str) {
850 rune, size := utf8.DecodeRuneInString(str[i:])
851 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
852 break
854 i += size
856 if i == 0 {
857 // empty name is not okay
858 return
860 name = str[:i]
861 if brace {
862 if i >= len(str) || str[i] != '}' {
863 // missing closing brace
864 return
869 // Parse number.
870 num = 0
871 for i := 0; i < len(name); i++ {
872 if name[i] < '0' || '9' < name[i] || num >= 1e8 {
873 num = -1
874 break
876 num = num*10 + int(name[i]) - '0'
878 // Disallow leading zeros.
879 if name[0] == '0' && len(name) > 1 {
880 num = -1
883 rest = str[i:]
884 ok = true
885 return
888 // FindSubmatchIndex returns a slice holding the index pairs identifying the
889 // leftmost match of the regular expression in b and the matches, if any, of
890 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
891 // in the package comment.
892 // A return value of nil indicates no match.
893 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
894 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap))
897 // FindStringSubmatch returns a slice of strings holding the text of the
898 // leftmost match of the regular expression in s and the matches, if any, of
899 // its subexpressions, as defined by the 'Submatch' description in the
900 // package comment.
901 // A return value of nil indicates no match.
902 func (re *Regexp) FindStringSubmatch(s string) []string {
903 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap)
904 if a == nil {
905 return nil
907 ret := make([]string, 1+re.numSubexp)
908 for i := range ret {
909 if 2*i < len(a) && a[2*i] >= 0 {
910 ret[i] = s[a[2*i]:a[2*i+1]]
913 return ret
916 // FindStringSubmatchIndex returns a slice holding the index pairs
917 // identifying the leftmost match of the regular expression in s and the
918 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
919 // 'Index' descriptions in the package comment.
920 // A return value of nil indicates no match.
921 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
922 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap))
925 // FindReaderSubmatchIndex returns a slice holding the index pairs
926 // identifying the leftmost match of the regular expression of text read by
927 // the RuneReader, and the matches, if any, of its subexpressions, as defined
928 // by the 'Submatch' and 'Index' descriptions in the package comment. A
929 // return value of nil indicates no match.
930 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
931 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap))
934 const startSize = 10 // The size at which to start a slice in the 'All' routines.
936 // FindAll is the 'All' version of Find; it returns a slice of all successive
937 // matches of the expression, as defined by the 'All' description in the
938 // package comment.
939 // A return value of nil indicates no match.
940 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
941 if n < 0 {
942 n = len(b) + 1
944 result := make([][]byte, 0, startSize)
945 re.allMatches("", b, n, func(match []int) {
946 result = append(result, b[match[0]:match[1]])
948 if len(result) == 0 {
949 return nil
951 return result
954 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
955 // successive matches of the expression, as defined by the 'All' description
956 // in the package comment.
957 // A return value of nil indicates no match.
958 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
959 if n < 0 {
960 n = len(b) + 1
962 result := make([][]int, 0, startSize)
963 re.allMatches("", b, n, func(match []int) {
964 result = append(result, match[0:2])
966 if len(result) == 0 {
967 return nil
969 return result
972 // FindAllString is the 'All' version of FindString; it returns a slice of all
973 // successive matches of the expression, as defined by the 'All' description
974 // in the package comment.
975 // A return value of nil indicates no match.
976 func (re *Regexp) FindAllString(s string, n int) []string {
977 if n < 0 {
978 n = len(s) + 1
980 result := make([]string, 0, startSize)
981 re.allMatches(s, nil, n, func(match []int) {
982 result = append(result, s[match[0]:match[1]])
984 if len(result) == 0 {
985 return nil
987 return result
990 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
991 // slice of all successive matches of the expression, as defined by the 'All'
992 // description in the package comment.
993 // A return value of nil indicates no match.
994 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
995 if n < 0 {
996 n = len(s) + 1
998 result := make([][]int, 0, startSize)
999 re.allMatches(s, nil, n, func(match []int) {
1000 result = append(result, match[0:2])
1002 if len(result) == 0 {
1003 return nil
1005 return result
1008 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1009 // of all successive matches of the expression, as defined by the 'All'
1010 // description in the package comment.
1011 // A return value of nil indicates no match.
1012 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1013 if n < 0 {
1014 n = len(b) + 1
1016 result := make([][][]byte, 0, startSize)
1017 re.allMatches("", b, n, func(match []int) {
1018 slice := make([][]byte, len(match)/2)
1019 for j := range slice {
1020 if match[2*j] >= 0 {
1021 slice[j] = b[match[2*j]:match[2*j+1]]
1024 result = append(result, slice)
1026 if len(result) == 0 {
1027 return nil
1029 return result
1032 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1033 // a slice of all successive matches of the expression, as defined by the
1034 // 'All' description in the package comment.
1035 // A return value of nil indicates no match.
1036 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1037 if n < 0 {
1038 n = len(b) + 1
1040 result := make([][]int, 0, startSize)
1041 re.allMatches("", b, n, func(match []int) {
1042 result = append(result, match)
1044 if len(result) == 0 {
1045 return nil
1047 return result
1050 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1051 // returns a slice of all successive matches of the expression, as defined by
1052 // the 'All' description in the package comment.
1053 // A return value of nil indicates no match.
1054 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1055 if n < 0 {
1056 n = len(s) + 1
1058 result := make([][]string, 0, startSize)
1059 re.allMatches(s, nil, n, func(match []int) {
1060 slice := make([]string, len(match)/2)
1061 for j := range slice {
1062 if match[2*j] >= 0 {
1063 slice[j] = s[match[2*j]:match[2*j+1]]
1066 result = append(result, slice)
1068 if len(result) == 0 {
1069 return nil
1071 return result
1074 // FindAllStringSubmatchIndex is the 'All' version of
1075 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1076 // the expression, as defined by the 'All' description in the package
1077 // comment.
1078 // A return value of nil indicates no match.
1079 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1080 if n < 0 {
1081 n = len(s) + 1
1083 result := make([][]int, 0, startSize)
1084 re.allMatches(s, nil, n, func(match []int) {
1085 result = append(result, match)
1087 if len(result) == 0 {
1088 return nil
1090 return result
1093 // Split slices s into substrings separated by the expression and returns a slice of
1094 // the substrings between those expression matches.
1096 // The slice returned by this method consists of all the substrings of s
1097 // not contained in the slice returned by FindAllString. When called on an expression
1098 // that contains no metacharacters, it is equivalent to strings.SplitN.
1100 // Example:
1101 // s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
1102 // // s: ["", "b", "b", "c", "cadaaae"]
1104 // The count determines the number of substrings to return:
1105 // n > 0: at most n substrings; the last substring will be the unsplit remainder.
1106 // n == 0: the result is nil (zero substrings)
1107 // n < 0: all substrings
1108 func (re *Regexp) Split(s string, n int) []string {
1110 if n == 0 {
1111 return nil
1114 if len(re.expr) > 0 && len(s) == 0 {
1115 return []string{""}
1118 matches := re.FindAllStringIndex(s, n)
1119 strings := make([]string, 0, len(matches))
1121 beg := 0
1122 end := 0
1123 for _, match := range matches {
1124 if n > 0 && len(strings) >= n-1 {
1125 break
1128 end = match[0]
1129 if match[1] != 0 {
1130 strings = append(strings, s[beg:end])
1132 beg = match[1]
1135 if end != len(s) {
1136 strings = append(strings, s[beg:])
1139 return strings