libgo/go/regexp/regexp.go

   1 // Copyright 2009 The Go Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 // Package regexp implements regular expression search.
   6 //
   7 // The syntax of the regular expressions accepted is the same
   8 // general syntax used by Perl, Python, and other languages.
   9 // More precisely, it is the syntax accepted by RE2 and described at
  10 // https://golang.org/s/re2syntax, except for \C.
  11 // For an overview of the syntax, run
  12 //   go doc regexp/syntax
  13 //
  14 // The regexp implementation provided by this package is
  15 // guaranteed to run in time linear in the size of the input.
  16 // (This is a property not guaranteed by most open source
  17 // implementations of regular expressions.) For more information
  18 // about this property, see
  19 //      http://swtch.com/~rsc/regexp/regexp1.html
  20 // or any book about automata theory.
  21 //
  22 // All characters are UTF-8-encoded code points.
  23 //
  24 // There are 16 methods of Regexp that match a regular expression and identify
  25 // the matched text. Their names are matched by this regular expression:
  26 //
  27 //      Find(All)?(String)?(Submatch)?(Index)?
  28 //
  29 // If 'All' is present, the routine matches successive non-overlapping
  30 // matches of the entire expression. Empty matches abutting a preceding
  31 // match are ignored. The return value is a slice containing the successive
  32 // return values of the corresponding non-'All' routine. These routines take
  33 // an extra integer argument, n; if n >= 0, the function returns at most n
  34 // matches/submatches.
  35 //
  36 // If 'String' is present, the argument is a string; otherwise it is a slice
  37 // of bytes; return values are adjusted as appropriate.
  38 //
  39 // If 'Submatch' is present, the return value is a slice identifying the
  40 // successive submatches of the expression. Submatches are matches of
  41 // parenthesized subexpressions (also known as capturing groups) within the
  42 // regular expression, numbered from left to right in order of opening
  43 // parenthesis. Submatch 0 is the match of the entire expression, submatch 1
  44 // the match of the first parenthesized subexpression, and so on.
  45 //
  46 // If 'Index' is present, matches and submatches are identified by byte index
  47 // pairs within the input string: result[2*n:2*n+1] identifies the indexes of
  48 // the nth submatch. The pair for n==0 identifies the match of the entire
  49 // expression. If 'Index' is not present, the match is identified by the
  50 // text of the match/submatch. If an index is negative, it means that
  51 // subexpression did not match any string in the input.
  52 //
  53 // There is also a subset of the methods that can be applied to text read
  54 // from a RuneReader:
  55 //
  56 //      MatchReader, FindReaderIndex, FindReaderSubmatchIndex
  57 //
  58 // This set may grow. Note that regular expression matches may need to
  59 // examine text beyond the text returned by a match, so the methods that
  60 // match text from a RuneReader may read arbitrarily far into the input
  61 // before returning.
  62 //
  63 // (There are a few other methods that do not match this pattern.)
  64 //
  65 package regexp
  66
  67 import (
  68         "bytes"
  69         "io"
  70         "regexp/syntax"
  71         "strconv"
  72         "strings"
  73         "sync"
  74         "unicode"
  75         "unicode/utf8"
  76 )
  77
  78 // Regexp is the representation of a compiled regular expression.
  79 // A Regexp is safe for concurrent use by multiple goroutines,
  80 // except for configuration methods, such as Longest.
  81 type Regexp struct {
  82         // read-only after Compile
  83         regexpRO
  84
  85         // cache of machines for running regexp
  86         mu      sync.Mutex
  87         machine []*machine
  88 }
  89
  90 type regexpRO struct {
  91         expr           string         // as passed to Compile
  92         prog           *syntax.Prog   // compiled program
  93         onepass        *onePassProg   // onepass program or nil
  94         prefix         string         // required prefix in unanchored matches
  95         prefixBytes    []byte         // prefix, as a []byte
  96         prefixComplete bool           // prefix is the entire regexp
  97         prefixRune     rune           // first rune in prefix
  98         prefixEnd      uint32         // pc for last rune in prefix
  99         cond           syntax.EmptyOp // empty-width conditions required at start of match
 100         numSubexp      int
 101         subexpNames    []string
 102         longest        bool
 103 }
 104
 105 // String returns the source text used to compile the regular expression.
 106 func (re *Regexp) String() string {
 107         return re.expr
 108 }
 109
 110 // Copy returns a new Regexp object copied from re.
 111 //
 112 // When using a Regexp in multiple goroutines, giving each goroutine
 113 // its own copy helps to avoid lock contention.
 114 func (re *Regexp) Copy() *Regexp {
 115         // It is not safe to copy Regexp by value
 116         // since it contains a sync.Mutex.
 117         return &Regexp{
 118                 regexpRO: re.regexpRO,
 119         }
 120 }
 121
 122 // Compile parses a regular expression and returns, if successful,
 123 // a Regexp object that can be used to match against text.
 124 //
 125 // When matching against text, the regexp returns a match that
 126 // begins as early as possible in the input (leftmost), and among those
 127 // it chooses the one that a backtracking search would have found first.
 128 // This so-called leftmost-first matching is the same semantics
 129 // that Perl, Python, and other implementations use, although this
 130 // package implements it without the expense of backtracking.
 131 // For POSIX leftmost-longest matching, see CompilePOSIX.
 132 func Compile(expr string) (*Regexp, error) {
 133         return compile(expr, syntax.Perl, false)
 134 }
 135
 136 // CompilePOSIX is like Compile but restricts the regular expression
 137 // to POSIX ERE (egrep) syntax and changes the match semantics to
 138 // leftmost-longest.
 139 //
 140 // That is, when matching against text, the regexp returns a match that
 141 // begins as early as possible in the input (leftmost), and among those
 142 // it chooses a match that is as long as possible.
 143 // This so-called leftmost-longest matching is the same semantics
 144 // that early regular expression implementations used and that POSIX
 145 // specifies.
 146 //
 147 // However, there can be multiple leftmost-longest matches, with different
 148 // submatch choices, and here this package diverges from POSIX.
 149 // Among the possible leftmost-longest matches, this package chooses
 150 // the one that a backtracking search would have found first, while POSIX
 151 // specifies that the match be chosen to maximize the length of the first
 152 // subexpression, then the second, and so on from left to right.
 153 // The POSIX rule is computationally prohibitive and not even well-defined.
 154 // See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
 155 func CompilePOSIX(expr string) (*Regexp, error) {
 156         return compile(expr, syntax.POSIX, true)
 157 }
 158
 159 // Longest makes future searches prefer the leftmost-longest match.
 160 // That is, when matching against text, the regexp returns a match that
 161 // begins as early as possible in the input (leftmost), and among those
 162 // it chooses a match that is as long as possible.
 163 // This method modifies the Regexp and may not be called concurrently
 164 // with any other methods.
 165 func (re *Regexp) Longest() {
 166         re.longest = true
 167 }
 168
 169 func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
 170         re, err := syntax.Parse(expr, mode)
 171         if err != nil {
 172                 return nil, err
 173         }
 174         maxCap := re.MaxCap()
 175         capNames := re.CapNames()
 176
 177         re = re.Simplify()
 178         prog, err := syntax.Compile(re)
 179         if err != nil {
 180                 return nil, err
 181         }
 182         regexp := &Regexp{
 183                 regexpRO: regexpRO{
 184                         expr:        expr,
 185                         prog:        prog,
 186                         onepass:     compileOnePass(prog),
 187                         numSubexp:   maxCap,
 188                         subexpNames: capNames,
 189                         cond:        prog.StartCond(),
 190                         longest:     longest,
 191                 },
 192         }
 193         if regexp.onepass == notOnePass {
 194                 regexp.prefix, regexp.prefixComplete = prog.Prefix()
 195         } else {
 196                 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
 197         }
 198         if regexp.prefix != "" {
 199                 // TODO(rsc): Remove this allocation by adding
 200                 // IndexString to package bytes.
 201                 regexp.prefixBytes = []byte(regexp.prefix)
 202                 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
 203         }
 204         return regexp, nil
 205 }
 206
 207 // get returns a machine to use for matching re.
 208 // It uses the re's machine cache if possible, to avoid
 209 // unnecessary allocation.
 210 func (re *Regexp) get() *machine {
 211         re.mu.Lock()
 212         if n := len(re.machine); n > 0 {
 213                 z := re.machine[n-1]
 214                 re.machine = re.machine[:n-1]
 215                 re.mu.Unlock()
 216                 return z
 217         }
 218         re.mu.Unlock()
 219         z := progMachine(re.prog, re.onepass)
 220         z.re = re
 221         return z
 222 }
 223
 224 // put returns a machine to the re's machine cache.
 225 // There is no attempt to limit the size of the cache, so it will
 226 // grow to the maximum number of simultaneous matches
 227 // run using re.  (The cache empties when re gets garbage collected.)
 228 func (re *Regexp) put(z *machine) {
 229         re.mu.Lock()
 230         re.machine = append(re.machine, z)
 231         re.mu.Unlock()
 232 }
 233
 234 // MustCompile is like Compile but panics if the expression cannot be parsed.
 235 // It simplifies safe initialization of global variables holding compiled regular
 236 // expressions.
 237 func MustCompile(str string) *Regexp {
 238         regexp, error := Compile(str)
 239         if error != nil {
 240                 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
 241         }
 242         return regexp
 243 }
 244
 245 // MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
 246 // It simplifies safe initialization of global variables holding compiled regular
 247 // expressions.
 248 func MustCompilePOSIX(str string) *Regexp {
 249         regexp, error := CompilePOSIX(str)
 250         if error != nil {
 251                 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
 252         }
 253         return regexp
 254 }
 255
 256 func quote(s string) string {
 257         if strconv.CanBackquote(s) {
 258                 return "`" + s + "`"
 259         }
 260         return strconv.Quote(s)
 261 }
 262
 263 // NumSubexp returns the number of parenthesized subexpressions in this Regexp.
 264 func (re *Regexp) NumSubexp() int {
 265         return re.numSubexp
 266 }
 267
 268 // SubexpNames returns the names of the parenthesized subexpressions
 269 // in this Regexp. The name for the first sub-expression is names[1],
 270 // so that if m is a match slice, the name for m[i] is SubexpNames()[i].
 271 // Since the Regexp as a whole cannot be named, names[0] is always
 272 // the empty string. The slice should not be modified.
 273 func (re *Regexp) SubexpNames() []string {
 274         return re.subexpNames
 275 }
 276
 277 const endOfText rune = -1
 278
 279 // input abstracts different representations of the input text. It provides
 280 // one-character lookahead.
 281 type input interface {
 282         step(pos int) (r rune, width int) // advance one rune
 283         canCheckPrefix() bool             // can we look ahead without losing info?
 284         hasPrefix(re *Regexp) bool
 285         index(re *Regexp, pos int) int
 286         context(pos int) syntax.EmptyOp
 287 }
 288
 289 // inputString scans a string.
 290 type inputString struct {
 291         str string
 292 }
 293
 294 func (i *inputString) step(pos int) (rune, int) {
 295         if pos < len(i.str) {
 296                 c := i.str[pos]
 297                 if c < utf8.RuneSelf {
 298                         return rune(c), 1
 299                 }
 300                 return utf8.DecodeRuneInString(i.str[pos:])
 301         }
 302         return endOfText, 0
 303 }
 304
 305 func (i *inputString) canCheckPrefix() bool {
 306         return true
 307 }
 308
 309 func (i *inputString) hasPrefix(re *Regexp) bool {
 310         return strings.HasPrefix(i.str, re.prefix)
 311 }
 312
 313 func (i *inputString) index(re *Regexp, pos int) int {
 314         return strings.Index(i.str[pos:], re.prefix)
 315 }
 316
 317 func (i *inputString) context(pos int) syntax.EmptyOp {
 318         r1, r2 := endOfText, endOfText
 319         // 0 < pos && pos <= len(i.str)
 320         if uint(pos-1) < uint(len(i.str)) {
 321                 r1 = rune(i.str[pos-1])
 322                 if r1 >= utf8.RuneSelf {
 323                         r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
 324                 }
 325         }
 326         // 0 <= pos && pos < len(i.str)
 327         if uint(pos) < uint(len(i.str)) {
 328                 r2 = rune(i.str[pos])
 329                 if r2 >= utf8.RuneSelf {
 330                         r2, _ = utf8.DecodeRuneInString(i.str[pos:])
 331                 }
 332         }
 333         return syntax.EmptyOpContext(r1, r2)
 334 }
 335
 336 // inputBytes scans a byte slice.
 337 type inputBytes struct {
 338         str []byte
 339 }
 340
 341 func (i *inputBytes) step(pos int) (rune, int) {
 342         if pos < len(i.str) {
 343                 c := i.str[pos]
 344                 if c < utf8.RuneSelf {
 345                         return rune(c), 1
 346                 }
 347                 return utf8.DecodeRune(i.str[pos:])
 348         }
 349         return endOfText, 0
 350 }
 351
 352 func (i *inputBytes) canCheckPrefix() bool {
 353         return true
 354 }
 355
 356 func (i *inputBytes) hasPrefix(re *Regexp) bool {
 357         return bytes.HasPrefix(i.str, re.prefixBytes)
 358 }
 359
 360 func (i *inputBytes) index(re *Regexp, pos int) int {
 361         return bytes.Index(i.str[pos:], re.prefixBytes)
 362 }
 363
 364 func (i *inputBytes) context(pos int) syntax.EmptyOp {
 365         r1, r2 := endOfText, endOfText
 366         // 0 < pos && pos <= len(i.str)
 367         if uint(pos-1) < uint(len(i.str)) {
 368                 r1 = rune(i.str[pos-1])
 369                 if r1 >= utf8.RuneSelf {
 370                         r1, _ = utf8.DecodeLastRune(i.str[:pos])
 371                 }
 372         }
 373         // 0 <= pos && pos < len(i.str)
 374         if uint(pos) < uint(len(i.str)) {
 375                 r2 = rune(i.str[pos])
 376                 if r2 >= utf8.RuneSelf {
 377                         r2, _ = utf8.DecodeRune(i.str[pos:])
 378                 }
 379         }
 380         return syntax.EmptyOpContext(r1, r2)
 381 }
 382
 383 // inputReader scans a RuneReader.
 384 type inputReader struct {
 385         r     io.RuneReader
 386         atEOT bool
 387         pos   int
 388 }
 389
 390 func (i *inputReader) step(pos int) (rune, int) {
 391         if !i.atEOT && pos != i.pos {
 392                 return endOfText, 0
 393
 394         }
 395         r, w, err := i.r.ReadRune()
 396         if err != nil {
 397                 i.atEOT = true
 398                 return endOfText, 0
 399         }
 400         i.pos += w
 401         return r, w
 402 }
 403
 404 func (i *inputReader) canCheckPrefix() bool {
 405         return false
 406 }
 407
 408 func (i *inputReader) hasPrefix(re *Regexp) bool {
 409         return false
 410 }
 411
 412 func (i *inputReader) index(re *Regexp, pos int) int {
 413         return -1
 414 }
 415
 416 func (i *inputReader) context(pos int) syntax.EmptyOp {
 417         return 0
 418 }
 419
 420 // LiteralPrefix returns a literal string that must begin any match
 421 // of the regular expression re. It returns the boolean true if the
 422 // literal string comprises the entire regular expression.
 423 func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
 424         return re.prefix, re.prefixComplete
 425 }
 426
 427 // MatchReader reports whether the Regexp matches the text read by the
 428 // RuneReader.
 429 func (re *Regexp) MatchReader(r io.RuneReader) bool {
 430         return re.doMatch(r, nil, "")
 431 }
 432
 433 // MatchString reports whether the Regexp matches the string s.
 434 func (re *Regexp) MatchString(s string) bool {
 435         return re.doMatch(nil, nil, s)
 436 }
 437
 438 // Match reports whether the Regexp matches the byte slice b.
 439 func (re *Regexp) Match(b []byte) bool {
 440         return re.doMatch(nil, b, "")
 441 }
 442
 443 // MatchReader checks whether a textual regular expression matches the text
 444 // read by the RuneReader. More complicated queries need to use Compile and
 445 // the full Regexp interface.
 446 func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
 447         re, err := Compile(pattern)
 448         if err != nil {
 449                 return false, err
 450         }
 451         return re.MatchReader(r), nil
 452 }
 453
 454 // MatchString checks whether a textual regular expression
 455 // matches a string. More complicated queries need
 456 // to use Compile and the full Regexp interface.
 457 func MatchString(pattern string, s string) (matched bool, err error) {
 458         re, err := Compile(pattern)
 459         if err != nil {
 460                 return false, err
 461         }
 462         return re.MatchString(s), nil
 463 }
 464
 465 // Match checks whether a textual regular expression
 466 // matches a byte slice. More complicated queries need
 467 // to use Compile and the full Regexp interface.
 468 func Match(pattern string, b []byte) (matched bool, err error) {
 469         re, err := Compile(pattern)
 470         if err != nil {
 471                 return false, err
 472         }
 473         return re.Match(b), nil
 474 }
 475
 476 // ReplaceAllString returns a copy of src, replacing matches of the Regexp
 477 // with the replacement string repl. Inside repl, $ signs are interpreted as
 478 // in Expand, so for instance $1 represents the text of the first submatch.
 479 func (re *Regexp) ReplaceAllString(src, repl string) string {
 480         n := 2
 481         if strings.Contains(repl, "$") {
 482                 n = 2 * (re.numSubexp + 1)
 483         }
 484         b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
 485                 return re.expand(dst, repl, nil, src, match)
 486         })
 487         return string(b)
 488 }
 489
 490 // ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
 491 // with the replacement string repl. The replacement repl is substituted directly,
 492 // without using Expand.
 493 func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
 494         return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 495                 return append(dst, repl...)
 496         }))
 497 }
 498
 499 // ReplaceAllStringFunc returns a copy of src in which all matches of the
 500 // Regexp have been replaced by the return value of function repl applied
 501 // to the matched substring. The replacement returned by repl is substituted
 502 // directly, without using Expand.
 503 func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
 504         b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
 505                 return append(dst, repl(src[match[0]:match[1]])...)
 506         })
 507         return string(b)
 508 }
 509
 510 func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
 511         lastMatchEnd := 0 // end position of the most recent match
 512         searchPos := 0    // position where we next look for a match
 513         var buf []byte
 514         var endPos int
 515         if bsrc != nil {
 516                 endPos = len(bsrc)
 517         } else {
 518                 endPos = len(src)
 519         }
 520         if nmatch > re.prog.NumCap {
 521                 nmatch = re.prog.NumCap
 522         }
 523
 524         var dstCap [2]int
 525         for searchPos <= endPos {
 526                 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
 527                 if len(a) == 0 {
 528                         break // no more matches
 529                 }
 530
 531                 // Copy the unmatched characters before this match.
 532                 if bsrc != nil {
 533                         buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
 534                 } else {
 535                         buf = append(buf, src[lastMatchEnd:a[0]]...)
 536                 }
 537
 538                 // Now insert a copy of the replacement string, but not for a
 539                 // match of the empty string immediately after another match.
 540                 // (Otherwise, we get double replacement for patterns that
 541                 // match both empty and nonempty strings.)
 542                 if a[1] > lastMatchEnd || a[0] == 0 {
 543                         buf = repl(buf, a)
 544                 }
 545                 lastMatchEnd = a[1]
 546
 547                 // Advance past this match; always advance at least one character.
 548                 var width int
 549                 if bsrc != nil {
 550                         _, width = utf8.DecodeRune(bsrc[searchPos:])
 551                 } else {
 552                         _, width = utf8.DecodeRuneInString(src[searchPos:])
 553                 }
 554                 if searchPos+width > a[1] {
 555                         searchPos += width
 556                 } else if searchPos+1 > a[1] {
 557                         // This clause is only needed at the end of the input
 558                         // string. In that case, DecodeRuneInString returns width=0.
 559                         searchPos++
 560                 } else {
 561                         searchPos = a[1]
 562                 }
 563         }
 564
 565         // Copy the unmatched characters after the last match.
 566         if bsrc != nil {
 567                 buf = append(buf, bsrc[lastMatchEnd:]...)
 568         } else {
 569                 buf = append(buf, src[lastMatchEnd:]...)
 570         }
 571
 572         return buf
 573 }
 574
 575 // ReplaceAll returns a copy of src, replacing matches of the Regexp
 576 // with the replacement text repl. Inside repl, $ signs are interpreted as
 577 // in Expand, so for instance $1 represents the text of the first submatch.
 578 func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
 579         n := 2
 580         if bytes.IndexByte(repl, '$') >= 0 {
 581                 n = 2 * (re.numSubexp + 1)
 582         }
 583         srepl := ""
 584         b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
 585                 if len(srepl) != len(repl) {
 586                         srepl = string(repl)
 587                 }
 588                 return re.expand(dst, srepl, src, "", match)
 589         })
 590         return b
 591 }
 592
 593 // ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
 594 // with the replacement bytes repl. The replacement repl is substituted directly,
 595 // without using Expand.
 596 func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
 597         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 598                 return append(dst, repl...)
 599         })
 600 }
 601
 602 // ReplaceAllFunc returns a copy of src in which all matches of the
 603 // Regexp have been replaced by the return value of function repl applied
 604 // to the matched byte slice. The replacement returned by repl is substituted
 605 // directly, without using Expand.
 606 func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
 607         return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
 608                 return append(dst, repl(src[match[0]:match[1]])...)
 609         })
 610 }
 611
 612 // Bitmap used by func special to check whether a character needs to be escaped.
 613 var specialBytes [16]byte
 614
 615 // special reports whether byte b needs to be escaped by QuoteMeta.
 616 func special(b byte) bool {
 617         return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
 618 }
 619
 620 func init() {
 621         for _, b := range []byte(`\.+*?()|[]{}^$`) {
 622                 specialBytes[b%16] |= 1 << (b / 16)
 623         }
 624 }
 625
 626 // QuoteMeta returns a string that quotes all regular expression metacharacters
 627 // inside the argument text; the returned string is a regular expression matching
 628 // the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
 629 func QuoteMeta(s string) string {
 630         // A byte loop is correct because all metacharacters are ASCII.
 631         var i int
 632         for i = 0; i < len(s); i++ {
 633                 if special(s[i]) {
 634                         break
 635                 }
 636         }
 637         // No meta characters found, so return original string.
 638         if i >= len(s) {
 639                 return s
 640         }
 641
 642         b := make([]byte, 2*len(s)-i)
 643         copy(b, s[:i])
 644         j := i
 645         for ; i < len(s); i++ {
 646                 if special(s[i]) {
 647                         b[j] = '\\'
 648                         j++
 649                 }
 650                 b[j] = s[i]
 651                 j++
 652         }
 653         return string(b[:j])
 654 }
 655
 656 // The number of capture values in the program may correspond
 657 // to fewer capturing expressions than are in the regexp.
 658 // For example, "(a){0}" turns into an empty program, so the
 659 // maximum capture in the program is 0 but we need to return
 660 // an expression for \1.  Pad appends -1s to the slice a as needed.
 661 func (re *Regexp) pad(a []int) []int {
 662         if a == nil {
 663                 // No match.
 664                 return nil
 665         }
 666         n := (1 + re.numSubexp) * 2
 667         for len(a) < n {
 668                 a = append(a, -1)
 669         }
 670         return a
 671 }
 672
 673 // Find matches in slice b if b is non-nil, otherwise find matches in string s.
 674 func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
 675         var end int
 676         if b == nil {
 677                 end = len(s)
 678         } else {
 679                 end = len(b)
 680         }
 681
 682         for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
 683                 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
 684                 if len(matches) == 0 {
 685                         break
 686                 }
 687
 688                 accept := true
 689                 if matches[1] == pos {
 690                         // We've found an empty match.
 691                         if matches[0] == prevMatchEnd {
 692                                 // We don't allow an empty match right
 693                                 // after a previous match, so ignore it.
 694                                 accept = false
 695                         }
 696                         var width int
 697                         // TODO: use step()
 698                         if b == nil {
 699                                 _, width = utf8.DecodeRuneInString(s[pos:end])
 700                         } else {
 701                                 _, width = utf8.DecodeRune(b[pos:end])
 702                         }
 703                         if width > 0 {
 704                                 pos += width
 705                         } else {
 706                                 pos = end + 1
 707                         }
 708                 } else {
 709                         pos = matches[1]
 710                 }
 711                 prevMatchEnd = matches[1]
 712
 713                 if accept {
 714                         deliver(re.pad(matches))
 715                         i++
 716                 }
 717         }
 718 }
 719
 720 // Find returns a slice holding the text of the leftmost match in b of the regular expression.
 721 // A return value of nil indicates no match.
 722 func (re *Regexp) Find(b []byte) []byte {
 723         var dstCap [2]int
 724         a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
 725         if a == nil {
 726                 return nil
 727         }
 728         return b[a[0]:a[1]]
 729 }
 730
 731 // FindIndex returns a two-element slice of integers defining the location of
 732 // the leftmost match in b of the regular expression. The match itself is at
 733 // b[loc[0]:loc[1]].
 734 // A return value of nil indicates no match.
 735 func (re *Regexp) FindIndex(b []byte) (loc []int) {
 736         a := re.doExecute(nil, b, "", 0, 2, nil)
 737         if a == nil {
 738                 return nil
 739         }
 740         return a[0:2]
 741 }
 742
 743 // FindString returns a string holding the text of the leftmost match in s of the regular
 744 // expression. If there is no match, the return value is an empty string,
 745 // but it will also be empty if the regular expression successfully matches
 746 // an empty string. Use FindStringIndex or FindStringSubmatch if it is
 747 // necessary to distinguish these cases.
 748 func (re *Regexp) FindString(s string) string {
 749         var dstCap [2]int
 750         a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
 751         if a == nil {
 752                 return ""
 753         }
 754         return s[a[0]:a[1]]
 755 }
 756
 757 // FindStringIndex returns a two-element slice of integers defining the
 758 // location of the leftmost match in s of the regular expression. The match
 759 // itself is at s[loc[0]:loc[1]].
 760 // A return value of nil indicates no match.
 761 func (re *Regexp) FindStringIndex(s string) (loc []int) {
 762         a := re.doExecute(nil, nil, s, 0, 2, nil)
 763         if a == nil {
 764                 return nil
 765         }
 766         return a[0:2]
 767 }
 768
 769 // FindReaderIndex returns a two-element slice of integers defining the
 770 // location of the leftmost match of the regular expression in text read from
 771 // the RuneReader. The match text was found in the input stream at
 772 // byte offset loc[0] through loc[1]-1.
 773 // A return value of nil indicates no match.
 774 func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
 775         a := re.doExecute(r, nil, "", 0, 2, nil)
 776         if a == nil {
 777                 return nil
 778         }
 779         return a[0:2]
 780 }
 781
 782 // FindSubmatch returns a slice of slices holding the text of the leftmost
 783 // match of the regular expression in b and the matches, if any, of its
 784 // subexpressions, as defined by the 'Submatch' descriptions in the package
 785 // comment.
 786 // A return value of nil indicates no match.
 787 func (re *Regexp) FindSubmatch(b []byte) [][]byte {
 788         var dstCap [4]int
 789         a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
 790         if a == nil {
 791                 return nil
 792         }
 793         ret := make([][]byte, 1+re.numSubexp)
 794         for i := range ret {
 795                 if 2*i < len(a) && a[2*i] >= 0 {
 796                         ret[i] = b[a[2*i]:a[2*i+1]]
 797                 }
 798         }
 799         return ret
 800 }
 801
 802 // Expand appends template to dst and returns the result; during the
 803 // append, Expand replaces variables in the template with corresponding
 804 // matches drawn from src. The match slice should have been returned by
 805 // FindSubmatchIndex.
 806 //
 807 // In the template, a variable is denoted by a substring of the form
 808 // $name or ${name}, where name is a non-empty sequence of letters,
 809 // digits, and underscores. A purely numeric name like $1 refers to
 810 // the submatch with the corresponding index; other names refer to
 811 // capturing parentheses named with the (?P<name>...) syntax. A
 812 // reference to an out of range or unmatched index or a name that is not
 813 // present in the regular expression is replaced with an empty slice.
 814 //
 815 // In the $name form, name is taken to be as long as possible: $1x is
 816 // equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
 817 //
 818 // To insert a literal $ in the output, use $$ in the template.
 819 func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
 820         return re.expand(dst, string(template), src, "", match)
 821 }
 822
 823 // ExpandString is like Expand but the template and source are strings.
 824 // It appends to and returns a byte slice in order to give the calling
 825 // code control over allocation.
 826 func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
 827         return re.expand(dst, template, nil, src, match)
 828 }
 829
 830 func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
 831         for len(template) > 0 {
 832                 i := strings.Index(template, "$")
 833                 if i < 0 {
 834                         break
 835                 }
 836                 dst = append(dst, template[:i]...)
 837                 template = template[i:]
 838                 if len(template) > 1 && template[1] == '$' {
 839                         // Treat $$ as $.
 840                         dst = append(dst, '$')
 841                         template = template[2:]
 842                         continue
 843                 }
 844                 name, num, rest, ok := extract(template)
 845                 if !ok {
 846                         // Malformed; treat $ as raw text.
 847                         dst = append(dst, '$')
 848                         template = template[1:]
 849                         continue
 850                 }
 851                 template = rest
 852                 if num >= 0 {
 853                         if 2*num+1 < len(match) && match[2*num] >= 0 {
 854                                 if bsrc != nil {
 855                                         dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
 856                                 } else {
 857                                         dst = append(dst, src[match[2*num]:match[2*num+1]]...)
 858                                 }
 859                         }
 860                 } else {
 861                         for i, namei := range re.subexpNames {
 862                                 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
 863                                         if bsrc != nil {
 864                                                 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
 865                                         } else {
 866                                                 dst = append(dst, src[match[2*i]:match[2*i+1]]...)
 867                                         }
 868                                         break
 869                                 }
 870                         }
 871                 }
 872         }
 873         dst = append(dst, template...)
 874         return dst
 875 }
 876
 877 // extract returns the name from a leading "$name" or "${name}" in str.
 878 // If it is a number, extract returns num set to that number; otherwise num = -1.
 879 func extract(str string) (name string, num int, rest string, ok bool) {
 880         if len(str) < 2 || str[0] != '$' {
 881                 return
 882         }
 883         brace := false
 884         if str[1] == '{' {
 885                 brace = true
 886                 str = str[2:]
 887         } else {
 888                 str = str[1:]
 889         }
 890         i := 0
 891         for i < len(str) {
 892                 rune, size := utf8.DecodeRuneInString(str[i:])
 893                 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
 894                         break
 895                 }
 896                 i += size
 897         }
 898         if i == 0 {
 899                 // empty name is not okay
 900                 return
 901         }
 902         name = str[:i]
 903         if brace {
 904                 if i >= len(str) || str[i] != '}' {
 905                         // missing closing brace
 906                         return
 907                 }
 908                 i++
 909         }
 910
 911         // Parse number.
 912         num = 0
 913         for i := 0; i < len(name); i++ {
 914                 if name[i] < '0' || '9' < name[i] || num >= 1e8 {
 915                         num = -1
 916                         break
 917                 }
 918                 num = num*10 + int(name[i]) - '0'
 919         }
 920         // Disallow leading zeros.
 921         if name[0] == '0' && len(name) > 1 {
 922                 num = -1
 923         }
 924
 925         rest = str[i:]
 926         ok = true
 927         return
 928 }
 929
 930 // FindSubmatchIndex returns a slice holding the index pairs identifying the
 931 // leftmost match of the regular expression in b and the matches, if any, of
 932 // its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
 933 // in the package comment.
 934 // A return value of nil indicates no match.
 935 func (re *Regexp) FindSubmatchIndex(b []byte) []int {
 936         return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
 937 }
 938
 939 // FindStringSubmatch returns a slice of strings holding the text of the
 940 // leftmost match of the regular expression in s and the matches, if any, of
 941 // its subexpressions, as defined by the 'Submatch' description in the
 942 // package comment.
 943 // A return value of nil indicates no match.
 944 func (re *Regexp) FindStringSubmatch(s string) []string {
 945         var dstCap [4]int
 946         a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
 947         if a == nil {
 948                 return nil
 949         }
 950         ret := make([]string, 1+re.numSubexp)
 951         for i := range ret {
 952                 if 2*i < len(a) && a[2*i] >= 0 {
 953                         ret[i] = s[a[2*i]:a[2*i+1]]
 954                 }
 955         }
 956         return ret
 957 }
 958
 959 // FindStringSubmatchIndex returns a slice holding the index pairs
 960 // identifying the leftmost match of the regular expression in s and the
 961 // matches, if any, of its subexpressions, as defined by the 'Submatch' and
 962 // 'Index' descriptions in the package comment.
 963 // A return value of nil indicates no match.
 964 func (re *Regexp) FindStringSubmatchIndex(s string) []int {
 965         return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
 966 }
 967
 968 // FindReaderSubmatchIndex returns a slice holding the index pairs
 969 // identifying the leftmost match of the regular expression of text read by
 970 // the RuneReader, and the matches, if any, of its subexpressions, as defined
 971 // by the 'Submatch' and 'Index' descriptions in the package comment. A
 972 // return value of nil indicates no match.
 973 func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
 974         return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
 975 }
 976
 977 const startSize = 10 // The size at which to start a slice in the 'All' routines.
 978
 979 // FindAll is the 'All' version of Find; it returns a slice of all successive
 980 // matches of the expression, as defined by the 'All' description in the
 981 // package comment.
 982 // A return value of nil indicates no match.
 983 func (re *Regexp) FindAll(b []byte, n int) [][]byte {
 984         if n < 0 {
 985                 n = len(b) + 1
 986         }
 987         result := make([][]byte, 0, startSize)
 988         re.allMatches("", b, n, func(match []int) {
 989                 result = append(result, b[match[0]:match[1]])
 990         })
 991         if len(result) == 0 {
 992                 return nil
 993         }
 994         return result
 995 }
 996
 997 // FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
 998 // successive matches of the expression, as defined by the 'All' description
 999 // in the package comment.
1000 // A return value of nil indicates no match.
1001 func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
1002         if n < 0 {
1003                 n = len(b) + 1
1004         }
1005         result := make([][]int, 0, startSize)
1006         re.allMatches("", b, n, func(match []int) {
1007                 result = append(result, match[0:2])
1008         })
1009         if len(result) == 0 {
1010                 return nil
1011         }
1012         return result
1013 }
1014
1015 // FindAllString is the 'All' version of FindString; it returns a slice of all
1016 // successive matches of the expression, as defined by the 'All' description
1017 // in the package comment.
1018 // A return value of nil indicates no match.
1019 func (re *Regexp) FindAllString(s string, n int) []string {
1020         if n < 0 {
1021                 n = len(s) + 1
1022         }
1023         result := make([]string, 0, startSize)
1024         re.allMatches(s, nil, n, func(match []int) {
1025                 result = append(result, s[match[0]:match[1]])
1026         })
1027         if len(result) == 0 {
1028                 return nil
1029         }
1030         return result
1031 }
1032
1033 // FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
1034 // slice of all successive matches of the expression, as defined by the 'All'
1035 // description in the package comment.
1036 // A return value of nil indicates no match.
1037 func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
1038         if n < 0 {
1039                 n = len(s) + 1
1040         }
1041         result := make([][]int, 0, startSize)
1042         re.allMatches(s, nil, n, func(match []int) {
1043                 result = append(result, match[0:2])
1044         })
1045         if len(result) == 0 {
1046                 return nil
1047         }
1048         return result
1049 }
1050
1051 // FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1052 // of all successive matches of the expression, as defined by the 'All'
1053 // description in the package comment.
1054 // A return value of nil indicates no match.
1055 func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1056         if n < 0 {
1057                 n = len(b) + 1
1058         }
1059         result := make([][][]byte, 0, startSize)
1060         re.allMatches("", b, n, func(match []int) {
1061                 slice := make([][]byte, len(match)/2)
1062                 for j := range slice {
1063                         if match[2*j] >= 0 {
1064                                 slice[j] = b[match[2*j]:match[2*j+1]]
1065                         }
1066                 }
1067                 result = append(result, slice)
1068         })
1069         if len(result) == 0 {
1070                 return nil
1071         }
1072         return result
1073 }
1074
1075 // FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1076 // a slice of all successive matches of the expression, as defined by the
1077 // 'All' description in the package comment.
1078 // A return value of nil indicates no match.
1079 func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1080         if n < 0 {
1081                 n = len(b) + 1
1082         }
1083         result := make([][]int, 0, startSize)
1084         re.allMatches("", b, n, func(match []int) {
1085                 result = append(result, match)
1086         })
1087         if len(result) == 0 {
1088                 return nil
1089         }
1090         return result
1091 }
1092
1093 // FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1094 // returns a slice of all successive matches of the expression, as defined by
1095 // the 'All' description in the package comment.
1096 // A return value of nil indicates no match.
1097 func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1098         if n < 0 {
1099                 n = len(s) + 1
1100         }
1101         result := make([][]string, 0, startSize)
1102         re.allMatches(s, nil, n, func(match []int) {
1103                 slice := make([]string, len(match)/2)
1104                 for j := range slice {
1105                         if match[2*j] >= 0 {
1106                                 slice[j] = s[match[2*j]:match[2*j+1]]
1107                         }
1108                 }
1109                 result = append(result, slice)
1110         })
1111         if len(result) == 0 {
1112                 return nil
1113         }
1114         return result
1115 }
1116
1117 // FindAllStringSubmatchIndex is the 'All' version of
1118 // FindStringSubmatchIndex; it returns a slice of all successive matches of
1119 // the expression, as defined by the 'All' description in the package
1120 // comment.
1121 // A return value of nil indicates no match.
1122 func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1123         if n < 0 {
1124                 n = len(s) + 1
1125         }
1126         result := make([][]int, 0, startSize)
1127         re.allMatches(s, nil, n, func(match []int) {
1128                 result = append(result, match)
1129         })
1130         if len(result) == 0 {
1131                 return nil
1132         }
1133         return result
1134 }
1135
1136 // Split slices s into substrings separated by the expression and returns a slice of
1137 // the substrings between those expression matches.
1138 //
1139 // The slice returned by this method consists of all the substrings of s
1140 // not contained in the slice returned by FindAllString. When called on an expression
1141 // that contains no metacharacters, it is equivalent to strings.SplitN.
1142 //
1143 // Example:
1144 //   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
1145 //   // s: ["", "b", "b", "c", "cadaaae"]
1146 //
1147 // The count determines the number of substrings to return:
1148 //   n > 0: at most n substrings; the last substring will be the unsplit remainder.
1149 //   n == 0: the result is nil (zero substrings)
1150 //   n < 0: all substrings
1151 func (re *Regexp) Split(s string, n int) []string {
1152
1153         if n == 0 {
1154                 return nil
1155         }
1156
1157         if len(re.expr) > 0 && len(s) == 0 {
1158                 return []string{""}
1159         }
1160
1161         matches := re.FindAllStringIndex(s, n)
1162         strings := make([]string, 0, len(matches))
1163
1164         beg := 0
1165         end := 0
1166         for _, match := range matches {
1167                 if n > 0 && len(strings) >= n-1 {
1168                         break
1169                 }
1170
1171                 end = match[0]
1172                 if match[1] != 0 {
1173                         strings = append(strings, s[beg:end])
1174                 }
1175                 beg = match[1]
1176         }
1177
1178         if end != len(s) {
1179                 strings = append(strings, s[beg:])
1180         }
1181
1182         return strings
1183 }