libgo/go/regexp/onepass.go

   1 // Copyright 2014 The Go Authors.  All rights reserved.
   2
   3 package regexp
   4
   5 import (
   6         "bytes"
   7         "regexp/syntax"
   8         "sort"
   9         "unicode"
  10 )
  11
  12 // Use of this source code is governed by a BSD-style
  13 // license that can be found in the LICENSE file.
  14
  15 // "One-pass" regexp execution.
  16 // Some regexps can be analyzed to determine that they never need
  17 // backtracking: they are guaranteed to run in one pass over the string
  18 // without bothering to save all the usual NFA state.
  19 // Detect those and execute them more quickly.
  20
  21 // A onePassProg is a compiled one-pass regular expression program.
  22 // It is the same as syntax.Prog except for the use of onePassInst.
  23 type onePassProg struct {
  24         Inst   []onePassInst
  25         Start  int // index of start instruction
  26         NumCap int // number of InstCapture insts in re
  27 }
  28
  29 // A onePassInst is a single instruction in a one-pass regular expression program.
  30 // It is the same as syntax.Inst except for the new 'Next' field.
  31 type onePassInst struct {
  32         syntax.Inst
  33         Next []uint32
  34 }
  35
  36 // OnePassPrefix returns a literal string that all matches for the
  37 // regexp must start with.  Complete is true if the prefix
  38 // is the entire match. Pc is the index of the last rune instruction
  39 // in the string. The OnePassPrefix skips over the mandatory
  40 // EmptyBeginText
  41 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
  42         i := &p.Inst[p.Start]
  43         if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
  44                 return "", i.Op == syntax.InstMatch, uint32(p.Start)
  45         }
  46         pc = i.Out
  47         i = &p.Inst[pc]
  48         for i.Op == syntax.InstNop {
  49                 pc = i.Out
  50                 i = &p.Inst[pc]
  51         }
  52         // Avoid allocation of buffer if prefix is empty.
  53         if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
  54                 return "", i.Op == syntax.InstMatch, uint32(p.Start)
  55         }
  56
  57         // Have prefix; gather characters.
  58         var buf bytes.Buffer
  59         for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
  60                 buf.WriteRune(i.Rune[0])
  61                 pc, i = i.Out, &p.Inst[i.Out]
  62         }
  63         return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
  64 }
  65
  66 // OnePassNext selects the next actionable state of the prog, based on the input character.
  67 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
  68 // One of the alternates may ultimately lead without input to end of line. If the instruction
  69 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
  70 func onePassNext(i *onePassInst, r rune) uint32 {
  71         next := i.MatchRunePos(r)
  72         if next >= 0 {
  73                 return i.Next[next]
  74         }
  75         if i.Op == syntax.InstAltMatch {
  76                 return i.Out
  77         }
  78         return 0
  79 }
  80
  81 func iop(i *syntax.Inst) syntax.InstOp {
  82         op := i.Op
  83         switch op {
  84         case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
  85                 op = syntax.InstRune
  86         }
  87         return op
  88 }
  89
  90 // Sparse Array implementation is used as a queueOnePass.
  91 type queueOnePass struct {
  92         sparse          []uint32
  93         dense           []uint32
  94         size, nextIndex uint32
  95 }
  96
  97 func (q *queueOnePass) empty() bool {
  98         return q.nextIndex >= q.size
  99 }
 100
 101 func (q *queueOnePass) next() (n uint32) {
 102         n = q.dense[q.nextIndex]
 103         q.nextIndex++
 104         return
 105 }
 106
 107 func (q *queueOnePass) clear() {
 108         q.size = 0
 109         q.nextIndex = 0
 110 }
 111
 112 func (q *queueOnePass) reset() {
 113         q.nextIndex = 0
 114 }
 115
 116 func (q *queueOnePass) contains(u uint32) bool {
 117         if u >= uint32(len(q.sparse)) {
 118                 return false
 119         }
 120         return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
 121 }
 122
 123 func (q *queueOnePass) insert(u uint32) {
 124         if !q.contains(u) {
 125                 q.insertNew(u)
 126         }
 127 }
 128
 129 func (q *queueOnePass) insertNew(u uint32) {
 130         if u >= uint32(len(q.sparse)) {
 131                 return
 132         }
 133         q.sparse[u] = q.size
 134         q.dense[q.size] = u
 135         q.size++
 136 }
 137
 138 func newQueue(size int) (q *queueOnePass) {
 139         return &queueOnePass{
 140                 sparse: make([]uint32, size),
 141                 dense:  make([]uint32, size),
 142         }
 143 }
 144
 145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
 146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
 147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
 148 // NextIp array with the single element mergeFailed is returned.
 149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
 150 const mergeFailed = uint32(0xffffffff)
 151
 152 var (
 153         noRune = []rune{}
 154         noNext = []uint32{mergeFailed}
 155 )
 156
 157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
 158         leftLen := len(*leftRunes)
 159         rightLen := len(*rightRunes)
 160         if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
 161                 panic("mergeRuneSets odd length []rune")
 162         }
 163         var (
 164                 lx, rx int
 165         )
 166         merged := make([]rune, 0)
 167         next := make([]uint32, 0)
 168         ok := true
 169         defer func() {
 170                 if !ok {
 171                         merged = nil
 172                         next = nil
 173                 }
 174         }()
 175
 176         ix := -1
 177         extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
 178                 if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
 179                         return false
 180                 }
 181                 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
 182                 *newLow += 2
 183                 ix += 2
 184                 next = append(next, pc)
 185                 return true
 186         }
 187
 188         for lx < leftLen || rx < rightLen {
 189                 switch {
 190                 case rx >= rightLen:
 191                         ok = extend(&lx, leftRunes, leftPC)
 192                 case lx >= leftLen:
 193                         ok = extend(&rx, rightRunes, rightPC)
 194                 case (*rightRunes)[rx] < (*leftRunes)[lx]:
 195                         ok = extend(&rx, rightRunes, rightPC)
 196                 default:
 197                         ok = extend(&lx, leftRunes, leftPC)
 198                 }
 199                 if !ok {
 200                         return noRune, noNext
 201                 }
 202         }
 203         return merged, next
 204 }
 205
 206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
 207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
 208         for ix, instOriginal := range original.Inst {
 209                 switch instOriginal.Op {
 210                 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
 211                 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
 212                         prog.Inst[ix].Next = nil
 213                 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 214                         prog.Inst[ix].Next = nil
 215                         prog.Inst[ix] = onePassInst{Inst: instOriginal}
 216                 }
 217         }
 218 }
 219
 220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
 221 func onePassCopy(prog *syntax.Prog) *onePassProg {
 222         p := &onePassProg{
 223                 Start:  prog.Start,
 224                 NumCap: prog.NumCap,
 225         }
 226         for _, inst := range prog.Inst {
 227                 p.Inst = append(p.Inst, onePassInst{Inst: inst})
 228         }
 229
 230         // rewrites one or more common Prog constructs that enable some otherwise
 231         // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
 232         // ip A, that points to ips B & C.
 233         // A:BC + B:DA => A:BC + B:CD
 234         // A:BC + B:DC => A:DC + B:DC
 235         for pc := range p.Inst {
 236                 switch p.Inst[pc].Op {
 237                 default:
 238                         continue
 239                 case syntax.InstAlt, syntax.InstAltMatch:
 240                         // A:Bx + B:Ay
 241                         p_A_Other := &p.Inst[pc].Out
 242                         p_A_Alt := &p.Inst[pc].Arg
 243                         // make sure a target is another Alt
 244                         instAlt := p.Inst[*p_A_Alt]
 245                         if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
 246                                 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
 247                                 instAlt = p.Inst[*p_A_Alt]
 248                                 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
 249                                         continue
 250                                 }
 251                         }
 252                         instOther := p.Inst[*p_A_Other]
 253                         // Analyzing both legs pointing to Alts is for another day
 254                         if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
 255                                 // too complicated
 256                                 continue
 257                         }
 258                         // simple empty transition loop
 259                         // A:BC + B:DA => A:BC + B:DC
 260                         p_B_Alt := &p.Inst[*p_A_Alt].Out
 261                         p_B_Other := &p.Inst[*p_A_Alt].Arg
 262                         patch := false
 263                         if instAlt.Out == uint32(pc) {
 264                                 patch = true
 265                         } else if instAlt.Arg == uint32(pc) {
 266                                 patch = true
 267                                 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
 268                         }
 269                         if patch {
 270                                 *p_B_Alt = *p_A_Other
 271                         }
 272
 273                         // empty transition to common target
 274                         // A:BC + B:DC => A:DC + B:DC
 275                         if *p_A_Other == *p_B_Alt {
 276                                 *p_A_Alt = *p_B_Other
 277                         }
 278                 }
 279         }
 280         return p
 281 }
 282
 283 // runeSlice exists to permit sorting the case-folded rune sets.
 284 type runeSlice []rune
 285
 286 func (p runeSlice) Len() int           { return len(p) }
 287 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
 288 func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
 289
 290 // Sort is a convenience method.
 291 func (p runeSlice) Sort() {
 292         sort.Sort(p)
 293 }
 294
 295 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
 296 var anyRune = []rune{0, unicode.MaxRune}
 297
 298 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
 299 // the match engine can always tell which branch to take. The routine may modify
 300 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
 301 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
 302 // to the size of the Prog.
 303 func makeOnePass(p *onePassProg) *onePassProg {
 304         // If the machine is very long, it's not worth the time to check if we can use one pass.
 305         if len(p.Inst) >= 1000 {
 306                 return notOnePass
 307         }
 308
 309         var (
 310                 instQueue    = newQueue(len(p.Inst))
 311                 visitQueue   = newQueue(len(p.Inst))
 312                 build        func(uint32, *queueOnePass)
 313                 check        func(uint32, map[uint32]bool) bool
 314                 onePassRunes = make([][]rune, len(p.Inst))
 315         )
 316         build = func(pc uint32, q *queueOnePass) {
 317                 if q.contains(pc) {
 318                         return
 319                 }
 320                 inst := p.Inst[pc]
 321                 switch inst.Op {
 322                 case syntax.InstAlt, syntax.InstAltMatch:
 323                         q.insert(inst.Out)
 324                         build(inst.Out, q)
 325                         q.insert(inst.Arg)
 326                 case syntax.InstMatch, syntax.InstFail:
 327                 default:
 328                         q.insert(inst.Out)
 329                 }
 330         }
 331
 332         // check that paths from Alt instructions are unambiguous, and rebuild the new
 333         // program as a onepass program
 334         check = func(pc uint32, m map[uint32]bool) (ok bool) {
 335                 ok = true
 336                 inst := &p.Inst[pc]
 337                 if visitQueue.contains(pc) {
 338                         return
 339                 }
 340                 visitQueue.insert(pc)
 341                 switch inst.Op {
 342                 case syntax.InstAlt, syntax.InstAltMatch:
 343                         ok = check(inst.Out, m) && check(inst.Arg, m)
 344                         // check no-input paths to InstMatch
 345                         matchOut := m[inst.Out]
 346                         matchArg := m[inst.Arg]
 347                         if matchOut && matchArg {
 348                                 ok = false
 349                                 break
 350                         }
 351                         // Match on empty goes in inst.Out
 352                         if matchArg {
 353                                 inst.Out, inst.Arg = inst.Arg, inst.Out
 354                                 matchOut, matchArg = matchArg, matchOut
 355                         }
 356                         if matchOut {
 357                                 m[pc] = true
 358                                 inst.Op = syntax.InstAltMatch
 359                         }
 360
 361                         // build a dispatch operator from the two legs of the alt.
 362                         onePassRunes[pc], inst.Next = mergeRuneSets(
 363                                 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
 364                         if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
 365                                 ok = false
 366                                 break
 367                         }
 368                 case syntax.InstCapture, syntax.InstNop:
 369                         ok = check(inst.Out, m)
 370                         m[pc] = m[inst.Out]
 371                         // pass matching runes back through these no-ops.
 372                         onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
 373                         inst.Next = []uint32{}
 374                         for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
 375                                 inst.Next = append(inst.Next, inst.Out)
 376                         }
 377                 case syntax.InstEmptyWidth:
 378                         ok = check(inst.Out, m)
 379                         m[pc] = m[inst.Out]
 380                         onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
 381                         inst.Next = []uint32{}
 382                         for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
 383                                 inst.Next = append(inst.Next, inst.Out)
 384                         }
 385                 case syntax.InstMatch, syntax.InstFail:
 386                         m[pc] = inst.Op == syntax.InstMatch
 387                         break
 388                 case syntax.InstRune:
 389                         ok = check(inst.Out, m)
 390                         m[pc] = false
 391                         if len(inst.Next) > 0 {
 392                                 break
 393                         }
 394                         if len(inst.Rune) == 0 {
 395                                 onePassRunes[pc] = []rune{}
 396                                 inst.Next = []uint32{inst.Out}
 397                                 break
 398                         }
 399                         runes := make([]rune, 0)
 400                         if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
 401                                 r0 := inst.Rune[0]
 402                                 runes = append(runes, r0, r0)
 403                                 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
 404                                         runes = append(runes, r1, r1)
 405                                 }
 406                                 sort.Sort(runeSlice(runes))
 407                         } else {
 408                                 runes = append(runes, inst.Rune...)
 409                         }
 410                         onePassRunes[pc] = runes
 411                         inst.Next = []uint32{}
 412                         for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
 413                                 inst.Next = append(inst.Next, inst.Out)
 414                         }
 415                         inst.Op = syntax.InstRune
 416                 case syntax.InstRune1:
 417                         ok = check(inst.Out, m)
 418                         m[pc] = false
 419                         if len(inst.Next) > 0 {
 420                                 break
 421                         }
 422                         runes := []rune{}
 423                         // expand case-folded runes
 424                         if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
 425                                 r0 := inst.Rune[0]
 426                                 runes = append(runes, r0, r0)
 427                                 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
 428                                         runes = append(runes, r1, r1)
 429                                 }
 430                                 sort.Sort(runeSlice(runes))
 431                         } else {
 432                                 runes = append(runes, inst.Rune[0], inst.Rune[0])
 433                         }
 434                         onePassRunes[pc] = runes
 435                         inst.Next = []uint32{}
 436                         for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
 437                                 inst.Next = append(inst.Next, inst.Out)
 438                         }
 439                         inst.Op = syntax.InstRune
 440                 case syntax.InstRuneAny:
 441                         ok = check(inst.Out, m)
 442                         m[pc] = false
 443                         if len(inst.Next) > 0 {
 444                                 break
 445                         }
 446                         onePassRunes[pc] = append([]rune{}, anyRune...)
 447                         inst.Next = []uint32{inst.Out}
 448                 case syntax.InstRuneAnyNotNL:
 449                         ok = check(inst.Out, m)
 450                         m[pc] = false
 451                         if len(inst.Next) > 0 {
 452                                 break
 453                         }
 454                         onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
 455                         inst.Next = []uint32{}
 456                         for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
 457                                 inst.Next = append(inst.Next, inst.Out)
 458                         }
 459                 }
 460                 return
 461         }
 462
 463         instQueue.clear()
 464         instQueue.insert(uint32(p.Start))
 465         m := make(map[uint32]bool, len(p.Inst))
 466         for !instQueue.empty() {
 467                 pc := instQueue.next()
 468                 inst := p.Inst[pc]
 469                 visitQueue.clear()
 470                 if !check(uint32(pc), m) {
 471                         p = notOnePass
 472                         break
 473                 }
 474                 switch inst.Op {
 475                 case syntax.InstAlt, syntax.InstAltMatch:
 476                         instQueue.insert(inst.Out)
 477                         instQueue.insert(inst.Arg)
 478                 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
 479                         instQueue.insert(inst.Out)
 480                 case syntax.InstMatch:
 481                 case syntax.InstFail:
 482                 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
 483                 default:
 484                 }
 485         }
 486         if p != notOnePass {
 487                 for i, _ := range p.Inst {
 488                         p.Inst[i].Rune = onePassRunes[i]
 489                 }
 490         }
 491         return p
 492 }
 493
 494 // walk visits each Inst in the prog once, and applies the argument
 495 // function(ip, next), in pre-order.
 496 func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
 497         var walk1 func(uint32)
 498         progQueue := newQueue(len(prog.Inst))
 499         walk1 = func(ip uint32) {
 500                 if progQueue.contains(ip) {
 501                         return
 502                 }
 503                 progQueue.insert(ip)
 504                 inst := prog.Inst[ip]
 505                 switch inst.Op {
 506                 case syntax.InstAlt, syntax.InstAltMatch:
 507                         for _, f := range funcs {
 508                                 f(ip, inst.Out)
 509                                 f(ip, inst.Arg)
 510                         }
 511                         walk1(inst.Out)
 512                         walk1(inst.Arg)
 513                 default:
 514                         for _, f := range funcs {
 515                                 f(ip, inst.Out)
 516                         }
 517                         walk1(inst.Out)
 518                 }
 519         }
 520         walk1(uint32(prog.Start))
 521 }
 522
 523 // find returns the Insts that match the argument predicate function
 524 func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
 525         matches = []uint32{}
 526
 527         for ip := range prog.Inst {
 528                 if f(prog, ip) {
 529                         matches = append(matches, uint32(ip))
 530                 }
 531         }
 532         return
 533 }
 534
 535 var notOnePass *onePassProg = nil
 536
 537 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
 538 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
 539 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
 540 // be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
 541 func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
 542         if prog.Start == 0 {
 543                 return notOnePass
 544         }
 545         // onepass regexp is anchored
 546         if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
 547                 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
 548                 return notOnePass
 549         }
 550         // every instruction leading to InstMatch must be EmptyEndText
 551         for _, inst := range prog.Inst {
 552                 opOut := prog.Inst[inst.Out].Op
 553                 switch inst.Op {
 554                 default:
 555                         if opOut == syntax.InstMatch {
 556                                 return notOnePass
 557                         }
 558                 case syntax.InstAlt, syntax.InstAltMatch:
 559                         if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
 560                                 return notOnePass
 561                         }
 562                 case syntax.InstEmptyWidth:
 563                         if opOut == syntax.InstMatch {
 564                                 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
 565                                         continue
 566                                 }
 567                                 return notOnePass
 568                         }
 569                 }
 570         }
 571         // Creates a slightly optimized copy of the original Prog
 572         // that cleans up some Prog idioms that block valid onepass programs
 573         p = onePassCopy(prog)
 574
 575         // checkAmbiguity on InstAlts, build onepass Prog if possible
 576         p = makeOnePass(p)
 577
 578         if p != notOnePass {
 579                 cleanupOnePass(p, prog)
 580         }
 581         return p
 582 }