PR rtl-optimization/57003
[official-gcc.git] / libgo / go / regexp / onepass.go
blob501fb28af6691367d4f9d55424787ee267f8126f
1 // Copyright 2014 The Go Authors. All rights reserved.
3 package regexp
5 import (
6 "bytes"
7 "regexp/syntax"
8 "sort"
9 "unicode"
12 // Use of this source code is governed by a BSD-style
13 // license that can be found in the LICENSE file.
15 // "One-pass" regexp execution.
16 // Some regexps can be analyzed to determine that they never need
17 // backtracking: they are guaranteed to run in one pass over the string
18 // without bothering to save all the usual NFA state.
19 // Detect those and execute them more quickly.
21 // A onePassProg is a compiled one-pass regular expression program.
22 // It is the same as syntax.Prog except for the use of onePassInst.
23 type onePassProg struct {
24 Inst []onePassInst
25 Start int // index of start instruction
26 NumCap int // number of InstCapture insts in re
29 // A onePassInst is a single instruction in a one-pass regular expression program.
30 // It is the same as syntax.Inst except for the new 'Next' field.
31 type onePassInst struct {
32 syntax.Inst
33 Next []uint32
36 // OnePassPrefix returns a literal string that all matches for the
37 // regexp must start with. Complete is true if the prefix
38 // is the entire match. Pc is the index of the last rune instruction
39 // in the string. The OnePassPrefix skips over the mandatory
40 // EmptyBeginText
41 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
42 i := &p.Inst[p.Start]
43 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
44 return "", i.Op == syntax.InstMatch, uint32(p.Start)
46 pc = i.Out
47 i = &p.Inst[pc]
48 for i.Op == syntax.InstNop {
49 pc = i.Out
50 i = &p.Inst[pc]
52 // Avoid allocation of buffer if prefix is empty.
53 if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
54 return "", i.Op == syntax.InstMatch, uint32(p.Start)
57 // Have prefix; gather characters.
58 var buf bytes.Buffer
59 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
60 buf.WriteRune(i.Rune[0])
61 pc, i = i.Out, &p.Inst[i.Out]
63 return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
66 // OnePassNext selects the next actionable state of the prog, based on the input character.
67 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
68 // One of the alternates may ultimately lead without input to end of line. If the instruction
69 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
70 func onePassNext(i *onePassInst, r rune) uint32 {
71 next := i.MatchRunePos(r)
72 if next >= 0 {
73 return i.Next[next]
75 if i.Op == syntax.InstAltMatch {
76 return i.Out
78 return 0
81 func iop(i *syntax.Inst) syntax.InstOp {
82 op := i.Op
83 switch op {
84 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
85 op = syntax.InstRune
87 return op
90 // Sparse Array implementation is used as a queueOnePass.
91 type queueOnePass struct {
92 sparse []uint32
93 dense []uint32
94 size, nextIndex uint32
97 func (q *queueOnePass) empty() bool {
98 return q.nextIndex >= q.size
101 func (q *queueOnePass) next() (n uint32) {
102 n = q.dense[q.nextIndex]
103 q.nextIndex++
104 return
107 func (q *queueOnePass) clear() {
108 q.size = 0
109 q.nextIndex = 0
112 func (q *queueOnePass) reset() {
113 q.nextIndex = 0
116 func (q *queueOnePass) contains(u uint32) bool {
117 if u >= uint32(len(q.sparse)) {
118 return false
120 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
123 func (q *queueOnePass) insert(u uint32) {
124 if !q.contains(u) {
125 q.insertNew(u)
129 func (q *queueOnePass) insertNew(u uint32) {
130 if u >= uint32(len(q.sparse)) {
131 return
133 q.sparse[u] = q.size
134 q.dense[q.size] = u
135 q.size++
138 func newQueue(size int) (q *queueOnePass) {
139 return &queueOnePass{
140 sparse: make([]uint32, size),
141 dense: make([]uint32, size),
145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148 // NextIp array with the single element mergeFailed is returned.
149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150 const mergeFailed = uint32(0xffffffff)
152 var (
153 noRune = []rune{}
154 noNext = []uint32{mergeFailed}
157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
158 leftLen := len(*leftRunes)
159 rightLen := len(*rightRunes)
160 if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
161 panic("mergeRuneSets odd length []rune")
163 var (
164 lx, rx int
166 merged := make([]rune, 0)
167 next := make([]uint32, 0)
168 ok := true
169 defer func() {
170 if !ok {
171 merged = nil
172 next = nil
176 ix := -1
177 extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
178 if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
179 return false
181 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
182 *newLow += 2
183 ix += 2
184 next = append(next, pc)
185 return true
188 for lx < leftLen || rx < rightLen {
189 switch {
190 case rx >= rightLen:
191 ok = extend(&lx, leftRunes, leftPC)
192 case lx >= leftLen:
193 ok = extend(&rx, rightRunes, rightPC)
194 case (*rightRunes)[rx] < (*leftRunes)[lx]:
195 ok = extend(&rx, rightRunes, rightPC)
196 default:
197 ok = extend(&lx, leftRunes, leftPC)
199 if !ok {
200 return noRune, noNext
203 return merged, next
206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
208 for ix, instOriginal := range original.Inst {
209 switch instOriginal.Op {
210 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
211 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
212 prog.Inst[ix].Next = nil
213 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
214 prog.Inst[ix].Next = nil
215 prog.Inst[ix] = onePassInst{Inst: instOriginal}
220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
221 func onePassCopy(prog *syntax.Prog) *onePassProg {
222 p := &onePassProg{
223 Start: prog.Start,
224 NumCap: prog.NumCap,
226 for _, inst := range prog.Inst {
227 p.Inst = append(p.Inst, onePassInst{Inst: inst})
230 // rewrites one or more common Prog constructs that enable some otherwise
231 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
232 // ip A, that points to ips B & C.
233 // A:BC + B:DA => A:BC + B:CD
234 // A:BC + B:DC => A:DC + B:DC
235 for pc := range p.Inst {
236 switch p.Inst[pc].Op {
237 default:
238 continue
239 case syntax.InstAlt, syntax.InstAltMatch:
240 // A:Bx + B:Ay
241 p_A_Other := &p.Inst[pc].Out
242 p_A_Alt := &p.Inst[pc].Arg
243 // make sure a target is another Alt
244 instAlt := p.Inst[*p_A_Alt]
245 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
246 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
247 instAlt = p.Inst[*p_A_Alt]
248 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
249 continue
252 instOther := p.Inst[*p_A_Other]
253 // Analyzing both legs pointing to Alts is for another day
254 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
255 // too complicated
256 continue
258 // simple empty transition loop
259 // A:BC + B:DA => A:BC + B:DC
260 p_B_Alt := &p.Inst[*p_A_Alt].Out
261 p_B_Other := &p.Inst[*p_A_Alt].Arg
262 patch := false
263 if instAlt.Out == uint32(pc) {
264 patch = true
265 } else if instAlt.Arg == uint32(pc) {
266 patch = true
267 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
269 if patch {
270 *p_B_Alt = *p_A_Other
273 // empty transition to common target
274 // A:BC + B:DC => A:DC + B:DC
275 if *p_A_Other == *p_B_Alt {
276 *p_A_Alt = *p_B_Other
280 return p
283 // runeSlice exists to permit sorting the case-folded rune sets.
284 type runeSlice []rune
286 func (p runeSlice) Len() int { return len(p) }
287 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
288 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
290 // Sort is a convenience method.
291 func (p runeSlice) Sort() {
292 sort.Sort(p)
295 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
296 var anyRune = []rune{0, unicode.MaxRune}
298 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
299 // the match engine can always tell which branch to take. The routine may modify
300 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
301 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
302 // to the size of the Prog.
303 func makeOnePass(p *onePassProg) *onePassProg {
304 // If the machine is very long, it's not worth the time to check if we can use one pass.
305 if len(p.Inst) >= 1000 {
306 return notOnePass
309 var (
310 instQueue = newQueue(len(p.Inst))
311 visitQueue = newQueue(len(p.Inst))
312 build func(uint32, *queueOnePass)
313 check func(uint32, map[uint32]bool) bool
314 onePassRunes = make([][]rune, len(p.Inst))
316 build = func(pc uint32, q *queueOnePass) {
317 if q.contains(pc) {
318 return
320 inst := p.Inst[pc]
321 switch inst.Op {
322 case syntax.InstAlt, syntax.InstAltMatch:
323 q.insert(inst.Out)
324 build(inst.Out, q)
325 q.insert(inst.Arg)
326 case syntax.InstMatch, syntax.InstFail:
327 default:
328 q.insert(inst.Out)
332 // check that paths from Alt instructions are unambiguous, and rebuild the new
333 // program as a onepass program
334 check = func(pc uint32, m map[uint32]bool) (ok bool) {
335 ok = true
336 inst := &p.Inst[pc]
337 if visitQueue.contains(pc) {
338 return
340 visitQueue.insert(pc)
341 switch inst.Op {
342 case syntax.InstAlt, syntax.InstAltMatch:
343 ok = check(inst.Out, m) && check(inst.Arg, m)
344 // check no-input paths to InstMatch
345 matchOut := m[inst.Out]
346 matchArg := m[inst.Arg]
347 if matchOut && matchArg {
348 ok = false
349 break
351 // Match on empty goes in inst.Out
352 if matchArg {
353 inst.Out, inst.Arg = inst.Arg, inst.Out
354 matchOut, matchArg = matchArg, matchOut
356 if matchOut {
357 m[pc] = true
358 inst.Op = syntax.InstAltMatch
361 // build a dispatch operator from the two legs of the alt.
362 onePassRunes[pc], inst.Next = mergeRuneSets(
363 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
364 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
365 ok = false
366 break
368 case syntax.InstCapture, syntax.InstNop:
369 ok = check(inst.Out, m)
370 m[pc] = m[inst.Out]
371 // pass matching runes back through these no-ops.
372 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
373 inst.Next = []uint32{}
374 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
375 inst.Next = append(inst.Next, inst.Out)
377 case syntax.InstEmptyWidth:
378 ok = check(inst.Out, m)
379 m[pc] = m[inst.Out]
380 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
381 inst.Next = []uint32{}
382 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
383 inst.Next = append(inst.Next, inst.Out)
385 case syntax.InstMatch, syntax.InstFail:
386 m[pc] = inst.Op == syntax.InstMatch
387 break
388 case syntax.InstRune:
389 ok = check(inst.Out, m)
390 m[pc] = false
391 if len(inst.Next) > 0 {
392 break
394 if len(inst.Rune) == 0 {
395 onePassRunes[pc] = []rune{}
396 inst.Next = []uint32{inst.Out}
397 break
399 runes := make([]rune, 0)
400 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
401 r0 := inst.Rune[0]
402 runes = append(runes, r0, r0)
403 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
404 runes = append(runes, r1, r1)
406 sort.Sort(runeSlice(runes))
407 } else {
408 runes = append(runes, inst.Rune...)
410 onePassRunes[pc] = runes
411 inst.Next = []uint32{}
412 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
413 inst.Next = append(inst.Next, inst.Out)
415 inst.Op = syntax.InstRune
416 case syntax.InstRune1:
417 ok = check(inst.Out, m)
418 m[pc] = false
419 if len(inst.Next) > 0 {
420 break
422 runes := []rune{}
423 // expand case-folded runes
424 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
425 r0 := inst.Rune[0]
426 runes = append(runes, r0, r0)
427 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
428 runes = append(runes, r1, r1)
430 sort.Sort(runeSlice(runes))
431 } else {
432 runes = append(runes, inst.Rune[0], inst.Rune[0])
434 onePassRunes[pc] = runes
435 inst.Next = []uint32{}
436 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
437 inst.Next = append(inst.Next, inst.Out)
439 inst.Op = syntax.InstRune
440 case syntax.InstRuneAny:
441 ok = check(inst.Out, m)
442 m[pc] = false
443 if len(inst.Next) > 0 {
444 break
446 onePassRunes[pc] = append([]rune{}, anyRune...)
447 inst.Next = []uint32{inst.Out}
448 case syntax.InstRuneAnyNotNL:
449 ok = check(inst.Out, m)
450 m[pc] = false
451 if len(inst.Next) > 0 {
452 break
454 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
455 inst.Next = []uint32{}
456 for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
457 inst.Next = append(inst.Next, inst.Out)
460 return
463 instQueue.clear()
464 instQueue.insert(uint32(p.Start))
465 m := make(map[uint32]bool, len(p.Inst))
466 for !instQueue.empty() {
467 pc := instQueue.next()
468 inst := p.Inst[pc]
469 visitQueue.clear()
470 if !check(uint32(pc), m) {
471 p = notOnePass
472 break
474 switch inst.Op {
475 case syntax.InstAlt, syntax.InstAltMatch:
476 instQueue.insert(inst.Out)
477 instQueue.insert(inst.Arg)
478 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
479 instQueue.insert(inst.Out)
480 case syntax.InstMatch:
481 case syntax.InstFail:
482 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
483 default:
486 if p != notOnePass {
487 for i, _ := range p.Inst {
488 p.Inst[i].Rune = onePassRunes[i]
491 return p
494 // walk visits each Inst in the prog once, and applies the argument
495 // function(ip, next), in pre-order.
496 func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
497 var walk1 func(uint32)
498 progQueue := newQueue(len(prog.Inst))
499 walk1 = func(ip uint32) {
500 if progQueue.contains(ip) {
501 return
503 progQueue.insert(ip)
504 inst := prog.Inst[ip]
505 switch inst.Op {
506 case syntax.InstAlt, syntax.InstAltMatch:
507 for _, f := range funcs {
508 f(ip, inst.Out)
509 f(ip, inst.Arg)
511 walk1(inst.Out)
512 walk1(inst.Arg)
513 default:
514 for _, f := range funcs {
515 f(ip, inst.Out)
517 walk1(inst.Out)
520 walk1(uint32(prog.Start))
523 // find returns the Insts that match the argument predicate function
524 func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
525 matches = []uint32{}
527 for ip := range prog.Inst {
528 if f(prog, ip) {
529 matches = append(matches, uint32(ip))
532 return
535 var notOnePass *onePassProg = nil
537 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
538 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
539 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
540 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
541 func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
542 if prog.Start == 0 {
543 return notOnePass
545 // onepass regexp is anchored
546 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
547 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
548 return notOnePass
550 // every instruction leading to InstMatch must be EmptyEndText
551 for _, inst := range prog.Inst {
552 opOut := prog.Inst[inst.Out].Op
553 switch inst.Op {
554 default:
555 if opOut == syntax.InstMatch {
556 return notOnePass
558 case syntax.InstAlt, syntax.InstAltMatch:
559 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
560 return notOnePass
562 case syntax.InstEmptyWidth:
563 if opOut == syntax.InstMatch {
564 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
565 continue
567 return notOnePass
571 // Creates a slightly optimized copy of the original Prog
572 // that cleans up some Prog idioms that block valid onepass programs
573 p = onePassCopy(prog)
575 // checkAmbiguity on InstAlts, build onepass Prog if possible
576 p = makeOnePass(p)
578 if p != notOnePass {
579 cleanupOnePass(p, prog)
581 return p