MATCH: Add patterns from phiopt's minmax_replacement
[official-gcc.git] / libgo / go / regexp / onepass.go
blobbc47f4c4a830da065ca5053227a97b0a53588ad0
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package regexp
7 import (
8 "regexp/syntax"
9 "sort"
10 "strings"
11 "unicode"
12 "unicode/utf8"
15 // "One-pass" regexp execution.
16 // Some regexps can be analyzed to determine that they never need
17 // backtracking: they are guaranteed to run in one pass over the string
18 // without bothering to save all the usual NFA state.
19 // Detect those and execute them more quickly.
21 // A onePassProg is a compiled one-pass regular expression program.
22 // It is the same as syntax.Prog except for the use of onePassInst.
23 type onePassProg struct {
24 Inst []onePassInst
25 Start int // index of start instruction
26 NumCap int // number of InstCapture insts in re
29 // A onePassInst is a single instruction in a one-pass regular expression program.
30 // It is the same as syntax.Inst except for the new 'Next' field.
31 type onePassInst struct {
32 syntax.Inst
33 Next []uint32
36 // OnePassPrefix returns a literal string that all matches for the
37 // regexp must start with. Complete is true if the prefix
38 // is the entire match. Pc is the index of the last rune instruction
39 // in the string. The OnePassPrefix skips over the mandatory
40 // EmptyBeginText
41 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
42 i := &p.Inst[p.Start]
43 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
44 return "", i.Op == syntax.InstMatch, uint32(p.Start)
46 pc = i.Out
47 i = &p.Inst[pc]
48 for i.Op == syntax.InstNop {
49 pc = i.Out
50 i = &p.Inst[pc]
52 // Avoid allocation of buffer if prefix is empty.
53 if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
54 return "", i.Op == syntax.InstMatch, uint32(p.Start)
57 // Have prefix; gather characters.
58 var buf strings.Builder
59 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
60 buf.WriteRune(i.Rune[0])
61 pc, i = i.Out, &p.Inst[i.Out]
63 if i.Op == syntax.InstEmptyWidth &&
64 syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
65 p.Inst[i.Out].Op == syntax.InstMatch {
66 complete = true
68 return buf.String(), complete, pc
71 // OnePassNext selects the next actionable state of the prog, based on the input character.
72 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
73 // One of the alternates may ultimately lead without input to end of line. If the instruction
74 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
75 func onePassNext(i *onePassInst, r rune) uint32 {
76 next := i.MatchRunePos(r)
77 if next >= 0 {
78 return i.Next[next]
80 if i.Op == syntax.InstAltMatch {
81 return i.Out
83 return 0
86 func iop(i *syntax.Inst) syntax.InstOp {
87 op := i.Op
88 switch op {
89 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
90 op = syntax.InstRune
92 return op
95 // Sparse Array implementation is used as a queueOnePass.
96 type queueOnePass struct {
97 sparse []uint32
98 dense []uint32
99 size, nextIndex uint32
102 func (q *queueOnePass) empty() bool {
103 return q.nextIndex >= q.size
106 func (q *queueOnePass) next() (n uint32) {
107 n = q.dense[q.nextIndex]
108 q.nextIndex++
109 return
112 func (q *queueOnePass) clear() {
113 q.size = 0
114 q.nextIndex = 0
117 func (q *queueOnePass) contains(u uint32) bool {
118 if u >= uint32(len(q.sparse)) {
119 return false
121 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
124 func (q *queueOnePass) insert(u uint32) {
125 if !q.contains(u) {
126 q.insertNew(u)
130 func (q *queueOnePass) insertNew(u uint32) {
131 if u >= uint32(len(q.sparse)) {
132 return
134 q.sparse[u] = q.size
135 q.dense[q.size] = u
136 q.size++
139 func newQueue(size int) (q *queueOnePass) {
140 return &queueOnePass{
141 sparse: make([]uint32, size),
142 dense: make([]uint32, size),
146 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
147 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
148 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
149 // NextIp array with the single element mergeFailed is returned.
150 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
151 const mergeFailed = uint32(0xffffffff)
153 var (
154 noRune = []rune{}
155 noNext = []uint32{mergeFailed}
158 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
159 leftLen := len(*leftRunes)
160 rightLen := len(*rightRunes)
161 if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
162 panic("mergeRuneSets odd length []rune")
164 var (
165 lx, rx int
167 merged := make([]rune, 0)
168 next := make([]uint32, 0)
169 ok := true
170 defer func() {
171 if !ok {
172 merged = nil
173 next = nil
177 ix := -1
178 extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
179 if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
180 return false
182 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
183 *newLow += 2
184 ix += 2
185 next = append(next, pc)
186 return true
189 for lx < leftLen || rx < rightLen {
190 switch {
191 case rx >= rightLen:
192 ok = extend(&lx, leftRunes, leftPC)
193 case lx >= leftLen:
194 ok = extend(&rx, rightRunes, rightPC)
195 case (*rightRunes)[rx] < (*leftRunes)[lx]:
196 ok = extend(&rx, rightRunes, rightPC)
197 default:
198 ok = extend(&lx, leftRunes, leftPC)
200 if !ok {
201 return noRune, noNext
204 return merged, next
207 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
208 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
209 for ix, instOriginal := range original.Inst {
210 switch instOriginal.Op {
211 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
212 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
213 prog.Inst[ix].Next = nil
214 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
215 prog.Inst[ix].Next = nil
216 prog.Inst[ix] = onePassInst{Inst: instOriginal}
221 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
222 func onePassCopy(prog *syntax.Prog) *onePassProg {
223 p := &onePassProg{
224 Start: prog.Start,
225 NumCap: prog.NumCap,
226 Inst: make([]onePassInst, len(prog.Inst)),
228 for i, inst := range prog.Inst {
229 p.Inst[i] = onePassInst{Inst: inst}
232 // rewrites one or more common Prog constructs that enable some otherwise
233 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
234 // ip A, that points to ips B & C.
235 // A:BC + B:DA => A:BC + B:CD
236 // A:BC + B:DC => A:DC + B:DC
237 for pc := range p.Inst {
238 switch p.Inst[pc].Op {
239 default:
240 continue
241 case syntax.InstAlt, syntax.InstAltMatch:
242 // A:Bx + B:Ay
243 p_A_Other := &p.Inst[pc].Out
244 p_A_Alt := &p.Inst[pc].Arg
245 // make sure a target is another Alt
246 instAlt := p.Inst[*p_A_Alt]
247 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
248 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
249 instAlt = p.Inst[*p_A_Alt]
250 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
251 continue
254 instOther := p.Inst[*p_A_Other]
255 // Analyzing both legs pointing to Alts is for another day
256 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
257 // too complicated
258 continue
260 // simple empty transition loop
261 // A:BC + B:DA => A:BC + B:DC
262 p_B_Alt := &p.Inst[*p_A_Alt].Out
263 p_B_Other := &p.Inst[*p_A_Alt].Arg
264 patch := false
265 if instAlt.Out == uint32(pc) {
266 patch = true
267 } else if instAlt.Arg == uint32(pc) {
268 patch = true
269 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
271 if patch {
272 *p_B_Alt = *p_A_Other
275 // empty transition to common target
276 // A:BC + B:DC => A:DC + B:DC
277 if *p_A_Other == *p_B_Alt {
278 *p_A_Alt = *p_B_Other
282 return p
285 // runeSlice exists to permit sorting the case-folded rune sets.
286 type runeSlice []rune
288 func (p runeSlice) Len() int { return len(p) }
289 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
290 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
292 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
293 var anyRune = []rune{0, unicode.MaxRune}
295 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
296 // the match engine can always tell which branch to take. The routine may modify
297 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
298 // onepass Prog, the Prog nil is returned. makeOnePass is recursive
299 // to the size of the Prog.
300 func makeOnePass(p *onePassProg) *onePassProg {
301 // If the machine is very long, it's not worth the time to check if we can use one pass.
302 if len(p.Inst) >= 1000 {
303 return nil
306 var (
307 instQueue = newQueue(len(p.Inst))
308 visitQueue = newQueue(len(p.Inst))
309 check func(uint32, []bool) bool
310 onePassRunes = make([][]rune, len(p.Inst))
313 // check that paths from Alt instructions are unambiguous, and rebuild the new
314 // program as a onepass program
315 check = func(pc uint32, m []bool) (ok bool) {
316 ok = true
317 inst := &p.Inst[pc]
318 if visitQueue.contains(pc) {
319 return
321 visitQueue.insert(pc)
322 switch inst.Op {
323 case syntax.InstAlt, syntax.InstAltMatch:
324 ok = check(inst.Out, m) && check(inst.Arg, m)
325 // check no-input paths to InstMatch
326 matchOut := m[inst.Out]
327 matchArg := m[inst.Arg]
328 if matchOut && matchArg {
329 ok = false
330 break
332 // Match on empty goes in inst.Out
333 if matchArg {
334 inst.Out, inst.Arg = inst.Arg, inst.Out
335 matchOut, matchArg = matchArg, matchOut
337 if matchOut {
338 m[pc] = true
339 inst.Op = syntax.InstAltMatch
342 // build a dispatch operator from the two legs of the alt.
343 onePassRunes[pc], inst.Next = mergeRuneSets(
344 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
345 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
346 ok = false
347 break
349 case syntax.InstCapture, syntax.InstNop:
350 ok = check(inst.Out, m)
351 m[pc] = m[inst.Out]
352 // pass matching runes back through these no-ops.
353 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
354 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
355 for i := range inst.Next {
356 inst.Next[i] = inst.Out
358 case syntax.InstEmptyWidth:
359 ok = check(inst.Out, m)
360 m[pc] = m[inst.Out]
361 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
362 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
363 for i := range inst.Next {
364 inst.Next[i] = inst.Out
366 case syntax.InstMatch, syntax.InstFail:
367 m[pc] = inst.Op == syntax.InstMatch
368 case syntax.InstRune:
369 m[pc] = false
370 if len(inst.Next) > 0 {
371 break
373 instQueue.insert(inst.Out)
374 if len(inst.Rune) == 0 {
375 onePassRunes[pc] = []rune{}
376 inst.Next = []uint32{inst.Out}
377 break
379 runes := make([]rune, 0)
380 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
381 r0 := inst.Rune[0]
382 runes = append(runes, r0, r0)
383 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
384 runes = append(runes, r1, r1)
386 sort.Sort(runeSlice(runes))
387 } else {
388 runes = append(runes, inst.Rune...)
390 onePassRunes[pc] = runes
391 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
392 for i := range inst.Next {
393 inst.Next[i] = inst.Out
395 inst.Op = syntax.InstRune
396 case syntax.InstRune1:
397 m[pc] = false
398 if len(inst.Next) > 0 {
399 break
401 instQueue.insert(inst.Out)
402 runes := []rune{}
403 // expand case-folded runes
404 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
405 r0 := inst.Rune[0]
406 runes = append(runes, r0, r0)
407 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
408 runes = append(runes, r1, r1)
410 sort.Sort(runeSlice(runes))
411 } else {
412 runes = append(runes, inst.Rune[0], inst.Rune[0])
414 onePassRunes[pc] = runes
415 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
416 for i := range inst.Next {
417 inst.Next[i] = inst.Out
419 inst.Op = syntax.InstRune
420 case syntax.InstRuneAny:
421 m[pc] = false
422 if len(inst.Next) > 0 {
423 break
425 instQueue.insert(inst.Out)
426 onePassRunes[pc] = append([]rune{}, anyRune...)
427 inst.Next = []uint32{inst.Out}
428 case syntax.InstRuneAnyNotNL:
429 m[pc] = false
430 if len(inst.Next) > 0 {
431 break
433 instQueue.insert(inst.Out)
434 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
435 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
436 for i := range inst.Next {
437 inst.Next[i] = inst.Out
440 return
443 instQueue.clear()
444 instQueue.insert(uint32(p.Start))
445 m := make([]bool, len(p.Inst))
446 for !instQueue.empty() {
447 visitQueue.clear()
448 pc := instQueue.next()
449 if !check(pc, m) {
450 p = nil
451 break
454 if p != nil {
455 for i := range p.Inst {
456 p.Inst[i].Rune = onePassRunes[i]
459 return p
462 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
463 // can be recharacterized as a one-pass regexp program, or syntax.nil if the
464 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
465 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
466 func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
467 if prog.Start == 0 {
468 return nil
470 // onepass regexp is anchored
471 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
472 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
473 return nil
475 // every instruction leading to InstMatch must be EmptyEndText
476 for _, inst := range prog.Inst {
477 opOut := prog.Inst[inst.Out].Op
478 switch inst.Op {
479 default:
480 if opOut == syntax.InstMatch {
481 return nil
483 case syntax.InstAlt, syntax.InstAltMatch:
484 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
485 return nil
487 case syntax.InstEmptyWidth:
488 if opOut == syntax.InstMatch {
489 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
490 continue
492 return nil
496 // Creates a slightly optimized copy of the original Prog
497 // that cleans up some Prog idioms that block valid onepass programs
498 p = onePassCopy(prog)
500 // checkAmbiguity on InstAlts, build onepass Prog if possible
501 p = makeOnePass(p)
503 if p != nil {
504 cleanupOnePass(p, prog)
506 return p