_mulkc3.c (__mulkc3): Add forward declaration.
[official-gcc.git] / libgo / go / regexp / onepass.go
blob3ceb4619058d3a7a8af7a859d868095db96195d8
1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package regexp
7 import (
8 "bytes"
9 "regexp/syntax"
10 "sort"
11 "unicode"
14 // "One-pass" regexp execution.
15 // Some regexps can be analyzed to determine that they never need
16 // backtracking: they are guaranteed to run in one pass over the string
17 // without bothering to save all the usual NFA state.
18 // Detect those and execute them more quickly.
20 // A onePassProg is a compiled one-pass regular expression program.
21 // It is the same as syntax.Prog except for the use of onePassInst.
22 type onePassProg struct {
23 Inst []onePassInst
24 Start int // index of start instruction
25 NumCap int // number of InstCapture insts in re
28 // A onePassInst is a single instruction in a one-pass regular expression program.
29 // It is the same as syntax.Inst except for the new 'Next' field.
30 type onePassInst struct {
31 syntax.Inst
32 Next []uint32
35 // OnePassPrefix returns a literal string that all matches for the
36 // regexp must start with. Complete is true if the prefix
37 // is the entire match. Pc is the index of the last rune instruction
38 // in the string. The OnePassPrefix skips over the mandatory
39 // EmptyBeginText
40 func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
41 i := &p.Inst[p.Start]
42 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
43 return "", i.Op == syntax.InstMatch, uint32(p.Start)
45 pc = i.Out
46 i = &p.Inst[pc]
47 for i.Op == syntax.InstNop {
48 pc = i.Out
49 i = &p.Inst[pc]
51 // Avoid allocation of buffer if prefix is empty.
52 if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
53 return "", i.Op == syntax.InstMatch, uint32(p.Start)
56 // Have prefix; gather characters.
57 var buf bytes.Buffer
58 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
59 buf.WriteRune(i.Rune[0])
60 pc, i = i.Out, &p.Inst[i.Out]
62 if i.Op == syntax.InstEmptyWidth &&
63 syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
64 p.Inst[i.Out].Op == syntax.InstMatch {
65 complete = true
67 return buf.String(), complete, pc
70 // OnePassNext selects the next actionable state of the prog, based on the input character.
71 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
72 // One of the alternates may ultimately lead without input to end of line. If the instruction
73 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
74 func onePassNext(i *onePassInst, r rune) uint32 {
75 next := i.MatchRunePos(r)
76 if next >= 0 {
77 return i.Next[next]
79 if i.Op == syntax.InstAltMatch {
80 return i.Out
82 return 0
85 func iop(i *syntax.Inst) syntax.InstOp {
86 op := i.Op
87 switch op {
88 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
89 op = syntax.InstRune
91 return op
94 // Sparse Array implementation is used as a queueOnePass.
95 type queueOnePass struct {
96 sparse []uint32
97 dense []uint32
98 size, nextIndex uint32
101 func (q *queueOnePass) empty() bool {
102 return q.nextIndex >= q.size
105 func (q *queueOnePass) next() (n uint32) {
106 n = q.dense[q.nextIndex]
107 q.nextIndex++
108 return
111 func (q *queueOnePass) clear() {
112 q.size = 0
113 q.nextIndex = 0
116 func (q *queueOnePass) contains(u uint32) bool {
117 if u >= uint32(len(q.sparse)) {
118 return false
120 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
123 func (q *queueOnePass) insert(u uint32) {
124 if !q.contains(u) {
125 q.insertNew(u)
129 func (q *queueOnePass) insertNew(u uint32) {
130 if u >= uint32(len(q.sparse)) {
131 return
133 q.sparse[u] = q.size
134 q.dense[q.size] = u
135 q.size++
138 func newQueue(size int) (q *queueOnePass) {
139 return &queueOnePass{
140 sparse: make([]uint32, size),
141 dense: make([]uint32, size),
145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148 // NextIp array with the single element mergeFailed is returned.
149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150 const mergeFailed = uint32(0xffffffff)
152 var (
153 noRune = []rune{}
154 noNext = []uint32{mergeFailed}
157 func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
158 leftLen := len(*leftRunes)
159 rightLen := len(*rightRunes)
160 if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
161 panic("mergeRuneSets odd length []rune")
163 var (
164 lx, rx int
166 merged := make([]rune, 0)
167 next := make([]uint32, 0)
168 ok := true
169 defer func() {
170 if !ok {
171 merged = nil
172 next = nil
176 ix := -1
177 extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
178 if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
179 return false
181 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
182 *newLow += 2
183 ix += 2
184 next = append(next, pc)
185 return true
188 for lx < leftLen || rx < rightLen {
189 switch {
190 case rx >= rightLen:
191 ok = extend(&lx, leftRunes, leftPC)
192 case lx >= leftLen:
193 ok = extend(&rx, rightRunes, rightPC)
194 case (*rightRunes)[rx] < (*leftRunes)[lx]:
195 ok = extend(&rx, rightRunes, rightPC)
196 default:
197 ok = extend(&lx, leftRunes, leftPC)
199 if !ok {
200 return noRune, noNext
203 return merged, next
206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
207 func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
208 for ix, instOriginal := range original.Inst {
209 switch instOriginal.Op {
210 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
211 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
212 prog.Inst[ix].Next = nil
213 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
214 prog.Inst[ix].Next = nil
215 prog.Inst[ix] = onePassInst{Inst: instOriginal}
220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
221 func onePassCopy(prog *syntax.Prog) *onePassProg {
222 p := &onePassProg{
223 Start: prog.Start,
224 NumCap: prog.NumCap,
225 Inst: make([]onePassInst, len(prog.Inst)),
227 for i, inst := range prog.Inst {
228 p.Inst[i] = onePassInst{Inst: inst}
231 // rewrites one or more common Prog constructs that enable some otherwise
232 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
233 // ip A, that points to ips B & C.
234 // A:BC + B:DA => A:BC + B:CD
235 // A:BC + B:DC => A:DC + B:DC
236 for pc := range p.Inst {
237 switch p.Inst[pc].Op {
238 default:
239 continue
240 case syntax.InstAlt, syntax.InstAltMatch:
241 // A:Bx + B:Ay
242 p_A_Other := &p.Inst[pc].Out
243 p_A_Alt := &p.Inst[pc].Arg
244 // make sure a target is another Alt
245 instAlt := p.Inst[*p_A_Alt]
246 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
247 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
248 instAlt = p.Inst[*p_A_Alt]
249 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
250 continue
253 instOther := p.Inst[*p_A_Other]
254 // Analyzing both legs pointing to Alts is for another day
255 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
256 // too complicated
257 continue
259 // simple empty transition loop
260 // A:BC + B:DA => A:BC + B:DC
261 p_B_Alt := &p.Inst[*p_A_Alt].Out
262 p_B_Other := &p.Inst[*p_A_Alt].Arg
263 patch := false
264 if instAlt.Out == uint32(pc) {
265 patch = true
266 } else if instAlt.Arg == uint32(pc) {
267 patch = true
268 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
270 if patch {
271 *p_B_Alt = *p_A_Other
274 // empty transition to common target
275 // A:BC + B:DC => A:DC + B:DC
276 if *p_A_Other == *p_B_Alt {
277 *p_A_Alt = *p_B_Other
281 return p
284 // runeSlice exists to permit sorting the case-folded rune sets.
285 type runeSlice []rune
287 func (p runeSlice) Len() int { return len(p) }
288 func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
289 func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
291 var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
292 var anyRune = []rune{0, unicode.MaxRune}
294 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
295 // the match engine can always tell which branch to take. The routine may modify
296 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
297 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
298 // to the size of the Prog.
299 func makeOnePass(p *onePassProg) *onePassProg {
300 // If the machine is very long, it's not worth the time to check if we can use one pass.
301 if len(p.Inst) >= 1000 {
302 return notOnePass
305 var (
306 instQueue = newQueue(len(p.Inst))
307 visitQueue = newQueue(len(p.Inst))
308 check func(uint32, []bool) bool
309 onePassRunes = make([][]rune, len(p.Inst))
312 // check that paths from Alt instructions are unambiguous, and rebuild the new
313 // program as a onepass program
314 check = func(pc uint32, m []bool) (ok bool) {
315 ok = true
316 inst := &p.Inst[pc]
317 if visitQueue.contains(pc) {
318 return
320 visitQueue.insert(pc)
321 switch inst.Op {
322 case syntax.InstAlt, syntax.InstAltMatch:
323 ok = check(inst.Out, m) && check(inst.Arg, m)
324 // check no-input paths to InstMatch
325 matchOut := m[inst.Out]
326 matchArg := m[inst.Arg]
327 if matchOut && matchArg {
328 ok = false
329 break
331 // Match on empty goes in inst.Out
332 if matchArg {
333 inst.Out, inst.Arg = inst.Arg, inst.Out
334 matchOut, matchArg = matchArg, matchOut
336 if matchOut {
337 m[pc] = true
338 inst.Op = syntax.InstAltMatch
341 // build a dispatch operator from the two legs of the alt.
342 onePassRunes[pc], inst.Next = mergeRuneSets(
343 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
344 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
345 ok = false
346 break
348 case syntax.InstCapture, syntax.InstNop:
349 ok = check(inst.Out, m)
350 m[pc] = m[inst.Out]
351 // pass matching runes back through these no-ops.
352 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
353 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
354 for i := range inst.Next {
355 inst.Next[i] = inst.Out
357 case syntax.InstEmptyWidth:
358 ok = check(inst.Out, m)
359 m[pc] = m[inst.Out]
360 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
361 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
362 for i := range inst.Next {
363 inst.Next[i] = inst.Out
365 case syntax.InstMatch, syntax.InstFail:
366 m[pc] = inst.Op == syntax.InstMatch
367 case syntax.InstRune:
368 m[pc] = false
369 if len(inst.Next) > 0 {
370 break
372 instQueue.insert(inst.Out)
373 if len(inst.Rune) == 0 {
374 onePassRunes[pc] = []rune{}
375 inst.Next = []uint32{inst.Out}
376 break
378 runes := make([]rune, 0)
379 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
380 r0 := inst.Rune[0]
381 runes = append(runes, r0, r0)
382 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
383 runes = append(runes, r1, r1)
385 sort.Sort(runeSlice(runes))
386 } else {
387 runes = append(runes, inst.Rune...)
389 onePassRunes[pc] = runes
390 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
391 for i := range inst.Next {
392 inst.Next[i] = inst.Out
394 inst.Op = syntax.InstRune
395 case syntax.InstRune1:
396 m[pc] = false
397 if len(inst.Next) > 0 {
398 break
400 instQueue.insert(inst.Out)
401 runes := []rune{}
402 // expand case-folded runes
403 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
404 r0 := inst.Rune[0]
405 runes = append(runes, r0, r0)
406 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
407 runes = append(runes, r1, r1)
409 sort.Sort(runeSlice(runes))
410 } else {
411 runes = append(runes, inst.Rune[0], inst.Rune[0])
413 onePassRunes[pc] = runes
414 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
415 for i := range inst.Next {
416 inst.Next[i] = inst.Out
418 inst.Op = syntax.InstRune
419 case syntax.InstRuneAny:
420 m[pc] = false
421 if len(inst.Next) > 0 {
422 break
424 instQueue.insert(inst.Out)
425 onePassRunes[pc] = append([]rune{}, anyRune...)
426 inst.Next = []uint32{inst.Out}
427 case syntax.InstRuneAnyNotNL:
428 m[pc] = false
429 if len(inst.Next) > 0 {
430 break
432 instQueue.insert(inst.Out)
433 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
434 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
435 for i := range inst.Next {
436 inst.Next[i] = inst.Out
439 return
442 instQueue.clear()
443 instQueue.insert(uint32(p.Start))
444 m := make([]bool, len(p.Inst))
445 for !instQueue.empty() {
446 visitQueue.clear()
447 pc := instQueue.next()
448 if !check(pc, m) {
449 p = notOnePass
450 break
453 if p != notOnePass {
454 for i := range p.Inst {
455 p.Inst[i].Rune = onePassRunes[i]
458 return p
461 var notOnePass *onePassProg = nil
463 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
464 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
465 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
466 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
467 func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
468 if prog.Start == 0 {
469 return notOnePass
471 // onepass regexp is anchored
472 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
473 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
474 return notOnePass
476 // every instruction leading to InstMatch must be EmptyEndText
477 for _, inst := range prog.Inst {
478 opOut := prog.Inst[inst.Out].Op
479 switch inst.Op {
480 default:
481 if opOut == syntax.InstMatch {
482 return notOnePass
484 case syntax.InstAlt, syntax.InstAltMatch:
485 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
486 return notOnePass
488 case syntax.InstEmptyWidth:
489 if opOut == syntax.InstMatch {
490 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
491 continue
493 return notOnePass
497 // Creates a slightly optimized copy of the original Prog
498 // that cleans up some Prog idioms that block valid onepass programs
499 p = onePassCopy(prog)
501 // checkAmbiguity on InstAlts, build onepass Prog if possible
502 p = makeOnePass(p)
504 if p != notOnePass {
505 cleanupOnePass(p, prog)
507 return p