1 // Copyright 2014 The Go Authors. All rights reserved.
12 // Use of this source code is governed by a BSD-style
13 // license that can be found in the LICENSE file.
15 // "One-pass" regexp execution.
16 // Some regexps can be analyzed to determine that they never need
17 // backtracking: they are guaranteed to run in one pass over the string
18 // without bothering to save all the usual NFA state.
19 // Detect those and execute them more quickly.
21 // A onePassProg is a compiled one-pass regular expression program.
22 // It is the same as syntax.Prog except for the use of onePassInst.
23 type onePassProg
struct {
25 Start
int // index of start instruction
26 NumCap
int // number of InstCapture insts in re
29 // A onePassInst is a single instruction in a one-pass regular expression program.
30 // It is the same as syntax.Inst except for the new 'Next' field.
31 type onePassInst
struct {
36 // OnePassPrefix returns a literal string that all matches for the
37 // regexp must start with. Complete is true if the prefix
38 // is the entire match. Pc is the index of the last rune instruction
39 // in the string. The OnePassPrefix skips over the mandatory
41 func onePassPrefix(p
*syntax
.Prog
) (prefix
string, complete
bool, pc
uint32) {
43 if i
.Op
!= syntax
.InstEmptyWidth ||
(syntax
.EmptyOp(i
.Arg
))&syntax
.EmptyBeginText
== 0 {
44 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
48 for i
.Op
== syntax
.InstNop
{
52 // Avoid allocation of buffer if prefix is empty.
53 if iop(i
) != syntax
.InstRune ||
len(i
.Rune
) != 1 {
54 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
57 // Have prefix; gather characters.
59 for iop(i
) == syntax
.InstRune
&& len(i
.Rune
) == 1 && syntax
.Flags(i
.Arg
)&syntax
.FoldCase
== 0 {
60 buf
.WriteRune(i
.Rune
[0])
61 pc
, i
= i
.Out
, &p
.Inst
[i
.Out
]
63 return buf
.String(), i
.Op
== syntax
.InstEmptyWidth
&& (syntax
.EmptyOp(i
.Arg
))&syntax
.EmptyBeginText
!= 0, pc
66 // OnePassNext selects the next actionable state of the prog, based on the input character.
67 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
68 // One of the alternates may ultimately lead without input to end of line. If the instruction
69 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
70 func onePassNext(i
*onePassInst
, r rune
) uint32 {
71 next
:= i
.MatchRunePos(r
)
75 if i
.Op
== syntax
.InstAltMatch
{
81 func iop(i
*syntax
.Inst
) syntax
.InstOp
{
84 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
90 // Sparse Array implementation is used as a queueOnePass.
91 type queueOnePass
struct {
94 size
, nextIndex
uint32
97 func (q
*queueOnePass
) empty() bool {
98 return q
.nextIndex
>= q
.size
101 func (q
*queueOnePass
) next() (n
uint32) {
102 n
= q
.dense
[q
.nextIndex
]
107 func (q
*queueOnePass
) clear() {
112 func (q
*queueOnePass
) reset() {
116 func (q
*queueOnePass
) contains(u
uint32) bool {
117 if u
>= uint32(len(q
.sparse
)) {
120 return q
.sparse
[u
] < q
.size
&& q
.dense
[q
.sparse
[u
]] == u
123 func (q
*queueOnePass
) insert(u
uint32) {
129 func (q
*queueOnePass
) insertNew(u
uint32) {
130 if u
>= uint32(len(q
.sparse
)) {
138 func newQueue(size
int) (q
*queueOnePass
) {
139 return &queueOnePass
{
140 sparse
: make([]uint32, size
),
141 dense
: make([]uint32, size
),
145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148 // NextIp array with the single element mergeFailed is returned.
149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150 const mergeFailed
= uint32(0xffffffff)
154 noNext
= []uint32{mergeFailed
}
157 func mergeRuneSets(leftRunes
, rightRunes
*[]rune
, leftPC
, rightPC
uint32) ([]rune
, []uint32) {
158 leftLen
:= len(*leftRunes
)
159 rightLen
:= len(*rightRunes
)
160 if leftLen
&0x1 != 0 || rightLen
&0x1 != 0 {
161 panic("mergeRuneSets odd length []rune")
166 merged
:= make([]rune
, 0)
167 next
:= make([]uint32, 0)
177 extend
:= func(newLow
*int, newArray
*[]rune
, pc
uint32) bool {
178 if ix
> 0 && (*newArray
)[*newLow
] <= merged
[ix
] {
181 merged
= append(merged
, (*newArray
)[*newLow
], (*newArray
)[*newLow
+1])
184 next
= append(next
, pc
)
188 for lx
< leftLen || rx
< rightLen
{
191 ok
= extend(&lx
, leftRunes
, leftPC
)
193 ok
= extend(&rx
, rightRunes
, rightPC
)
194 case (*rightRunes
)[rx
] < (*leftRunes
)[lx
]:
195 ok
= extend(&rx
, rightRunes
, rightPC
)
197 ok
= extend(&lx
, leftRunes
, leftPC
)
200 return noRune
, noNext
206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
207 func cleanupOnePass(prog
*onePassProg
, original
*syntax
.Prog
) {
208 for ix
, instOriginal
:= range original
.Inst
{
209 switch instOriginal
.Op
{
210 case syntax
.InstAlt
, syntax
.InstAltMatch
, syntax
.InstRune
:
211 case syntax
.InstCapture
, syntax
.InstEmptyWidth
, syntax
.InstNop
, syntax
.InstMatch
, syntax
.InstFail
:
212 prog
.Inst
[ix
].Next
= nil
213 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
214 prog
.Inst
[ix
].Next
= nil
215 prog
.Inst
[ix
] = onePassInst
{Inst
: instOriginal
}
220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
221 func onePassCopy(prog
*syntax
.Prog
) *onePassProg
{
226 for _
, inst
:= range prog
.Inst
{
227 p
.Inst
= append(p
.Inst
, onePassInst
{Inst
: inst
})
230 // rewrites one or more common Prog constructs that enable some otherwise
231 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
232 // ip A, that points to ips B & C.
233 // A:BC + B:DA => A:BC + B:CD
234 // A:BC + B:DC => A:DC + B:DC
235 for pc
:= range p
.Inst
{
236 switch p
.Inst
[pc
].Op
{
239 case syntax
.InstAlt
, syntax
.InstAltMatch
:
241 p_A_Other
:= &p
.Inst
[pc
].Out
242 p_A_Alt
:= &p
.Inst
[pc
].Arg
243 // make sure a target is another Alt
244 instAlt
:= p
.Inst
[*p_A_Alt
]
245 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
246 p_A_Alt
, p_A_Other
= p_A_Other
, p_A_Alt
247 instAlt
= p
.Inst
[*p_A_Alt
]
248 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
252 instOther
:= p
.Inst
[*p_A_Other
]
253 // Analyzing both legs pointing to Alts is for another day
254 if instOther
.Op
== syntax
.InstAlt || instOther
.Op
== syntax
.InstAltMatch
{
258 // simple empty transition loop
259 // A:BC + B:DA => A:BC + B:DC
260 p_B_Alt
:= &p
.Inst
[*p_A_Alt
].Out
261 p_B_Other
:= &p
.Inst
[*p_A_Alt
].Arg
263 if instAlt
.Out
== uint32(pc
) {
265 } else if instAlt
.Arg
== uint32(pc
) {
267 p_B_Alt
, p_B_Other
= p_B_Other
, p_B_Alt
270 *p_B_Alt
= *p_A_Other
273 // empty transition to common target
274 // A:BC + B:DC => A:DC + B:DC
275 if *p_A_Other
== *p_B_Alt
{
276 *p_A_Alt
= *p_B_Other
283 // runeSlice exists to permit sorting the case-folded rune sets.
284 type runeSlice
[]rune
286 func (p runeSlice
) Len() int { return len(p
) }
287 func (p runeSlice
) Less(i
, j
int) bool { return p
[i
] < p
[j
] }
288 func (p runeSlice
) Swap(i
, j
int) { p
[i
], p
[j
] = p
[j
], p
[i
] }
290 // Sort is a convenience method.
291 func (p runeSlice
) Sort() {
295 var anyRuneNotNL
= []rune
{0, '\n' - 1, '\n' + 1, unicode
.MaxRune
}
296 var anyRune
= []rune
{0, unicode
.MaxRune
}
298 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
299 // the match engine can always tell which branch to take. The routine may modify
300 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
301 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
302 // to the size of the Prog.
303 func makeOnePass(p
*onePassProg
) *onePassProg
{
304 // If the machine is very long, it's not worth the time to check if we can use one pass.
305 if len(p
.Inst
) >= 1000 {
310 instQueue
= newQueue(len(p
.Inst
))
311 visitQueue
= newQueue(len(p
.Inst
))
312 build
func(uint32, *queueOnePass
)
313 check
func(uint32, map[uint32]bool) bool
314 onePassRunes
= make([][]rune
, len(p
.Inst
))
316 build
= func(pc
uint32, q
*queueOnePass
) {
322 case syntax
.InstAlt
, syntax
.InstAltMatch
:
326 case syntax
.InstMatch
, syntax
.InstFail
:
332 // check that paths from Alt instructions are unambiguous, and rebuild the new
333 // program as a onepass program
334 check
= func(pc
uint32, m
map[uint32]bool) (ok
bool) {
337 if visitQueue
.contains(pc
) {
340 visitQueue
.insert(pc
)
342 case syntax
.InstAlt
, syntax
.InstAltMatch
:
343 ok
= check(inst
.Out
, m
) && check(inst
.Arg
, m
)
344 // check no-input paths to InstMatch
345 matchOut
:= m
[inst
.Out
]
346 matchArg
:= m
[inst
.Arg
]
347 if matchOut
&& matchArg
{
351 // Match on empty goes in inst.Out
353 inst
.Out
, inst
.Arg
= inst
.Arg
, inst
.Out
354 matchOut
, matchArg
= matchArg
, matchOut
358 inst
.Op
= syntax
.InstAltMatch
361 // build a dispatch operator from the two legs of the alt.
362 onePassRunes
[pc
], inst
.Next
= mergeRuneSets(
363 &onePassRunes
[inst
.Out
], &onePassRunes
[inst
.Arg
], inst
.Out
, inst
.Arg
)
364 if len(inst
.Next
) > 0 && inst
.Next
[0] == mergeFailed
{
368 case syntax
.InstCapture
, syntax
.InstNop
:
369 ok
= check(inst
.Out
, m
)
371 // pass matching runes back through these no-ops.
372 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
373 inst
.Next
= []uint32{}
374 for i
:= len(onePassRunes
[pc
]) / 2; i
>= 0; i
-- {
375 inst
.Next
= append(inst
.Next
, inst
.Out
)
377 case syntax
.InstEmptyWidth
:
378 ok
= check(inst
.Out
, m
)
380 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
381 inst
.Next
= []uint32{}
382 for i
:= len(onePassRunes
[pc
]) / 2; i
>= 0; i
-- {
383 inst
.Next
= append(inst
.Next
, inst
.Out
)
385 case syntax
.InstMatch
, syntax
.InstFail
:
386 m
[pc
] = inst
.Op
== syntax
.InstMatch
388 case syntax
.InstRune
:
389 ok
= check(inst
.Out
, m
)
391 if len(inst
.Next
) > 0 {
394 if len(inst
.Rune
) == 0 {
395 onePassRunes
[pc
] = []rune
{}
396 inst
.Next
= []uint32{inst
.Out
}
399 runes
:= make([]rune
, 0)
400 if len(inst
.Rune
) == 1 && syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
402 runes
= append(runes
, r0
, r0
)
403 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
404 runes
= append(runes
, r1
, r1
)
406 sort
.Sort(runeSlice(runes
))
408 runes
= append(runes
, inst
.Rune
...)
410 onePassRunes
[pc
] = runes
411 inst
.Next
= []uint32{}
412 for i
:= len(onePassRunes
[pc
]) / 2; i
>= 0; i
-- {
413 inst
.Next
= append(inst
.Next
, inst
.Out
)
415 inst
.Op
= syntax
.InstRune
416 case syntax
.InstRune1
:
417 ok
= check(inst
.Out
, m
)
419 if len(inst
.Next
) > 0 {
423 // expand case-folded runes
424 if syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
426 runes
= append(runes
, r0
, r0
)
427 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
428 runes
= append(runes
, r1
, r1
)
430 sort
.Sort(runeSlice(runes
))
432 runes
= append(runes
, inst
.Rune
[0], inst
.Rune
[0])
434 onePassRunes
[pc
] = runes
435 inst
.Next
= []uint32{}
436 for i
:= len(onePassRunes
[pc
]) / 2; i
>= 0; i
-- {
437 inst
.Next
= append(inst
.Next
, inst
.Out
)
439 inst
.Op
= syntax
.InstRune
440 case syntax
.InstRuneAny
:
441 ok
= check(inst
.Out
, m
)
443 if len(inst
.Next
) > 0 {
446 onePassRunes
[pc
] = append([]rune
{}, anyRune
...)
447 inst
.Next
= []uint32{inst
.Out
}
448 case syntax
.InstRuneAnyNotNL
:
449 ok
= check(inst
.Out
, m
)
451 if len(inst
.Next
) > 0 {
454 onePassRunes
[pc
] = append([]rune
{}, anyRuneNotNL
...)
455 inst
.Next
= []uint32{}
456 for i
:= len(onePassRunes
[pc
]) / 2; i
>= 0; i
-- {
457 inst
.Next
= append(inst
.Next
, inst
.Out
)
464 instQueue
.insert(uint32(p
.Start
))
465 m
:= make(map[uint32]bool, len(p
.Inst
))
466 for !instQueue
.empty() {
467 pc
:= instQueue
.next()
470 if !check(uint32(pc
), m
) {
475 case syntax
.InstAlt
, syntax
.InstAltMatch
:
476 instQueue
.insert(inst
.Out
)
477 instQueue
.insert(inst
.Arg
)
478 case syntax
.InstCapture
, syntax
.InstEmptyWidth
, syntax
.InstNop
:
479 instQueue
.insert(inst
.Out
)
480 case syntax
.InstMatch
:
481 case syntax
.InstFail
:
482 case syntax
.InstRune
, syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
487 for i
, _
:= range p
.Inst
{
488 p
.Inst
[i
].Rune
= onePassRunes
[i
]
494 // walk visits each Inst in the prog once, and applies the argument
495 // function(ip, next), in pre-order.
496 func walk(prog
*syntax
.Prog
, funcs
...func(ip
, next
uint32)) {
497 var walk1
func(uint32)
498 progQueue
:= newQueue(len(prog
.Inst
))
499 walk1
= func(ip
uint32) {
500 if progQueue
.contains(ip
) {
504 inst
:= prog
.Inst
[ip
]
506 case syntax
.InstAlt
, syntax
.InstAltMatch
:
507 for _
, f
:= range funcs
{
514 for _
, f
:= range funcs
{
520 walk1(uint32(prog
.Start
))
523 // find returns the Insts that match the argument predicate function
524 func find(prog
*syntax
.Prog
, f
func(*syntax
.Prog
, int) bool) (matches
[]uint32) {
527 for ip
:= range prog
.Inst
{
529 matches
= append(matches
, uint32(ip
))
535 var notOnePass
*onePassProg
= nil
537 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
538 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
539 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
540 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
541 func compileOnePass(prog
*syntax
.Prog
) (p
*onePassProg
) {
545 // onepass regexp is anchored
546 if prog
.Inst
[prog
.Start
].Op
!= syntax
.InstEmptyWidth ||
547 syntax
.EmptyOp(prog
.Inst
[prog
.Start
].Arg
)&syntax
.EmptyBeginText
!= syntax
.EmptyBeginText
{
550 // every instruction leading to InstMatch must be EmptyEndText
551 for _
, inst
:= range prog
.Inst
{
552 opOut
:= prog
.Inst
[inst
.Out
].Op
555 if opOut
== syntax
.InstMatch
{
558 case syntax
.InstAlt
, syntax
.InstAltMatch
:
559 if opOut
== syntax
.InstMatch || prog
.Inst
[inst
.Arg
].Op
== syntax
.InstMatch
{
562 case syntax
.InstEmptyWidth
:
563 if opOut
== syntax
.InstMatch
{
564 if syntax
.EmptyOp(inst
.Arg
)&syntax
.EmptyEndText
== syntax
.EmptyEndText
{
571 // Creates a slightly optimized copy of the original Prog
572 // that cleans up some Prog idioms that block valid onepass programs
573 p
= onePassCopy(prog
)
575 // checkAmbiguity on InstAlts, build onepass Prog if possible
579 cleanupOnePass(p
, prog
)