1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
15 // "One-pass" regexp execution.
16 // Some regexps can be analyzed to determine that they never need
17 // backtracking: they are guaranteed to run in one pass over the string
18 // without bothering to save all the usual NFA state.
19 // Detect those and execute them more quickly.
21 // A onePassProg is a compiled one-pass regular expression program.
22 // It is the same as syntax.Prog except for the use of onePassInst.
23 type onePassProg
struct {
25 Start
int // index of start instruction
26 NumCap
int // number of InstCapture insts in re
29 // A onePassInst is a single instruction in a one-pass regular expression program.
30 // It is the same as syntax.Inst except for the new 'Next' field.
31 type onePassInst
struct {
36 // OnePassPrefix returns a literal string that all matches for the
37 // regexp must start with. Complete is true if the prefix
38 // is the entire match. Pc is the index of the last rune instruction
39 // in the string. The OnePassPrefix skips over the mandatory
41 func onePassPrefix(p
*syntax
.Prog
) (prefix
string, complete
bool, pc
uint32) {
43 if i
.Op
!= syntax
.InstEmptyWidth ||
(syntax
.EmptyOp(i
.Arg
))&syntax
.EmptyBeginText
== 0 {
44 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
48 for i
.Op
== syntax
.InstNop
{
52 // Avoid allocation of buffer if prefix is empty.
53 if iop(i
) != syntax
.InstRune ||
len(i
.Rune
) != 1 {
54 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
57 // Have prefix; gather characters.
58 var buf strings
.Builder
59 for iop(i
) == syntax
.InstRune
&& len(i
.Rune
) == 1 && syntax
.Flags(i
.Arg
)&syntax
.FoldCase
== 0 && i
.Rune
[0] != utf8
.RuneError
{
60 buf
.WriteRune(i
.Rune
[0])
61 pc
, i
= i
.Out
, &p
.Inst
[i
.Out
]
63 if i
.Op
== syntax
.InstEmptyWidth
&&
64 syntax
.EmptyOp(i
.Arg
)&syntax
.EmptyEndText
!= 0 &&
65 p
.Inst
[i
.Out
].Op
== syntax
.InstMatch
{
68 return buf
.String(), complete
, pc
71 // OnePassNext selects the next actionable state of the prog, based on the input character.
72 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
73 // One of the alternates may ultimately lead without input to end of line. If the instruction
74 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
75 func onePassNext(i
*onePassInst
, r rune
) uint32 {
76 next
:= i
.MatchRunePos(r
)
80 if i
.Op
== syntax
.InstAltMatch
{
86 func iop(i
*syntax
.Inst
) syntax
.InstOp
{
89 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
95 // Sparse Array implementation is used as a queueOnePass.
96 type queueOnePass
struct {
99 size
, nextIndex
uint32
102 func (q
*queueOnePass
) empty() bool {
103 return q
.nextIndex
>= q
.size
106 func (q
*queueOnePass
) next() (n
uint32) {
107 n
= q
.dense
[q
.nextIndex
]
112 func (q
*queueOnePass
) clear() {
117 func (q
*queueOnePass
) contains(u
uint32) bool {
118 if u
>= uint32(len(q
.sparse
)) {
121 return q
.sparse
[u
] < q
.size
&& q
.dense
[q
.sparse
[u
]] == u
124 func (q
*queueOnePass
) insert(u
uint32) {
130 func (q
*queueOnePass
) insertNew(u
uint32) {
131 if u
>= uint32(len(q
.sparse
)) {
139 func newQueue(size
int) (q
*queueOnePass
) {
140 return &queueOnePass
{
141 sparse
: make([]uint32, size
),
142 dense
: make([]uint32, size
),
146 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
147 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
148 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
149 // NextIp array with the single element mergeFailed is returned.
150 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
151 const mergeFailed
= uint32(0xffffffff)
155 noNext
= []uint32{mergeFailed
}
158 func mergeRuneSets(leftRunes
, rightRunes
*[]rune
, leftPC
, rightPC
uint32) ([]rune
, []uint32) {
159 leftLen
:= len(*leftRunes
)
160 rightLen
:= len(*rightRunes
)
161 if leftLen
&0x1 != 0 || rightLen
&0x1 != 0 {
162 panic("mergeRuneSets odd length []rune")
167 merged
:= make([]rune
, 0)
168 next
:= make([]uint32, 0)
178 extend
:= func(newLow
*int, newArray
*[]rune
, pc
uint32) bool {
179 if ix
> 0 && (*newArray
)[*newLow
] <= merged
[ix
] {
182 merged
= append(merged
, (*newArray
)[*newLow
], (*newArray
)[*newLow
+1])
185 next
= append(next
, pc
)
189 for lx
< leftLen || rx
< rightLen
{
192 ok
= extend(&lx
, leftRunes
, leftPC
)
194 ok
= extend(&rx
, rightRunes
, rightPC
)
195 case (*rightRunes
)[rx
] < (*leftRunes
)[lx
]:
196 ok
= extend(&rx
, rightRunes
, rightPC
)
198 ok
= extend(&lx
, leftRunes
, leftPC
)
201 return noRune
, noNext
207 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
208 func cleanupOnePass(prog
*onePassProg
, original
*syntax
.Prog
) {
209 for ix
, instOriginal
:= range original
.Inst
{
210 switch instOriginal
.Op
{
211 case syntax
.InstAlt
, syntax
.InstAltMatch
, syntax
.InstRune
:
212 case syntax
.InstCapture
, syntax
.InstEmptyWidth
, syntax
.InstNop
, syntax
.InstMatch
, syntax
.InstFail
:
213 prog
.Inst
[ix
].Next
= nil
214 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
215 prog
.Inst
[ix
].Next
= nil
216 prog
.Inst
[ix
] = onePassInst
{Inst
: instOriginal
}
221 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
222 func onePassCopy(prog
*syntax
.Prog
) *onePassProg
{
226 Inst
: make([]onePassInst
, len(prog
.Inst
)),
228 for i
, inst
:= range prog
.Inst
{
229 p
.Inst
[i
] = onePassInst
{Inst
: inst
}
232 // rewrites one or more common Prog constructs that enable some otherwise
233 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
234 // ip A, that points to ips B & C.
235 // A:BC + B:DA => A:BC + B:CD
236 // A:BC + B:DC => A:DC + B:DC
237 for pc
:= range p
.Inst
{
238 switch p
.Inst
[pc
].Op
{
241 case syntax
.InstAlt
, syntax
.InstAltMatch
:
243 p_A_Other
:= &p
.Inst
[pc
].Out
244 p_A_Alt
:= &p
.Inst
[pc
].Arg
245 // make sure a target is another Alt
246 instAlt
:= p
.Inst
[*p_A_Alt
]
247 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
248 p_A_Alt
, p_A_Other
= p_A_Other
, p_A_Alt
249 instAlt
= p
.Inst
[*p_A_Alt
]
250 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
254 instOther
:= p
.Inst
[*p_A_Other
]
255 // Analyzing both legs pointing to Alts is for another day
256 if instOther
.Op
== syntax
.InstAlt || instOther
.Op
== syntax
.InstAltMatch
{
260 // simple empty transition loop
261 // A:BC + B:DA => A:BC + B:DC
262 p_B_Alt
:= &p
.Inst
[*p_A_Alt
].Out
263 p_B_Other
:= &p
.Inst
[*p_A_Alt
].Arg
265 if instAlt
.Out
== uint32(pc
) {
267 } else if instAlt
.Arg
== uint32(pc
) {
269 p_B_Alt
, p_B_Other
= p_B_Other
, p_B_Alt
272 *p_B_Alt
= *p_A_Other
275 // empty transition to common target
276 // A:BC + B:DC => A:DC + B:DC
277 if *p_A_Other
== *p_B_Alt
{
278 *p_A_Alt
= *p_B_Other
285 // runeSlice exists to permit sorting the case-folded rune sets.
286 type runeSlice
[]rune
288 func (p runeSlice
) Len() int { return len(p
) }
289 func (p runeSlice
) Less(i
, j
int) bool { return p
[i
] < p
[j
] }
290 func (p runeSlice
) Swap(i
, j
int) { p
[i
], p
[j
] = p
[j
], p
[i
] }
292 var anyRuneNotNL
= []rune
{0, '\n' - 1, '\n' + 1, unicode
.MaxRune
}
293 var anyRune
= []rune
{0, unicode
.MaxRune
}
295 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
296 // the match engine can always tell which branch to take. The routine may modify
297 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
298 // onepass Prog, the Prog nil is returned. makeOnePass is recursive
299 // to the size of the Prog.
300 func makeOnePass(p
*onePassProg
) *onePassProg
{
301 // If the machine is very long, it's not worth the time to check if we can use one pass.
302 if len(p
.Inst
) >= 1000 {
307 instQueue
= newQueue(len(p
.Inst
))
308 visitQueue
= newQueue(len(p
.Inst
))
309 check
func(uint32, []bool) bool
310 onePassRunes
= make([][]rune
, len(p
.Inst
))
313 // check that paths from Alt instructions are unambiguous, and rebuild the new
314 // program as a onepass program
315 check
= func(pc
uint32, m
[]bool) (ok
bool) {
318 if visitQueue
.contains(pc
) {
321 visitQueue
.insert(pc
)
323 case syntax
.InstAlt
, syntax
.InstAltMatch
:
324 ok
= check(inst
.Out
, m
) && check(inst
.Arg
, m
)
325 // check no-input paths to InstMatch
326 matchOut
:= m
[inst
.Out
]
327 matchArg
:= m
[inst
.Arg
]
328 if matchOut
&& matchArg
{
332 // Match on empty goes in inst.Out
334 inst
.Out
, inst
.Arg
= inst
.Arg
, inst
.Out
335 matchOut
, matchArg
= matchArg
, matchOut
339 inst
.Op
= syntax
.InstAltMatch
342 // build a dispatch operator from the two legs of the alt.
343 onePassRunes
[pc
], inst
.Next
= mergeRuneSets(
344 &onePassRunes
[inst
.Out
], &onePassRunes
[inst
.Arg
], inst
.Out
, inst
.Arg
)
345 if len(inst
.Next
) > 0 && inst
.Next
[0] == mergeFailed
{
349 case syntax
.InstCapture
, syntax
.InstNop
:
350 ok
= check(inst
.Out
, m
)
352 // pass matching runes back through these no-ops.
353 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
354 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
355 for i
:= range inst
.Next
{
356 inst
.Next
[i
] = inst
.Out
358 case syntax
.InstEmptyWidth
:
359 ok
= check(inst
.Out
, m
)
361 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
362 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
363 for i
:= range inst
.Next
{
364 inst
.Next
[i
] = inst
.Out
366 case syntax
.InstMatch
, syntax
.InstFail
:
367 m
[pc
] = inst
.Op
== syntax
.InstMatch
368 case syntax
.InstRune
:
370 if len(inst
.Next
) > 0 {
373 instQueue
.insert(inst
.Out
)
374 if len(inst
.Rune
) == 0 {
375 onePassRunes
[pc
] = []rune
{}
376 inst
.Next
= []uint32{inst
.Out
}
379 runes
:= make([]rune
, 0)
380 if len(inst
.Rune
) == 1 && syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
382 runes
= append(runes
, r0
, r0
)
383 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
384 runes
= append(runes
, r1
, r1
)
386 sort
.Sort(runeSlice(runes
))
388 runes
= append(runes
, inst
.Rune
...)
390 onePassRunes
[pc
] = runes
391 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
392 for i
:= range inst
.Next
{
393 inst
.Next
[i
] = inst
.Out
395 inst
.Op
= syntax
.InstRune
396 case syntax
.InstRune1
:
398 if len(inst
.Next
) > 0 {
401 instQueue
.insert(inst
.Out
)
403 // expand case-folded runes
404 if syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
406 runes
= append(runes
, r0
, r0
)
407 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
408 runes
= append(runes
, r1
, r1
)
410 sort
.Sort(runeSlice(runes
))
412 runes
= append(runes
, inst
.Rune
[0], inst
.Rune
[0])
414 onePassRunes
[pc
] = runes
415 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
416 for i
:= range inst
.Next
{
417 inst
.Next
[i
] = inst
.Out
419 inst
.Op
= syntax
.InstRune
420 case syntax
.InstRuneAny
:
422 if len(inst
.Next
) > 0 {
425 instQueue
.insert(inst
.Out
)
426 onePassRunes
[pc
] = append([]rune
{}, anyRune
...)
427 inst
.Next
= []uint32{inst
.Out
}
428 case syntax
.InstRuneAnyNotNL
:
430 if len(inst
.Next
) > 0 {
433 instQueue
.insert(inst
.Out
)
434 onePassRunes
[pc
] = append([]rune
{}, anyRuneNotNL
...)
435 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
436 for i
:= range inst
.Next
{
437 inst
.Next
[i
] = inst
.Out
444 instQueue
.insert(uint32(p
.Start
))
445 m
:= make([]bool, len(p
.Inst
))
446 for !instQueue
.empty() {
448 pc
:= instQueue
.next()
455 for i
:= range p
.Inst
{
456 p
.Inst
[i
].Rune
= onePassRunes
[i
]
462 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
463 // can be recharacterized as a one-pass regexp program, or syntax.nil if the
464 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
465 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
466 func compileOnePass(prog
*syntax
.Prog
) (p
*onePassProg
) {
470 // onepass regexp is anchored
471 if prog
.Inst
[prog
.Start
].Op
!= syntax
.InstEmptyWidth ||
472 syntax
.EmptyOp(prog
.Inst
[prog
.Start
].Arg
)&syntax
.EmptyBeginText
!= syntax
.EmptyBeginText
{
475 // every instruction leading to InstMatch must be EmptyEndText
476 for _
, inst
:= range prog
.Inst
{
477 opOut
:= prog
.Inst
[inst
.Out
].Op
480 if opOut
== syntax
.InstMatch
{
483 case syntax
.InstAlt
, syntax
.InstAltMatch
:
484 if opOut
== syntax
.InstMatch || prog
.Inst
[inst
.Arg
].Op
== syntax
.InstMatch
{
487 case syntax
.InstEmptyWidth
:
488 if opOut
== syntax
.InstMatch
{
489 if syntax
.EmptyOp(inst
.Arg
)&syntax
.EmptyEndText
== syntax
.EmptyEndText
{
496 // Creates a slightly optimized copy of the original Prog
497 // that cleans up some Prog idioms that block valid onepass programs
498 p
= onePassCopy(prog
)
500 // checkAmbiguity on InstAlts, build onepass Prog if possible
504 cleanupOnePass(p
, prog
)