1 // Copyright 2014 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 // "One-pass" regexp execution.
15 // Some regexps can be analyzed to determine that they never need
16 // backtracking: they are guaranteed to run in one pass over the string
17 // without bothering to save all the usual NFA state.
18 // Detect those and execute them more quickly.
20 // A onePassProg is a compiled one-pass regular expression program.
21 // It is the same as syntax.Prog except for the use of onePassInst.
22 type onePassProg
struct {
24 Start
int // index of start instruction
25 NumCap
int // number of InstCapture insts in re
28 // A onePassInst is a single instruction in a one-pass regular expression program.
29 // It is the same as syntax.Inst except for the new 'Next' field.
30 type onePassInst
struct {
35 // OnePassPrefix returns a literal string that all matches for the
36 // regexp must start with. Complete is true if the prefix
37 // is the entire match. Pc is the index of the last rune instruction
38 // in the string. The OnePassPrefix skips over the mandatory
40 func onePassPrefix(p
*syntax
.Prog
) (prefix
string, complete
bool, pc
uint32) {
42 if i
.Op
!= syntax
.InstEmptyWidth ||
(syntax
.EmptyOp(i
.Arg
))&syntax
.EmptyBeginText
== 0 {
43 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
47 for i
.Op
== syntax
.InstNop
{
51 // Avoid allocation of buffer if prefix is empty.
52 if iop(i
) != syntax
.InstRune ||
len(i
.Rune
) != 1 {
53 return "", i
.Op
== syntax
.InstMatch
, uint32(p
.Start
)
56 // Have prefix; gather characters.
58 for iop(i
) == syntax
.InstRune
&& len(i
.Rune
) == 1 && syntax
.Flags(i
.Arg
)&syntax
.FoldCase
== 0 {
59 buf
.WriteRune(i
.Rune
[0])
60 pc
, i
= i
.Out
, &p
.Inst
[i
.Out
]
62 if i
.Op
== syntax
.InstEmptyWidth
&&
63 syntax
.EmptyOp(i
.Arg
)&syntax
.EmptyEndText
!= 0 &&
64 p
.Inst
[i
.Out
].Op
== syntax
.InstMatch
{
67 return buf
.String(), complete
, pc
70 // OnePassNext selects the next actionable state of the prog, based on the input character.
71 // It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
72 // One of the alternates may ultimately lead without input to end of line. If the instruction
73 // is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
74 func onePassNext(i
*onePassInst
, r rune
) uint32 {
75 next
:= i
.MatchRunePos(r
)
79 if i
.Op
== syntax
.InstAltMatch
{
85 func iop(i
*syntax
.Inst
) syntax
.InstOp
{
88 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
94 // Sparse Array implementation is used as a queueOnePass.
95 type queueOnePass
struct {
98 size
, nextIndex
uint32
101 func (q
*queueOnePass
) empty() bool {
102 return q
.nextIndex
>= q
.size
105 func (q
*queueOnePass
) next() (n
uint32) {
106 n
= q
.dense
[q
.nextIndex
]
111 func (q
*queueOnePass
) clear() {
116 func (q
*queueOnePass
) contains(u
uint32) bool {
117 if u
>= uint32(len(q
.sparse
)) {
120 return q
.sparse
[u
] < q
.size
&& q
.dense
[q
.sparse
[u
]] == u
123 func (q
*queueOnePass
) insert(u
uint32) {
129 func (q
*queueOnePass
) insertNew(u
uint32) {
130 if u
>= uint32(len(q
.sparse
)) {
138 func newQueue(size
int) (q
*queueOnePass
) {
139 return &queueOnePass
{
140 sparse
: make([]uint32, size
),
141 dense
: make([]uint32, size
),
145 // mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146 // and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147 // i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148 // NextIp array with the single element mergeFailed is returned.
149 // The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150 const mergeFailed
= uint32(0xffffffff)
154 noNext
= []uint32{mergeFailed
}
157 func mergeRuneSets(leftRunes
, rightRunes
*[]rune
, leftPC
, rightPC
uint32) ([]rune
, []uint32) {
158 leftLen
:= len(*leftRunes
)
159 rightLen
:= len(*rightRunes
)
160 if leftLen
&0x1 != 0 || rightLen
&0x1 != 0 {
161 panic("mergeRuneSets odd length []rune")
166 merged
:= make([]rune
, 0)
167 next
:= make([]uint32, 0)
177 extend
:= func(newLow
*int, newArray
*[]rune
, pc
uint32) bool {
178 if ix
> 0 && (*newArray
)[*newLow
] <= merged
[ix
] {
181 merged
= append(merged
, (*newArray
)[*newLow
], (*newArray
)[*newLow
+1])
184 next
= append(next
, pc
)
188 for lx
< leftLen || rx
< rightLen
{
191 ok
= extend(&lx
, leftRunes
, leftPC
)
193 ok
= extend(&rx
, rightRunes
, rightPC
)
194 case (*rightRunes
)[rx
] < (*leftRunes
)[lx
]:
195 ok
= extend(&rx
, rightRunes
, rightPC
)
197 ok
= extend(&lx
, leftRunes
, leftPC
)
200 return noRune
, noNext
206 // cleanupOnePass drops working memory, and restores certain shortcut instructions.
207 func cleanupOnePass(prog
*onePassProg
, original
*syntax
.Prog
) {
208 for ix
, instOriginal
:= range original
.Inst
{
209 switch instOriginal
.Op
{
210 case syntax
.InstAlt
, syntax
.InstAltMatch
, syntax
.InstRune
:
211 case syntax
.InstCapture
, syntax
.InstEmptyWidth
, syntax
.InstNop
, syntax
.InstMatch
, syntax
.InstFail
:
212 prog
.Inst
[ix
].Next
= nil
213 case syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
214 prog
.Inst
[ix
].Next
= nil
215 prog
.Inst
[ix
] = onePassInst
{Inst
: instOriginal
}
220 // onePassCopy creates a copy of the original Prog, as we'll be modifying it
221 func onePassCopy(prog
*syntax
.Prog
) *onePassProg
{
225 Inst
: make([]onePassInst
, len(prog
.Inst
)),
227 for i
, inst
:= range prog
.Inst
{
228 p
.Inst
[i
] = onePassInst
{Inst
: inst
}
231 // rewrites one or more common Prog constructs that enable some otherwise
232 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
233 // ip A, that points to ips B & C.
234 // A:BC + B:DA => A:BC + B:CD
235 // A:BC + B:DC => A:DC + B:DC
236 for pc
:= range p
.Inst
{
237 switch p
.Inst
[pc
].Op
{
240 case syntax
.InstAlt
, syntax
.InstAltMatch
:
242 p_A_Other
:= &p
.Inst
[pc
].Out
243 p_A_Alt
:= &p
.Inst
[pc
].Arg
244 // make sure a target is another Alt
245 instAlt
:= p
.Inst
[*p_A_Alt
]
246 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
247 p_A_Alt
, p_A_Other
= p_A_Other
, p_A_Alt
248 instAlt
= p
.Inst
[*p_A_Alt
]
249 if !(instAlt
.Op
== syntax
.InstAlt || instAlt
.Op
== syntax
.InstAltMatch
) {
253 instOther
:= p
.Inst
[*p_A_Other
]
254 // Analyzing both legs pointing to Alts is for another day
255 if instOther
.Op
== syntax
.InstAlt || instOther
.Op
== syntax
.InstAltMatch
{
259 // simple empty transition loop
260 // A:BC + B:DA => A:BC + B:DC
261 p_B_Alt
:= &p
.Inst
[*p_A_Alt
].Out
262 p_B_Other
:= &p
.Inst
[*p_A_Alt
].Arg
264 if instAlt
.Out
== uint32(pc
) {
266 } else if instAlt
.Arg
== uint32(pc
) {
268 p_B_Alt
, p_B_Other
= p_B_Other
, p_B_Alt
271 *p_B_Alt
= *p_A_Other
274 // empty transition to common target
275 // A:BC + B:DC => A:DC + B:DC
276 if *p_A_Other
== *p_B_Alt
{
277 *p_A_Alt
= *p_B_Other
284 // runeSlice exists to permit sorting the case-folded rune sets.
285 type runeSlice
[]rune
287 func (p runeSlice
) Len() int { return len(p
) }
288 func (p runeSlice
) Less(i
, j
int) bool { return p
[i
] < p
[j
] }
289 func (p runeSlice
) Swap(i
, j
int) { p
[i
], p
[j
] = p
[j
], p
[i
] }
291 var anyRuneNotNL
= []rune
{0, '\n' - 1, '\n' + 1, unicode
.MaxRune
}
292 var anyRune
= []rune
{0, unicode
.MaxRune
}
294 // makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
295 // the match engine can always tell which branch to take. The routine may modify
296 // p if it is turned into a onepass Prog. If it isn't possible for this to be a
297 // onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
298 // to the size of the Prog.
299 func makeOnePass(p
*onePassProg
) *onePassProg
{
300 // If the machine is very long, it's not worth the time to check if we can use one pass.
301 if len(p
.Inst
) >= 1000 {
306 instQueue
= newQueue(len(p
.Inst
))
307 visitQueue
= newQueue(len(p
.Inst
))
308 check
func(uint32, []bool) bool
309 onePassRunes
= make([][]rune
, len(p
.Inst
))
312 // check that paths from Alt instructions are unambiguous, and rebuild the new
313 // program as a onepass program
314 check
= func(pc
uint32, m
[]bool) (ok
bool) {
317 if visitQueue
.contains(pc
) {
320 visitQueue
.insert(pc
)
322 case syntax
.InstAlt
, syntax
.InstAltMatch
:
323 ok
= check(inst
.Out
, m
) && check(inst
.Arg
, m
)
324 // check no-input paths to InstMatch
325 matchOut
:= m
[inst
.Out
]
326 matchArg
:= m
[inst
.Arg
]
327 if matchOut
&& matchArg
{
331 // Match on empty goes in inst.Out
333 inst
.Out
, inst
.Arg
= inst
.Arg
, inst
.Out
334 matchOut
, matchArg
= matchArg
, matchOut
338 inst
.Op
= syntax
.InstAltMatch
341 // build a dispatch operator from the two legs of the alt.
342 onePassRunes
[pc
], inst
.Next
= mergeRuneSets(
343 &onePassRunes
[inst
.Out
], &onePassRunes
[inst
.Arg
], inst
.Out
, inst
.Arg
)
344 if len(inst
.Next
) > 0 && inst
.Next
[0] == mergeFailed
{
348 case syntax
.InstCapture
, syntax
.InstNop
:
349 ok
= check(inst
.Out
, m
)
351 // pass matching runes back through these no-ops.
352 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
353 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
354 for i
:= range inst
.Next
{
355 inst
.Next
[i
] = inst
.Out
357 case syntax
.InstEmptyWidth
:
358 ok
= check(inst
.Out
, m
)
360 onePassRunes
[pc
] = append([]rune
{}, onePassRunes
[inst
.Out
]...)
361 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
362 for i
:= range inst
.Next
{
363 inst
.Next
[i
] = inst
.Out
365 case syntax
.InstMatch
, syntax
.InstFail
:
366 m
[pc
] = inst
.Op
== syntax
.InstMatch
367 case syntax
.InstRune
:
369 if len(inst
.Next
) > 0 {
372 instQueue
.insert(inst
.Out
)
373 if len(inst
.Rune
) == 0 {
374 onePassRunes
[pc
] = []rune
{}
375 inst
.Next
= []uint32{inst
.Out
}
378 runes
:= make([]rune
, 0)
379 if len(inst
.Rune
) == 1 && syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
381 runes
= append(runes
, r0
, r0
)
382 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
383 runes
= append(runes
, r1
, r1
)
385 sort
.Sort(runeSlice(runes
))
387 runes
= append(runes
, inst
.Rune
...)
389 onePassRunes
[pc
] = runes
390 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
391 for i
:= range inst
.Next
{
392 inst
.Next
[i
] = inst
.Out
394 inst
.Op
= syntax
.InstRune
395 case syntax
.InstRune1
:
397 if len(inst
.Next
) > 0 {
400 instQueue
.insert(inst
.Out
)
402 // expand case-folded runes
403 if syntax
.Flags(inst
.Arg
)&syntax
.FoldCase
!= 0 {
405 runes
= append(runes
, r0
, r0
)
406 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
407 runes
= append(runes
, r1
, r1
)
409 sort
.Sort(runeSlice(runes
))
411 runes
= append(runes
, inst
.Rune
[0], inst
.Rune
[0])
413 onePassRunes
[pc
] = runes
414 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
415 for i
:= range inst
.Next
{
416 inst
.Next
[i
] = inst
.Out
418 inst
.Op
= syntax
.InstRune
419 case syntax
.InstRuneAny
:
421 if len(inst
.Next
) > 0 {
424 instQueue
.insert(inst
.Out
)
425 onePassRunes
[pc
] = append([]rune
{}, anyRune
...)
426 inst
.Next
= []uint32{inst
.Out
}
427 case syntax
.InstRuneAnyNotNL
:
429 if len(inst
.Next
) > 0 {
432 instQueue
.insert(inst
.Out
)
433 onePassRunes
[pc
] = append([]rune
{}, anyRuneNotNL
...)
434 inst
.Next
= make([]uint32, len(onePassRunes
[pc
])/2+1)
435 for i
:= range inst
.Next
{
436 inst
.Next
[i
] = inst
.Out
443 instQueue
.insert(uint32(p
.Start
))
444 m
:= make([]bool, len(p
.Inst
))
445 for !instQueue
.empty() {
447 pc
:= instQueue
.next()
454 for i
:= range p
.Inst
{
455 p
.Inst
[i
].Rune
= onePassRunes
[i
]
461 var notOnePass
*onePassProg
= nil
463 // compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
464 // can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
465 // Prog cannot be converted. For a one pass prog, the fundamental condition that must
466 // be true is: at any InstAlt, there must be no ambiguity about what branch to take.
467 func compileOnePass(prog
*syntax
.Prog
) (p
*onePassProg
) {
471 // onepass regexp is anchored
472 if prog
.Inst
[prog
.Start
].Op
!= syntax
.InstEmptyWidth ||
473 syntax
.EmptyOp(prog
.Inst
[prog
.Start
].Arg
)&syntax
.EmptyBeginText
!= syntax
.EmptyBeginText
{
476 // every instruction leading to InstMatch must be EmptyEndText
477 for _
, inst
:= range prog
.Inst
{
478 opOut
:= prog
.Inst
[inst
.Out
].Op
481 if opOut
== syntax
.InstMatch
{
484 case syntax
.InstAlt
, syntax
.InstAltMatch
:
485 if opOut
== syntax
.InstMatch || prog
.Inst
[inst
.Arg
].Op
== syntax
.InstMatch
{
488 case syntax
.InstEmptyWidth
:
489 if opOut
== syntax
.InstMatch
{
490 if syntax
.EmptyOp(inst
.Arg
)&syntax
.EmptyEndText
== syntax
.EmptyEndText
{
497 // Creates a slightly optimized copy of the original Prog
498 // that cleans up some Prog idioms that block valid onepass programs
499 p
= onePassCopy(prog
)
501 // checkAmbiguity on InstAlts, build onepass Prog if possible
505 cleanupOnePass(p
, prog
)