1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // backtrack is a regular expression search with submatch
6 // tracking for small regular expressions and texts. It allocates
7 // a bit vector with (length of input) * (length of prog) bits,
8 // to make sure it never explores the same (character position, instruction)
9 // state multiple times. This limits the search to run in time linear in
10 // the length of the test.
12 // backtrack is a fast replacement for the NFA code on small
13 // regexps when onepass cannot be used.
17 import "regexp/syntax"
19 // A job is an entry on the backtracker's job stack. It holds
20 // the instruction pc and the position in the input.
29 maxBacktrackProg
= 500 // len(prog.Inst) <= max
30 maxBacktrackVector
= 256 * 1024 // bit vector size <= max (bits)
33 // bitState holds state for the backtracker.
34 type bitState
struct {
43 var notBacktrack
*bitState
= nil
45 // maxBitStateLen returns the maximum length of a string to search with
46 // the backtracker using prog.
47 func maxBitStateLen(prog
*syntax
.Prog
) int {
48 if !shouldBacktrack(prog
) {
51 return maxBacktrackVector
/ len(prog
.Inst
)
54 // newBitState returns a new bitState for the given prog,
55 // or notBacktrack if the size of the prog exceeds the maximum size that
56 // the backtracker will be run for.
57 func newBitState(prog
*syntax
.Prog
) *bitState
{
58 if !shouldBacktrack(prog
) {
66 // shouldBacktrack reports whether the program is too
67 // long for the backtracker to run.
68 func shouldBacktrack(prog
*syntax
.Prog
) bool {
69 return len(prog
.Inst
) <= maxBacktrackProg
72 // reset resets the state of the backtracker.
73 // end is the end position in the input.
74 // ncap is the number of captures.
75 func (b
*bitState
) reset(end
int, ncap
int) {
79 b
.jobs
= make([]job
, 0, 256)
84 visitedSize
:= (len(b
.prog
.Inst
)*(end
+1) + visitedBits
- 1) / visitedBits
85 if cap(b
.visited
) < visitedSize
{
86 b
.visited
= make([]uint32, visitedSize
, maxBacktrackVector
/visitedBits
)
88 b
.visited
= b
.visited
[:visitedSize
]
89 for i
:= range b
.visited
{
94 if cap(b
.cap) < ncap
{
95 b
.cap = make([]int, ncap
)
99 for i
:= range b
.cap {
104 // shouldVisit reports whether the combination of (pc, pos) has not
106 func (b
*bitState
) shouldVisit(pc
uint32, pos
int) bool {
107 n
:= uint(int(pc
)*(b
.end
+1) + pos
)
108 if b
.visited
[n
/visitedBits
]&(1<<(n
&(visitedBits
-1))) != 0 {
111 b
.visited
[n
/visitedBits
] |
= 1 << (n
& (visitedBits
- 1))
115 // push pushes (pc, pos, arg) onto the job stack if it should be
117 func (b
*bitState
) push(pc
uint32, pos
int, arg
int) {
118 if b
.prog
.Inst
[pc
].Op
== syntax
.InstFail
{
122 // Only check shouldVisit when arg == 0.
123 // When arg > 0, we are continuing a previous visit.
124 if arg
== 0 && !b
.shouldVisit(pc
, pos
) {
128 b
.jobs
= append(b
.jobs
, job
{pc
: pc
, arg
: arg
, pos
: pos
})
131 // tryBacktrack runs a backtracking search starting at pos.
132 func (m
*machine
) tryBacktrack(b
*bitState
, i input
, pc
uint32, pos
int) bool {
133 longest
:= m
.re
.longest
137 for len(b
.jobs
) > 0 {
139 // Pop job off the stack.
145 // Optimization: rather than push and pop,
146 // code that is going to Push and continue
147 // the loop simply updates ip, p, and arg
148 // and jumps to CheckAndLoop. We have to
149 // do the ShouldVisit check that Push
150 // would have, but we avoid the stack
154 if !b
.shouldVisit(pc
, pos
) {
159 inst
:= b
.prog
.Inst
[pc
]
164 case syntax
.InstFail
:
165 panic("unexpected InstFail")
168 // b.push(inst.Out, pos, 0)
169 // b.push(inst.Arg, pos, 0)
170 // If during the processing of inst.Out, we encounter
171 // inst.Arg via another path, we want to process it then.
172 // Pushing it here will inhibit that. Instead, re-push
173 // inst with arg==1 as a reminder to push inst.Arg out
181 // Finished inst.Out; try inst.Arg.
186 panic("bad arg in InstAlt")
188 case syntax
.InstAltMatch
:
189 // One opcode consumes runes; the other leads to match.
190 switch b
.prog
.Inst
[inst
.Out
].Op
{
191 case syntax
.InstRune
, syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
192 // inst.Arg is the match.
193 b
.push(inst
.Arg
, pos
, 0)
198 // inst.Out is the match - non-greedy
199 b
.push(inst
.Out
, b
.end
, 0)
203 case syntax
.InstRune
:
204 r
, width
:= i
.step(pos
)
205 if !inst
.MatchRune(r
) {
212 case syntax
.InstRune1
:
213 r
, width
:= i
.step(pos
)
214 if r
!= inst
.Rune
[0] {
221 case syntax
.InstRuneAnyNotNL
:
222 r
, width
:= i
.step(pos
)
223 if r
== '\n' || r
== endOfText
{
230 case syntax
.InstRuneAny
:
231 r
, width
:= i
.step(pos
)
239 case syntax
.InstCapture
:
242 if 0 <= inst
.Arg
&& inst
.Arg
< uint32(len(b
.cap)) {
243 // Capture pos to register, but save old value.
244 b
.push(pc
, b
.cap[inst
.Arg
], 1) // come back when we're done.
245 b
.cap[inst
.Arg
] = pos
250 // Finished inst.Out; restore the old value.
251 b
.cap[inst
.Arg
] = pos
255 panic("bad arg in InstCapture")
257 case syntax
.InstEmptyWidth
:
258 if syntax
.EmptyOp(inst
.Arg
)&^i
.context(pos
) != 0 {
268 case syntax
.InstMatch
:
269 // We found a match. If the caller doesn't care
270 // where the match is, no point going further.
276 // Record best match so far.
277 // Only need to check end point, because this entire
278 // call is only considering one start position.
282 if !m
.matched ||
(longest
&& pos
> 0 && pos
> m
.matchcap
[1]) {
283 copy(m
.matchcap
, b
.cap)
287 // If going for first match, we're done.
292 // If we used the entire text, no longer match is possible.
297 // Otherwise, continue on in hope of a longer match.
305 // backtrack runs a backtracking search of prog on the input starting at pos.
306 func (m
*machine
) backtrack(i input
, pos
int, end
int, ncap
int) bool {
307 if !i
.canCheckPrefix() {
308 panic("backtrack called for a RuneReader")
311 startCond
:= m
.re
.cond
312 if startCond
== ^syntax
.EmptyOp(0) { // impossible
315 if startCond
&syntax
.EmptyBeginText
!= 0 && pos
!= 0 {
316 // Anchored match, past beginning of text.
323 m
.matchcap
= m
.matchcap
[:ncap
]
324 for i
:= range m
.matchcap
{
328 // Anchored search must start at the beginning of the input
329 if startCond
&syntax
.EmptyBeginText
!= 0 {
333 return m
.tryBacktrack(b
, i
, uint32(m
.p
.Start
), pos
)
336 // Unanchored search, starting from each possible text position.
337 // Notice that we have to try the empty string at the end of
338 // the text, so the loop condition is pos <= end, not pos < end.
339 // This looks like it's quadratic in the size of the text,
340 // but we are not clearing visited between calls to TrySearch,
341 // so no work is duplicated and it ends up still being linear.
343 for ; pos
<= end
&& width
!= 0; pos
+= width
{
344 if len(m
.re
.prefix
) > 0 {
345 // Match requires literal prefix; fast search for it.
346 advance
:= i
.index(m
.re
, pos
)
356 if m
.tryBacktrack(b
, i
, uint32(m
.p
.Start
), pos
) {
357 // Match must be leftmost; done.
360 _
, width
= i
.step(pos
)