PR middle-end/81768
[official-gcc.git] / libgo / go / regexp / backtrack.go
blob29f624b54c7da8e37e0f7b06ae058bb02d64c552
1 // Copyright 2015 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // backtrack is a regular expression search with submatch
6 // tracking for small regular expressions and texts. It allocates
7 // a bit vector with (length of input) * (length of prog) bits,
8 // to make sure it never explores the same (character position, instruction)
9 // state multiple times. This limits the search to run in time linear in
10 // the length of the test.
12 // backtrack is a fast replacement for the NFA code on small
13 // regexps when onepass cannot be used.
15 package regexp
17 import "regexp/syntax"
19 // A job is an entry on the backtracker's job stack. It holds
20 // the instruction pc and the position in the input.
21 type job struct {
22 pc uint32
23 arg int
24 pos int
27 const (
28 visitedBits = 32
29 maxBacktrackProg = 500 // len(prog.Inst) <= max
30 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits)
33 // bitState holds state for the backtracker.
34 type bitState struct {
35 prog *syntax.Prog
37 end int
38 cap []int
39 jobs []job
40 visited []uint32
43 var notBacktrack *bitState = nil
45 // maxBitStateLen returns the maximum length of a string to search with
46 // the backtracker using prog.
47 func maxBitStateLen(prog *syntax.Prog) int {
48 if !shouldBacktrack(prog) {
49 return 0
51 return maxBacktrackVector / len(prog.Inst)
54 // newBitState returns a new bitState for the given prog,
55 // or notBacktrack if the size of the prog exceeds the maximum size that
56 // the backtracker will be run for.
57 func newBitState(prog *syntax.Prog) *bitState {
58 if !shouldBacktrack(prog) {
59 return notBacktrack
61 return &bitState{
62 prog: prog,
66 // shouldBacktrack reports whether the program is too
67 // long for the backtracker to run.
68 func shouldBacktrack(prog *syntax.Prog) bool {
69 return len(prog.Inst) <= maxBacktrackProg
72 // reset resets the state of the backtracker.
73 // end is the end position in the input.
74 // ncap is the number of captures.
75 func (b *bitState) reset(end int, ncap int) {
76 b.end = end
78 if cap(b.jobs) == 0 {
79 b.jobs = make([]job, 0, 256)
80 } else {
81 b.jobs = b.jobs[:0]
84 visitedSize := (len(b.prog.Inst)*(end+1) + visitedBits - 1) / visitedBits
85 if cap(b.visited) < visitedSize {
86 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits)
87 } else {
88 b.visited = b.visited[:visitedSize]
89 for i := range b.visited {
90 b.visited[i] = 0
94 if cap(b.cap) < ncap {
95 b.cap = make([]int, ncap)
96 } else {
97 b.cap = b.cap[:ncap]
99 for i := range b.cap {
100 b.cap[i] = -1
104 // shouldVisit reports whether the combination of (pc, pos) has not
105 // been visited yet.
106 func (b *bitState) shouldVisit(pc uint32, pos int) bool {
107 n := uint(int(pc)*(b.end+1) + pos)
108 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 {
109 return false
111 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1))
112 return true
115 // push pushes (pc, pos, arg) onto the job stack if it should be
116 // visited.
117 func (b *bitState) push(pc uint32, pos int, arg int) {
118 if b.prog.Inst[pc].Op == syntax.InstFail {
119 return
122 // Only check shouldVisit when arg == 0.
123 // When arg > 0, we are continuing a previous visit.
124 if arg == 0 && !b.shouldVisit(pc, pos) {
125 return
128 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos})
131 // tryBacktrack runs a backtracking search starting at pos.
132 func (m *machine) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool {
133 longest := m.re.longest
134 m.matched = false
136 b.push(pc, pos, 0)
137 for len(b.jobs) > 0 {
138 l := len(b.jobs) - 1
139 // Pop job off the stack.
140 pc := b.jobs[l].pc
141 pos := b.jobs[l].pos
142 arg := b.jobs[l].arg
143 b.jobs = b.jobs[:l]
145 // Optimization: rather than push and pop,
146 // code that is going to Push and continue
147 // the loop simply updates ip, p, and arg
148 // and jumps to CheckAndLoop. We have to
149 // do the ShouldVisit check that Push
150 // would have, but we avoid the stack
151 // manipulation.
152 goto Skip
153 CheckAndLoop:
154 if !b.shouldVisit(pc, pos) {
155 continue
157 Skip:
159 inst := b.prog.Inst[pc]
161 switch inst.Op {
162 default:
163 panic("bad inst")
164 case syntax.InstFail:
165 panic("unexpected InstFail")
166 case syntax.InstAlt:
167 // Cannot just
168 // b.push(inst.Out, pos, 0)
169 // b.push(inst.Arg, pos, 0)
170 // If during the processing of inst.Out, we encounter
171 // inst.Arg via another path, we want to process it then.
172 // Pushing it here will inhibit that. Instead, re-push
173 // inst with arg==1 as a reminder to push inst.Arg out
174 // later.
175 switch arg {
176 case 0:
177 b.push(pc, pos, 1)
178 pc = inst.Out
179 goto CheckAndLoop
180 case 1:
181 // Finished inst.Out; try inst.Arg.
182 arg = 0
183 pc = inst.Arg
184 goto CheckAndLoop
186 panic("bad arg in InstAlt")
188 case syntax.InstAltMatch:
189 // One opcode consumes runes; the other leads to match.
190 switch b.prog.Inst[inst.Out].Op {
191 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
192 // inst.Arg is the match.
193 b.push(inst.Arg, pos, 0)
194 pc = inst.Arg
195 pos = b.end
196 goto CheckAndLoop
198 // inst.Out is the match - non-greedy
199 b.push(inst.Out, b.end, 0)
200 pc = inst.Out
201 goto CheckAndLoop
203 case syntax.InstRune:
204 r, width := i.step(pos)
205 if !inst.MatchRune(r) {
206 continue
208 pos += width
209 pc = inst.Out
210 goto CheckAndLoop
212 case syntax.InstRune1:
213 r, width := i.step(pos)
214 if r != inst.Rune[0] {
215 continue
217 pos += width
218 pc = inst.Out
219 goto CheckAndLoop
221 case syntax.InstRuneAnyNotNL:
222 r, width := i.step(pos)
223 if r == '\n' || r == endOfText {
224 continue
226 pos += width
227 pc = inst.Out
228 goto CheckAndLoop
230 case syntax.InstRuneAny:
231 r, width := i.step(pos)
232 if r == endOfText {
233 continue
235 pos += width
236 pc = inst.Out
237 goto CheckAndLoop
239 case syntax.InstCapture:
240 switch arg {
241 case 0:
242 if 0 <= inst.Arg && inst.Arg < uint32(len(b.cap)) {
243 // Capture pos to register, but save old value.
244 b.push(pc, b.cap[inst.Arg], 1) // come back when we're done.
245 b.cap[inst.Arg] = pos
247 pc = inst.Out
248 goto CheckAndLoop
249 case 1:
250 // Finished inst.Out; restore the old value.
251 b.cap[inst.Arg] = pos
252 continue
255 panic("bad arg in InstCapture")
257 case syntax.InstEmptyWidth:
258 if syntax.EmptyOp(inst.Arg)&^i.context(pos) != 0 {
259 continue
261 pc = inst.Out
262 goto CheckAndLoop
264 case syntax.InstNop:
265 pc = inst.Out
266 goto CheckAndLoop
268 case syntax.InstMatch:
269 // We found a match. If the caller doesn't care
270 // where the match is, no point going further.
271 if len(b.cap) == 0 {
272 m.matched = true
273 return m.matched
276 // Record best match so far.
277 // Only need to check end point, because this entire
278 // call is only considering one start position.
279 if len(b.cap) > 1 {
280 b.cap[1] = pos
282 if !m.matched || (longest && pos > 0 && pos > m.matchcap[1]) {
283 copy(m.matchcap, b.cap)
285 m.matched = true
287 // If going for first match, we're done.
288 if !longest {
289 return m.matched
292 // If we used the entire text, no longer match is possible.
293 if pos == b.end {
294 return m.matched
297 // Otherwise, continue on in hope of a longer match.
298 continue
302 return m.matched
305 // backtrack runs a backtracking search of prog on the input starting at pos.
306 func (m *machine) backtrack(i input, pos int, end int, ncap int) bool {
307 if !i.canCheckPrefix() {
308 panic("backtrack called for a RuneReader")
311 startCond := m.re.cond
312 if startCond == ^syntax.EmptyOp(0) { // impossible
313 return false
315 if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
316 // Anchored match, past beginning of text.
317 return false
320 b := m.b
321 b.reset(end, ncap)
323 m.matchcap = m.matchcap[:ncap]
324 for i := range m.matchcap {
325 m.matchcap[i] = -1
328 // Anchored search must start at the beginning of the input
329 if startCond&syntax.EmptyBeginText != 0 {
330 if len(b.cap) > 0 {
331 b.cap[0] = pos
333 return m.tryBacktrack(b, i, uint32(m.p.Start), pos)
336 // Unanchored search, starting from each possible text position.
337 // Notice that we have to try the empty string at the end of
338 // the text, so the loop condition is pos <= end, not pos < end.
339 // This looks like it's quadratic in the size of the text,
340 // but we are not clearing visited between calls to TrySearch,
341 // so no work is duplicated and it ends up still being linear.
342 width := -1
343 for ; pos <= end && width != 0; pos += width {
344 if len(m.re.prefix) > 0 {
345 // Match requires literal prefix; fast search for it.
346 advance := i.index(m.re, pos)
347 if advance < 0 {
348 return false
350 pos += advance
353 if len(b.cap) > 0 {
354 b.cap[0] = pos
356 if m.tryBacktrack(b, i, uint32(m.p.Start), pos) {
357 // Match must be leftmost; done.
358 return true
360 _, width = i.step(pos)
362 return false