1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
13 // A queue is a 'sparse array' holding pending threads of execution.
14 // See https://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
20 // An entry is an entry on a queue.
21 // It holds both the instruction pc and the actual thread.
22 // Some queue entries are just place holders so that the machine
23 // knows it has considered that pc. Such entries have t == nil.
29 // A thread is the state of a single path through the machine:
30 // an instruction and a corresponding capture array.
31 // See https://swtch.com/~rsc/regexp/regexp2.html
37 // A machine holds all the state during an NFA simulation for p.
39 re
*Regexp
// corresponding Regexp
40 p
*syntax
.Prog
// compiled program
41 q0
, q1 queue
// two queues for runq, nextq
42 pool
[]*thread
// pool of available threads
43 matched
bool // whether a match was found
44 matchcap
[]int // capture information for the match
50 // cached inputs, to avoid allocation
56 func (i
*inputs
) newBytes(b
[]byte) input
{
61 func (i
*inputs
) newString(s
string) input
{
66 func (i
*inputs
) newReader(r io
.RuneReader
) input
{
68 i
.reader
.atEOT
= false
73 func (i
*inputs
) clear() {
74 // We need to clear 1 of these.
75 // Avoid the expense of clearing the others (pointer write barrier).
76 if i
.bytes
.str
!= nil {
78 } else if i
.reader
.r
!= nil {
85 func (i
*inputs
) init(r io
.RuneReader
, b
[]byte, s
string) (input
, int) {
87 return i
.newReader(r
), 0
90 return i
.newBytes(b
), len(b
)
92 return i
.newString(s
), len(s
)
95 func (m
*machine
) init(ncap
int) {
96 for _
, t
:= range m
.pool
{
99 m
.matchcap
= m
.matchcap
[:ncap
]
102 // alloc allocates a new thread with the given instruction.
103 // It uses the free pool if possible.
104 func (m
*machine
) alloc(i
*syntax
.Inst
) *thread
{
106 if n
:= len(m
.pool
); n
> 0 {
108 m
.pool
= m
.pool
[:n
-1]
111 t
.cap = make([]int, len(m
.matchcap
), cap(m
.matchcap
))
117 // A lazyFlag is a lazily-evaluated syntax.EmptyOp,
118 // for checking zero-width flags like ^ $ \A \z \B \b.
119 // It records the pair of relevant runes and does not
120 // determine the implied flags until absolutely necessary
121 // (most of the time, that means never).
124 func newLazyFlag(r1
, r2 rune
) lazyFlag
{
125 return lazyFlag(uint64(r1
)<<32 |
uint64(uint32(r2
)))
128 func (f lazyFlag
) match(op syntax
.EmptyOp
) bool {
133 if op
&syntax
.EmptyBeginLine
!= 0 {
134 if r1
!= '\n' && r1
>= 0 {
137 op
&^= syntax
.EmptyBeginLine
139 if op
&syntax
.EmptyBeginText
!= 0 {
143 op
&^= syntax
.EmptyBeginText
149 if op
&syntax
.EmptyEndLine
!= 0 {
150 if r2
!= '\n' && r2
>= 0 {
153 op
&^= syntax
.EmptyEndLine
155 if op
&syntax
.EmptyEndText
!= 0 {
159 op
&^= syntax
.EmptyEndText
164 if syntax
.IsWordChar(r1
) != syntax
.IsWordChar(r2
) {
165 op
&^= syntax
.EmptyWordBoundary
167 op
&^= syntax
.EmptyNoWordBoundary
172 // match runs the machine over the input starting at pos.
173 // It reports whether a match was found.
174 // If so, m.matchcap holds the submatch information.
175 func (m
*machine
) match(i input
, pos
int) bool {
176 startCond
:= m
.re
.cond
177 if startCond
== ^syntax
.EmptyOp(0) { // impossible
181 for i
:= range m
.matchcap
{
184 runq
, nextq
:= &m
.q0
, &m
.q1
185 r
, r1
:= endOfText
, endOfText
186 width
, width1
:= 0, 0
187 r
, width
= i
.step(pos
)
189 r1
, width1
= i
.step(pos
+ width
)
193 flag
= newLazyFlag(-1, r
)
195 flag
= i
.context(pos
)
198 if len(runq
.dense
) == 0 {
199 if startCond
&syntax
.EmptyBeginText
!= 0 && pos
!= 0 {
200 // Anchored match, past beginning of text.
204 // Have match; finished exploring alternatives.
207 if len(m
.re
.prefix
) > 0 && r1
!= m
.re
.prefixRune
&& i
.canCheckPrefix() {
208 // Match requires literal prefix; fast search for it.
209 advance
:= i
.index(m
.re
, pos
)
214 r
, width
= i
.step(pos
)
215 r1
, width1
= i
.step(pos
+ width
)
219 if len(m
.matchcap
) > 0 {
222 m
.add(runq
, uint32(m
.p
.Start
), pos
, m
.matchcap
, &flag
, nil)
224 flag
= newLazyFlag(r
, r1
)
225 m
.step(runq
, nextq
, pos
, pos
+width
, r
, &flag
)
229 if len(m
.matchcap
) == 0 && m
.matched
{
230 // Found a match and not paying attention
231 // to where it is, so any match will do.
235 r
, width
= r1
, width1
237 r1
, width1
= i
.step(pos
+ width
)
239 runq
, nextq
= nextq
, runq
245 // clear frees all threads on the thread queue.
246 func (m
*machine
) clear(q
*queue
) {
247 for _
, d
:= range q
.dense
{
249 m
.pool
= append(m
.pool
, d
.t
)
252 q
.dense
= q
.dense
[:0]
255 // step executes one step of the machine, running each of the threads
256 // on runq and appending new threads to nextq.
257 // The step processes the rune c (which may be endOfText),
258 // which starts at position pos and ends at nextPos.
259 // nextCond gives the setting for the empty-width flags after c.
260 func (m
*machine
) step(runq
, nextq
*queue
, pos
, nextPos
int, c rune
, nextCond
*lazyFlag
) {
261 longest
:= m
.re
.longest
262 for j
:= 0; j
< len(runq
.dense
); j
++ {
268 if longest
&& m
.matched
&& len(t
.cap) > 0 && m
.matchcap
[0] < t
.cap[0] {
269 m
.pool
= append(m
.pool
, t
)
278 case syntax
.InstMatch
:
279 if len(t
.cap) > 0 && (!longest ||
!m
.matched || m
.matchcap
[1] < pos
) {
281 copy(m
.matchcap
, t
.cap)
284 // First-match mode: cut off all lower-priority threads.
285 for _
, d
:= range runq
.dense
[j
+1:] {
287 m
.pool
= append(m
.pool
, d
.t
)
290 runq
.dense
= runq
.dense
[:0]
294 case syntax
.InstRune
:
296 case syntax
.InstRune1
:
298 case syntax
.InstRuneAny
:
300 case syntax
.InstRuneAnyNotNL
:
304 t
= m
.add(nextq
, i
.Out
, nextPos
, t
.cap, nextCond
, t
)
307 m
.pool
= append(m
.pool
, t
)
310 runq
.dense
= runq
.dense
[:0]
313 // add adds an entry to q for pc, unless the q already has such an entry.
314 // It also recursively adds an entry for all instructions reachable from pc by following
315 // empty-width conditions satisfied by cond. pos gives the current position
317 func (m
*machine
) add(q
*queue
, pc
uint32, pos
int, cap []int, cond
*lazyFlag
, t
*thread
) *thread
{
322 if j
:= q
.sparse
[pc
]; j
< uint32(len(q
.dense
)) && q
.dense
[j
].pc
== pc
{
327 q
.dense
= q
.dense
[:j
+1]
331 q
.sparse
[pc
] = uint32(j
)
337 case syntax
.InstFail
:
339 case syntax
.InstAlt
, syntax
.InstAltMatch
:
340 t
= m
.add(q
, i
.Out
, pos
, cap, cond
, t
)
343 case syntax
.InstEmptyWidth
:
344 if cond
.match(syntax
.EmptyOp(i
.Arg
)) {
351 case syntax
.InstCapture
:
352 if int(i
.Arg
) < len(cap) {
355 m
.add(q
, i
.Out
, pos
, cap, cond
, nil)
361 case syntax
.InstMatch
, syntax
.InstRune
, syntax
.InstRune1
, syntax
.InstRuneAny
, syntax
.InstRuneAnyNotNL
:
367 if len(cap) > 0 && &t
.cap[0] != &cap[0] {
376 type onePassMachine
struct {
381 var onePassPool sync
.Pool
383 func newOnePassMachine() *onePassMachine
{
384 m
, ok
:= onePassPool
.Get().(*onePassMachine
)
386 m
= new(onePassMachine
)
391 func freeOnePassMachine(m
*onePassMachine
) {
396 // doOnePass implements r.doExecute using the one-pass execution engine.
397 func (re
*Regexp
) doOnePass(ir io
.RuneReader
, ib
[]byte, is
string, pos
, ncap
int, dstCap
[]int) []int {
399 if startCond
== ^syntax
.EmptyOp(0) { // impossible
403 m
:= newOnePassMachine()
404 if cap(m
.matchcap
) < ncap
{
405 m
.matchcap
= make([]int, ncap
)
407 m
.matchcap
= m
.matchcap
[:ncap
]
411 for i
:= range m
.matchcap
{
415 i
, _
:= m
.inputs
.init(ir
, ib
, is
)
417 r
, r1
:= endOfText
, endOfText
418 width
, width1
:= 0, 0
419 r
, width
= i
.step(pos
)
421 r1
, width1
= i
.step(pos
+ width
)
425 flag
= newLazyFlag(-1, r
)
427 flag
= i
.context(pos
)
429 pc
:= re
.onepass
.Start
430 inst
:= re
.onepass
.Inst
[pc
]
431 // If there is a simple literal prefix, skip over it.
432 if pos
== 0 && flag
.match(syntax
.EmptyOp(inst
.Arg
)) &&
433 len(re
.prefix
) > 0 && i
.canCheckPrefix() {
434 // Match requires literal prefix; fast search for it.
435 if !i
.hasPrefix(re
) {
438 pos
+= len(re
.prefix
)
439 r
, width
= i
.step(pos
)
440 r1
, width1
= i
.step(pos
+ width
)
441 flag
= i
.context(pos
)
442 pc
= int(re
.prefixEnd
)
445 inst
= re
.onepass
.Inst
[pc
]
450 case syntax
.InstMatch
:
452 if len(m
.matchcap
) > 0 {
457 case syntax
.InstRune
:
458 if !inst
.MatchRune(r
) {
461 case syntax
.InstRune1
:
462 if r
!= inst
.Rune
[0] {
465 case syntax
.InstRuneAny
:
467 case syntax
.InstRuneAnyNotNL
:
471 // peek at the input rune to see which branch of the Alt to take
472 case syntax
.InstAlt
, syntax
.InstAltMatch
:
473 pc
= int(onePassNext(&inst
, r
))
475 case syntax
.InstFail
:
479 case syntax
.InstEmptyWidth
:
480 if !flag
.match(syntax
.EmptyOp(inst
.Arg
)) {
484 case syntax
.InstCapture
:
485 if int(inst
.Arg
) < len(m
.matchcap
) {
486 m
.matchcap
[inst
.Arg
] = pos
493 flag
= newLazyFlag(r
, r1
)
495 r
, width
= r1
, width1
497 r1
, width1
= i
.step(pos
+ width
)
503 freeOnePassMachine(m
)
507 dstCap
= append(dstCap
, m
.matchcap
...)
508 freeOnePassMachine(m
)
512 // doMatch reports whether either r, b or s match the regexp.
513 func (re
*Regexp
) doMatch(r io
.RuneReader
, b
[]byte, s
string) bool {
514 return re
.doExecute(r
, b
, s
, 0, 0, nil) != nil
517 // doExecute finds the leftmost match in the input, appends the position
518 // of its subexpressions to dstCap and returns dstCap.
520 // nil is returned if no matches are found and non-nil if matches are found.
521 func (re
*Regexp
) doExecute(r io
.RuneReader
, b
[]byte, s
string, pos
int, ncap
int, dstCap
[]int) []int {
523 // Make sure 'return dstCap' is non-nil.
524 dstCap
= arrayNoInts
[:0:0]
527 if r
== nil && len(b
)+len(s
) < re
.minInputLen
{
531 if re
.onepass
!= nil {
532 return re
.doOnePass(r
, b
, s
, pos
, ncap
, dstCap
)
534 if r
== nil && len(b
)+len(s
) < re
.maxBitStateLen
{
535 return re
.backtrack(b
, s
, pos
, ncap
, dstCap
)
539 i
, _
:= m
.inputs
.init(r
, b
, s
)
542 if !m
.match(i
, pos
) {
547 dstCap
= append(dstCap
, m
.matchcap
...)
552 // arrayNoInts is returned by doExecute match if nil dstCap is passed
553 // to it with ncap=0.
554 var arrayNoInts
[0]int