1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
9 // A patchList is a list of instruction pointers that need to be filled in (patched).
10 // Because the pointers haven't been filled in yet, we can reuse their storage
11 // to hold the list. It's kind of sleazy, but works well in practice.
12 // See https://swtch.com/~rsc/regexp/regexp1.html for inspiration.
14 // These aren't really pointers: they're integers, so we can reinterpret them
15 // this way without using package unsafe. A value l denotes
16 // p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
17 // l == 0 denotes the empty list, okay because we start every program
18 // with a fail instruction, so we'll never want to point at its output link.
21 func (l patchList
) next(p
*Prog
) patchList
{
24 return patchList(i
.Out
)
26 return patchList(i
.Arg
)
29 func (l patchList
) patch(p
*Prog
, val
uint32) {
42 func (l1 patchList
) append(p
*Prog
, l2 patchList
) patchList
{
68 // A frag represents a compiled program fragment.
70 i
uint32 // index of first instruction
71 out patchList
// where to record end instruction
74 type compiler
struct {
78 // Compile compiles the regexp into a program to be executed.
79 // The regexp should have been simplified already (returned from re.Simplify).
80 func Compile(re
*Regexp
) (*Prog
, error
) {
84 f
.out
.patch(c
.p
, c
.inst(InstMatch
).i
)
89 func (c
*compiler
) init() {
91 c
.p
.NumCap
= 2 // implicit ( and ) for whole match $0
95 var anyRuneNotNL
= []rune
{0, '\n' - 1, '\n' + 1, unicode
.MaxRune
}
96 var anyRune
= []rune
{0, unicode
.MaxRune
}
98 func (c
*compiler
) compile(re
*Regexp
) frag
{
105 if len(re
.Rune
) == 0 {
109 for j
:= range re
.Rune
{
110 f1
:= c
.rune(re
.Rune
[j
:j
+1], re
.Flags
)
119 return c
.rune(re
.Rune
, re
.Flags
)
121 return c
.rune(anyRuneNotNL
, 0)
123 return c
.rune(anyRune
, 0)
125 return c
.empty(EmptyBeginLine
)
127 return c
.empty(EmptyEndLine
)
129 return c
.empty(EmptyBeginText
)
131 return c
.empty(EmptyEndText
)
133 return c
.empty(EmptyWordBoundary
)
134 case OpNoWordBoundary
:
135 return c
.empty(EmptyNoWordBoundary
)
137 bra
:= c
.cap(uint32(re
.Cap
<< 1))
138 sub
:= c
.compile(re
.Sub
[0])
139 ket
:= c
.cap(uint32(re
.Cap
<<1 |
1))
140 return c
.cat(c
.cat(bra
, sub
), ket
)
142 return c
.star(c
.compile(re
.Sub
[0]), re
.Flags
&NonGreedy
!= 0)
144 return c
.plus(c
.compile(re
.Sub
[0]), re
.Flags
&NonGreedy
!= 0)
146 return c
.quest(c
.compile(re
.Sub
[0]), re
.Flags
&NonGreedy
!= 0)
148 if len(re
.Sub
) == 0 {
152 for i
, sub
:= range re
.Sub
{
156 f
= c
.cat(f
, c
.compile(sub
))
162 for _
, sub
:= range re
.Sub
{
163 f
= c
.alt(f
, c
.compile(sub
))
167 panic("regexp: unhandled case in compile")
170 func (c
*compiler
) inst(op InstOp
) frag
{
171 // TODO: impose length limit
172 f
:= frag
{i
: uint32(len(c
.p
.Inst
))}
173 c
.p
.Inst
= append(c
.p
.Inst
, Inst
{Op
: op
})
177 func (c
*compiler
) nop() frag
{
179 f
.out
= patchList(f
.i
<< 1)
183 func (c
*compiler
) fail() frag
{
187 func (c
*compiler
) cap(arg
uint32) frag
{
188 f
:= c
.inst(InstCapture
)
189 f
.out
= patchList(f
.i
<< 1)
190 c
.p
.Inst
[f
.i
].Arg
= arg
192 if c
.p
.NumCap
< int(arg
)+1 {
193 c
.p
.NumCap
= int(arg
) + 1
198 func (c
*compiler
) cat(f1
, f2 frag
) frag
{
199 // concat of failure is failure
200 if f1
.i
== 0 || f2
.i
== 0 {
206 f1
.out
.patch(c
.p
, f2
.i
)
207 return frag
{f1
.i
, f2
.out
}
210 func (c
*compiler
) alt(f1
, f2 frag
) frag
{
211 // alt of failure is other
223 f
.out
= f1
.out
.append(c
.p
, f2
.out
)
227 func (c
*compiler
) quest(f1 frag
, nongreedy
bool) frag
{
232 f
.out
= patchList(f
.i
<< 1)
235 f
.out
= patchList(f
.i
<<1 |
1)
237 f
.out
= f
.out
.append(c
.p
, f1
.out
)
241 func (c
*compiler
) star(f1 frag
, nongreedy
bool) frag
{
246 f
.out
= patchList(f
.i
<< 1)
249 f
.out
= patchList(f
.i
<<1 |
1)
251 f1
.out
.patch(c
.p
, f
.i
)
255 func (c
*compiler
) plus(f1 frag
, nongreedy
bool) frag
{
256 return frag
{f1
.i
, c
.star(f1
, nongreedy
).out
}
259 func (c
*compiler
) empty(op EmptyOp
) frag
{
260 f
:= c
.inst(InstEmptyWidth
)
261 c
.p
.Inst
[f
.i
].Arg
= uint32(op
)
262 f
.out
= patchList(f
.i
<< 1)
266 func (c
*compiler
) rune(r
[]rune
, flags Flags
) frag
{
267 f
:= c
.inst(InstRune
)
270 flags
&= FoldCase
// only relevant flag is FoldCase
271 if len(r
) != 1 || unicode
.SimpleFold(r
[0]) == r
[0] {
272 // and sometimes not even that
275 i
.Arg
= uint32(flags
)
276 f
.out
= patchList(f
.i
<< 1)
278 // Special cases for exec machine.
280 case flags
&FoldCase
== 0 && (len(r
) == 1 ||
len(r
) == 2 && r
[0] == r
[1]):
282 case len(r
) == 2 && r
[0] == 0 && r
[1] == unicode
.MaxRune
:
284 case len(r
) == 4 && r
[0] == 0 && r
[1] == '\n'-1 && r
[2] == '\n'+1 && r
[3] == unicode
.MaxRune
:
285 i
.Op
= InstRuneAnyNotNL