1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
14 // May not belong in this package, but convenient for now.
16 // A Prog is a compiled regular expression program.
19 Start
int // index of start instruction
20 NumCap
int // number of InstCapture insts in re
23 // An InstOp is an instruction opcode.
40 var instOpNames
= []string{
54 func (i InstOp
) String() string {
55 if uint(i
) >= uint(len(instOpNames
)) {
61 // An EmptyOp specifies a kind or mixture of zero-width assertions.
65 EmptyBeginLine EmptyOp
= 1 << iota
73 // EmptyOpContext returns the zero-width assertions
74 // satisfied at the position between the runes r1 and r2.
75 // Passing r1 == -1 indicates that the position is
76 // at the beginning of the text.
77 // Passing r2 == -1 indicates that the position is
78 // at the end of the text.
79 func EmptyOpContext(r1
, r2 rune
) EmptyOp
{
80 var op EmptyOp
= EmptyNoWordBoundary
88 op |
= EmptyBeginText | EmptyBeginLine
96 op |
= EmptyEndText | EmptyEndLine
98 if boundary
!= 0 { // IsWordChar(r1) != IsWordChar(r2)
99 op
^= (EmptyWordBoundary | EmptyNoWordBoundary
)
104 // IsWordChar reports whether r is consider a ``word character''
105 // during the evaluation of the \b and \B zero-width assertions.
106 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
107 func IsWordChar(r rune
) bool {
108 return 'A' <= r
&& r
<= 'Z' ||
'a' <= r
&& r
<= 'z' ||
'0' <= r
&& r
<= '9' || r
== '_'
111 // An Inst is a single instruction in a regular expression program.
114 Out
uint32 // all but InstMatch, InstFail
115 Arg
uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
119 func (p
*Prog
) String() string {
125 // skipNop follows any no-op or capturing instructions
126 // and returns the resulting pc.
127 func (p
*Prog
) skipNop(pc
uint32) (*Inst
, uint32) {
129 for i
.Op
== InstNop || i
.Op
== InstCapture
{
136 // op returns i.Op but merges all the Rune special cases into InstRune
137 func (i
*Inst
) op() InstOp
{
140 case InstRune1
, InstRuneAny
, InstRuneAnyNotNL
:
146 // Prefix returns a literal string that all matches for the
147 // regexp must start with. Complete is true if the prefix
148 // is the entire match.
149 func (p
*Prog
) Prefix() (prefix
string, complete
bool) {
150 i
, _
:= p
.skipNop(uint32(p
.Start
))
152 // Avoid allocation of buffer if prefix is empty.
153 if i
.op() != InstRune ||
len(i
.Rune
) != 1 {
154 return "", i
.Op
== InstMatch
157 // Have prefix; gather characters.
159 for i
.op() == InstRune
&& len(i
.Rune
) == 1 && Flags(i
.Arg
)&FoldCase
== 0 {
160 buf
.WriteRune(i
.Rune
[0])
161 i
, _
= p
.skipNop(i
.Out
)
163 return buf
.String(), i
.Op
== InstMatch
166 // StartCond returns the leading empty-width conditions that must
167 // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
168 func (p
*Prog
) StartCond() EmptyOp
{
170 pc
:= uint32(p
.Start
)
176 flag |
= EmptyOp(i
.Arg
)
179 case InstCapture
, InstNop
:
192 // MatchRune reports whether the instruction matches (and consumes) r.
193 // It should only be called when i.Op == InstRune.
194 func (i
*Inst
) MatchRune(r rune
) bool {
195 return i
.MatchRunePos(r
) != noMatch
198 // MatchRunePos checks whether the instruction matches (and consumes) r.
199 // If so, MatchRunePos returns the index of the matching rune pair
200 // (or, when len(i.Rune) == 1, rune singleton).
201 // If not, MatchRunePos returns -1.
202 // MatchRunePos should only be called when i.Op == InstRune.
203 func (i
*Inst
) MatchRunePos(r rune
) int {
206 // Special case: single-rune slice is from literal string, not char class.
212 if Flags(i
.Arg
)&FoldCase
!= 0 {
213 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
222 // Peek at the first few pairs.
223 // Should handle ASCII well.
224 for j
:= 0; j
< len(rune
) && j
<= 8; j
+= 2 {
233 // Otherwise binary search.
238 if c
:= rune
[2*m
]; c
<= r
{
239 if r
<= rune
[2*m
+1] {
250 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
251 // Since we act on runes, it would be easy to support Unicode here.
252 func wordRune(r rune
) bool {
254 ('A' <= r
&& r
<= 'Z') ||
255 ('a' <= r
&& r
<= 'z') ||
256 ('0' <= r
&& r
<= '9')
259 // MatchEmptyWidth reports whether the instruction matches
260 // an empty string between the runes before and after.
261 // It should only be called when i.Op == InstEmptyWidth.
262 func (i
*Inst
) MatchEmptyWidth(before rune
, after rune
) bool {
263 switch EmptyOp(i
.Arg
) {
265 return before
== '\n' || before
== -1
267 return after
== '\n' || after
== -1
272 case EmptyWordBoundary
:
273 return wordRune(before
) != wordRune(after
)
274 case EmptyNoWordBoundary
:
275 return wordRune(before
) == wordRune(after
)
277 panic("unknown empty width arg")
280 func (i
*Inst
) String() string {
286 func bw(b
*bytes
.Buffer
, args
...string) {
287 for _
, s
:= range args
{
292 func dumpProg(b
*bytes
.Buffer
, p
*Prog
) {
293 for j
:= range p
.Inst
{
295 pc
:= strconv
.Itoa(j
)
297 b
.WriteString(" "[len(pc
):])
308 func u32(i
uint32) string {
309 return strconv
.FormatUint(uint64(i
), 10)
312 func dumpInst(b
*bytes
.Buffer
, i
*Inst
) {
315 bw(b
, "alt -> ", u32(i
.Out
), ", ", u32(i
.Arg
))
317 bw(b
, "altmatch -> ", u32(i
.Out
), ", ", u32(i
.Arg
))
319 bw(b
, "cap ", u32(i
.Arg
), " -> ", u32(i
.Out
))
321 bw(b
, "empty ", u32(i
.Arg
), " -> ", u32(i
.Out
))
327 bw(b
, "nop -> ", u32(i
.Out
))
333 bw(b
, "rune ", strconv
.QuoteToASCII(string(i
.Rune
)))
334 if Flags(i
.Arg
)&FoldCase
!= 0 {
337 bw(b
, " -> ", u32(i
.Out
))
339 bw(b
, "rune1 ", strconv
.QuoteToASCII(string(i
.Rune
)), " -> ", u32(i
.Out
))
341 bw(b
, "any -> ", u32(i
.Out
))
342 case InstRuneAnyNotNL
:
343 bw(b
, "anynotnl -> ", u32(i
.Out
))