1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
7 // Note to implementers:
8 // In this package, re is always a *Regexp and r is always a rune.
17 // A Regexp is a node in a regular expression syntax tree.
21 Sub
[]*Regexp
// subexpressions, if any
22 Sub0
[1]*Regexp
// storage for short Sub
23 Rune
[]rune
// matched runes, for OpLiteral, OpCharClass
24 Rune0
[2]rune
// storage for short Rune
25 Min
, Max
int // min, max for OpRepeat
26 Cap
int // capturing index, for OpCapture
27 Name
string // capturing name, for OpCapture
30 // An Op is a single regular expression operator.
33 // Operators are listed in precedence order, tightest binding to weakest.
34 // Character class operators are listed simplest to most complex
35 // (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
38 OpNoMatch Op
= 1 + iota // matches no strings
39 OpEmptyMatch
// matches empty string
40 OpLiteral
// matches Runes sequence
41 OpCharClass
// matches Runes interpreted as range pair list
42 OpAnyCharNotNL
// matches any character except newline
43 OpAnyChar
// matches any character
44 OpBeginLine
// matches empty string at beginning of line
45 OpEndLine
// matches empty string at end of line
46 OpBeginText
// matches empty string at beginning of text
47 OpEndText
// matches empty string at end of text
48 OpWordBoundary
// matches word boundary `\b`
49 OpNoWordBoundary
// matches word non-boundary `\B`
50 OpCapture
// capturing subexpression with index Cap, optional name Name
51 OpStar
// matches Sub[0] zero or more times
52 OpPlus
// matches Sub[0] one or more times
53 OpQuest
// matches Sub[0] zero or one times
54 OpRepeat
// matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
55 OpConcat
// matches concatenation of Subs
56 OpAlternate
// matches alternation of Subs
59 const opPseudo Op
= 128 // where pseudo-ops start
61 // Equal returns true if x and y have identical structure.
62 func (x
*Regexp
) Equal(y
*Regexp
) bool {
63 if x
== nil || y
== nil {
71 // The parse flags remember whether this is \z or \Z.
72 if x
.Flags
&WasDollar
!= y
.Flags
&WasDollar
{
76 case OpLiteral
, OpCharClass
:
77 if len(x
.Rune
) != len(y
.Rune
) {
80 for i
, r
:= range x
.Rune
{
86 case OpAlternate
, OpConcat
:
87 if len(x
.Sub
) != len(y
.Sub
) {
90 for i
, sub
:= range x
.Sub
{
91 if !sub
.Equal(y
.Sub
[i
]) {
96 case OpStar
, OpPlus
, OpQuest
:
97 if x
.Flags
&NonGreedy
!= y
.Flags
&NonGreedy ||
!x
.Sub
[0].Equal(y
.Sub
[0]) {
102 if x
.Flags
&NonGreedy
!= y
.Flags
&NonGreedy || x
.Min
!= y
.Min || x
.Max
!= y
.Max ||
!x
.Sub
[0].Equal(y
.Sub
[0]) {
107 if x
.Cap
!= y
.Cap || x
.Name
!= y
.Name ||
!x
.Sub
[0].Equal(y
.Sub
[0]) {
114 // writeRegexp writes the Perl syntax for the regular expression re to b.
115 func writeRegexp(b
*bytes
.Buffer
, re
*Regexp
) {
118 b
.WriteString("<invalid op" + strconv
.Itoa(int(re
.Op
)) + ">")
120 b
.WriteString(`[^\x00-\x{10FFFF}]`)
122 b
.WriteString(`(?:)`)
124 if re
.Flags
&FoldCase
!= 0 {
125 b
.WriteString(`(?i:`)
127 for _
, r
:= range re
.Rune
{
130 if re
.Flags
&FoldCase
!= 0 {
134 if len(re
.Rune
)%2
!= 0 {
135 b
.WriteString(`[invalid char class]`)
139 if len(re
.Rune
) == 0 {
140 b
.WriteString(`^\x00-\x{10FFFF}`)
141 } else if re
.Rune
[0] == 0 && re
.Rune
[len(re
.Rune
)-1] == unicode
.MaxRune
{
142 // Contains 0 and MaxRune. Probably a negated class.
145 for i
:= 1; i
< len(re
.Rune
)-1; i
+= 2 {
146 lo
, hi
:= re
.Rune
[i
]+1, re
.Rune
[i
+1]-1
147 escape(b
, lo
, lo
== '-')
150 escape(b
, hi
, hi
== '-')
154 for i
:= 0; i
< len(re
.Rune
); i
+= 2 {
155 lo
, hi
:= re
.Rune
[i
], re
.Rune
[i
+1]
156 escape(b
, lo
, lo
== '-')
159 escape(b
, hi
, hi
== '-')
165 b
.WriteString(`(?-s:.)`)
167 b
.WriteString(`(?s:.)`)
175 if re
.Flags
&WasDollar
!= 0 {
176 b
.WriteString(`(?-m:$)`)
182 case OpNoWordBoundary
:
186 b
.WriteString(`(?P<`)
187 b
.WriteString(re
.Name
)
192 if re
.Sub
[0].Op
!= OpEmptyMatch
{
193 writeRegexp(b
, re
.Sub
[0])
196 case OpStar
, OpPlus
, OpQuest
, OpRepeat
:
197 if sub
:= re
.Sub
[0]; sub
.Op
> OpCapture || sub
.Op
== OpLiteral
&& len(sub
.Rune
) > 1 {
213 b
.WriteString(strconv
.Itoa(re
.Min
))
214 if re
.Max
!= re
.Min
{
217 b
.WriteString(strconv
.Itoa(re
.Max
))
222 if re
.Flags
&NonGreedy
!= 0 {
226 for _
, sub
:= range re
.Sub
{
227 if sub
.Op
== OpAlternate
{
236 for i
, sub
:= range re
.Sub
{
245 func (re
*Regexp
) String() string {
251 const meta
= `\.+*?()|[]{}^$`
253 func escape(b
*bytes
.Buffer
, r rune
, force
bool) {
254 if unicode
.IsPrint(r
) {
255 if strings
.IndexRune(meta
, r
) >= 0 || force
{
278 s
:= strconv
.FormatInt(int64(r
), 16)
286 b
.WriteString(strconv
.FormatInt(int64(r
), 16))
291 // MaxCap walks the regexp to find the maximum capture index.
292 func (re
*Regexp
) MaxCap() int {
294 if re
.Op
== OpCapture
{
297 for _
, sub
:= range re
.Sub
{
298 if n
:= sub
.MaxCap(); m
< n
{
305 // CapNames walks the regexp to find the names of capturing groups.
306 func (re
*Regexp
) CapNames() []string {
307 names
:= make([]string, re
.MaxCap()+1)
312 func (re
*Regexp
) capNames(names
[]string) {
313 if re
.Op
== OpCapture
{
314 names
[re
.Cap
] = re
.Name
316 for _
, sub
:= range re
.Sub
{