libgo: Update to Go 1.3 release.
[official-gcc.git] / libgo / go / regexp / syntax / prog.go
blob29bd282d0d96c7c0c235037b9f0bcac33c592bfe
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package syntax
7 import (
8 "bytes"
9 "strconv"
10 "unicode"
13 // Compiled program.
14 // May not belong in this package, but convenient for now.
16 // A Prog is a compiled regular expression program.
17 type Prog struct {
18 Inst []Inst
19 Start int // index of start instruction
20 NumCap int // number of InstCapture insts in re
23 // An InstOp is an instruction opcode.
24 type InstOp uint8
26 const (
27 InstAlt InstOp = iota
28 InstAltMatch
29 InstCapture
30 InstEmptyWidth
31 InstMatch
32 InstFail
33 InstNop
34 InstRune
35 InstRune1
36 InstRuneAny
37 InstRuneAnyNotNL
40 var instOpNames = []string{
41 "InstAlt",
42 "InstAltMatch",
43 "InstCapture",
44 "InstEmptyWidth",
45 "InstMatch",
46 "InstFail",
47 "InstNop",
48 "InstRune",
49 "InstRune1",
50 "InstRuneAny",
51 "InstRuneAnyNotNL",
54 func (i InstOp) String() string {
55 if uint(i) >= uint(len(instOpNames)) {
56 return ""
58 return instOpNames[i]
61 // An EmptyOp specifies a kind or mixture of zero-width assertions.
62 type EmptyOp uint8
64 const (
65 EmptyBeginLine EmptyOp = 1 << iota
66 EmptyEndLine
67 EmptyBeginText
68 EmptyEndText
69 EmptyWordBoundary
70 EmptyNoWordBoundary
73 // EmptyOpContext returns the zero-width assertions
74 // satisfied at the position between the runes r1 and r2.
75 // Passing r1 == -1 indicates that the position is
76 // at the beginning of the text.
77 // Passing r2 == -1 indicates that the position is
78 // at the end of the text.
79 func EmptyOpContext(r1, r2 rune) EmptyOp {
80 var op EmptyOp = EmptyNoWordBoundary
81 var boundary byte
82 switch {
83 case IsWordChar(r1):
84 boundary = 1
85 case r1 == '\n':
86 op |= EmptyBeginLine
87 case r1 < 0:
88 op |= EmptyBeginText | EmptyBeginLine
90 switch {
91 case IsWordChar(r2):
92 boundary ^= 1
93 case r2 == '\n':
94 op |= EmptyEndLine
95 case r2 < 0:
96 op |= EmptyEndText | EmptyEndLine
98 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
99 op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
101 return op
104 // IsWordChar reports whether r is consider a ``word character''
105 // during the evaluation of the \b and \B zero-width assertions.
106 // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
107 func IsWordChar(r rune) bool {
108 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
111 // An Inst is a single instruction in a regular expression program.
112 type Inst struct {
113 Op InstOp
114 Out uint32 // all but InstMatch, InstFail
115 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
116 Rune []rune
119 func (p *Prog) String() string {
120 var b bytes.Buffer
121 dumpProg(&b, p)
122 return b.String()
125 // skipNop follows any no-op or capturing instructions
126 // and returns the resulting pc.
127 func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
128 i := &p.Inst[pc]
129 for i.Op == InstNop || i.Op == InstCapture {
130 pc = i.Out
131 i = &p.Inst[pc]
133 return i, pc
136 // op returns i.Op but merges all the Rune special cases into InstRune
137 func (i *Inst) op() InstOp {
138 op := i.Op
139 switch op {
140 case InstRune1, InstRuneAny, InstRuneAnyNotNL:
141 op = InstRune
143 return op
146 // Prefix returns a literal string that all matches for the
147 // regexp must start with. Complete is true if the prefix
148 // is the entire match.
149 func (p *Prog) Prefix() (prefix string, complete bool) {
150 i, _ := p.skipNop(uint32(p.Start))
152 // Avoid allocation of buffer if prefix is empty.
153 if i.op() != InstRune || len(i.Rune) != 1 {
154 return "", i.Op == InstMatch
157 // Have prefix; gather characters.
158 var buf bytes.Buffer
159 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
160 buf.WriteRune(i.Rune[0])
161 i, _ = p.skipNop(i.Out)
163 return buf.String(), i.Op == InstMatch
166 // StartCond returns the leading empty-width conditions that must
167 // be true in any match. It returns ^EmptyOp(0) if no matches are possible.
168 func (p *Prog) StartCond() EmptyOp {
169 var flag EmptyOp
170 pc := uint32(p.Start)
171 i := &p.Inst[pc]
172 Loop:
173 for {
174 switch i.Op {
175 case InstEmptyWidth:
176 flag |= EmptyOp(i.Arg)
177 case InstFail:
178 return ^EmptyOp(0)
179 case InstCapture, InstNop:
180 // skip
181 default:
182 break Loop
184 pc = i.Out
185 i = &p.Inst[pc]
187 return flag
190 const noMatch = -1
192 // MatchRune returns true if the instruction matches (and consumes) r.
193 // It should only be called when i.Op == InstRune.
194 func (i *Inst) MatchRune(r rune) bool {
195 return i.MatchRunePos(r) != noMatch
198 // MatchRunePos checks whether the instruction matches (and consumes) r.
199 // If so, MatchRunePos returns the index of the matching rune pair
200 // (or, when len(i.Rune) == 1, rune singleton).
201 // If not, MatchRunePos returns -1.
202 // MatchRunePos should only be called when i.Op == InstRune.
203 func (i *Inst) MatchRunePos(r rune) int {
204 rune := i.Rune
206 // Special case: single-rune slice is from literal string, not char class.
207 if len(rune) == 1 {
208 r0 := rune[0]
209 if r == r0 {
210 return 0
212 if Flags(i.Arg)&FoldCase != 0 {
213 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
214 if r == r1 {
215 return 0
219 return noMatch
222 // Peek at the first few pairs.
223 // Should handle ASCII well.
224 for j := 0; j < len(rune) && j <= 8; j += 2 {
225 if r < rune[j] {
226 return noMatch
228 if r <= rune[j+1] {
229 return j / 2
233 // Otherwise binary search.
234 lo := 0
235 hi := len(rune) / 2
236 for lo < hi {
237 m := lo + (hi-lo)/2
238 if c := rune[2*m]; c <= r {
239 if r <= rune[2*m+1] {
240 return m
242 lo = m + 1
243 } else {
244 hi = m
247 return noMatch
250 // As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
251 // Since we act on runes, it would be easy to support Unicode here.
252 func wordRune(r rune) bool {
253 return r == '_' ||
254 ('A' <= r && r <= 'Z') ||
255 ('a' <= r && r <= 'z') ||
256 ('0' <= r && r <= '9')
259 // MatchEmptyWidth returns true if the instruction matches
260 // an empty string between the runes before and after.
261 // It should only be called when i.Op == InstEmptyWidth.
262 func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
263 switch EmptyOp(i.Arg) {
264 case EmptyBeginLine:
265 return before == '\n' || before == -1
266 case EmptyEndLine:
267 return after == '\n' || after == -1
268 case EmptyBeginText:
269 return before == -1
270 case EmptyEndText:
271 return after == -1
272 case EmptyWordBoundary:
273 return wordRune(before) != wordRune(after)
274 case EmptyNoWordBoundary:
275 return wordRune(before) == wordRune(after)
277 panic("unknown empty width arg")
280 func (i *Inst) String() string {
281 var b bytes.Buffer
282 dumpInst(&b, i)
283 return b.String()
286 func bw(b *bytes.Buffer, args ...string) {
287 for _, s := range args {
288 b.WriteString(s)
292 func dumpProg(b *bytes.Buffer, p *Prog) {
293 for j := range p.Inst {
294 i := &p.Inst[j]
295 pc := strconv.Itoa(j)
296 if len(pc) < 3 {
297 b.WriteString(" "[len(pc):])
299 if j == p.Start {
300 pc += "*"
302 bw(b, pc, "\t")
303 dumpInst(b, i)
304 bw(b, "\n")
308 func u32(i uint32) string {
309 return strconv.FormatUint(uint64(i), 10)
312 func dumpInst(b *bytes.Buffer, i *Inst) {
313 switch i.Op {
314 case InstAlt:
315 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
316 case InstAltMatch:
317 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
318 case InstCapture:
319 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
320 case InstEmptyWidth:
321 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
322 case InstMatch:
323 bw(b, "match")
324 case InstFail:
325 bw(b, "fail")
326 case InstNop:
327 bw(b, "nop -> ", u32(i.Out))
328 case InstRune:
329 if i.Rune == nil {
330 // shouldn't happen
331 bw(b, "rune <nil>")
333 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
334 if Flags(i.Arg)&FoldCase != 0 {
335 bw(b, "/i")
337 bw(b, " -> ", u32(i.Out))
338 case InstRune1:
339 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
340 case InstRuneAny:
341 bw(b, "any -> ", u32(i.Out))
342 case InstRuneAnyNotNL:
343 bw(b, "anynotnl -> ", u32(i.Out))