1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
18 "github.com/google/codesearch/sparse"
21 // A matcher holds the state for running regular expression search.
23 prog
*syntax
.Prog
// compiled program
24 dstate
map[string]*dstate
// dstate cache
25 start
*dstate
// start state
26 startLine
*dstate
// start state for beginning of line
27 z1
, z2 nstate
// two temporary nstates
30 // An nstate corresponds to an NFA state.
32 q sparse
.Set
// queue of program instructions
33 partial rune
// partially decoded rune (TODO)
34 flag flags
// flags (TODO)
37 // The flags record state about a position between bytes in the text.
41 flagBOL flags
= 1 << iota // beginning of line
42 flagEOL
// end of line
43 flagBOT
// beginning of text
44 flagEOT
// end of text
45 flagWord
// last byte was word byte
48 // A dstate corresponds to a DFA state.
50 next
[256]*dstate
// next state, per byte
51 enc
string // encoded nstate
52 matchNL
bool // match when next byte is \n
53 matchEOT
bool // match in this state at end of text
56 func (z
*nstate
) String() string {
57 return fmt
.Sprintf("%v/%#x+%#x", z
.q
.Dense(), z
.flag
, z
.partial
)
60 // enc encodes z as a string.
61 func (z
*nstate
) enc() string {
65 n
:= binary
.PutUvarint(v
[:], uint64(z
.partial
))
66 buf
= append(buf
, v
[:n
]...)
67 n
= binary
.PutUvarint(v
[:], uint64(z
.flag
))
68 buf
= append(buf
, v
[:n
]...)
70 ids
:= make([]int, 0, len(dense
))
71 for _
, id
:= range z
.q
.Dense() {
72 ids
= append(ids
, int(id
))
75 for _
, id
:= range ids
{
76 n
:= binary
.PutUvarint(v
[:], uint64(uint32(id
)-last
))
77 buf
= append(buf
, v
[:n
]...)
83 // dec decodes the encoding s into z.
84 func (z
*nstate
) dec(s
string) {
86 i
, n
:= binary
.Uvarint(b
)
92 i
, n
= binary
.Uvarint(b
)
101 i
, n
= binary
.Uvarint(b
)
111 // dmatch is the state we're in when we've seen a match and are just
112 // waiting for the end of the line.
121 for i
:= range dmatch
.next
{
123 dmatch
.next
[i
] = &dmatch
128 // init initializes the matcher.
129 func (m
*matcher
) init(prog
*syntax
.Prog
) error
{
131 m
.dstate
= make(map[string]*dstate
)
133 m
.z1
.q
.Init(uint32(len(prog
.Inst
)))
134 m
.z2
.q
.Init(uint32(len(prog
.Inst
)))
136 m
.addq(&m
.z1
.q
, uint32(prog
.Start
), syntax
.EmptyBeginLine|syntax
.EmptyBeginText
)
137 m
.z1
.flag
= flagBOL | flagBOT
138 m
.start
= m
.cache(&m
.z1
)
141 m
.addq(&m
.z1
.q
, uint32(prog
.Start
), syntax
.EmptyBeginLine
)
143 m
.startLine
= m
.cache(&m
.z1
)
148 // stepEmpty steps runq to nextq expanding according to flag.
149 func (m
*matcher
) stepEmpty(runq
, nextq
*sparse
.Set
, flag syntax
.EmptyOp
) {
151 for _
, id
:= range runq
.Dense() {
152 m
.addq(nextq
, id
, flag
)
156 // stepByte steps runq to nextq consuming c and then expanding according to flag.
157 // It returns true if a match ends immediately before c.
158 // c is either an input byte or endText.
159 func (m
*matcher
) stepByte(runq
, nextq
*sparse
.Set
, c
int, flag syntax
.EmptyOp
) (match
bool) {
161 m
.addq(nextq
, uint32(m
.prog
.Start
), flag
)
162 for _
, id
:= range runq
.Dense() {
163 i
:= &m
.prog
.Inst
[id
]
167 case syntax
.InstMatch
:
174 lo
:= int((i
.Arg
>> 8) & 0xFF)
175 hi
:= int(i
.Arg
& 0xFF)
177 if i
.Arg
&argFold
!= 0 && 'a' <= ch
&& ch
<= 'z' {
180 if lo
<= ch
&& ch
<= hi
{
181 m
.addq(nextq
, i
.Out
, flag
)
188 // addq adds id to the queue, expanding according to flag.
189 func (m
*matcher
) addq(q
*sparse
.Set
, id
uint32, flag syntax
.EmptyOp
) {
194 i
:= &m
.prog
.Inst
[id
]
196 case syntax
.InstCapture
, syntax
.InstNop
:
197 m
.addq(q
, i
.Out
, flag
)
198 case syntax
.InstAlt
, syntax
.InstAltMatch
:
199 m
.addq(q
, i
.Out
, flag
)
200 m
.addq(q
, i
.Arg
, flag
)
201 case syntax
.InstEmptyWidth
:
202 if syntax
.EmptyOp(i
.Arg
)&^flag
== 0 {
203 m
.addq(q
, i
.Out
, flag
)
210 // computeNext computes the next DFA state if we're in d reading c (an input byte or endText).
211 func (m
*matcher
) computeNext(d
*dstate
, c
int) *dstate
{
212 this
, next
:= &m
.z1
, &m
.z2
215 // compute flags in effect before c
216 flag
:= syntax
.EmptyOp(0)
217 if this
.flag
&flagBOL
!= 0 {
218 flag |
= syntax
.EmptyBeginLine
220 if this
.flag
&flagBOT
!= 0 {
221 flag |
= syntax
.EmptyBeginText
223 if this
.flag
&flagWord
!= 0 {
225 flag |
= syntax
.EmptyWordBoundary
227 flag |
= syntax
.EmptyNoWordBoundary
231 flag |
= syntax
.EmptyWordBoundary
233 flag |
= syntax
.EmptyNoWordBoundary
237 flag |
= syntax
.EmptyEndLine
240 flag |
= syntax
.EmptyEndLine | syntax
.EmptyEndText
243 // re-expand queue using new flags.
244 // TODO: only do this when it matters
245 // (something is gating on word boundaries).
246 m
.stepEmpty(&this
.q
, &next
.q
, flag
)
247 this
, next
= next
, this
249 // now compute flags after c.
253 flag |
= syntax
.EmptyBeginLine
257 next
.flag |
= flagWord
260 // re-add start, process rune + expand according to flags.
261 if m
.stepByte(&this
.q
, &next
.q
, c
, flag
) {
267 func (m
*matcher
) cache(z
*nstate
) *dstate
{
274 d
= &dstate
{enc
: enc
}
276 d
.matchNL
= m
.computeNext(d
, '\n') == &dmatch
277 d
.matchEOT
= m
.computeNext(d
, endText
) == &dmatch
281 func (m
*matcher
) match(b
[]byte, beginText
, endText
bool) (end
int) {
282 // fmt.Printf("%v\n", m.prog)
289 // fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch)
290 for i
, c
:= range b
{
299 d1
= m
.computeNext(d
, int(c
))
305 // fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT)
307 if d
.matchNL || endText
&& d
.matchEOT
{
313 func (m
*matcher
) matchString(b
string, beginText
, endText
bool) (end
int) {
318 for i
:= 0; i
< len(b
); i
++ {
328 d1
= m
.computeNext(d
, int(c
))
334 if d
.matchNL || endText
&& d
.matchEOT
{
340 // isWordByte reports whether the byte c is a word character: ASCII only.
341 // This is used to implement \b and \B. This is not right for Unicode, but:
342 // - it's hard to get right in a byte-at-a-time matching world
343 // (the DFA has only one-byte lookahead)
344 // - this crude approximation is the same one PCRE uses
345 func isWordByte(c
int) bool {
346 return 'A' <= c
&& c
<= 'Z' ||
347 'a' <= c
&& c
<= 'z' ||
348 '0' <= c
&& c
<= '9' ||
354 Regexp
*Regexp
// regexp to search for
355 Stdout io
.Writer
// output target
356 Stderr io
.Writer
// error target
358 L
bool // L flag - print file names only
359 C
bool // C flag - print count of matches
360 N
bool // N flag - print line numbers
361 H
bool // H flag - do not print file names
368 func (g
*Grep
) AddFlags() {
369 flag
.BoolVar(&g
.L
, "l", false, "list matching files only")
370 flag
.BoolVar(&g
.C
, "c", false, "print match counts only")
371 flag
.BoolVar(&g
.N
, "n", false, "show line numbers")
372 flag
.BoolVar(&g
.H
, "h", false, "omit file names")
375 func (g
*Grep
) File(name
string) []Match
{
376 f
, err
:= os
.Open(name
)
378 fmt
.Fprintf(g
.Stderr
, "%s\n", err
)
382 return g
.Reader(f
, name
)
385 var nl
= []byte{'\n'}
387 func countNL(b
[]byte) int {
390 i
:= bytes
.IndexByte(b
, '\n')
404 // contents of line (Line - 2)
406 // contents of line (Line - 1)
408 // XXX: The following will most likely change after we figure out how
409 // to highlight the found text properly :).
411 // contents of line (Line + 1)
413 // contents of line (Line + 2)
416 // This will be filled in by the source backend
421 func (g
*Grep
) Reader(r io
.Reader
, name
string) []Match
{
425 g
.buf
= make([]byte, 1<<20)
438 n
, err
:= io
.ReadFull(r
, buf
[len(buf
):cap(buf
)])
439 buf
= buf
[:len(buf
)+n
]
442 // We only look at complete lines
443 end
= bytes
.LastIndex(buf
, nl
) + 1
449 //fmt.Printf("need to add %d context lines to the last match\n", needContext)
451 lineEnd
:= bytes
.Index(buf
[:end
], nl
)
453 result
[len(result
)-1].Ctxn1
= string(buf
[:lineEnd
])
454 //fmt.Printf("afterwards: ctxn1 = *%s*\n", result[len(result)-1].Ctxn1)
456 nextLineEnd
:= bytes
.Index(buf
[lineEnd
+1:end
], nl
)
457 if nextLineEnd
!= -1 {
458 result
[len(result
)-1].Ctxn2
= string(buf
[lineEnd
+1 : lineEnd
+1+nextLineEnd
])
459 //fmt.Printf("afterwards: ctxn2 = *%s*\n", result[len(result)-1].Ctxn2)
466 //fmt.Printf("looking at line *%s*\n", buf[0:end])
467 for chunkStart
< end
{
468 m1
:= g
.Regexp
.Match(buf
[chunkStart
:end
], beginText
, endText
) + chunkStart
474 lineStart
:= bytes
.LastIndex(buf
[chunkStart
:m1
], nl
) + 1 + chunkStart
479 //fmt.Printf("matching line: %s", buf[lineStart:lineEnd])
481 lineno
+= countNL(buf
[chunkStart
:lineStart
])
482 line
:= html
.EscapeString(string(buf
[lineStart
: lineEnd
-1]))
486 Context
: string(line
),
488 // Let’s find the previous two lines, if possible.
489 bufLineNo
= countNL(buf
[:lineStart
])
491 prev1Start
:= bytes
.LastIndex(buf
[:lineStart
-1], nl
) + 1
492 match
.Ctxp1
= html
.EscapeString(string(buf
[prev1Start
: lineStart
-1]))
494 prev2Start
:= bytes
.LastIndex(buf
[:prev1Start
-1], nl
) + 1
495 match
.Ctxp2
= html
.EscapeString(string(buf
[prev2Start
: prev1Start
-1]))
505 //fmt.Printf("lineEnd = %d, end = %d\n", lineEnd, end)
506 next1Start
:= bytes
.Index(buf
[lineEnd
:end
], nl
)
507 //fmt.Printf("next1Start = %d\n", next1Start)
508 if next1Start
!= -1 {
509 next1Start
= next1Start
+ lineEnd
+ 1
510 match
.Ctxn1
= html
.EscapeString(string(buf
[lineEnd
: next1Start
-1]))
511 if next1Start
< end
{
512 next2Start
:= bytes
.Index(buf
[next1Start
:end
], nl
)
513 if next2Start
!= -1 {
514 match
.Ctxn2
= html
.EscapeString(string(buf
[next1Start
: next1Start
+next2Start
]))
523 //fmt.Printf("ctxn1 = *%s*\n", match.Ctxn1)
524 //fmt.Printf("ctxn2 = *%s*\n", match.Ctxn2)
525 result
= append(result
, match
)
530 lineno
+= countNL(buf
[chunkStart
:end
])
533 // We are about to read again, so let’s store the last two lines in
534 // case the next match needs them.
536 // This could be because there was no match and we didn’t count, so
537 // make sure we count.
538 bufLineNo
= countNL(buf
[:end
])
539 //fmt.Printf("lineno = %d\n", bufLineNo)
542 prev1Start
:= bytes
.LastIndex(buf
[:end
-1], nl
) + 1
543 lastp1
= html
.EscapeString(string(buf
[prev1Start
: end
-1]))
545 prev2Start
:= bytes
.LastIndex(buf
[:prev1Start
-1], nl
) + 1
546 lastp2
= html
.EscapeString(string(buf
[prev2Start
: prev1Start
-1]))
550 // Copy the remaining elements to the front (everything after the next newline)
551 n
= copy(buf
, buf
[end
:])
553 if len(buf
) == 0 && err
!= nil {
554 if err
!= io
.EOF
&& err
!= io
.ErrUnexpectedEOF
{
555 fmt
.Fprintf(g
.Stderr
, "%s: %v\n", name
, err
)