Serve package list as plain text by default
[debiancodesearch.git] / regexp / match.go
blobda0ff0bffe5c44c7a4e999b8ca6f8a5f7db56aa3
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package regexp
7 import (
8 "bytes"
9 "encoding/binary"
10 "flag"
11 "fmt"
12 "html"
13 "io"
14 "os"
15 "regexp/syntax"
16 "sort"
18 "github.com/google/codesearch/sparse"
21 // A matcher holds the state for running regular expression search.
22 type matcher struct {
23 prog *syntax.Prog // compiled program
24 dstate map[string]*dstate // dstate cache
25 start *dstate // start state
26 startLine *dstate // start state for beginning of line
27 z1, z2 nstate // two temporary nstates
30 // An nstate corresponds to an NFA state.
31 type nstate struct {
32 q sparse.Set // queue of program instructions
33 partial rune // partially decoded rune (TODO)
34 flag flags // flags (TODO)
37 // The flags record state about a position between bytes in the text.
38 type flags uint32
40 const (
41 flagBOL flags = 1 << iota // beginning of line
42 flagEOL // end of line
43 flagBOT // beginning of text
44 flagEOT // end of text
45 flagWord // last byte was word byte
48 // A dstate corresponds to a DFA state.
49 type dstate struct {
50 next [256]*dstate // next state, per byte
51 enc string // encoded nstate
52 matchNL bool // match when next byte is \n
53 matchEOT bool // match in this state at end of text
56 func (z *nstate) String() string {
57 return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.partial)
60 // enc encodes z as a string.
61 func (z *nstate) enc() string {
62 var buf []byte
63 var v [10]byte
64 last := ^uint32(0)
65 n := binary.PutUvarint(v[:], uint64(z.partial))
66 buf = append(buf, v[:n]...)
67 n = binary.PutUvarint(v[:], uint64(z.flag))
68 buf = append(buf, v[:n]...)
69 dense := z.q.Dense()
70 ids := make([]int, 0, len(dense))
71 for _, id := range z.q.Dense() {
72 ids = append(ids, int(id))
74 sort.Ints(ids)
75 for _, id := range ids {
76 n := binary.PutUvarint(v[:], uint64(uint32(id)-last))
77 buf = append(buf, v[:n]...)
78 last = uint32(id)
80 return string(buf)
83 // dec decodes the encoding s into z.
84 func (z *nstate) dec(s string) {
85 b := []byte(s)
86 i, n := binary.Uvarint(b)
87 if n <= 0 {
88 bug()
90 b = b[n:]
91 z.partial = rune(i)
92 i, n = binary.Uvarint(b)
93 if n <= 0 {
94 bug()
96 b = b[n:]
97 z.flag = flags(i)
98 z.q.Reset()
99 last := ^uint32(0)
100 for len(b) > 0 {
101 i, n = binary.Uvarint(b)
102 if n <= 0 {
103 bug()
105 b = b[n:]
106 last += uint32(i)
107 z.q.Add(last)
111 // dmatch is the state we're in when we've seen a match and are just
112 // waiting for the end of the line.
113 var dmatch = dstate{
114 matchNL: true,
115 matchEOT: true,
118 func init() {
119 var z nstate
120 dmatch.enc = z.enc()
121 for i := range dmatch.next {
122 if i != '\n' {
123 dmatch.next[i] = &dmatch
128 // init initializes the matcher.
129 func (m *matcher) init(prog *syntax.Prog) error {
130 m.prog = prog
131 m.dstate = make(map[string]*dstate)
133 m.z1.q.Init(uint32(len(prog.Inst)))
134 m.z2.q.Init(uint32(len(prog.Inst)))
136 m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText)
137 m.z1.flag = flagBOL | flagBOT
138 m.start = m.cache(&m.z1)
140 m.z1.q.Reset()
141 m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine)
142 m.z1.flag = flagBOL
143 m.startLine = m.cache(&m.z1)
145 return nil
148 // stepEmpty steps runq to nextq expanding according to flag.
149 func (m *matcher) stepEmpty(runq, nextq *sparse.Set, flag syntax.EmptyOp) {
150 nextq.Reset()
151 for _, id := range runq.Dense() {
152 m.addq(nextq, id, flag)
156 // stepByte steps runq to nextq consuming c and then expanding according to flag.
157 // It returns true if a match ends immediately before c.
158 // c is either an input byte or endText.
159 func (m *matcher) stepByte(runq, nextq *sparse.Set, c int, flag syntax.EmptyOp) (match bool) {
160 nextq.Reset()
161 m.addq(nextq, uint32(m.prog.Start), flag)
162 for _, id := range runq.Dense() {
163 i := &m.prog.Inst[id]
164 switch i.Op {
165 default:
166 continue
167 case syntax.InstMatch:
168 match = true
169 continue
170 case instByteRange:
171 if c == endText {
172 break
174 lo := int((i.Arg >> 8) & 0xFF)
175 hi := int(i.Arg & 0xFF)
176 ch := c
177 if i.Arg&argFold != 0 && 'a' <= ch && ch <= 'z' {
178 ch += 'A' - 'a'
180 if lo <= ch && ch <= hi {
181 m.addq(nextq, i.Out, flag)
185 return
188 // addq adds id to the queue, expanding according to flag.
189 func (m *matcher) addq(q *sparse.Set, id uint32, flag syntax.EmptyOp) {
190 if q.Has(id) {
191 return
193 q.Add(id)
194 i := &m.prog.Inst[id]
195 switch i.Op {
196 case syntax.InstCapture, syntax.InstNop:
197 m.addq(q, i.Out, flag)
198 case syntax.InstAlt, syntax.InstAltMatch:
199 m.addq(q, i.Out, flag)
200 m.addq(q, i.Arg, flag)
201 case syntax.InstEmptyWidth:
202 if syntax.EmptyOp(i.Arg)&^flag == 0 {
203 m.addq(q, i.Out, flag)
208 const endText = -1
210 // computeNext computes the next DFA state if we're in d reading c (an input byte or endText).
211 func (m *matcher) computeNext(d *dstate, c int) *dstate {
212 this, next := &m.z1, &m.z2
213 this.dec(d.enc)
215 // compute flags in effect before c
216 flag := syntax.EmptyOp(0)
217 if this.flag&flagBOL != 0 {
218 flag |= syntax.EmptyBeginLine
220 if this.flag&flagBOT != 0 {
221 flag |= syntax.EmptyBeginText
223 if this.flag&flagWord != 0 {
224 if !isWordByte(c) {
225 flag |= syntax.EmptyWordBoundary
226 } else {
227 flag |= syntax.EmptyNoWordBoundary
229 } else {
230 if isWordByte(c) {
231 flag |= syntax.EmptyWordBoundary
232 } else {
233 flag |= syntax.EmptyNoWordBoundary
236 if c == '\n' {
237 flag |= syntax.EmptyEndLine
239 if c == endText {
240 flag |= syntax.EmptyEndLine | syntax.EmptyEndText
243 // re-expand queue using new flags.
244 // TODO: only do this when it matters
245 // (something is gating on word boundaries).
246 m.stepEmpty(&this.q, &next.q, flag)
247 this, next = next, this
249 // now compute flags after c.
250 flag = 0
251 next.flag = 0
252 if c == '\n' {
253 flag |= syntax.EmptyBeginLine
254 next.flag |= flagBOL
256 if isWordByte(c) {
257 next.flag |= flagWord
260 // re-add start, process rune + expand according to flags.
261 if m.stepByte(&this.q, &next.q, c, flag) {
262 return &dmatch
264 return m.cache(next)
267 func (m *matcher) cache(z *nstate) *dstate {
268 enc := z.enc()
269 d := m.dstate[enc]
270 if d != nil {
271 return d
274 d = &dstate{enc: enc}
275 m.dstate[enc] = d
276 d.matchNL = m.computeNext(d, '\n') == &dmatch
277 d.matchEOT = m.computeNext(d, endText) == &dmatch
278 return d
281 func (m *matcher) match(b []byte, beginText, endText bool) (end int) {
282 // fmt.Printf("%v\n", m.prog)
284 d := m.startLine
285 if beginText {
286 d = m.start
288 // m.z1.dec(d.enc)
289 // fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch)
290 for i, c := range b {
291 d1 := d.next[c]
292 if d1 == nil {
293 if c == '\n' {
294 if d.matchNL {
295 return i
297 d1 = m.startLine
298 } else {
299 d1 = m.computeNext(d, int(c))
301 d.next[c] = d1
303 d = d1
304 // m.z1.dec(d.enc)
305 // fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT)
307 if d.matchNL || endText && d.matchEOT {
308 return len(b)
310 return -1
313 func (m *matcher) matchString(b string, beginText, endText bool) (end int) {
314 d := m.startLine
315 if beginText {
316 d = m.start
318 for i := 0; i < len(b); i++ {
319 c := b[i]
320 d1 := d.next[c]
321 if d1 == nil {
322 if c == '\n' {
323 if d.matchNL {
324 return i
326 d1 = m.startLine
327 } else {
328 d1 = m.computeNext(d, int(c))
330 d.next[c] = d1
332 d = d1
334 if d.matchNL || endText && d.matchEOT {
335 return len(b)
337 return -1
340 // isWordByte reports whether the byte c is a word character: ASCII only.
341 // This is used to implement \b and \B. This is not right for Unicode, but:
342 // - it's hard to get right in a byte-at-a-time matching world
343 // (the DFA has only one-byte lookahead)
344 // - this crude approximation is the same one PCRE uses
345 func isWordByte(c int) bool {
346 return 'A' <= c && c <= 'Z' ||
347 'a' <= c && c <= 'z' ||
348 '0' <= c && c <= '9' ||
349 c == '_'
352 // TODO:
353 type Grep struct {
354 Regexp *Regexp // regexp to search for
355 Stdout io.Writer // output target
356 Stderr io.Writer // error target
358 L bool // L flag - print file names only
359 C bool // C flag - print count of matches
360 N bool // N flag - print line numbers
361 H bool // H flag - do not print file names
363 Match bool
365 buf []byte
368 func (g *Grep) AddFlags() {
369 flag.BoolVar(&g.L, "l", false, "list matching files only")
370 flag.BoolVar(&g.C, "c", false, "print match counts only")
371 flag.BoolVar(&g.N, "n", false, "show line numbers")
372 flag.BoolVar(&g.H, "h", false, "omit file names")
375 func (g *Grep) File(name string) []Match {
376 f, err := os.Open(name)
377 if err != nil {
378 fmt.Fprintf(g.Stderr, "%s\n", err)
379 return []Match{}
381 defer f.Close()
382 return g.Reader(f, name)
385 var nl = []byte{'\n'}
387 func countNL(b []byte) int {
388 n := 0
389 for {
390 i := bytes.IndexByte(b, '\n')
391 if i < 0 {
392 break
395 b = b[i+1:]
397 return n
400 type Match struct {
401 Path string
402 Line int
404 // contents of line (Line - 2)
405 Ctxp2 string
406 // contents of line (Line - 1)
407 Ctxp1 string
408 // XXX: The following will most likely change after we figure out how
409 // to highlight the found text properly :).
410 Context string
411 // contents of line (Line + 1)
412 Ctxn1 string
413 // contents of line (Line + 2)
414 Ctxn2 string
416 // This will be filled in by the source backend
417 PathRank float32
418 Ranking float32
421 func (g *Grep) Reader(r io.Reader, name string) []Match {
422 var result []Match
423 if g.buf == nil {
424 // 1024KB
425 g.buf = make([]byte, 1<<20)
427 var (
428 buf = g.buf[:0]
429 lineno = 1
430 bufLineNo = 0
431 beginText = true
432 endText = false
433 needContext = 0
434 lastp1 = ""
435 lastp2 = ""
437 for {
438 n, err := io.ReadFull(r, buf[len(buf):cap(buf)])
439 buf = buf[:len(buf)+n]
440 end := len(buf)
441 if err == nil {
442 // We only look at complete lines
443 end = bytes.LastIndex(buf, nl) + 1
444 } else {
445 endText = true
447 chunkStart := 0
448 bufLineNo = 0
449 //fmt.Printf("need to add %d context lines to the last match\n", needContext)
450 if needContext > 0 {
451 lineEnd := bytes.Index(buf[:end], nl)
452 if lineEnd != -1 {
453 result[len(result)-1].Ctxn1 = string(buf[:lineEnd])
454 //fmt.Printf("afterwards: ctxn1 = *%s*\n", result[len(result)-1].Ctxn1)
455 if needContext > 1 {
456 nextLineEnd := bytes.Index(buf[lineEnd+1:end], nl)
457 if nextLineEnd != -1 {
458 result[len(result)-1].Ctxn2 = string(buf[lineEnd+1 : lineEnd+1+nextLineEnd])
459 //fmt.Printf("afterwards: ctxn2 = *%s*\n", result[len(result)-1].Ctxn2)
463 needContext = 0
466 //fmt.Printf("looking at line *%s*\n", buf[0:end])
467 for chunkStart < end {
468 m1 := g.Regexp.Match(buf[chunkStart:end], beginText, endText) + chunkStart
469 beginText = false
470 if m1 < chunkStart {
471 break
473 g.Match = true
474 lineStart := bytes.LastIndex(buf[chunkStart:m1], nl) + 1 + chunkStart
475 lineEnd := m1 + 1
476 if lineEnd > end {
477 lineEnd = end
479 //fmt.Printf("matching line: %s", buf[lineStart:lineEnd])
481 lineno += countNL(buf[chunkStart:lineStart])
482 line := html.EscapeString(string(buf[lineStart : lineEnd-1]))
483 match := Match{
484 Path: name,
485 Line: lineno,
486 Context: string(line),
488 // Let’s find the previous two lines, if possible.
489 bufLineNo = countNL(buf[:lineStart])
490 if bufLineNo >= 1 {
491 prev1Start := bytes.LastIndex(buf[:lineStart-1], nl) + 1
492 match.Ctxp1 = html.EscapeString(string(buf[prev1Start : lineStart-1]))
493 if bufLineNo >= 2 {
494 prev2Start := bytes.LastIndex(buf[:prev1Start-1], nl) + 1
495 match.Ctxp2 = html.EscapeString(string(buf[prev2Start : prev1Start-1]))
496 } else {
497 match.Ctxp2 = lastp1
499 } else {
500 match.Ctxp1 = lastp1
501 match.Ctxp2 = lastp2
503 needContext = 0
504 if lineEnd < end {
505 //fmt.Printf("lineEnd = %d, end = %d\n", lineEnd, end)
506 next1Start := bytes.Index(buf[lineEnd:end], nl)
507 //fmt.Printf("next1Start = %d\n", next1Start)
508 if next1Start != -1 {
509 next1Start = next1Start + lineEnd + 1
510 match.Ctxn1 = html.EscapeString(string(buf[lineEnd : next1Start-1]))
511 if next1Start < end {
512 next2Start := bytes.Index(buf[next1Start:end], nl)
513 if next2Start != -1 {
514 match.Ctxn2 = html.EscapeString(string(buf[next1Start : next1Start+next2Start]))
516 } else {
517 needContext = 1
520 } else {
521 needContext = 2
523 //fmt.Printf("ctxn1 = *%s*\n", match.Ctxn1)
524 //fmt.Printf("ctxn2 = *%s*\n", match.Ctxn2)
525 result = append(result, match)
526 lineno++
527 chunkStart = lineEnd
529 if err == nil {
530 lineno += countNL(buf[chunkStart:end])
533 // We are about to read again, so let’s store the last two lines in
534 // case the next match needs them.
535 if bufLineNo == 0 {
536 // This could be because there was no match and we didn’t count, so
537 // make sure we count.
538 bufLineNo = countNL(buf[:end])
539 //fmt.Printf("lineno = %d\n", bufLineNo)
541 if bufLineNo > 1 {
542 prev1Start := bytes.LastIndex(buf[:end-1], nl) + 1
543 lastp1 = html.EscapeString(string(buf[prev1Start : end-1]))
544 if bufLineNo > 2 {
545 prev2Start := bytes.LastIndex(buf[:prev1Start-1], nl) + 1
546 lastp2 = html.EscapeString(string(buf[prev2Start : prev1Start-1]))
550 // Copy the remaining elements to the front (everything after the next newline)
551 n = copy(buf, buf[end:])
552 buf = buf[:n]
553 if len(buf) == 0 && err != nil {
554 if err != io.EOF && err != io.ErrUnexpectedEOF {
555 fmt.Fprintf(g.Stderr, "%s: %v\n", name, err)
557 break
560 return result