regexp/match.go

   1 // Copyright 2011 The Go Authors.  All rights reserved.
   2 // Use of this source code is governed by a BSD-style
   3 // license that can be found in the LICENSE file.
   4
   5 package regexp
   6
   7 import (
   8         "bytes"
   9         "encoding/binary"
  10         "flag"
  11         "fmt"
  12         "html"
  13         "io"
  14         "os"
  15         "regexp/syntax"
  16         "sort"
  17
  18         "github.com/google/codesearch/sparse"
  19 )
  20
  21 // A matcher holds the state for running regular expression search.
  22 type matcher struct {
  23         prog      *syntax.Prog       // compiled program
  24         dstate    map[string]*dstate // dstate cache
  25         start     *dstate            // start state
  26         startLine *dstate            // start state for beginning of line
  27         z1, z2    nstate             // two temporary nstates
  28 }
  29
  30 // An nstate corresponds to an NFA state.
  31 type nstate struct {
  32         q       sparse.Set // queue of program instructions
  33         partial rune       // partially decoded rune (TODO)
  34         flag    flags      // flags (TODO)
  35 }
  36
  37 // The flags record state about a position between bytes in the text.
  38 type flags uint32
  39
  40 const (
  41         flagBOL  flags = 1 << iota // beginning of line
  42         flagEOL                    // end of line
  43         flagBOT                    // beginning of text
  44         flagEOT                    // end of text
  45         flagWord                   // last byte was word byte
  46 )
  47
  48 // A dstate corresponds to a DFA state.
  49 type dstate struct {
  50         next     [256]*dstate // next state, per byte
  51         enc      string       // encoded nstate
  52         matchNL  bool         // match when next byte is \n
  53         matchEOT bool         // match in this state at end of text
  54 }
  55
  56 func (z *nstate) String() string {
  57         return fmt.Sprintf("%v/%#x+%#x", z.q.Dense(), z.flag, z.partial)
  58 }
  59
  60 // enc encodes z as a string.
  61 func (z *nstate) enc() string {
  62         var buf []byte
  63         var v [10]byte
  64         last := ^uint32(0)
  65         n := binary.PutUvarint(v[:], uint64(z.partial))
  66         buf = append(buf, v[:n]...)
  67         n = binary.PutUvarint(v[:], uint64(z.flag))
  68         buf = append(buf, v[:n]...)
  69         dense := z.q.Dense()
  70         ids := make([]int, 0, len(dense))
  71         for _, id := range z.q.Dense() {
  72                 ids = append(ids, int(id))
  73         }
  74         sort.Ints(ids)
  75         for _, id := range ids {
  76                 n := binary.PutUvarint(v[:], uint64(uint32(id)-last))
  77                 buf = append(buf, v[:n]...)
  78                 last = uint32(id)
  79         }
  80         return string(buf)
  81 }
  82
  83 // dec decodes the encoding s into z.
  84 func (z *nstate) dec(s string) {
  85         b := []byte(s)
  86         i, n := binary.Uvarint(b)
  87         if n <= 0 {
  88                 bug()
  89         }
  90         b = b[n:]
  91         z.partial = rune(i)
  92         i, n = binary.Uvarint(b)
  93         if n <= 0 {
  94                 bug()
  95         }
  96         b = b[n:]
  97         z.flag = flags(i)
  98         z.q.Reset()
  99         last := ^uint32(0)
 100         for len(b) > 0 {
 101                 i, n = binary.Uvarint(b)
 102                 if n <= 0 {
 103                         bug()
 104                 }
 105                 b = b[n:]
 106                 last += uint32(i)
 107                 z.q.Add(last)
 108         }
 109 }
 110
 111 // dmatch is the state we're in when we've seen a match and are just
 112 // waiting for the end of the line.
 113 var dmatch = dstate{
 114         matchNL:  true,
 115         matchEOT: true,
 116 }
 117
 118 func init() {
 119         var z nstate
 120         dmatch.enc = z.enc()
 121         for i := range dmatch.next {
 122                 if i != '\n' {
 123                         dmatch.next[i] = &dmatch
 124                 }
 125         }
 126 }
 127
 128 // init initializes the matcher.
 129 func (m *matcher) init(prog *syntax.Prog) error {
 130         m.prog = prog
 131         m.dstate = make(map[string]*dstate)
 132
 133         m.z1.q.Init(uint32(len(prog.Inst)))
 134         m.z2.q.Init(uint32(len(prog.Inst)))
 135
 136         m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine|syntax.EmptyBeginText)
 137         m.z1.flag = flagBOL | flagBOT
 138         m.start = m.cache(&m.z1)
 139
 140         m.z1.q.Reset()
 141         m.addq(&m.z1.q, uint32(prog.Start), syntax.EmptyBeginLine)
 142         m.z1.flag = flagBOL
 143         m.startLine = m.cache(&m.z1)
 144
 145         return nil
 146 }
 147
 148 // stepEmpty steps runq to nextq expanding according to flag.
 149 func (m *matcher) stepEmpty(runq, nextq *sparse.Set, flag syntax.EmptyOp) {
 150         nextq.Reset()
 151         for _, id := range runq.Dense() {
 152                 m.addq(nextq, id, flag)
 153         }
 154 }
 155
 156 // stepByte steps runq to nextq consuming c and then expanding according to flag.
 157 // It returns true if a match ends immediately before c.
 158 // c is either an input byte or endText.
 159 func (m *matcher) stepByte(runq, nextq *sparse.Set, c int, flag syntax.EmptyOp) (match bool) {
 160         nextq.Reset()
 161         m.addq(nextq, uint32(m.prog.Start), flag)
 162         for _, id := range runq.Dense() {
 163                 i := &m.prog.Inst[id]
 164                 switch i.Op {
 165                 default:
 166                         continue
 167                 case syntax.InstMatch:
 168                         match = true
 169                         continue
 170                 case instByteRange:
 171                         if c == endText {
 172                                 break
 173                         }
 174                         lo := int((i.Arg >> 8) & 0xFF)
 175                         hi := int(i.Arg & 0xFF)
 176                         ch := c
 177                         if i.Arg&argFold != 0 && 'a' <= ch && ch <= 'z' {
 178                                 ch += 'A' - 'a'
 179                         }
 180                         if lo <= ch && ch <= hi {
 181                                 m.addq(nextq, i.Out, flag)
 182                         }
 183                 }
 184         }
 185         return
 186 }
 187
 188 // addq adds id to the queue, expanding according to flag.
 189 func (m *matcher) addq(q *sparse.Set, id uint32, flag syntax.EmptyOp) {
 190         if q.Has(id) {
 191                 return
 192         }
 193         q.Add(id)
 194         i := &m.prog.Inst[id]
 195         switch i.Op {
 196         case syntax.InstCapture, syntax.InstNop:
 197                 m.addq(q, i.Out, flag)
 198         case syntax.InstAlt, syntax.InstAltMatch:
 199                 m.addq(q, i.Out, flag)
 200                 m.addq(q, i.Arg, flag)
 201         case syntax.InstEmptyWidth:
 202                 if syntax.EmptyOp(i.Arg)&^flag == 0 {
 203                         m.addq(q, i.Out, flag)
 204                 }
 205         }
 206 }
 207
 208 const endText = -1
 209
 210 // computeNext computes the next DFA state if we're in d reading c (an input byte or endText).
 211 func (m *matcher) computeNext(d *dstate, c int) *dstate {
 212         this, next := &m.z1, &m.z2
 213         this.dec(d.enc)
 214
 215         // compute flags in effect before c
 216         flag := syntax.EmptyOp(0)
 217         if this.flag&flagBOL != 0 {
 218                 flag |= syntax.EmptyBeginLine
 219         }
 220         if this.flag&flagBOT != 0 {
 221                 flag |= syntax.EmptyBeginText
 222         }
 223         if this.flag&flagWord != 0 {
 224                 if !isWordByte(c) {
 225                         flag |= syntax.EmptyWordBoundary
 226                 } else {
 227                         flag |= syntax.EmptyNoWordBoundary
 228                 }
 229         } else {
 230                 if isWordByte(c) {
 231                         flag |= syntax.EmptyWordBoundary
 232                 } else {
 233                         flag |= syntax.EmptyNoWordBoundary
 234                 }
 235         }
 236         if c == '\n' {
 237                 flag |= syntax.EmptyEndLine
 238         }
 239         if c == endText {
 240                 flag |= syntax.EmptyEndLine | syntax.EmptyEndText
 241         }
 242
 243         // re-expand queue using new flags.
 244         // TODO: only do this when it matters
 245         // (something is gating on word boundaries).
 246         m.stepEmpty(&this.q, &next.q, flag)
 247         this, next = next, this
 248
 249         // now compute flags after c.
 250         flag = 0
 251         next.flag = 0
 252         if c == '\n' {
 253                 flag |= syntax.EmptyBeginLine
 254                 next.flag |= flagBOL
 255         }
 256         if isWordByte(c) {
 257                 next.flag |= flagWord
 258         }
 259
 260         // re-add start, process rune + expand according to flags.
 261         if m.stepByte(&this.q, &next.q, c, flag) {
 262                 return &dmatch
 263         }
 264         return m.cache(next)
 265 }
 266
 267 func (m *matcher) cache(z *nstate) *dstate {
 268         enc := z.enc()
 269         d := m.dstate[enc]
 270         if d != nil {
 271                 return d
 272         }
 273
 274         d = &dstate{enc: enc}
 275         m.dstate[enc] = d
 276         d.matchNL = m.computeNext(d, '\n') == &dmatch
 277         d.matchEOT = m.computeNext(d, endText) == &dmatch
 278         return d
 279 }
 280
 281 func (m *matcher) match(b []byte, beginText, endText bool) (end int) {
 282         //      fmt.Printf("%v\n", m.prog)
 283
 284         d := m.startLine
 285         if beginText {
 286                 d = m.start
 287         }
 288         //      m.z1.dec(d.enc)
 289         //      fmt.Printf("%v (%v)\n", &m.z1, d==&dmatch)
 290         for i, c := range b {
 291                 d1 := d.next[c]
 292                 if d1 == nil {
 293                         if c == '\n' {
 294                                 if d.matchNL {
 295                                         return i
 296                                 }
 297                                 d1 = m.startLine
 298                         } else {
 299                                 d1 = m.computeNext(d, int(c))
 300                         }
 301                         d.next[c] = d1
 302                 }
 303                 d = d1
 304                 //              m.z1.dec(d.enc)
 305                 //              fmt.Printf("%#U: %v (%v, %v, %v)\n", c, &m.z1, d==&dmatch, d.matchNL, d.matchEOT)
 306         }
 307         if d.matchNL || endText && d.matchEOT {
 308                 return len(b)
 309         }
 310         return -1
 311 }
 312
 313 func (m *matcher) matchString(b string, beginText, endText bool) (end int) {
 314         d := m.startLine
 315         if beginText {
 316                 d = m.start
 317         }
 318         for i := 0; i < len(b); i++ {
 319                 c := b[i]
 320                 d1 := d.next[c]
 321                 if d1 == nil {
 322                         if c == '\n' {
 323                                 if d.matchNL {
 324                                         return i
 325                                 }
 326                                 d1 = m.startLine
 327                         } else {
 328                                 d1 = m.computeNext(d, int(c))
 329                         }
 330                         d.next[c] = d1
 331                 }
 332                 d = d1
 333         }
 334         if d.matchNL || endText && d.matchEOT {
 335                 return len(b)
 336         }
 337         return -1
 338 }
 339
 340 // isWordByte reports whether the byte c is a word character: ASCII only.
 341 // This is used to implement \b and \B.  This is not right for Unicode, but:
 342 //      - it's hard to get right in a byte-at-a-time matching world
 343 //        (the DFA has only one-byte lookahead)
 344 //      - this crude approximation is the same one PCRE uses
 345 func isWordByte(c int) bool {
 346         return 'A' <= c && c <= 'Z' ||
 347                 'a' <= c && c <= 'z' ||
 348                 '0' <= c && c <= '9' ||
 349                 c == '_'
 350 }
 351
 352 // TODO:
 353 type Grep struct {
 354         Regexp *Regexp   // regexp to search for
 355         Stdout io.Writer // output target
 356         Stderr io.Writer // error target
 357
 358         L bool // L flag - print file names only
 359         C bool // C flag - print count of matches
 360         N bool // N flag - print line numbers
 361         H bool // H flag - do not print file names
 362
 363         Match bool
 364
 365         buf []byte
 366 }
 367
 368 func (g *Grep) AddFlags() {
 369         flag.BoolVar(&g.L, "l", false, "list matching files only")
 370         flag.BoolVar(&g.C, "c", false, "print match counts only")
 371         flag.BoolVar(&g.N, "n", false, "show line numbers")
 372         flag.BoolVar(&g.H, "h", false, "omit file names")
 373 }
 374
 375 func (g *Grep) File(name string) []Match {
 376         f, err := os.Open(name)
 377         if err != nil {
 378                 fmt.Fprintf(g.Stderr, "%s\n", err)
 379                 return []Match{}
 380         }
 381         defer f.Close()
 382         return g.Reader(f, name)
 383 }
 384
 385 var nl = []byte{'\n'}
 386
 387 func countNL(b []byte) int {
 388         n := 0
 389         for {
 390                 i := bytes.IndexByte(b, '\n')
 391                 if i < 0 {
 392                         break
 393                 }
 394                 n++
 395                 b = b[i+1:]
 396         }
 397         return n
 398 }
 399
 400 type Match struct {
 401         Path string
 402         Line int
 403
 404         // contents of line (Line - 2)
 405         Ctxp2 string
 406         // contents of line (Line - 1)
 407         Ctxp1 string
 408         // XXX: The following will most likely change after we figure out how
 409         // to highlight the found text properly :).
 410         Context string
 411         // contents of line (Line + 1)
 412         Ctxn1 string
 413         // contents of line (Line + 2)
 414         Ctxn2 string
 415
 416         // This will be filled in by the source backend
 417         PathRank float32
 418         Ranking  float32
 419 }
 420
 421 func (g *Grep) Reader(r io.Reader, name string) []Match {
 422         var result []Match
 423         if g.buf == nil {
 424                 // 1024KB
 425                 g.buf = make([]byte, 1<<20)
 426         }
 427         var (
 428                 buf         = g.buf[:0]
 429                 lineno      = 1
 430                 bufLineNo   = 0
 431                 beginText   = true
 432                 endText     = false
 433                 needContext = 0
 434                 lastp1      = ""
 435                 lastp2      = ""
 436         )
 437         for {
 438                 n, err := io.ReadFull(r, buf[len(buf):cap(buf)])
 439                 buf = buf[:len(buf)+n]
 440                 end := len(buf)
 441                 if err == nil {
 442                         // We only look at complete lines
 443                         end = bytes.LastIndex(buf, nl) + 1
 444                 } else {
 445                         endText = true
 446                 }
 447                 chunkStart := 0
 448                 bufLineNo = 0
 449                 //fmt.Printf("need to add %d context lines to the last match\n", needContext)
 450                 if needContext > 0 {
 451                         lineEnd := bytes.Index(buf[:end], nl)
 452                         if lineEnd != -1 {
 453                                 result[len(result)-1].Ctxn1 = string(buf[:lineEnd])
 454                                 //fmt.Printf("afterwards: ctxn1 = *%s*\n", result[len(result)-1].Ctxn1)
 455                                 if needContext > 1 {
 456                                         nextLineEnd := bytes.Index(buf[lineEnd+1:end], nl)
 457                                         if nextLineEnd != -1 {
 458                                                 result[len(result)-1].Ctxn2 = string(buf[lineEnd+1 : lineEnd+1+nextLineEnd])
 459                                                 //fmt.Printf("afterwards: ctxn2 = *%s*\n", result[len(result)-1].Ctxn2)
 460                                         }
 461                                 }
 462                         }
 463                         needContext = 0
 464                 }
 465
 466                 //fmt.Printf("looking at line *%s*\n", buf[0:end])
 467                 for chunkStart < end {
 468                         m1 := g.Regexp.Match(buf[chunkStart:end], beginText, endText) + chunkStart
 469                         beginText = false
 470                         if m1 < chunkStart {
 471                                 break
 472                         }
 473                         g.Match = true
 474                         lineStart := bytes.LastIndex(buf[chunkStart:m1], nl) + 1 + chunkStart
 475                         lineEnd := m1 + 1
 476                         if lineEnd > end {
 477                                 lineEnd = end
 478                         }
 479                         //fmt.Printf("matching line: %s", buf[lineStart:lineEnd])
 480
 481                         lineno += countNL(buf[chunkStart:lineStart])
 482                         line := html.EscapeString(string(buf[lineStart : lineEnd-1]))
 483                         match := Match{
 484                                 Path:    name,
 485                                 Line:    lineno,
 486                                 Context: string(line),
 487                         }
 488                         // Let’s find the previous two lines, if possible.
 489                         bufLineNo = countNL(buf[:lineStart])
 490                         if bufLineNo >= 1 {
 491                                 prev1Start := bytes.LastIndex(buf[:lineStart-1], nl) + 1
 492                                 match.Ctxp1 = html.EscapeString(string(buf[prev1Start : lineStart-1]))
 493                                 if bufLineNo >= 2 {
 494                                         prev2Start := bytes.LastIndex(buf[:prev1Start-1], nl) + 1
 495                                         match.Ctxp2 = html.EscapeString(string(buf[prev2Start : prev1Start-1]))
 496                                 } else {
 497                                         match.Ctxp2 = lastp1
 498                                 }
 499                         } else {
 500                                 match.Ctxp1 = lastp1
 501                                 match.Ctxp2 = lastp2
 502                         }
 503                         needContext = 0
 504                         if lineEnd < end {
 505                                 //fmt.Printf("lineEnd = %d, end = %d\n", lineEnd, end)
 506                                 next1Start := bytes.Index(buf[lineEnd:end], nl)
 507                                 //fmt.Printf("next1Start = %d\n", next1Start)
 508                                 if next1Start != -1 {
 509                                         next1Start = next1Start + lineEnd + 1
 510                                         match.Ctxn1 = html.EscapeString(string(buf[lineEnd : next1Start-1]))
 511                                         if next1Start < end {
 512                                                 next2Start := bytes.Index(buf[next1Start:end], nl)
 513                                                 if next2Start != -1 {
 514                                                         match.Ctxn2 = html.EscapeString(string(buf[next1Start : next1Start+next2Start]))
 515                                                 }
 516                                         } else {
 517                                                 needContext = 1
 518                                         }
 519                                 }
 520                         } else {
 521                                 needContext = 2
 522                         }
 523                         //fmt.Printf("ctxn1 = *%s*\n", match.Ctxn1)
 524                         //fmt.Printf("ctxn2 = *%s*\n", match.Ctxn2)
 525                         result = append(result, match)
 526                         lineno++
 527                         chunkStart = lineEnd
 528                 }
 529                 if err == nil {
 530                         lineno += countNL(buf[chunkStart:end])
 531                 }
 532
 533                 // We are about to read again, so let’s store the last two lines in
 534                 // case the next match needs them.
 535                 if bufLineNo == 0 {
 536                         // This could be because there was no match and we didn’t count, so
 537                         // make sure we count.
 538                         bufLineNo = countNL(buf[:end])
 539                         //fmt.Printf("lineno = %d\n", bufLineNo)
 540                 }
 541                 if bufLineNo > 1 {
 542                         prev1Start := bytes.LastIndex(buf[:end-1], nl) + 1
 543                         lastp1 = html.EscapeString(string(buf[prev1Start : end-1]))
 544                         if bufLineNo > 2 {
 545                                 prev2Start := bytes.LastIndex(buf[:prev1Start-1], nl) + 1
 546                                 lastp2 = html.EscapeString(string(buf[prev2Start : prev1Start-1]))
 547                         }
 548                 }
 549
 550                 // Copy the remaining elements to the front (everything after the next newline)
 551                 n = copy(buf, buf[end:])
 552                 buf = buf[:n]
 553                 if len(buf) == 0 && err != nil {
 554                         if err != io.EOF && err != io.ErrUnexpectedEOF {
 555                                 fmt.Fprintf(g.Stderr, "%s: %v\n", name, err)
 556                         }
 557                         break
 558                 }
 559         }
 560         return result
 561 }