Bugfix: construct redirect URLs using net/url
[debiancodesearch.git] / index / regexp.go
blob4f336feafe5d96a490b858b444a32e39f76211ac
1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 package index
7 import (
8 "regexp/syntax"
9 "sort"
10 "strconv"
11 "strings"
12 "unicode"
15 // A Query is a matching machine, like a regular expression,
16 // that matches some text and not other text. When we compute a
17 // Query from a regexp, the Query is a conservative version of the
18 // regexp: it matches everything the regexp would match, and probably
19 // quite a bit more. We can then filter target files by whether they match
20 // the Query (using a trigram index) before running the comparatively
21 // more expensive regexp machinery.
22 type Query struct {
23 Op QueryOp
24 Trigram []string
25 Sub []*Query
28 type QueryOp int
30 const (
31 QAll QueryOp = iota // Everything matches
32 QNone // Nothing matches
33 QAnd // All in Sub and Trigram must match
34 QOr // At least one in Sub or Trigram must match
37 var allQuery = &Query{Op: QAll}
38 var noneQuery = &Query{Op: QNone}
40 // and returns the query q AND r, possibly reusing q's and r's storage.
41 func (q *Query) and(r *Query) *Query {
42 return q.andOr(r, QAnd)
45 // or returns the query q OR r, possibly reusing q's and r's storage.
46 func (q *Query) or(r *Query) *Query {
47 return q.andOr(r, QOr)
50 // andOr returns the query q AND r or q OR r, possibly reusing q's and r's storage.
51 // It works hard to avoid creating unnecessarily complicated structures.
52 func (q *Query) andOr(r *Query, op QueryOp) (out *Query) {
53 opstr := "&"
54 if op == QOr {
55 opstr = "|"
57 //println("andOr", q.String(), opstr, r.String())
58 //defer func() { println(" ->", out.String()) }()
59 _ = opstr
61 if len(q.Trigram) == 0 && len(q.Sub) == 1 {
62 q = q.Sub[0]
64 if len(r.Trigram) == 0 && len(r.Sub) == 1 {
65 r = r.Sub[0]
68 // Boolean simplification.
69 // If q ⇒ r, q AND r ≡ q.
70 // If q ⇒ r, q OR r ≡ r.
71 if q.implies(r) {
72 //println(q.String(), "implies", r.String())
73 if op == QAnd {
74 return q
76 return r
78 if r.implies(q) {
79 //println(r.String(), "implies", q.String())
80 if op == QAnd {
81 return r
83 return q
86 // Both q and r are QAnd or QOr.
87 // If they match or can be made to match, merge.
88 qAtom := len(q.Trigram) == 1 && len(q.Sub) == 0
89 rAtom := len(r.Trigram) == 1 && len(r.Sub) == 0
90 if q.Op == op && (r.Op == op || rAtom) {
91 q.Trigram = stringSet.union(q.Trigram, r.Trigram, false)
92 q.Sub = append(q.Sub, r.Sub...)
93 return q
95 if r.Op == op && qAtom {
96 r.Trigram = stringSet.union(r.Trigram, q.Trigram, false)
97 return r
99 if qAtom && rAtom {
100 q.Op = op
101 q.Trigram = append(q.Trigram, r.Trigram...)
102 return q
105 // If one matches the op, add the other to it.
106 if q.Op == op {
107 q.Sub = append(q.Sub, r)
108 return q
110 if r.Op == op {
111 r.Sub = append(r.Sub, q)
112 return r
115 // We are creating an AND of ORs or an OR of ANDs.
116 // Factor out common trigrams, if any.
117 common := stringSet{}
118 i, j := 0, 0
119 wi, wj := 0, 0
120 for i < len(q.Trigram) && j < len(r.Trigram) {
121 qt, rt := q.Trigram[i], r.Trigram[j]
122 if qt < rt {
123 q.Trigram[wi] = qt
124 wi++
126 } else if qt > rt {
127 r.Trigram[wj] = rt
128 wj++
130 } else {
131 common = append(common, qt)
136 for ; i < len(q.Trigram); i++ {
137 q.Trigram[wi] = q.Trigram[i]
138 wi++
140 for ; j < len(r.Trigram); j++ {
141 r.Trigram[wj] = r.Trigram[j]
142 wj++
144 q.Trigram = q.Trigram[:wi]
145 r.Trigram = r.Trigram[:wj]
146 if len(common) > 0 {
147 // If there were common trigrams, rewrite
149 // (abc|def|ghi|jkl) AND (abc|def|mno|prs) =>
150 // (abc|def) OR ((ghi|jkl) AND (mno|prs))
152 // (abc&def&ghi&jkl) OR (abc&def&mno&prs) =>
153 // (abc&def) AND ((ghi&jkl) OR (mno&prs))
155 // Build up the right one of
156 // (ghi|jkl) AND (mno|prs)
157 // (ghi&jkl) OR (mno&prs)
158 // Call andOr recursively in case q and r can now be simplified
159 // (we removed some trigrams).
160 s := q.andOr(r, op)
162 // Add in factored trigrams.
163 otherOp := QAnd + QOr - op
164 t := &Query{Op: otherOp, Trigram: common}
165 return t.andOr(s, t.Op)
168 // Otherwise just create the op.
169 return &Query{Op: op, Sub: []*Query{q, r}}
172 // implies reports whether q implies r.
173 // It is okay for it to return false negatives.
174 func (q *Query) implies(r *Query) bool {
175 if q.Op == QNone || r.Op == QAll {
176 // False implies everything.
177 // Everything implies True.
178 return true
180 if q.Op == QAll || r.Op == QNone {
181 // True implies nothing.
182 // Nothing implies False.
183 return false
186 if q.Op == QAnd || (q.Op == QOr && len(q.Trigram) == 1 && len(q.Sub) == 0) {
187 return trigramsImply(q.Trigram, r)
190 if q.Op == QOr && r.Op == QOr &&
191 len(q.Trigram) > 0 && len(q.Sub) == 0 &&
192 stringSet.isSubsetOf(q.Trigram, r.Trigram) {
193 return true
195 return false
198 func trigramsImply(t []string, q *Query) bool {
199 switch q.Op {
200 case QOr:
201 for _, qq := range q.Sub {
202 if trigramsImply(t, qq) {
203 return true
206 for i := range t {
207 if stringSet.isSubsetOf(t[i:i+1], q.Trigram) {
208 return true
211 return false
212 case QAnd:
213 for _, qq := range q.Sub {
214 if !trigramsImply(t, qq) {
215 return false
218 if !stringSet.isSubsetOf(q.Trigram, t) {
219 return false
221 return true
223 return false
226 // maybeRewrite rewrites q to use op if it is possible to do so
227 // without changing the meaning. It also simplifies if the node
228 // is an empty OR or AND.
229 func (q *Query) maybeRewrite(op QueryOp) {
230 if q.Op != QAnd && q.Op != QOr {
231 return
234 // AND/OR doing real work? Can't rewrite.
235 n := len(q.Sub) + len(q.Trigram)
236 if n > 1 {
237 return
240 // Nothing left in the AND/OR?
241 if n == 0 {
242 if q.Op == QAnd {
243 q.Op = QAll
244 } else {
245 q.Op = QNone
247 return
250 // Just a sub-node: throw away wrapper.
251 if len(q.Sub) == 1 {
252 *q = *q.Sub[0]
255 // Just a trigram: can use either op.
256 q.Op = op
259 // andTrigrams returns q AND the OR of the AND of the trigrams present in each string.
260 func (q *Query) andTrigrams(t stringSet) *Query {
261 if t.minLen() < 3 {
262 // If there is a short string, we can't guarantee
263 // that any trigrams must be present, so use ALL.
264 // q AND ALL = q.
265 return q
268 //println("andtrigrams", strings.Join(t, ","))
269 or := noneQuery
270 for _, tt := range t {
271 var trig stringSet
272 for i := 0; i+3 <= len(tt); i++ {
273 trig.add(tt[i : i+3])
275 trig.clean(false)
276 //println(tt, "trig", strings.Join(trig, ","))
277 or = or.or(&Query{Op: QAnd, Trigram: trig})
279 q = q.and(or)
280 return q
283 func (q *Query) String() string {
284 if q == nil {
285 return "?"
287 if q.Op == QNone {
288 return "-"
290 if q.Op == QAll {
291 return "+"
294 if len(q.Sub) == 0 && len(q.Trigram) == 1 {
295 return strconv.Quote(q.Trigram[0])
298 var (
299 s string
300 sjoin string
301 end string
302 tjoin string
304 if q.Op == QAnd {
305 sjoin = " "
306 tjoin = " "
307 } else {
308 s = "("
309 sjoin = ")|("
310 end = ")"
311 tjoin = "|"
313 for i, t := range q.Trigram {
314 if i > 0 {
315 s += tjoin
317 s += strconv.Quote(t)
319 if len(q.Sub) > 0 {
320 if len(q.Trigram) > 0 {
321 s += sjoin
323 s += q.Sub[0].String()
324 for i := 1; i < len(q.Sub); i++ {
325 s += sjoin + q.Sub[i].String()
328 s += end
329 return s
332 // RegexpQuery returns a Query for the given regexp.
333 func RegexpQuery(re *syntax.Regexp) *Query {
334 info := analyze(re)
335 info.simplify(true)
336 info.addExact()
337 return info.match
340 // A regexpInfo summarizes the results of analyzing a regexp.
341 type regexpInfo struct {
342 // canEmpty records whether the regexp matches the empty string
343 canEmpty bool
345 // exact is the exact set of strings matching the regexp.
346 exact stringSet
348 // if exact is nil, prefix is the set of possible match prefixes,
349 // and suffix is the set of possible match suffixes.
350 prefix stringSet // otherwise: the exact set of matching prefixes ...
351 suffix stringSet // ... and suffixes
353 // match records a query that must be satisfied by any
354 // match for the regexp, in addition to the information
355 // recorded above.
356 match *Query
359 const (
360 // Exact sets are limited to maxExact strings.
361 // If they get too big, simplify will rewrite the regexpInfo
362 // to use prefix and suffix instead. It's not worthwhile for
363 // this to be bigger than maxSet.
364 // Because we allow the maximum length of an exact string
365 // to grow to 5 below (see simplify), it helps to avoid ridiculous
366 // alternations if maxExact is sized so that 3 case-insensitive letters
367 // triggers a flush.
368 maxExact = 7
370 // Prefix and suffix sets are limited to maxSet strings.
371 // If they get too big, simplify will replace groups of strings
372 // sharing a common leading prefix (or trailing suffix) with
373 // that common prefix (or suffix). It is useful for maxSet
374 // to be at least 2³ = 8 so that we can exactly
375 // represent a case-insensitive abc by the set
376 // {abc, abC, aBc, aBC, Abc, AbC, ABc, ABC}.
377 maxSet = 20
380 // anyMatch returns the regexpInfo describing a regexp that
381 // matches any string.
382 func anyMatch() regexpInfo {
383 return regexpInfo{
384 canEmpty: true,
385 prefix: []string{""},
386 suffix: []string{""},
387 match: allQuery,
391 // anyChar returns the regexpInfo describing a regexp that
392 // matches any single character.
393 func anyChar() regexpInfo {
394 return regexpInfo{
395 prefix: []string{""},
396 suffix: []string{""},
397 match: allQuery,
401 // noMatch returns the regexpInfo describing a regexp that
402 // matches no strings at all.
403 func noMatch() regexpInfo {
404 return regexpInfo{
405 match: noneQuery,
409 // emptyString returns the regexpInfo describing a regexp that
410 // matches only the empty string.
411 func emptyString() regexpInfo {
412 return regexpInfo{
413 canEmpty: true,
414 exact: []string{""},
415 match: allQuery,
419 // analyze returns the regexpInfo for the regexp re.
420 func analyze(re *syntax.Regexp) (ret regexpInfo) {
421 //println("analyze", re.String())
422 //defer func() { println("->", ret.String()) }()
423 var info regexpInfo
424 switch re.Op {
425 case syntax.OpNoMatch:
426 return noMatch()
428 case syntax.OpEmptyMatch,
429 syntax.OpBeginLine, syntax.OpEndLine,
430 syntax.OpBeginText, syntax.OpEndText,
431 syntax.OpWordBoundary, syntax.OpNoWordBoundary:
432 return emptyString()
434 case syntax.OpLiteral:
435 if re.Flags&syntax.FoldCase != 0 {
436 switch len(re.Rune) {
437 case 0:
438 return emptyString()
439 case 1:
440 // Single-letter case-folded string:
441 // rewrite into char class and analyze.
442 re1 := &syntax.Regexp{
443 Op: syntax.OpCharClass,
445 re1.Rune = re1.Rune0[:0]
446 r0 := re.Rune[0]
447 re1.Rune = append(re1.Rune, r0, r0)
448 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
449 re1.Rune = append(re1.Rune, r1, r1)
451 info = analyze(re1)
452 return info
454 // Multi-letter case-folded string:
455 // treat as concatenation of single-letter case-folded strings.
456 re1 := &syntax.Regexp{
457 Op: syntax.OpLiteral,
458 Flags: syntax.FoldCase,
460 info = emptyString()
461 for i := range re.Rune {
462 re1.Rune = re.Rune[i : i+1]
463 info = concat(info, analyze(re1))
465 return info
467 info.exact = stringSet{string(re.Rune)}
468 info.match = allQuery
470 case syntax.OpAnyCharNotNL, syntax.OpAnyChar:
471 return anyChar()
473 case syntax.OpCapture:
474 return analyze(re.Sub[0])
476 case syntax.OpConcat:
477 return fold(concat, re.Sub, emptyString())
479 case syntax.OpAlternate:
480 return fold(alternate, re.Sub, noMatch())
482 case syntax.OpQuest:
483 return alternate(analyze(re.Sub[0]), emptyString())
485 case syntax.OpStar:
486 // We don't know anything, so assume the worst.
487 return anyMatch()
489 case syntax.OpRepeat:
490 if re.Min == 0 {
491 // Like OpStar
492 return anyMatch()
494 fallthrough
495 case syntax.OpPlus:
496 // x+
497 // Since there has to be at least one x, the prefixes and suffixes
498 // stay the same. If x was exact, it isn't anymore.
499 info = analyze(re.Sub[0])
500 if info.exact.have() {
501 info.prefix = info.exact
502 info.suffix = info.exact.copy()
503 info.exact = nil
506 case syntax.OpCharClass:
507 info.match = allQuery
509 // Special case.
510 if len(re.Rune) == 0 {
511 return noMatch()
514 // Special case.
515 if len(re.Rune) == 1 {
516 info.exact = stringSet{string(re.Rune[0])}
517 break
520 n := 0
521 for i := 0; i < len(re.Rune); i += 2 {
522 n += int(re.Rune[i+1] - re.Rune[i])
524 // If the class is too large, it's okay to overestimate.
525 if n > 100 {
526 return anyChar()
529 info.exact = []string{}
530 for i := 0; i < len(re.Rune); i += 2 {
531 lo, hi := re.Rune[i], re.Rune[i+1]
532 for rr := lo; rr <= hi; rr++ {
533 info.exact.add(string(rr))
538 info.simplify(false)
539 return info
542 // fold is the usual higher-order function.
543 func fold(f func(x, y regexpInfo) regexpInfo, sub []*syntax.Regexp, zero regexpInfo) regexpInfo {
544 if len(sub) == 0 {
545 return zero
547 if len(sub) == 1 {
548 return analyze(sub[0])
550 info := f(analyze(sub[0]), analyze(sub[1]))
551 for i := 2; i < len(sub); i++ {
552 info = f(info, analyze(sub[i]))
554 return info
557 // concat returns the regexp info for xy given x and y.
558 func concat(x, y regexpInfo) (out regexpInfo) {
559 //println("concat", x.String(), "...", y.String())
560 //defer func() { println("->", out.String()) }()
561 var xy regexpInfo
562 xy.match = x.match.and(y.match)
563 if x.exact.have() && y.exact.have() {
564 xy.exact = x.exact.cross(y.exact, false)
565 } else {
566 if x.exact.have() {
567 xy.prefix = x.exact.cross(y.prefix, false)
568 } else {
569 xy.prefix = x.prefix
570 if x.canEmpty {
571 xy.prefix = xy.prefix.union(y.prefix, false)
574 if y.exact.have() {
575 xy.suffix = x.suffix.cross(y.exact, true)
576 } else {
577 xy.suffix = y.suffix
578 if y.canEmpty {
579 xy.suffix = xy.suffix.union(x.suffix, true)
584 // If all the possible strings in the cross product of x.suffix
585 // and y.prefix are long enough, then the trigram for one
586 // of them must be present and would not necessarily be
587 // accounted for in xy.prefix or xy.suffix yet. Cut things off
588 // at maxSet just to keep the sets manageable.
589 if !x.exact.have() && !y.exact.have() &&
590 x.suffix.size() <= maxSet && y.prefix.size() <= maxSet &&
591 x.suffix.minLen()+y.prefix.minLen() >= 3 {
592 xy.match = xy.match.andTrigrams(x.suffix.cross(y.prefix, false))
595 xy.simplify(false)
596 return xy
599 // alternate returns the regexpInfo for x|y given x and y.
600 func alternate(x, y regexpInfo) (out regexpInfo) {
601 //println("alternate", x.String(), "...", y.String())
602 //defer func() { println("->", out.String()) }()
603 var xy regexpInfo
604 if x.exact.have() && y.exact.have() {
605 xy.exact = x.exact.union(y.exact, false)
606 } else if x.exact.have() {
607 xy.prefix = x.exact.union(y.prefix, false)
608 xy.suffix = x.exact.union(y.suffix, true)
609 x.addExact()
610 } else if y.exact.have() {
611 xy.prefix = x.prefix.union(y.exact, false)
612 xy.suffix = x.suffix.union(y.exact.copy(), true)
613 y.addExact()
614 } else {
615 xy.prefix = x.prefix.union(y.prefix, false)
616 xy.suffix = x.suffix.union(y.suffix, true)
618 xy.canEmpty = x.canEmpty || y.canEmpty
619 xy.match = x.match.or(y.match)
621 xy.simplify(false)
622 return xy
625 // addExact adds to the match query the trigrams for matching info.exact.
626 func (info *regexpInfo) addExact() {
627 if info.exact.have() {
628 info.match = info.match.andTrigrams(info.exact)
632 // simplify simplifies the regexpInfo when the exact set gets too large.
633 func (info *regexpInfo) simplify(force bool) {
634 //println(" simplify", info.String(), " force=", force)
635 //defer func() { println(" ->", info.String()) }()
636 // If there are now too many exact strings,
637 // loop over them, adding trigrams and moving
638 // the relevant pieces into prefix and suffix.
639 info.exact.clean(false)
640 if len(info.exact) > maxExact || (info.exact.minLen() >= 3 && force) || info.exact.minLen() >= 4 {
641 info.addExact()
642 for _, s := range info.exact {
643 n := len(s)
644 if n < 3 {
645 info.prefix.add(s)
646 info.suffix.add(s)
647 } else {
648 info.prefix.add(s[:2])
649 info.suffix.add(s[n-2:])
652 info.exact = nil
655 if !info.exact.have() {
656 info.simplifySet(&info.prefix)
657 info.simplifySet(&info.suffix)
661 // simplifySet reduces the size of the given set (either prefix or suffix).
662 // There is no need to pass around enormous prefix or suffix sets, since
663 // they will only be used to create trigrams. As they get too big, simplifySet
664 // moves the information they contain into the match query, which is
665 // more efficient to pass around.
666 func (info *regexpInfo) simplifySet(s *stringSet) {
667 t := *s
668 t.clean(s == &info.suffix)
670 // Add the OR of the current prefix/suffix set to the query.
671 info.match = info.match.andTrigrams(t)
673 for n := 3; n == 3 || t.size() > maxSet; n-- {
674 // Replace set by strings of length n-1.
675 w := 0
676 for _, str := range t {
677 if len(str) >= n {
678 if s == &info.prefix {
679 str = str[:n-1]
680 } else {
681 str = str[len(str)-n+1:]
684 if w == 0 || t[w-1] != str {
685 t[w] = str
689 t = t[:w]
690 t.clean(s == &info.suffix)
693 // Now make sure that the prefix/suffix sets aren't redundant.
694 // For example, if we know "ab" is a possible prefix, then it
695 // doesn't help at all to know that "abc" is also a possible
696 // prefix, so delete "abc".
697 w := 0
698 f := strings.HasPrefix
699 if s == &info.suffix {
700 f = strings.HasSuffix
702 for _, str := range t {
703 if w == 0 || !f(str, t[w-1]) {
704 t[w] = str
708 t = t[:w]
710 *s = t
713 func (info regexpInfo) String() string {
714 s := ""
715 if info.canEmpty {
716 s += "canempty "
718 if info.exact.have() {
719 s += "exact:" + strings.Join(info.exact, ",")
720 } else {
721 s += "prefix:" + strings.Join(info.prefix, ",")
722 s += " suffix:" + strings.Join(info.suffix, ",")
724 s += " match: " + info.match.String()
725 return s
728 // A stringSet is a set of strings.
729 // The nil stringSet indicates not having a set.
730 // The non-nil but empty stringSet is the empty set.
731 type stringSet []string
733 // have reports whether we have a stringSet.
734 func (s stringSet) have() bool {
735 return s != nil
738 // contains reports whether s contains str.
739 func (s stringSet) contains(str string) bool {
740 for _, ss := range s {
741 if ss == str {
742 return true
745 return false
748 type byPrefix []string
750 func (x *byPrefix) Len() int { return len(*x) }
751 func (x *byPrefix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] }
752 func (x *byPrefix) Less(i, j int) bool { return (*x)[i] < (*x)[j] }
754 type bySuffix []string
756 func (x *bySuffix) Len() int { return len(*x) }
757 func (x *bySuffix) Swap(i, j int) { (*x)[i], (*x)[j] = (*x)[j], (*x)[i] }
758 func (x *bySuffix) Less(i, j int) bool {
759 s := (*x)[i]
760 t := (*x)[j]
761 for i := 1; i <= len(s) && i <= len(t); i++ {
762 si := s[len(s)-i]
763 ti := t[len(t)-i]
764 if si < ti {
765 return true
767 if si > ti {
768 return false
771 return len(s) < len(t)
774 // add adds str to the set.
775 func (s *stringSet) add(str string) {
776 *s = append(*s, str)
779 // clean removes duplicates from the stringSet.
780 func (s *stringSet) clean(isSuffix bool) {
781 t := *s
782 if isSuffix {
783 sort.Sort((*bySuffix)(s))
784 } else {
785 sort.Sort((*byPrefix)(s))
787 w := 0
788 for _, str := range t {
789 if w == 0 || t[w-1] != str {
790 t[w] = str
794 *s = t[:w]
797 // size returns the number of strings in s.
798 func (s stringSet) size() int {
799 return len(s)
802 // minLen returns the length of the shortest string in s.
803 func (s stringSet) minLen() int {
804 if len(s) == 0 {
805 return 0
807 m := len(s[0])
808 for _, str := range s {
809 if m > len(str) {
810 m = len(str)
813 return m
816 // maxLen returns the length of the longest string in s.
817 func (s stringSet) maxLen() int {
818 if len(s) == 0 {
819 return 0
821 m := len(s[0])
822 for _, str := range s {
823 if m < len(str) {
824 m = len(str)
827 return m
830 // union returns the union of s and t, reusing s's storage.
831 func (s stringSet) union(t stringSet, isSuffix bool) stringSet {
832 s = append(s, t...)
833 s.clean(isSuffix)
834 return s
837 // cross returns the cross product of s and t.
838 func (s stringSet) cross(t stringSet, isSuffix bool) stringSet {
839 p := stringSet{}
840 for _, ss := range s {
841 for _, tt := range t {
842 p.add(ss + tt)
845 p.clean(isSuffix)
846 return p
849 // clear empties the set but preserves the storage.
850 func (s *stringSet) clear() {
851 *s = (*s)[:0]
854 // copy returns a copy of the set that does not share storage with the original.
855 func (s stringSet) copy() stringSet {
856 return append(stringSet{}, s...)
859 // isSubsetOf returns true if all strings in s are also in t.
860 // It assumes both sets are sorted.
861 func (s stringSet) isSubsetOf(t stringSet) bool {
862 j := 0
863 for _, ss := range s {
864 for j < len(t) && t[j] < ss {
867 if j >= len(t) || t[j] != ss {
868 return false
871 return true