1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
15 // A Query is a matching machine, like a regular expression,
16 // that matches some text and not other text. When we compute a
17 // Query from a regexp, the Query is a conservative version of the
18 // regexp: it matches everything the regexp would match, and probably
19 // quite a bit more. We can then filter target files by whether they match
20 // the Query (using a trigram index) before running the comparatively
21 // more expensive regexp machinery.
31 QAll QueryOp
= iota // Everything matches
32 QNone
// Nothing matches
33 QAnd
// All in Sub and Trigram must match
34 QOr
// At least one in Sub or Trigram must match
37 var allQuery
= &Query
{Op
: QAll
}
38 var noneQuery
= &Query
{Op
: QNone
}
40 // and returns the query q AND r, possibly reusing q's and r's storage.
41 func (q
*Query
) and(r
*Query
) *Query
{
42 return q
.andOr(r
, QAnd
)
45 // or returns the query q OR r, possibly reusing q's and r's storage.
46 func (q
*Query
) or(r
*Query
) *Query
{
47 return q
.andOr(r
, QOr
)
50 // andOr returns the query q AND r or q OR r, possibly reusing q's and r's storage.
51 // It works hard to avoid creating unnecessarily complicated structures.
52 func (q
*Query
) andOr(r
*Query
, op QueryOp
) (out
*Query
) {
57 //println("andOr", q.String(), opstr, r.String())
58 //defer func() { println(" ->", out.String()) }()
61 if len(q
.Trigram
) == 0 && len(q
.Sub
) == 1 {
64 if len(r
.Trigram
) == 0 && len(r
.Sub
) == 1 {
68 // Boolean simplification.
69 // If q ⇒ r, q AND r ≡ q.
70 // If q ⇒ r, q OR r ≡ r.
72 //println(q.String(), "implies", r.String())
79 //println(r.String(), "implies", q.String())
86 // Both q and r are QAnd or QOr.
87 // If they match or can be made to match, merge.
88 qAtom
:= len(q
.Trigram
) == 1 && len(q
.Sub
) == 0
89 rAtom
:= len(r
.Trigram
) == 1 && len(r
.Sub
) == 0
90 if q
.Op
== op
&& (r
.Op
== op || rAtom
) {
91 q
.Trigram
= stringSet
.union(q
.Trigram
, r
.Trigram
, false)
92 q
.Sub
= append(q
.Sub
, r
.Sub
...)
95 if r
.Op
== op
&& qAtom
{
96 r
.Trigram
= stringSet
.union(r
.Trigram
, q
.Trigram
, false)
101 q
.Trigram
= append(q
.Trigram
, r
.Trigram
...)
105 // If one matches the op, add the other to it.
107 q
.Sub
= append(q
.Sub
, r
)
111 r
.Sub
= append(r
.Sub
, q
)
115 // We are creating an AND of ORs or an OR of ANDs.
116 // Factor out common trigrams, if any.
117 common
:= stringSet
{}
120 for i
< len(q
.Trigram
) && j
< len(r
.Trigram
) {
121 qt
, rt
:= q
.Trigram
[i
], r
.Trigram
[j
]
131 common
= append(common
, qt
)
136 for ; i
< len(q
.Trigram
); i
++ {
137 q
.Trigram
[wi
] = q
.Trigram
[i
]
140 for ; j
< len(r
.Trigram
); j
++ {
141 r
.Trigram
[wj
] = r
.Trigram
[j
]
144 q
.Trigram
= q
.Trigram
[:wi
]
145 r
.Trigram
= r
.Trigram
[:wj
]
147 // If there were common trigrams, rewrite
149 // (abc|def|ghi|jkl) AND (abc|def|mno|prs) =>
150 // (abc|def) OR ((ghi|jkl) AND (mno|prs))
152 // (abc&def&ghi&jkl) OR (abc&def&mno&prs) =>
153 // (abc&def) AND ((ghi&jkl) OR (mno&prs))
155 // Build up the right one of
156 // (ghi|jkl) AND (mno|prs)
157 // (ghi&jkl) OR (mno&prs)
158 // Call andOr recursively in case q and r can now be simplified
159 // (we removed some trigrams).
162 // Add in factored trigrams.
163 otherOp
:= QAnd
+ QOr
- op
164 t
:= &Query
{Op
: otherOp
, Trigram
: common
}
165 return t
.andOr(s
, t
.Op
)
168 // Otherwise just create the op.
169 return &Query
{Op
: op
, Sub
: []*Query
{q
, r
}}
172 // implies reports whether q implies r.
173 // It is okay for it to return false negatives.
174 func (q
*Query
) implies(r
*Query
) bool {
175 if q
.Op
== QNone || r
.Op
== QAll
{
176 // False implies everything.
177 // Everything implies True.
180 if q
.Op
== QAll || r
.Op
== QNone
{
181 // True implies nothing.
182 // Nothing implies False.
186 if q
.Op
== QAnd ||
(q
.Op
== QOr
&& len(q
.Trigram
) == 1 && len(q
.Sub
) == 0) {
187 return trigramsImply(q
.Trigram
, r
)
190 if q
.Op
== QOr
&& r
.Op
== QOr
&&
191 len(q
.Trigram
) > 0 && len(q
.Sub
) == 0 &&
192 stringSet
.isSubsetOf(q
.Trigram
, r
.Trigram
) {
198 func trigramsImply(t
[]string, q
*Query
) bool {
201 for _
, qq
:= range q
.Sub
{
202 if trigramsImply(t
, qq
) {
207 if stringSet
.isSubsetOf(t
[i
:i
+1], q
.Trigram
) {
213 for _
, qq
:= range q
.Sub
{
214 if !trigramsImply(t
, qq
) {
218 if !stringSet
.isSubsetOf(q
.Trigram
, t
) {
226 // maybeRewrite rewrites q to use op if it is possible to do so
227 // without changing the meaning. It also simplifies if the node
228 // is an empty OR or AND.
229 func (q
*Query
) maybeRewrite(op QueryOp
) {
230 if q
.Op
!= QAnd
&& q
.Op
!= QOr
{
234 // AND/OR doing real work? Can't rewrite.
235 n
:= len(q
.Sub
) + len(q
.Trigram
)
240 // Nothing left in the AND/OR?
250 // Just a sub-node: throw away wrapper.
255 // Just a trigram: can use either op.
259 // andTrigrams returns q AND the OR of the AND of the trigrams present in each string.
260 func (q
*Query
) andTrigrams(t stringSet
) *Query
{
262 // If there is a short string, we can't guarantee
263 // that any trigrams must be present, so use ALL.
268 //println("andtrigrams", strings.Join(t, ","))
270 for _
, tt
:= range t
{
272 for i
:= 0; i
+3 <= len(tt
); i
++ {
273 trig
.add(tt
[i
: i
+3])
276 //println(tt, "trig", strings.Join(trig, ","))
277 or
= or
.or(&Query
{Op
: QAnd
, Trigram
: trig
})
283 func (q
*Query
) String() string {
294 if len(q
.Sub
) == 0 && len(q
.Trigram
) == 1 {
295 return strconv
.Quote(q
.Trigram
[0])
313 for i
, t
:= range q
.Trigram
{
317 s
+= strconv
.Quote(t
)
320 if len(q
.Trigram
) > 0 {
323 s
+= q
.Sub
[0].String()
324 for i
:= 1; i
< len(q
.Sub
); i
++ {
325 s
+= sjoin
+ q
.Sub
[i
].String()
332 // RegexpQuery returns a Query for the given regexp.
333 func RegexpQuery(re
*syntax
.Regexp
) *Query
{
340 // A regexpInfo summarizes the results of analyzing a regexp.
341 type regexpInfo
struct {
342 // canEmpty records whether the regexp matches the empty string
345 // exact is the exact set of strings matching the regexp.
348 // if exact is nil, prefix is the set of possible match prefixes,
349 // and suffix is the set of possible match suffixes.
350 prefix stringSet
// otherwise: the exact set of matching prefixes ...
351 suffix stringSet
// ... and suffixes
353 // match records a query that must be satisfied by any
354 // match for the regexp, in addition to the information
360 // Exact sets are limited to maxExact strings.
361 // If they get too big, simplify will rewrite the regexpInfo
362 // to use prefix and suffix instead. It's not worthwhile for
363 // this to be bigger than maxSet.
364 // Because we allow the maximum length of an exact string
365 // to grow to 5 below (see simplify), it helps to avoid ridiculous
366 // alternations if maxExact is sized so that 3 case-insensitive letters
370 // Prefix and suffix sets are limited to maxSet strings.
371 // If they get too big, simplify will replace groups of strings
372 // sharing a common leading prefix (or trailing suffix) with
373 // that common prefix (or suffix). It is useful for maxSet
374 // to be at least 2³ = 8 so that we can exactly
375 // represent a case-insensitive abc by the set
376 // {abc, abC, aBc, aBC, Abc, AbC, ABc, ABC}.
380 // anyMatch returns the regexpInfo describing a regexp that
381 // matches any string.
382 func anyMatch() regexpInfo
{
385 prefix
: []string{""},
386 suffix
: []string{""},
391 // anyChar returns the regexpInfo describing a regexp that
392 // matches any single character.
393 func anyChar() regexpInfo
{
395 prefix
: []string{""},
396 suffix
: []string{""},
401 // noMatch returns the regexpInfo describing a regexp that
402 // matches no strings at all.
403 func noMatch() regexpInfo
{
409 // emptyString returns the regexpInfo describing a regexp that
410 // matches only the empty string.
411 func emptyString() regexpInfo
{
419 // analyze returns the regexpInfo for the regexp re.
420 func analyze(re
*syntax
.Regexp
) (ret regexpInfo
) {
421 //println("analyze", re.String())
422 //defer func() { println("->", ret.String()) }()
425 case syntax
.OpNoMatch
:
428 case syntax
.OpEmptyMatch
,
429 syntax
.OpBeginLine
, syntax
.OpEndLine
,
430 syntax
.OpBeginText
, syntax
.OpEndText
,
431 syntax
.OpWordBoundary
, syntax
.OpNoWordBoundary
:
434 case syntax
.OpLiteral
:
435 if re
.Flags
&syntax
.FoldCase
!= 0 {
436 switch len(re
.Rune
) {
440 // Single-letter case-folded string:
441 // rewrite into char class and analyze.
442 re1
:= &syntax
.Regexp
{
443 Op
: syntax
.OpCharClass
,
445 re1
.Rune
= re1
.Rune0
[:0]
447 re1
.Rune
= append(re1
.Rune
, r0
, r0
)
448 for r1
:= unicode
.SimpleFold(r0
); r1
!= r0
; r1
= unicode
.SimpleFold(r1
) {
449 re1
.Rune
= append(re1
.Rune
, r1
, r1
)
454 // Multi-letter case-folded string:
455 // treat as concatenation of single-letter case-folded strings.
456 re1
:= &syntax
.Regexp
{
457 Op
: syntax
.OpLiteral
,
458 Flags
: syntax
.FoldCase
,
461 for i
:= range re
.Rune
{
462 re1
.Rune
= re
.Rune
[i
: i
+1]
463 info
= concat(info
, analyze(re1
))
467 info
.exact
= stringSet
{string(re
.Rune
)}
468 info
.match
= allQuery
470 case syntax
.OpAnyCharNotNL
, syntax
.OpAnyChar
:
473 case syntax
.OpCapture
:
474 return analyze(re
.Sub
[0])
476 case syntax
.OpConcat
:
477 return fold(concat
, re
.Sub
, emptyString())
479 case syntax
.OpAlternate
:
480 return fold(alternate
, re
.Sub
, noMatch())
483 return alternate(analyze(re
.Sub
[0]), emptyString())
486 // We don't know anything, so assume the worst.
489 case syntax
.OpRepeat
:
497 // Since there has to be at least one x, the prefixes and suffixes
498 // stay the same. If x was exact, it isn't anymore.
499 info
= analyze(re
.Sub
[0])
500 if info
.exact
.have() {
501 info
.prefix
= info
.exact
502 info
.suffix
= info
.exact
.copy()
506 case syntax
.OpCharClass
:
507 info
.match
= allQuery
510 if len(re
.Rune
) == 0 {
515 if len(re
.Rune
) == 1 {
516 info
.exact
= stringSet
{string(re
.Rune
[0])}
521 for i
:= 0; i
< len(re
.Rune
); i
+= 2 {
522 n
+= int(re
.Rune
[i
+1] - re
.Rune
[i
])
524 // If the class is too large, it's okay to overestimate.
529 info
.exact
= []string{}
530 for i
:= 0; i
< len(re
.Rune
); i
+= 2 {
531 lo
, hi
:= re
.Rune
[i
], re
.Rune
[i
+1]
532 for rr
:= lo
; rr
<= hi
; rr
++ {
533 info
.exact
.add(string(rr
))
542 // fold is the usual higher-order function.
543 func fold(f
func(x
, y regexpInfo
) regexpInfo
, sub
[]*syntax
.Regexp
, zero regexpInfo
) regexpInfo
{
548 return analyze(sub
[0])
550 info
:= f(analyze(sub
[0]), analyze(sub
[1]))
551 for i
:= 2; i
< len(sub
); i
++ {
552 info
= f(info
, analyze(sub
[i
]))
557 // concat returns the regexp info for xy given x and y.
558 func concat(x
, y regexpInfo
) (out regexpInfo
) {
559 //println("concat", x.String(), "...", y.String())
560 //defer func() { println("->", out.String()) }()
562 xy
.match
= x
.match
.and(y
.match
)
563 if x
.exact
.have() && y
.exact
.have() {
564 xy
.exact
= x
.exact
.cross(y
.exact
, false)
567 xy
.prefix
= x
.exact
.cross(y
.prefix
, false)
571 xy
.prefix
= xy
.prefix
.union(y
.prefix
, false)
575 xy
.suffix
= x
.suffix
.cross(y
.exact
, true)
579 xy
.suffix
= xy
.suffix
.union(x
.suffix
, true)
584 // If all the possible strings in the cross product of x.suffix
585 // and y.prefix are long enough, then the trigram for one
586 // of them must be present and would not necessarily be
587 // accounted for in xy.prefix or xy.suffix yet. Cut things off
588 // at maxSet just to keep the sets manageable.
589 if !x
.exact
.have() && !y
.exact
.have() &&
590 x
.suffix
.size() <= maxSet
&& y
.prefix
.size() <= maxSet
&&
591 x
.suffix
.minLen()+y
.prefix
.minLen() >= 3 {
592 xy
.match
= xy
.match
.andTrigrams(x
.suffix
.cross(y
.prefix
, false))
599 // alternate returns the regexpInfo for x|y given x and y.
600 func alternate(x
, y regexpInfo
) (out regexpInfo
) {
601 //println("alternate", x.String(), "...", y.String())
602 //defer func() { println("->", out.String()) }()
604 if x
.exact
.have() && y
.exact
.have() {
605 xy
.exact
= x
.exact
.union(y
.exact
, false)
606 } else if x
.exact
.have() {
607 xy
.prefix
= x
.exact
.union(y
.prefix
, false)
608 xy
.suffix
= x
.exact
.union(y
.suffix
, true)
610 } else if y
.exact
.have() {
611 xy
.prefix
= x
.prefix
.union(y
.exact
, false)
612 xy
.suffix
= x
.suffix
.union(y
.exact
.copy(), true)
615 xy
.prefix
= x
.prefix
.union(y
.prefix
, false)
616 xy
.suffix
= x
.suffix
.union(y
.suffix
, true)
618 xy
.canEmpty
= x
.canEmpty || y
.canEmpty
619 xy
.match
= x
.match
.or(y
.match
)
625 // addExact adds to the match query the trigrams for matching info.exact.
626 func (info
*regexpInfo
) addExact() {
627 if info
.exact
.have() {
628 info
.match
= info
.match
.andTrigrams(info
.exact
)
632 // simplify simplifies the regexpInfo when the exact set gets too large.
633 func (info
*regexpInfo
) simplify(force
bool) {
634 //println(" simplify", info.String(), " force=", force)
635 //defer func() { println(" ->", info.String()) }()
636 // If there are now too many exact strings,
637 // loop over them, adding trigrams and moving
638 // the relevant pieces into prefix and suffix.
639 info
.exact
.clean(false)
640 if len(info
.exact
) > maxExact ||
(info
.exact
.minLen() >= 3 && force
) || info
.exact
.minLen() >= 4 {
642 for _
, s
:= range info
.exact
{
648 info
.prefix
.add(s
[:2])
649 info
.suffix
.add(s
[n
-2:])
655 if !info
.exact
.have() {
656 info
.simplifySet(&info
.prefix
)
657 info
.simplifySet(&info
.suffix
)
661 // simplifySet reduces the size of the given set (either prefix or suffix).
662 // There is no need to pass around enormous prefix or suffix sets, since
663 // they will only be used to create trigrams. As they get too big, simplifySet
664 // moves the information they contain into the match query, which is
665 // more efficient to pass around.
666 func (info
*regexpInfo
) simplifySet(s
*stringSet
) {
668 t
.clean(s
== &info
.suffix
)
670 // Add the OR of the current prefix/suffix set to the query.
671 info
.match
= info
.match
.andTrigrams(t
)
673 for n
:= 3; n
== 3 || t
.size() > maxSet
; n
-- {
674 // Replace set by strings of length n-1.
676 for _
, str
:= range t
{
678 if s
== &info
.prefix
{
681 str
= str
[len(str
)-n
+1:]
684 if w
== 0 || t
[w
-1] != str
{
690 t
.clean(s
== &info
.suffix
)
693 // Now make sure that the prefix/suffix sets aren't redundant.
694 // For example, if we know "ab" is a possible prefix, then it
695 // doesn't help at all to know that "abc" is also a possible
696 // prefix, so delete "abc".
698 f
:= strings
.HasPrefix
699 if s
== &info
.suffix
{
700 f
= strings
.HasSuffix
702 for _
, str
:= range t
{
703 if w
== 0 ||
!f(str
, t
[w
-1]) {
713 func (info regexpInfo
) String() string {
718 if info
.exact
.have() {
719 s
+= "exact:" + strings
.Join(info
.exact
, ",")
721 s
+= "prefix:" + strings
.Join(info
.prefix
, ",")
722 s
+= " suffix:" + strings
.Join(info
.suffix
, ",")
724 s
+= " match: " + info
.match
.String()
728 // A stringSet is a set of strings.
729 // The nil stringSet indicates not having a set.
730 // The non-nil but empty stringSet is the empty set.
731 type stringSet
[]string
733 // have reports whether we have a stringSet.
734 func (s stringSet
) have() bool {
738 // contains reports whether s contains str.
739 func (s stringSet
) contains(str
string) bool {
740 for _
, ss
:= range s
{
748 type byPrefix
[]string
750 func (x
*byPrefix
) Len() int { return len(*x
) }
751 func (x
*byPrefix
) Swap(i
, j
int) { (*x
)[i
], (*x
)[j
] = (*x
)[j
], (*x
)[i
] }
752 func (x
*byPrefix
) Less(i
, j
int) bool { return (*x
)[i
] < (*x
)[j
] }
754 type bySuffix
[]string
756 func (x
*bySuffix
) Len() int { return len(*x
) }
757 func (x
*bySuffix
) Swap(i
, j
int) { (*x
)[i
], (*x
)[j
] = (*x
)[j
], (*x
)[i
] }
758 func (x
*bySuffix
) Less(i
, j
int) bool {
761 for i
:= 1; i
<= len(s
) && i
<= len(t
); i
++ {
771 return len(s
) < len(t
)
774 // add adds str to the set.
775 func (s
*stringSet
) add(str
string) {
779 // clean removes duplicates from the stringSet.
780 func (s
*stringSet
) clean(isSuffix
bool) {
783 sort
.Sort((*bySuffix
)(s
))
785 sort
.Sort((*byPrefix
)(s
))
788 for _
, str
:= range t
{
789 if w
== 0 || t
[w
-1] != str
{
797 // size returns the number of strings in s.
798 func (s stringSet
) size() int {
802 // minLen returns the length of the shortest string in s.
803 func (s stringSet
) minLen() int {
808 for _
, str
:= range s
{
816 // maxLen returns the length of the longest string in s.
817 func (s stringSet
) maxLen() int {
822 for _
, str
:= range s
{
830 // union returns the union of s and t, reusing s's storage.
831 func (s stringSet
) union(t stringSet
, isSuffix
bool) stringSet
{
837 // cross returns the cross product of s and t.
838 func (s stringSet
) cross(t stringSet
, isSuffix
bool) stringSet
{
840 for _
, ss
:= range s
{
841 for _
, tt
:= range t
{
849 // clear empties the set but preserves the storage.
850 func (s
*stringSet
) clear() {
854 // copy returns a copy of the set that does not share storage with the original.
855 func (s stringSet
) copy() stringSet
{
856 return append(stringSet
{}, s
...)
859 // isSubsetOf returns true if all strings in s are also in t.
860 // It assumes both sets are sorted.
861 func (s stringSet
) isSubsetOf(t stringSet
) bool {
863 for _
, ss
:= range s
{
864 for j
< len(t
) && t
[j
] < ss
{
867 if j
>= len(t
) || t
[j
] != ss
{