1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
22 // TestRE2 tests this package's regexp API against test cases
23 // considered during RE2's exhaustive tests, which run all possible
24 // regexps over a given set of atoms and operators, up to a given
25 // complexity, over all possible strings over a given alphabet,
26 // up to a given size. Rather than try to link with RE2, we read a
27 // log file containing the test cases and the expected matches.
28 // The log file, re2-exhaustive.txt, is generated by running 'make log'
29 // in the open source RE2 distribution https://github.com/google/re2/.
31 // The test file format is a sequence of stanzas like:
40 // "([0-9])([0-9])([0-9])"
44 // The stanza begins by defining a set of strings, quoted
45 // using Go double-quote syntax, one per line. Then the
46 // regexps section gives a sequence of regexps to run on
47 // the strings. In the block that follows a regexp, each line
48 // gives the semicolon-separated match results of running
49 // the regexp on the corresponding string.
50 // Each match result is either a single -, meaning no match, or a
51 // space-separated sequence of pairs giving the match and
52 // submatch indices. An unmatched subexpression formats
53 // its pair as a single - (not illustrated above). For now
54 // each regexp run produces two match results, one for a
55 // ``full match'' that restricts the regexp to matching the entire
56 // string or nothing, and one for a ``partial match'' that gives
57 // the leftmost first match found in the string.
59 // Lines beginning with # are comments. Lines beginning with
60 // a capital letter are test names printed during RE2's test suite
61 // and are echoed into t but otherwise ignored.
63 // At time of writing, re2-exhaustive.txt is 59 MB but compresses to 385 kB,
64 // so we store re2-exhaustive.txt.bz2 in the repository and decompress it on the fly.
66 func TestRE2Search(t
*testing
.T
) {
67 testRE2(t
, "testdata/re2-search.txt")
70 func testRE2(t
*testing
.T
, file
string) {
71 f
, err
:= os
.Open(file
)
77 if strings
.HasSuffix(file
, ".bz2") {
78 z
:= bzip2
.NewReader(f
)
80 file
= file
[:len(file
)-len(".bz2")] // for error messages
85 scanner
:= bufio
.NewScanner(txt
)
95 for lineno
:= 1; scanner
.Scan(); lineno
++ {
96 line
:= scanner
.Text()
99 t
.Fatalf("%s:%d: unexpected blank line", file
, lineno
)
102 case 'A' <= line
[0] && line
[0] <= 'Z':
106 case line
== "strings":
109 case line
== "regexps":
112 q
, err
:= strconv
.Unquote(line
)
114 // Fatal because we'll get out of sync.
115 t
.Fatalf("%s:%d: unquote %s: %v", file
, lineno
, line
, err
)
123 t
.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file
, lineno
, len(input
), q
)
125 re
, err
= tryCompile(q
)
127 if err
.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
128 // We don't and likely never will support \C; keep going.
131 t
.Errorf("%s:%d: compile %#q: %v", file
, lineno
, q
, err
)
132 if nfail
++; nfail
>= 100 {
133 t
.Fatalf("stopping after %d errors", nfail
)
137 full
:= `\A(?:` + q
+ `)\z`
138 refull
, err
= tryCompile(full
)
140 // Fatal because q worked, so this should always work.
141 t
.Fatalf("%s:%d: compile full %#q: %v", file
, lineno
, full
, err
)
144 case line
[0] == '-' ||
'0' <= line
[0] && line
[0] <= '9':
145 // A sequence of match results.
148 // Failed to compile: skip results.
152 t
.Fatalf("%s:%d: out of sync: no input remaining", file
, lineno
)
155 text
, input
= input
[0], input
[1:]
156 if !isSingleBytes(text
) && strings
.Contains(re
.String(), `\B`) {
157 // RE2's \B considers every byte position,
158 // so it sees 'not word boundary' in the
159 // middle of UTF-8 sequences. This package
160 // only considers the positions between runes,
161 // so it disagrees. Skip those cases.
164 res
:= strings
.Split(line
, ";")
165 if len(res
) != len(run
) {
166 t
.Fatalf("%s:%d: have %d test results, want %d", file
, lineno
, len(res
), len(run
))
169 have
, suffix
:= run
[i
](re
, refull
, text
)
170 want
:= parseResult(t
, file
, lineno
, res
[i
])
171 if !same(have
, want
) {
172 t
.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file
, lineno
, re
, suffix
, text
, have
, want
)
173 if nfail
++; nfail
>= 100 {
174 t
.Fatalf("stopping after %d errors", nfail
)
178 b
, suffix
:= match
[i
](re
, refull
, text
)
179 if b
!= (want
!= nil) {
180 t
.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file
, lineno
, re
, suffix
, text
, b
, !b
)
181 if nfail
++; nfail
>= 100 {
182 t
.Fatalf("stopping after %d errors", nfail
)
189 t
.Fatalf("%s:%d: out of sync: %s\n", file
, lineno
, line
)
192 if err
:= scanner
.Err(); err
!= nil {
193 t
.Fatalf("%s:%d: %v", file
, lineno
, err
)
196 t
.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file
, lineno
, len(input
))
198 t
.Logf("%d cases tested", ncase
)
201 var run
= []func(*Regexp
, *Regexp
, string) ([]int, string){
208 func runFull(re
, refull
*Regexp
, text
string) ([]int, string) {
209 refull
.longest
= false
210 return refull
.FindStringSubmatchIndex(text
), "[full]"
213 func runPartial(re
, refull
*Regexp
, text
string) ([]int, string) {
215 return re
.FindStringSubmatchIndex(text
), ""
218 func runFullLongest(re
, refull
*Regexp
, text
string) ([]int, string) {
219 refull
.longest
= true
220 return refull
.FindStringSubmatchIndex(text
), "[full,longest]"
223 func runPartialLongest(re
, refull
*Regexp
, text
string) ([]int, string) {
225 return re
.FindStringSubmatchIndex(text
), "[longest]"
228 var match
= []func(*Regexp
, *Regexp
, string) (bool, string){
235 func matchFull(re
, refull
*Regexp
, text
string) (bool, string) {
236 refull
.longest
= false
237 return refull
.MatchString(text
), "[full]"
240 func matchPartial(re
, refull
*Regexp
, text
string) (bool, string) {
242 return re
.MatchString(text
), ""
245 func matchFullLongest(re
, refull
*Regexp
, text
string) (bool, string) {
246 refull
.longest
= true
247 return refull
.MatchString(text
), "[full,longest]"
250 func matchPartialLongest(re
, refull
*Regexp
, text
string) (bool, string) {
252 return re
.MatchString(text
), "[longest]"
255 func isSingleBytes(s
string) bool {
256 for _
, c
:= range s
{
257 if c
>= utf8
.RuneSelf
{
264 func tryCompile(s
string) (re
*Regexp
, err error
) {
265 // Protect against panic during Compile.
267 if r
:= recover(); r
!= nil {
268 err
= fmt
.Errorf("panic: %v", r
)
274 func parseResult(t
*testing
.T
, file
string, lineno
int, res
string) []int {
275 // A single - indicates no match.
279 // Otherwise, a space-separated list of pairs.
281 for j
:= 0; j
< len(res
); j
++ {
286 out
:= make([]int, 2*n
)
289 for j
:= 0; j
<= len(res
); j
++ {
290 if j
== len(res
) || res
[j
] == ' ' {
291 // Process a single pair. - means no submatch.
297 k
:= strings
.Index(pair
, "-")
299 t
.Fatalf("%s:%d: invalid pair %s", file
, lineno
, pair
)
301 lo
, err1
:= strconv
.Atoi(pair
[:k
])
302 hi
, err2
:= strconv
.Atoi(pair
[k
+1:])
303 if err1
!= nil || err2
!= nil || lo
> hi
{
304 t
.Fatalf("%s:%d: invalid pair %s", file
, lineno
, pair
)
316 func same(x
, y
[]int) bool {
317 if len(x
) != len(y
) {
320 for i
, xi
:= range x
{
328 // TestFowler runs this package's regexp API against the
329 // POSIX regular expression tests collected by Glenn Fowler
330 // at http://www2.research.att.com/~astopen/testregex/testregex.html.
331 func TestFowler(t
*testing
.T
) {
332 files
, err
:= filepath
.Glob("testdata/*.dat")
336 for _
, file
:= range files
{
342 var notab
= MustCompilePOSIX(`[^\t]+`)
344 func testFowler(t
*testing
.T
, file
string) {
345 f
, err
:= os
.Open(file
)
351 b
:= bufio
.NewReader(f
)
357 line
, err
:= b
.ReadString('\n')
360 t
.Errorf("%s:%d: %v", file
, lineno
, err
)
365 // http://www2.research.att.com/~astopen/man/man1/testregex.html
368 // Input lines may be blank, a comment beginning with #, or a test
369 // specification. A specification is five fields separated by one
370 // or more tabs. NULL denotes the empty string and NIL denotes the
372 if line
[0] == '#' || line
[0] == '\n' {
375 line
= line
[:len(line
)-1]
376 field
:= notab
.FindAllString(line
, -1)
377 for i
, f
:= range field
{
382 t
.Logf("%s:%d: skip: %s", file
, lineno
, line
)
390 // Field 1: the regex(3) flags to apply, one character per REG_feature
391 // flag. The test is skipped if REG_feature is not supported by the
392 // implementation. If the first character is not [BEASKLP] then the
393 // specification is a global control line. One or more of [BEASKLP] may be
394 // specified; the test will be repeated for each mode.
396 // B basic BRE (grep, ed, sed)
397 // E REG_EXTENDED ERE (egrep)
398 // A REG_AUGMENTED ARE (egrep with negation)
399 // S REG_SHELL SRE (sh glob)
400 // K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
401 // L REG_LITERAL LRE (fgrep)
403 // a REG_LEFT|REG_RIGHT implicit ^...$
404 // b REG_NOTBOL lhs does not match ^
405 // c REG_COMMENT ignore space and #...\n
406 // d REG_SHELL_DOT explicit leading . match
407 // e REG_NOTEOL rhs does not match $
408 // f REG_MULTIPLE multiple \n separated patterns
409 // g FNM_LEADING_DIR testfnmatch only -- match until /
410 // h REG_MULTIREF multiple digit backref
411 // i REG_ICASE ignore case
412 // j REG_SPAN . matches \n
413 // k REG_ESCAPE \ to escape [...] delimiter
414 // l REG_LEFT implicit ^...
415 // m REG_MINIMAL minimal match
416 // n REG_NEWLINE explicit \n match
417 // o REG_ENCLOSED (|&) magic inside [@|&](...)
418 // p REG_SHELL_PATH explicit / match
419 // q REG_DELIMITED delimited pattern
420 // r REG_RIGHT implicit ...$
421 // s REG_SHELL_ESCAPED \ not special
422 // t REG_MUSTDELIM all delimiters must be specified
423 // u standard unspecified behavior -- errors not counted
424 // v REG_CLASS_ESCAPE \ special inside [...]
425 // w REG_NOSUB no subexpression match array
426 // x REG_LENIENT let some errors slide
427 // y REG_LEFT regexec() implicit ^...
428 // z REG_NULL NULL subexpressions ok
429 // $ expand C \c escapes in fields 2 and 3
430 // / field 2 is a regsubcomp() expression
431 // = field 3 is a regdecomp() expression
433 // Field 1 control lines:
435 // C set LC_COLLATE and LC_CTYPE to locale in field 2
437 // ?test ... output field 5 if passed and != EXPECTED, silent otherwise
438 // &test ... output field 5 if current and previous passed
439 // |test ... output field 5 if current passed and previous failed
440 // ; ... output field 2 if previous failed
441 // {test ... skip if failed until }
444 // : comment comment copied as output NOTE
445 // :comment:test :comment: ignored
446 // N[OTE] comment comment copied as output NOTE
447 // T[EST] comment comment
449 // number use number for nmatch (20 by default)
452 case '?', '&', '|', ';', '{', '}':
453 // Ignore all the control operators.
454 // Just run everything.
460 i
:= strings
.Index(flag
[1:], ":")
462 t
.Logf("skip: %s", line
)
466 case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
467 t
.Logf("skip: %s", line
)
471 // Can check field count now that we've handled the myriad comment formats.
473 t
.Errorf("%s:%d: too few fields: %s", file
, lineno
, line
)
477 // Expand C escapes (a.k.a. Go escapes).
478 if strings
.Contains(flag
, "$") {
479 f
:= `"` + field
[1] + `"`
480 if field
[1], err
= strconv
.Unquote(f
); err
!= nil {
481 t
.Errorf("%s:%d: cannot unquote %s", file
, lineno
, f
)
483 f
= `"` + field
[2] + `"`
484 if field
[2], err
= strconv
.Unquote(f
); err
!= nil {
485 t
.Errorf("%s:%d: cannot unquote %s", file
, lineno
, f
)
489 // Field 2: the regular expression pattern; SAME uses the pattern from
490 // the previous specification.
492 if field
[1] == "SAME" {
493 field
[1] = lastRegexp
495 lastRegexp
= field
[1]
497 // Field 3: the string to match.
500 // Field 4: the test outcome...
501 ok
, shouldCompile
, shouldMatch
, pos
:= parseFowlerResult(field
[3])
503 t
.Errorf("%s:%d: cannot parse result %#q", file
, lineno
, field
[3])
507 // Field 5: optional comment appended to the report.
510 // Run test once for each specified capital letter mode that we support.
511 for _
, c
:= range flag
{
513 syn
:= syntax
.POSIX | syntax
.ClassNL
518 // extended regexp (what we support)
521 pattern
= QuoteMeta(pattern
)
524 for _
, c
:= range flag
{
527 syn |
= syntax
.FoldCase
531 re
, err
:= compile(pattern
, syn
, true)
534 t
.Errorf("%s:%d: %#q did not compile", file
, lineno
, pattern
)
539 t
.Errorf("%s:%d: %#q should not compile", file
, lineno
, pattern
)
542 match
:= re
.MatchString(text
)
543 if match
!= shouldMatch
{
544 t
.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file
, lineno
, pattern
, text
, match
, shouldMatch
)
547 have
:= re
.FindStringSubmatchIndex(text
)
548 if (len(have
) > 0) != match
{
549 t
.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file
, lineno
, pattern
, text
, match
, pattern
, text
, have
)
552 if len(have
) > len(pos
) {
553 have
= have
[:len(pos
)]
555 if !same(have
, pos
) {
556 t
.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file
, lineno
, pattern
, text
, have
, pos
)
562 func parseFowlerResult(s
string) (ok
, compiled
, matched
bool, pos
[]int) {
563 // Field 4: the test outcome. This is either one of the posix error
564 // codes (with REG_ omitted) or the match array, a list of (m,n)
565 // entries with m and n being first and last+1 positions in the
566 // field 3 string, or NULL if REG_NOSUB is in effect and success
567 // is expected. BADPAT is acceptable in place of any regcomp(3)
568 // error code. The match[] array is initialized to (-2,-2) before
569 // each test. All array elements from 0 to nmatch-1 must be specified
570 // in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
571 // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
572 // matched (?{...}) expression, where x is the text enclosed by {...},
573 // o is the expression ordinal counting from 1, and n is the length of
574 // the unmatched portion of the subject string. If x starts with a
575 // number then that is the return value of re_execf(), otherwise 0 is
579 // Match with no position information.
590 case 'A' <= s
[0] && s
[0] <= 'Z':
591 // All the other error codes are compile errors.
610 for i
< len(s
) && s
[i
] != end
{
613 if i
== 0 || i
== len(s
) {
620 v
, err
= strconv
.Atoi(s
[:i
])
641 func makeText(n
int) []byte {
645 text
= make([]byte, n
)
647 for i
:= range text
{
656 text
[i
] = byte(x
%(0x7E+1-0x20) + 0x20)
662 func BenchmarkMatch(b
*testing
.B
) {
663 isRaceBuilder
:= strings
.HasSuffix(testenv
.Builder(), "-race")
665 for _
, data
:= range benchData
{
666 r
:= MustCompile(data
.re
)
667 for _
, size
:= range benchSizes
{
668 if isRaceBuilder
&& size
.n
> 1<<10 {
671 t
:= makeText(size
.n
)
672 b
.Run(data
.name
+"/"+size
.name
, func(b
*testing
.B
) {
673 b
.SetBytes(int64(size
.n
))
674 for i
:= 0; i
< b
.N
; i
++ {
684 func BenchmarkMatch_onepass_regex(b
*testing
.B
) {
685 isRaceBuilder
:= strings
.HasSuffix(testenv
.Builder(), "-race")
686 r
:= MustCompile(`(?s)\A.*\z`)
687 if r
.get().op
== notOnePass
{
688 b
.Fatalf("want onepass regex, but %q is not onepass", r
)
690 for _
, size
:= range benchSizes
{
691 if isRaceBuilder
&& size
.n
> 1<<10 {
694 t
:= makeText(size
.n
)
695 bs
:= make([][]byte, len(t
))
696 for i
, s
:= range t
{
699 b
.Run(size
.name
, func(b
*testing
.B
) {
700 b
.SetBytes(int64(size
.n
))
702 for i
:= 0; i
< b
.N
; i
++ {
703 for _
, byts
:= range bs
{
705 b
.Fatal("not match!")
713 var benchData
= []struct{ name
, re
string }{
714 {"Easy0", "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
715 {"Easy0i", "(?i)ABCDEFGHIJklmnopqrstuvwxyz$"},
716 {"Easy1", "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"},
717 {"Medium", "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
718 {"Hard", "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"},
719 {"Hard1", "ABCD|CDEF|EFGH|GHIJ|IJKL|KLMN|MNOP|OPQR|QRST|STUV|UVWX|WXYZ"},
722 var benchSizes
= []struct {
733 func TestLongest(t
*testing
.T
) {
734 re
, err
:= Compile(`a(|b)`)
738 if g
, w
:= re
.FindString("ab"), "a"; g
!= w
{
739 t
.Errorf("first match was %q, want %q", g
, w
)
742 if g
, w
:= re
.FindString("ab"), "ab"; g
!= w
{
743 t
.Errorf("longest match was %q, want %q", g
, w
)
747 // TestProgramTooLongForBacktrack tests that a regex which is too long
748 // for the backtracker still executes properly.
749 func TestProgramTooLongForBacktrack(t
*testing
.T
) {
750 longRegex
:= MustCompile(`(one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twentyone|twentytwo|twentythree|twentyfour|twentyfive|twentysix|twentyseven|twentyeight|twentynine|thirty|thirtyone|thirtytwo|thirtythree|thirtyfour|thirtyfive|thirtysix|thirtyseven|thirtyeight|thirtynine|forty|fortyone|fortytwo|fortythree|fortyfour|fortyfive|fortysix|fortyseven|fortyeight|fortynine|fifty|fiftyone|fiftytwo|fiftythree|fiftyfour|fiftyfive|fiftysix|fiftyseven|fiftyeight|fiftynine|sixty|sixtyone|sixtytwo|sixtythree|sixtyfour|sixtyfive|sixtysix|sixtyseven|sixtyeight|sixtynine|seventy|seventyone|seventytwo|seventythree|seventyfour|seventyfive|seventysix|seventyseven|seventyeight|seventynine|eighty|eightyone|eightytwo|eightythree|eightyfour|eightyfive|eightysix|eightyseven|eightyeight|eightynine|ninety|ninetyone|ninetytwo|ninetythree|ninetyfour|ninetyfive|ninetysix|ninetyseven|ninetyeight|ninetynine|onehundred)`)
751 if !longRegex
.MatchString("two") {
752 t
.Errorf("longRegex.MatchString(\"two\") was false, want true")
754 if longRegex
.MatchString("xxx") {
755 t
.Errorf("longRegex.MatchString(\"xxx\") was true, want false")