1 // Copyright 2010 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
24 // TestRE2 tests this package's regexp API against test cases
25 // considered during RE2's exhaustive tests, which run all possible
26 // regexps over a given set of atoms and operators, up to a given
27 // complexity, over all possible strings over a given alphabet,
28 // up to a given size. Rather than try to link with RE2, we read a
29 // log file containing the test cases and the expected matches.
30 // The log file, re2.txt, is generated by running 'make exhaustive-log'
31 // in the open source RE2 distribution. http://code.google.com/p/re2/
33 // The test file format is a sequence of stanzas like:
42 // "([0-9])([0-9])([0-9])"
46 // The stanza begins by defining a set of strings, quoted
47 // using Go double-quote syntax, one per line. Then the
48 // regexps section gives a sequence of regexps to run on
49 // the strings. In the block that follows a regexp, each line
50 // gives the semicolon-separated match results of running
51 // the regexp on the corresponding string.
52 // Each match result is either a single -, meaning no match, or a
53 // space-separated sequence of pairs giving the match and
54 // submatch indices. An unmatched subexpression formats
55 // its pair as a single - (not illustrated above). For now
56 // each regexp run produces two match results, one for a
57 // ``full match'' that restricts the regexp to matching the entire
58 // string or nothing, and one for a ``partial match'' that gives
59 // the leftmost first match found in the string.
61 // Lines beginning with # are comments. Lines beginning with
62 // a capital letter are test names printed during RE2's test suite
63 // and are echoed into t but otherwise ignored.
65 // At time of writing, re2.txt is 32 MB but compresses to 760 kB,
66 // so we store re2.txt.gz in the repository and decompress it on the fly.
68 func TestRE2Search(t
*testing
.T
) {
69 testRE2(t
, "testdata/re2-search.txt")
72 func TestRE2Exhaustive(t
*testing
.T
) {
74 t
.Log("skipping TestRE2Exhaustive during short test")
77 testRE2(t
, "testdata/re2-exhaustive.txt.bz2")
80 func testRE2(t
*testing
.T
, file
string) {
81 f
, err
:= os
.Open(file
)
87 if strings
.HasSuffix(file
, ".bz2") {
88 z
:= bzip2
.NewReader(f
)
90 file
= file
[:len(file
)-len(".bz2")] // for error messages
95 r
:= bufio
.NewReader(txt
)
106 line
, err
:= r
.ReadString('\n')
111 t
.Fatalf("%s:%d: %v", file
, lineno
, err
)
113 line
= line
[:len(line
)-1] // chop \n
117 t
.Fatalf("%s:%d: unexpected blank line", file
, lineno
)
120 case 'A' <= line
[0] && line
[0] <= 'Z':
124 case line
== "strings":
127 case line
== "regexps":
130 q
, err
:= strconv
.Unquote(line
)
132 // Fatal because we'll get out of sync.
133 t
.Fatalf("%s:%d: unquote %s: %v", file
, lineno
, line
, err
)
141 t
.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file
, lineno
, len(input
), q
)
143 re
, err
= tryCompile(q
)
145 if err
.Error() == "error parsing regexp: invalid escape sequence: `\\C`" {
146 // We don't and likely never will support \C; keep going.
149 t
.Errorf("%s:%d: compile %#q: %v", file
, lineno
, q
, err
)
150 if nfail
++; nfail
>= 100 {
151 t
.Fatalf("stopping after %d errors", nfail
)
155 full
:= `\A(?:` + q
+ `)\z`
156 refull
, err
= tryCompile(full
)
158 // Fatal because q worked, so this should always work.
159 t
.Fatalf("%s:%d: compile full %#q: %v", file
, lineno
, full
, err
)
162 case line
[0] == '-' ||
'0' <= line
[0] && line
[0] <= '9':
163 // A sequence of match results.
166 // Failed to compile: skip results.
170 t
.Fatalf("%s:%d: out of sync: no input remaining", file
, lineno
)
173 text
, input
= input
[0], input
[1:]
174 if !isSingleBytes(text
) && strings
.Contains(re
.String(), `\B`) {
175 // RE2's \B considers every byte position,
176 // so it sees 'not word boundary' in the
177 // middle of UTF-8 sequences. This package
178 // only considers the positions between runes,
179 // so it disagrees. Skip those cases.
182 res
:= strings
.Split(line
, ";")
183 if len(res
) != len(run
) {
184 t
.Fatalf("%s:%d: have %d test results, want %d", file
, lineno
, len(res
), len(run
))
187 have
, suffix
:= run
[i
](re
, refull
, text
)
188 want
:= parseResult(t
, file
, lineno
, res
[i
])
189 if !same(have
, want
) {
190 t
.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file
, lineno
, re
, suffix
, text
, have
, want
)
191 if nfail
++; nfail
>= 100 {
192 t
.Fatalf("stopping after %d errors", nfail
)
196 b
, suffix
:= match
[i
](re
, refull
, text
)
197 if b
!= (want
!= nil) {
198 t
.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file
, lineno
, re
, suffix
, text
, b
, !b
)
199 if nfail
++; nfail
>= 100 {
200 t
.Fatalf("stopping after %d errors", nfail
)
207 t
.Fatalf("%s:%d: out of sync: %s\n", file
, lineno
, line
)
211 t
.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file
, lineno
, len(input
))
213 t
.Logf("%d cases tested", ncase
)
216 var run
= []func(*Regexp
, *Regexp
, string) ([]int, string){
223 func runFull(re
, refull
*Regexp
, text
string) ([]int, string) {
224 refull
.SetLongest(false)
225 return refull
.FindStringSubmatchIndex(text
), "[full]"
228 func runPartial(re
, refull
*Regexp
, text
string) ([]int, string) {
230 return re
.FindStringSubmatchIndex(text
), ""
233 func runFullLongest(re
, refull
*Regexp
, text
string) ([]int, string) {
234 refull
.SetLongest(true)
235 return refull
.FindStringSubmatchIndex(text
), "[full,longest]"
238 func runPartialLongest(re
, refull
*Regexp
, text
string) ([]int, string) {
240 return re
.FindStringSubmatchIndex(text
), "[longest]"
243 var match
= []func(*Regexp
, *Regexp
, string) (bool, string){
250 func matchFull(re
, refull
*Regexp
, text
string) (bool, string) {
251 refull
.SetLongest(false)
252 return refull
.MatchString(text
), "[full]"
255 func matchPartial(re
, refull
*Regexp
, text
string) (bool, string) {
257 return re
.MatchString(text
), ""
260 func matchFullLongest(re
, refull
*Regexp
, text
string) (bool, string) {
261 refull
.SetLongest(true)
262 return refull
.MatchString(text
), "[full,longest]"
265 func matchPartialLongest(re
, refull
*Regexp
, text
string) (bool, string) {
267 return re
.MatchString(text
), "[longest]"
270 func isSingleBytes(s
string) bool {
271 for _
, c
:= range s
{
272 if c
>= utf8
.RuneSelf
{
279 func tryCompile(s
string) (re
*Regexp
, err error
) {
280 // Protect against panic during Compile.
282 if r
:= recover(); r
!= nil {
283 err
= fmt
.Errorf("panic: %v", r
)
289 func parseResult(t
*testing
.T
, file
string, lineno
int, res
string) []int {
290 // A single - indicates no match.
294 // Otherwise, a space-separated list of pairs.
296 for j
:= 0; j
< len(res
); j
++ {
301 out
:= make([]int, 2*n
)
304 for j
:= 0; j
<= len(res
); j
++ {
305 if j
== len(res
) || res
[j
] == ' ' {
306 // Process a single pair. - means no submatch.
312 k
:= strings
.Index(pair
, "-")
314 t
.Fatalf("%s:%d: invalid pair %s", file
, lineno
, pair
)
316 lo
, err1
:= strconv
.Atoi(pair
[:k
])
317 hi
, err2
:= strconv
.Atoi(pair
[k
+1:])
318 if err1
!= nil || err2
!= nil || lo
> hi
{
319 t
.Fatalf("%s:%d: invalid pair %s", file
, lineno
, pair
)
331 func same(x
, y
[]int) bool {
332 if len(x
) != len(y
) {
335 for i
, xi
:= range x
{
343 // TestFowler runs this package's regexp API against the
344 // POSIX regular expression tests collected by Glenn Fowler
345 // at http://www2.research.att.com/~gsf/testregex/.
346 func TestFowler(t
*testing
.T
) {
347 files
, err
:= filepath
.Glob("testdata/*.dat")
351 for _
, file
:= range files
{
357 var notab
= MustCompilePOSIX(`[^\t]+`)
359 func testFowler(t
*testing
.T
, file
string) {
360 f
, err
:= os
.Open(file
)
366 b
:= bufio
.NewReader(f
)
372 line
, err
:= b
.ReadString('\n')
375 t
.Errorf("%s:%d: %v", file
, lineno
, err
)
380 // http://www2.research.att.com/~gsf/man/man1/testregex.html
383 // Input lines may be blank, a comment beginning with #, or a test
384 // specification. A specification is five fields separated by one
385 // or more tabs. NULL denotes the empty string and NIL denotes the
387 if line
[0] == '#' || line
[0] == '\n' {
390 line
= line
[:len(line
)-1]
391 field
:= notab
.FindAllString(line
, -1)
392 for i
, f
:= range field
{
397 t
.Logf("%s:%d: skip: %s", file
, lineno
, line
)
405 // Field 1: the regex(3) flags to apply, one character per REG_feature
406 // flag. The test is skipped if REG_feature is not supported by the
407 // implementation. If the first character is not [BEASKLP] then the
408 // specification is a global control line. One or more of [BEASKLP] may be
409 // specified; the test will be repeated for each mode.
411 // B basic BRE (grep, ed, sed)
412 // E REG_EXTENDED ERE (egrep)
413 // A REG_AUGMENTED ARE (egrep with negation)
414 // S REG_SHELL SRE (sh glob)
415 // K REG_SHELL|REG_AUGMENTED KRE (ksh glob)
416 // L REG_LITERAL LRE (fgrep)
418 // a REG_LEFT|REG_RIGHT implicit ^...$
419 // b REG_NOTBOL lhs does not match ^
420 // c REG_COMMENT ignore space and #...\n
421 // d REG_SHELL_DOT explicit leading . match
422 // e REG_NOTEOL rhs does not match $
423 // f REG_MULTIPLE multiple \n separated patterns
424 // g FNM_LEADING_DIR testfnmatch only -- match until /
425 // h REG_MULTIREF multiple digit backref
426 // i REG_ICASE ignore case
427 // j REG_SPAN . matches \n
428 // k REG_ESCAPE \ to ecape [...] delimiter
429 // l REG_LEFT implicit ^...
430 // m REG_MINIMAL minimal match
431 // n REG_NEWLINE explicit \n match
432 // o REG_ENCLOSED (|&) magic inside [@|&](...)
433 // p REG_SHELL_PATH explicit / match
434 // q REG_DELIMITED delimited pattern
435 // r REG_RIGHT implicit ...$
436 // s REG_SHELL_ESCAPED \ not special
437 // t REG_MUSTDELIM all delimiters must be specified
438 // u standard unspecified behavior -- errors not counted
439 // v REG_CLASS_ESCAPE \ special inside [...]
440 // w REG_NOSUB no subexpression match array
441 // x REG_LENIENT let some errors slide
442 // y REG_LEFT regexec() implicit ^...
443 // z REG_NULL NULL subexpressions ok
444 // $ expand C \c escapes in fields 2 and 3
445 // / field 2 is a regsubcomp() expression
446 // = field 3 is a regdecomp() expression
448 // Field 1 control lines:
450 // C set LC_COLLATE and LC_CTYPE to locale in field 2
452 // ?test ... output field 5 if passed and != EXPECTED, silent otherwise
453 // &test ... output field 5 if current and previous passed
454 // |test ... output field 5 if current passed and previous failed
455 // ; ... output field 2 if previous failed
456 // {test ... skip if failed until }
459 // : comment comment copied as output NOTE
460 // :comment:test :comment: ignored
461 // N[OTE] comment comment copied as output NOTE
462 // T[EST] comment comment
464 // number use number for nmatch (20 by default)
467 case '?', '&', '|', ';', '{', '}':
468 // Ignore all the control operators.
469 // Just run everything.
475 i
:= strings
.Index(flag
[1:], ":")
477 t
.Logf("skip: %s", line
)
481 case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
482 t
.Logf("skip: %s", line
)
486 // Can check field count now that we've handled the myriad comment formats.
488 t
.Errorf("%s:%d: too few fields: %s", file
, lineno
, line
)
492 // Expand C escapes (a.k.a. Go escapes).
493 if strings
.Contains(flag
, "$") {
494 f
:= `"` + field
[1] + `"`
495 if field
[1], err
= strconv
.Unquote(f
); err
!= nil {
496 t
.Errorf("%s:%d: cannot unquote %s", file
, lineno
, f
)
498 f
= `"` + field
[2] + `"`
499 if field
[2], err
= strconv
.Unquote(f
); err
!= nil {
500 t
.Errorf("%s:%d: cannot unquote %s", file
, lineno
, f
)
504 // Field 2: the regular expression pattern; SAME uses the pattern from
505 // the previous specification.
507 if field
[1] == "SAME" {
508 field
[1] = lastRegexp
510 lastRegexp
= field
[1]
512 // Field 3: the string to match.
515 // Field 4: the test outcome...
516 ok
, shouldCompile
, shouldMatch
, pos
:= parseFowlerResult(field
[3])
518 t
.Errorf("%s:%d: cannot parse result %#q", file
, lineno
, field
[3])
522 // Field 5: optional comment appended to the report.
525 // Run test once for each specified capital letter mode that we support.
526 for _
, c
:= range flag
{
528 syn
:= syntax
.POSIX | syntax
.ClassNL
533 // extended regexp (what we support)
536 pattern
= QuoteMeta(pattern
)
539 for _
, c
:= range flag
{
542 syn |
= syntax
.FoldCase
546 re
, err
:= CompileInternal(pattern
, syn
, true)
549 t
.Errorf("%s:%d: %#q did not compile", file
, lineno
, pattern
)
554 t
.Errorf("%s:%d: %#q should not compile", file
, lineno
, pattern
)
557 match
:= re
.MatchString(text
)
558 if match
!= shouldMatch
{
559 t
.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file
, lineno
, pattern
, text
, match
, shouldMatch
)
562 have
:= re
.FindStringSubmatchIndex(text
)
563 if (len(have
) > 0) != match
{
564 t
.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file
, lineno
, pattern
, text
, match
, pattern
, text
, have
)
567 if len(have
) > len(pos
) {
568 have
= have
[:len(pos
)]
570 if !same(have
, pos
) {
571 t
.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file
, lineno
, pattern
, text
, have
, pos
)
577 func parseFowlerResult(s
string) (ok
, compiled
, matched
bool, pos
[]int) {
578 // Field 4: the test outcome. This is either one of the posix error
579 // codes (with REG_ omitted) or the match array, a list of (m,n)
580 // entries with m and n being first and last+1 positions in the
581 // field 3 string, or NULL if REG_NOSUB is in effect and success
582 // is expected. BADPAT is acceptable in place of any regcomp(3)
583 // error code. The match[] array is initialized to (-2,-2) before
584 // each test. All array elements from 0 to nmatch-1 must be specified
585 // in the outcome. Unspecified endpoints (offset -1) are denoted by ?.
586 // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a
587 // matched (?{...}) expression, where x is the text enclosed by {...},
588 // o is the expression ordinal counting from 1, and n is the length of
589 // the unmatched portion of the subject string. If x starts with a
590 // number then that is the return value of re_execf(), otherwise 0 is
594 // Match with no position information.
605 case 'A' <= s
[0] && s
[0] <= 'Z':
606 // All the other error codes are compile errors.
625 for i
< len(s
) && s
[i
] != end
{
628 if i
== 0 || i
== len(s
) {
635 v
, err
= strconv
.Atoi(s
[:i
])
656 func makeText(n
int) []byte {
660 text
= make([]byte, n
)
661 for i
:= range text
{
662 if rand
.Intn(30) == 0 {
665 text
[i
] = byte(rand
.Intn(0x7E+1-0x20) + 0x20)
671 func benchmark(b
*testing
.B
, re
string, n
int) {
676 for i
:= 0; i
< b
.N
; i
++ {
684 easy0
= "ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
685 easy1
= "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$"
686 medium
= "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
687 hard
= "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$"
688 parens
= "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" +
689 "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$"
692 func BenchmarkMatchEasy0_32(b
*testing
.B
) { benchmark(b
, easy0
, 32<<0) }
693 func BenchmarkMatchEasy0_1K(b
*testing
.B
) { benchmark(b
, easy0
, 1<<10) }
694 func BenchmarkMatchEasy0_32K(b
*testing
.B
) { benchmark(b
, easy0
, 32<<10) }
695 func BenchmarkMatchEasy0_1M(b
*testing
.B
) { benchmark(b
, easy0
, 1<<20) }
696 func BenchmarkMatchEasy0_32M(b
*testing
.B
) { benchmark(b
, easy0
, 32<<20) }
697 func BenchmarkMatchEasy1_32(b
*testing
.B
) { benchmark(b
, easy1
, 32<<0) }
698 func BenchmarkMatchEasy1_1K(b
*testing
.B
) { benchmark(b
, easy1
, 1<<10) }
699 func BenchmarkMatchEasy1_32K(b
*testing
.B
) { benchmark(b
, easy1
, 32<<10) }
700 func BenchmarkMatchEasy1_1M(b
*testing
.B
) { benchmark(b
, easy1
, 1<<20) }
701 func BenchmarkMatchEasy1_32M(b
*testing
.B
) { benchmark(b
, easy1
, 32<<20) }
702 func BenchmarkMatchMedium_32(b
*testing
.B
) { benchmark(b
, medium
, 1<<0) }
703 func BenchmarkMatchMedium_1K(b
*testing
.B
) { benchmark(b
, medium
, 1<<10) }
704 func BenchmarkMatchMedium_32K(b
*testing
.B
) { benchmark(b
, medium
, 32<<10) }
705 func BenchmarkMatchMedium_1M(b
*testing
.B
) { benchmark(b
, medium
, 1<<20) }
706 func BenchmarkMatchMedium_32M(b
*testing
.B
) { benchmark(b
, medium
, 32<<20) }
707 func BenchmarkMatchHard_32(b
*testing
.B
) { benchmark(b
, hard
, 32<<0) }
708 func BenchmarkMatchHard_1K(b
*testing
.B
) { benchmark(b
, hard
, 1<<10) }
709 func BenchmarkMatchHard_32K(b
*testing
.B
) { benchmark(b
, hard
, 32<<10) }
710 func BenchmarkMatchHard_1M(b
*testing
.B
) { benchmark(b
, hard
, 1<<20) }
711 func BenchmarkMatchHard_32M(b
*testing
.B
) { benchmark(b
, hard
, 32<<20) }