1 ;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; This file is part of GNU Emacs.
7 ;; GNU Emacs is free software: you can redistribute it and/or modify
8 ;; it under the terms of the GNU General Public License as published by
9 ;; the Free Software Foundation, either version 3 of the License, or
10 ;; (at your option) any later version.
12 ;; GNU Emacs is distributed in the hope that it will be useful,
13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;; GNU General Public License for more details.
17 ;; You should have received a copy of the GNU General Public License
18 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
24 (defvar regex-tests--resources-dir
25 (concat (concat (file-name-directory (or load-file-name buffer-file-name
))
27 "Path to regex-resources directory next to the \"regex-tests.el\" file.")
29 (ert-deftest regex-word-cc-fallback-test
()
30 "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
32 Test that a regex of the form \"[[:cc:]]*x\" where CC is
33 a character class which matches a multibyte character X, matches
36 For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
37 character) must match a string \"\u2420\"."
38 (dolist (class '("[[:word:]]" "\\sw"))
39 (dolist (repeat '("*" "+"))
40 (dolist (suffix '("" "b" "bar" "\u2620"))
41 (dolist (string '("" "foo"))
42 (when (not (and (string-equal repeat
"+")
43 (string-equal string
"")))
44 (should (string-match (concat "^" class repeat suffix
"$")
45 (concat string suffix
)))))))))
47 (defun regex--test-cc (name matching not-matching
)
48 (let (case-fold-search)
49 (should (string-match-p (concat "^[[:" name
":]]*$") matching
))
50 (should (string-match-p (concat "^[[:" name
":]]*?\u2622$")
51 (concat matching
"\u2622")))
52 (should (string-match-p (concat "^[^[:" name
":]]*$") not-matching
))
53 (should (string-match-p (concat "^[^[:" name
":]]*\u2622$")
54 (concat not-matching
"\u2622")))
59 (goto-char (point-min))
60 (skip-chars-forward (concat "[:" name
":]"))
61 (should (equal (point) p
))
62 (skip-chars-forward (concat "^[:" name
":]"))
63 (should (equal (point) (point-max)))
64 (goto-char (point-min))
65 (skip-chars-forward (concat "[:" name
":]\u2622"))
66 (should (or (equal (point) p
) (equal (point) (1+ p
))))))))
68 (dolist (test '(("alnum" "abcABC012łąka" "-, \t\n")
69 ("alpha" "abcABCłąka" "-,012 \t\n")
70 ("digit" "012" "abcABCłąka-, \t\n")
71 ("xdigit" "0123aBc" "łąk-, \t\n")
72 ("upper" "ABCŁĄKA" "abc012-, \t\n")
73 ("lower" "abcłąka" "ABC012-, \t\n")
75 ("word" "abcABC012\u2620" "-, \t\n")
77 ("punct" ".,-" "abcABC012\u2620 \t\n")
78 ("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
79 ("graph" "abcłąka\u2620-," " \t\n\1")
80 ("print" "abcłąka\u2620-, " "\t\n\1")
82 ("space" " \t\n\u2001" "abcABCł0123")
83 ("blank" " \t" "\n\u2001")
85 ("ascii" "abcABC012 \t\n\1" "łą\u2620")
86 ("nonascii" "łą\u2622" "abcABC012 \t\n\1")
87 ("unibyte" "abcABC012 \t\n\1" "łą\u2622")
88 ("multibyte" "łą\u2622" "abcABC012 \t\n\1")))
89 (let ((name (intern (concat "regex-tests-" (car test
) "-character-class")))
90 (doc (concat "Perform sanity test of regexes using " (car test
)
93 Go over all the supported character classes and test whether the
94 classes and their inversions match what they are supposed to
95 match. The test is done using `string-match-p' as well as
96 `skip-chars-forward'.")))
97 (eval `(ert-deftest ,name
() ,doc
,(cons 'regex--test-cc test
)) t
)))
100 (defmacro regex-tests-generic-line
(comment-char test-file whitelist
&rest body
)
101 "Reads a line of the test file TEST-FILE, skipping
102 comments (defined by COMMENT-CHAR), and evaluates the tests in
103 this line as defined in the BODY. Line numbers in the WHITELIST
104 are known failures, and are skipped."
107 (modify-syntax-entry ?_
"w;; ") ; tests expect _ to be a word
108 (insert-file-contents (concat regex-tests--resources-dir
,test-file
))
109 (let ((case-fold-search nil
)
113 (goto-char (point-min))
116 (let ((start (point)))
118 (narrow-to-region start
(point))
120 (goto-char (point-min))
126 (re-search-forward ,(concat "^[^" (string comment-char
) "]") nil t
))
128 ;; skip lines in the whitelist
129 (let ((whitelist-next
131 (aref ,whitelist whitelist-idx
) (args-out-of-range nil
))))
133 ;; whitelist exhausted. do process this line
134 ((null whitelist-next
) t
)
136 ;; we're not yet at the next whitelist element. do
138 ((< line-number whitelist-next
) t
)
140 ;; we're past the next whitelist element. This
142 ((> line-number whitelist-next
)
145 "We somehow skipped the next whitelist element: line %d" whitelist-next
)))
147 ;; we're at the next whitelist element. Skip this
148 ;; line, and advance the whitelist index
150 (setq whitelist-idx
(1+ whitelist-idx
)) nil
))))
156 (setq line-number
(1+ line-number
)))))))
158 (defun regex-tests-compare (string what-failed bounds-ref
&optional substring-ref
)
159 "I just ran a search, looking at STRING. WHAT-FAILED describes
160 what failed, if anything; valid values are 'search-failed,
161 'compilation-failed and nil. I compare the beginning/end of each
162 group with their expected values. This is done with either
163 BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil.
164 BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1
165 end-ref1 ....] while SUBSTRING-REF is the expected substring
166 obtained by indexing the input string by start/end-ref.
168 If the search was supposed to fail then start-ref0/substring-ref0
169 is 'search-failed. If the search wasn't even supposed to compile
170 successfully, then start-ref0/substring-ref0 is
171 'compilation-failed. If I only care about a match succeeding,
172 this can be set to t.
174 This function returns a string that describes the failure, or nil
178 (and bounds-ref substring-ref
)
179 (not (or bounds-ref substring-ref
)))
180 (error "Exactly one of bounds-ref and bounds-ref should be non-nil"))
182 (let ((what-failed-ref (car (or bounds-ref substring-ref
))))
185 ((eq what-failed
'search-failed
)
187 ((eq what-failed-ref
'search-failed
)
189 ((eq what-failed-ref
'compilation-failed
)
190 "Expected pattern failure; but no match")
192 "Expected match; but no match")))
194 ((eq what-failed
'compilation-failed
)
196 ((eq what-failed-ref
'search-failed
)
197 "Expected no match; but pattern failure")
198 ((eq what-failed-ref
'compilation-failed
)
201 "Expected match; but pattern failure")))
203 ;; The regex match succeeded
204 ((eq what-failed-ref
'search-failed
)
205 "Expected no match; but match")
206 ((eq what-failed-ref
'compilation-failed
)
207 "Expected pattern failure; but match")
209 ;; The regex match succeeded, as expected. I now check all the
214 ref next-ref-function compare-ref-function mismatched-ref-function
)
218 next-ref-function
(lambda (x) (cddr x
))
219 compare-ref-function
(lambda (ref start-pos end-pos
)
221 (and (eq start-pos
(car ref
))
222 (eq end-pos
(cadr ref
)))))
223 mismatched-ref-function
(lambda (ref start-pos end-pos
)
225 "beginning/end positions: %d/%s and %d/%s"
226 start-pos
(car ref
) end-pos
(cadr ref
))))
227 (setq ref substring-ref
228 next-ref-function
(lambda (x) (cdr x
))
229 compare-ref-function
(lambda (ref start-pos end-pos
)
231 (string= (substring string start-pos end-pos
) (car ref
))))
232 mismatched-ref-function
(lambda (ref start-pos end-pos
)
234 "beginning/end positions: %d/%s and %d/%s"
235 start-pos
(car ref
) end-pos
(cadr ref
)))))
237 (while (not (or (null ref
) msg
))
239 (let ((start (match-beginning idx
))
240 (end (match-end idx
)))
242 (when (not (funcall compare-ref-function ref start end
))
245 "Have expected match, but mismatch in group %d: %s" idx
(funcall mismatched-ref-function ref start end
))))
247 (setq ref
(funcall next-ref-function ref
)
255 (defun regex-tests-match (pattern string bounds-ref
&optional substring-ref
)
256 "I match the given STRING against PATTERN. I compare the
257 beginning/end of each group with their expected values.
258 BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1
261 If the search was supposed to fail then start-ref0 is
262 'search-failed. If the search wasn't even supposed to compile
263 successfully, then start-ref0 is 'compilation-failed.
265 This function returns a string that describes the failure, or nil
268 (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern
)
269 ;; Skipping test: [.x.] and [=x=] forms not supported by emacs
275 (if (string-match pattern string
) nil
'search-failed
)
276 ('invalid-regexp
'compilation-failed
))
277 bounds-ref substring-ref
)))
280 (defconst regex-tests-re-even-escapes
281 "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*"
282 "Regex that matches an even number of \\ characters")
284 (defconst regex-tests-re-odd-escapes
285 (concat regex-tests-re-even-escapes
"\\\\")
286 "Regex that matches an odd number of \\ characters")
289 (defun regex-tests-unextend (pattern)
290 "Basic conversion from extended regexes to emacs ones. This is
291 mostly a hack that adds \\ to () and | and {}, and removes it if
292 it already exists. We also change \\S (and \\s) to \\S- (and
293 \\s-) because extended regexes see the former as whitespace, but
294 emacs requires an extra symbol character"
298 (goto-char (point-min))
300 (while (re-search-forward "[()|{}]" nil t
)
301 ;; point is past special character. If it is escaped, unescape
305 (re-search-backward (concat regex-tests-re-odd-escapes
".\\=") nil t
))
307 ;; This special character is preceded by an odd number of \,
308 ;; so I unescape it by removing the last one
314 ;; This special character is preceded by an even (possibly 0)
315 ;; number of \. I add an escape
321 (goto-char (point-min))
322 (while (re-search-forward (concat regex-tests-re-odd-escapes
"[Ss]") nil t
)
327 (defun regex-tests-BOOST-frob-escapes (s ispattern
)
328 "Mangle \\ the way it is done in frob_escapes() in
329 regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted;
330 \\\\, \\^, \{, \\|, \} are unescaped for the string (not
333 ;; this is all similar to (regex-tests-unextend)
337 (let ((interpret-list (list "t" "n" "r")))
338 (while interpret-list
339 (goto-char (point-min))
340 (while (re-search-forward
341 (concat "\\(" regex-tests-re-even-escapes
"\\)"
342 "\\\\" (car interpret-list
))
344 (replace-match (concat "\\1" (car (read-from-string
345 (concat "\"\\" (car interpret-list
) "\""))))))
347 (setq interpret-list
(cdr interpret-list
))))
349 (when (not ispattern
)
350 ;; unescape \\, \^, \{, \|, \}
351 (let ((unescape-list (list "\\\\" "^" "{" "|" "}")))
353 (goto-char (point-min))
354 (while (re-search-forward
355 (concat "\\(" regex-tests-re-even-escapes
"\\)"
356 "\\\\" (car unescape-list
))
358 (replace-match (concat "\\1" (car unescape-list
))))
360 (setq unescape-list
(cdr unescape-list
))))
367 (defconst regex-tests-BOOST-whitelist
369 ;; emacs is more stringent with regexes involving unbalanced )
372 ;; in emacs, regex . doesn't match \n
375 ;; emacs is more forgiving with * and ? that don't apply to
377 107 108 109 122 123 124 140 141 142
379 ;; emacs accepts regexes with {}
382 ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5]
385 ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of
389 ;; ambiguous groupings are ambiguous
390 443 444 445 446 448 449 450
392 ;; emacs doesn't know how to handle weird ranges such as [a-Z] and
396 ;; emacs matches non-greedy regex ab.*? non-greedily
399 "Line numbers in the boost test that should be skipped. These
400 are false-positive test failures that represent known/benign
401 differences in behavior.")
404 ;; - Comments are lines starting with ;
405 ;; - Lines starting with - set options passed to regcomp() and regexec():
406 ;; - if no "REG_BASIC" is found, with have an extended regex
407 ;; - These set a flag:
409 ;; - REG_NEWLINE (ignored by this function)
414 ;; pattern string start0 end0 start1 end1 ...
416 ;; - pattern, string can have escapes
417 ;; - string can have whitespace if enclosed in ""
418 ;; - if string is "!", then the pattern is supposed to fail compilation
419 ;; - start/end are of group0, group1, etc. group 0 is the full match
420 ;; - start<0 indicates "no match"
421 ;; - start is the 0-based index of the first character
422 ;; - end is the 0-based index of the first character past the group
423 (defun regex-tests-BOOST ()
425 basic icase notbol noteol
)
426 (regex-tests-generic-line
427 ?
; "BOOST.tests" regex-tests-BOOST-whitelist
428 (if (save-excursion (re-search-forward "^-" nil t
))
429 (setq basic
(save-excursion (re-search-forward "REG_BASIC" nil t
))
430 icase
(save-excursion (re-search-forward "REG_ICASE" nil t
))
431 notbol
(save-excursion (re-search-forward "REG_NOTBOL" nil t
))
432 noteol
(save-excursion (re-search-forward "REG_NOTEOL" nil t
)))
435 (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t
)
436 (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t
)
437 (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t
)))
439 (let* ((pattern-raw (match-string 1))
440 (string-raw (match-string 2))
441 (positions-raw (match-string 3))
442 (pattern (regex-tests-BOOST-frob-escapes pattern-raw t
))
443 (string (regex-tests-BOOST-frob-escapes string-raw nil
))
445 (if (string= string
"!")
446 (list 'compilation-failed
0)
449 (let ((x (string-to-number x
)))
451 (split-string positions-raw
)))))
453 (when (null (car positions
))
454 (setcar positions
'search-failed
))
457 (setq pattern
(regex-tests-unextend pattern
)))
459 ;; great. I now have all the data parsed. Let's use it to do
461 (let* ((case-fold-search icase
)
462 (msg (regex-tests-match pattern string positions
)))
465 ;; Skipping test: notbol/noteol not supported
466 (not notbol
) (not noteol
)
472 (cons (format "line number %d: Regex '%s': %s"
473 line-number pattern msg
)
478 (defconst regex-tests-PCRE-whitelist
480 ;; ambiguous groupings are ambiguous
481 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203
483 "Line numbers in the PCRE test that should be skipped. These
484 are false-positive test failures that represent known/benign
485 differences in behavior.")
491 ;; group_num: group_match | "No match"
493 ;; group_num: group_match | "No match"
495 ;; group_num: group_match | "No match"
497 ;; group_num: group_match | "No match"
499 (defun regex-tests-PCRE ()
501 pattern icase string what-failed matches-observed
)
502 (regex-tests-generic-line
503 ?
# "PCRE.tests" regex-tests-PCRE-whitelist
508 ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t
))
509 (setq icase
(string= "i" (match-string 2))
510 pattern
(regex-tests-unextend (match-string 1))))
512 ;; string. read it in, match against pattern, and save all the results
513 ((save-excursion (re-search-forward "^ \\(.*\\)" nil t
))
514 (let ((case-fold-search icase
))
515 (setq string
(match-string 1)
517 ;; the regex match under test
520 (if (string-match pattern string
) nil
'search-failed
)
521 ('invalid-regexp
'compilation-failed
))
524 (cl-loop for x from
0 to
20
525 collect
(and (not what-failed
)
526 (or (match-string x string
) "<unset>")))))
529 ;; verification line: failed match
530 ((save-excursion (re-search-forward "^No match" nil t
))
533 (cons (format "line number %d: Regex '%s': Expected no match; but match"
537 ;; verification line: succeeded match
538 ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t
))
539 (let* ((match-ref (match-string 2))
540 (idx (string-to-number (match-string 1))))
543 "Expected match; but no match"
544 (unless (string= match-ref
(elt matches-observed idx
))
546 (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'"
548 idx match-ref
(elt matches-observed idx
))
552 (t (setq pattern nil
) nil
)))
556 (defconst regex-tests-PTESTS-whitelist
558 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
562 ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character
565 "Line numbers in the PTESTS test that should be skipped. These
566 are false-positive test failures that represent known/benign
567 differences in behavior.")
570 ;; - fields separated by ¦ (note: this is not a |)
571 ;; - start¦end¦pattern¦string
572 ;; - start is the 1-based index of the first character
573 ;; - end is the 1-based index of the last character
574 (defun regex-tests-PTESTS ()
576 (regex-tests-generic-line
577 ?
# "PTESTS" regex-tests-PTESTS-whitelist
578 (let* ((fields (split-string (buffer-string) "¦"))
580 ;; string has 1-based index of first char in the
581 ;; match. -1 means "no match". -2 means "invalid
584 ;; start-ref is 0-based index of first char in the
587 ;; string==0 is a special case, and I have to treat
588 ;; it as start-ref = 0
589 (start-ref (let ((raw (string-to-number (elt fields
0))))
591 ((= raw -
2) 'compilation-failed
)
592 ((= raw -
1) 'search-failed
)
596 ;; string has 1-based index of last char in the
597 ;; match. end-ref is 0-based index of first char past
599 (end-ref (string-to-number (elt fields
1)))
600 (pattern (elt fields
2))
601 (string (elt fields
3)))
603 (let ((msg (regex-tests-match pattern string
(list start-ref end-ref
))))
606 (cons (format "line number %d: Regex '%s': %s"
607 line-number pattern msg
)
611 (defconst regex-tests-TESTS-whitelist
613 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
617 ;; emacs is more forgiving with * and ? that don't apply to
621 ;; emacs is more stringent with regexes involving unbalanced )
624 "Line numbers in the TESTS test that should be skipped. These
625 are false-positive test failures that represent known/benign
626 differences in behavior.")
629 ;; - fields separated by :. Watch for [\[:xxx:]]
630 ;; - expected:pattern:string
633 ;; | 0 | successful match |
634 ;; | 1 | failed match |
635 ;; | 2 | regcomp() should fail |
636 (defun regex-tests-TESTS ()
638 (regex-tests-generic-line
639 ?
# "TESTS" regex-tests-TESTS-whitelist
640 (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t
))
642 (let ((raw (string-to-number (match-string 1))))
644 ((= raw
2) 'compilation-failed
)
645 ((= raw
1) 'search-failed
)
647 (string (match-string 3))
648 (pattern (regex-tests-unextend (match-string 2))))
650 (let ((msg (regex-tests-match pattern string nil
(list what-failed
))))
653 (cons (format "line number %d: Regex '%s': %s"
654 line-number pattern msg
)
657 (error "Error parsing TESTS file line: '%s'" (buffer-string))))
660 (ert-deftest regex-tests-BOOST
()
661 "Tests of the regular expression engine.
662 This evaluates the BOOST test cases from glibc."
663 (should-not (regex-tests-BOOST)))
665 (ert-deftest regex-tests-PCRE
()
666 "Tests of the regular expression engine.
667 This evaluates the PCRE test cases from glibc."
668 (should-not (regex-tests-PCRE)))
670 (ert-deftest regex-tests-PTESTS
()
671 "Tests of the regular expression engine.
672 This evaluates the PTESTS test cases from glibc."
673 (should-not (regex-tests-PTESTS)))
675 (ert-deftest regex-tests-TESTS
()
676 "Tests of the regular expression engine.
677 This evaluates the TESTS test cases from glibc."
678 (should-not (regex-tests-TESTS)))
680 ;;; regex-tests.el ends here