1 ;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; This file is part of GNU Emacs.
7 ;; GNU Emacs is free software: you can redistribute it and/or modify
8 ;; it under the terms of the GNU General Public License as published by
9 ;; the Free Software Foundation, either version 3 of the License, or
10 ;; (at your option) any later version.
12 ;; GNU Emacs is distributed in the hope that it will be useful,
13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;; GNU General Public License for more details.
17 ;; You should have received a copy of the GNU General Public License
18 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
24 (defvar regex-tests--resources-dir
25 (concat (concat (file-name-directory (or load-file-name buffer-file-name
))
27 "Path to regex-resources directory next to the \"regex-tests.el\" file.")
29 (ert-deftest regex-word-cc-fallback-test
()
30 "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
32 Test that a regex of the form \"[[:cc:]]*x\" where CC is
33 a character class which matches a multibyte character X, matches
36 For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
37 character) must match a string \"\u2420\"."
38 (dolist (class '("[[:word:]]" "\\sw"))
39 (dolist (repeat '("*" "+"))
40 (dolist (suffix '("" "b" "bar" "\u2620"))
41 (dolist (string '("" "foo"))
42 (when (not (and (string-equal repeat
"+")
43 (string-equal string
"")))
44 (should (string-match (concat "^" class repeat suffix
"$")
45 (concat string suffix
)))))))))
47 (defun regex--test-cc (name matching not-matching
)
48 (should (string-match-p (concat "^[[:" name
":]]*$") matching
))
49 (should (string-match-p (concat "^[[:" name
":]]*?\u2622$")
50 (concat matching
"\u2622")))
51 (should (string-match-p (concat "^[^[:" name
":]]*$") not-matching
))
52 (should (string-match-p (concat "^[^[:" name
":]]*\u2622$")
53 (concat not-matching
"\u2622")))
58 (goto-char (point-min))
59 (skip-chars-forward (concat "[:" name
":]"))
60 (should (equal (point) p
))
61 (skip-chars-forward (concat "^[:" name
":]"))
62 (should (equal (point) (point-max)))
63 (goto-char (point-min))
64 (skip-chars-forward (concat "[:" name
":]\u2622"))
65 (should (or (equal (point) p
) (equal (point) (1+ p
)))))))
67 (ert-deftest regex-character-classes
()
68 "Perform sanity test of regexes using character classes.
70 Go over all the supported character classes and test whether the
71 classes and their inversions match what they are supposed to
72 match. The test is done using `string-match-p' as well as
73 `skip-chars-forward'."
74 (let (case-fold-search)
75 (regex--test-cc "alnum" "abcABC012łąka" "-, \t\n")
76 (regex--test-cc "alpha" "abcABCłąka" "-,012 \t\n")
77 (regex--test-cc "digit" "012" "abcABCłąka-, \t\n")
78 (regex--test-cc "xdigit" "0123aBc" "łąk-, \t\n")
79 (regex--test-cc "upper" "ABCŁĄKA" "abc012-, \t\n")
80 (regex--test-cc "lower" "abcłąka" "ABC012-, \t\n")
82 (regex--test-cc "word" "abcABC012\u2620" "-, \t\n")
84 (regex--test-cc "punct" ".,-" "abcABC012\u2620 \t\n")
85 (regex--test-cc "cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
86 (regex--test-cc "graph" "abcłąka\u2620-," " \t\n\1")
87 (regex--test-cc "print" "abcłąka\u2620-, " "\t\n\1")
89 (regex--test-cc "space" " \t\n\u2001" "abcABCł0123")
90 (regex--test-cc "blank" " \t" "\n\u2001")
92 (regex--test-cc "ascii" "abcABC012 \t\n\1" "łą\u2620")
93 (regex--test-cc "nonascii" "łą\u2622" "abcABC012 \t\n\1")
94 (regex--test-cc "unibyte" "abcABC012 \t\n\1" "łą\u2622")
95 (regex--test-cc "multibyte" "łą\u2622" "abcABC012 \t\n\1")))
98 (defmacro regex-tests-generic-line
(comment-char test-file whitelist
&rest body
)
99 "Reads a line of the test file TEST-FILE, skipping
100 comments (defined by COMMENT-CHAR), and evaluates the tests in
101 this line as defined in the BODY. Line numbers in the WHITELIST
102 are known failures, and are skipped."
105 (modify-syntax-entry ?_
"w;; ") ; tests expect _ to be a word
106 (insert-file-contents (concat regex-tests--resources-dir
,test-file
))
107 (let ((case-fold-search nil
)
111 (goto-char (point-min))
114 (let ((start (point)))
116 (narrow-to-region start
(point))
118 (goto-char (point-min))
124 (re-search-forward ,(concat "^[^" (string comment-char
) "]") nil t
))
126 ;; skip lines in the whitelist
127 (let ((whitelist-next
129 (aref ,whitelist whitelist-idx
) (args-out-of-range nil
))))
131 ;; whitelist exhausted. do process this line
132 ((null whitelist-next
) t
)
134 ;; we're not yet at the next whitelist element. do
136 ((< line-number whitelist-next
) t
)
138 ;; we're past the next whitelist element. This
140 ((> line-number whitelist-next
)
143 "We somehow skipped the next whitelist element: line %d" whitelist-next
)))
145 ;; we're at the next whitelist element. Skip this
146 ;; line, and advance the whitelist index
148 (setq whitelist-idx
(1+ whitelist-idx
)) nil
))))
154 (setq line-number
(1+ line-number
)))))))
156 (defun regex-tests-compare (string what-failed bounds-ref
&optional substring-ref
)
157 "I just ran a search, looking at STRING. WHAT-FAILED describes
158 what failed, if anything; valid values are 'search-failed,
159 'compilation-failed and nil. I compare the beginning/end of each
160 group with their expected values. This is done with either
161 BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil.
162 BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1
163 end-ref1 ....] while SUBSTRING-REF is the expected substring
164 obtained by indexing the input string by start/end-ref.
166 If the search was supposed to fail then start-ref0/substring-ref0
167 is 'search-failed. If the search wasn't even supposed to compile
168 successfully, then start-ref0/substring-ref0 is
169 'compilation-failed. If I only care about a match succeeding,
170 this can be set to t.
172 This function returns a string that describes the failure, or nil
176 (and bounds-ref substring-ref
)
177 (not (or bounds-ref substring-ref
)))
178 (error "Exactly one of bounds-ref and bounds-ref should be non-nil"))
180 (let ((what-failed-ref (car (or bounds-ref substring-ref
))))
183 ((eq what-failed
'search-failed
)
185 ((eq what-failed-ref
'search-failed
)
187 ((eq what-failed-ref
'compilation-failed
)
188 "Expected pattern failure; but no match")
190 "Expected match; but no match")))
192 ((eq what-failed
'compilation-failed
)
194 ((eq what-failed-ref
'search-failed
)
195 "Expected no match; but pattern failure")
196 ((eq what-failed-ref
'compilation-failed
)
199 "Expected match; but pattern failure")))
201 ;; The regex match succeeded
202 ((eq what-failed-ref
'search-failed
)
203 "Expected no match; but match")
204 ((eq what-failed-ref
'compilation-failed
)
205 "Expected pattern failure; but match")
207 ;; The regex match succeeded, as expected. I now check all the
212 ref next-ref-function compare-ref-function mismatched-ref-function
)
216 next-ref-function
(lambda (x) (cddr x
))
217 compare-ref-function
(lambda (ref start-pos end-pos
)
219 (and (eq start-pos
(car ref
))
220 (eq end-pos
(cadr ref
)))))
221 mismatched-ref-function
(lambda (ref start-pos end-pos
)
223 "beginning/end positions: %d/%s and %d/%s"
224 start-pos
(car ref
) end-pos
(cadr ref
))))
225 (setq ref substring-ref
226 next-ref-function
(lambda (x) (cdr x
))
227 compare-ref-function
(lambda (ref start-pos end-pos
)
229 (string= (substring string start-pos end-pos
) (car ref
))))
230 mismatched-ref-function
(lambda (ref start-pos end-pos
)
232 "beginning/end positions: %d/%s and %d/%s"
233 start-pos
(car ref
) end-pos
(cadr ref
)))))
235 (while (not (or (null ref
) msg
))
237 (let ((start (match-beginning idx
))
238 (end (match-end idx
)))
240 (when (not (funcall compare-ref-function ref start end
))
243 "Have expected match, but mismatch in group %d: %s" idx
(funcall mismatched-ref-function ref start end
))))
245 (setq ref
(funcall next-ref-function ref
)
253 (defun regex-tests-match (pattern string bounds-ref
&optional substring-ref
)
254 "I match the given STRING against PATTERN. I compare the
255 beginning/end of each group with their expected values.
256 BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1
259 If the search was supposed to fail then start-ref0 is
260 'search-failed. If the search wasn't even supposed to compile
261 successfully, then start-ref0 is 'compilation-failed.
263 This function returns a string that describes the failure, or nil
266 (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern
)
267 ;; Skipping test: [.x.] and [=x=] forms not supported by emacs
273 (if (string-match pattern string
) nil
'search-failed
)
274 ('invalid-regexp
'compilation-failed
))
275 bounds-ref substring-ref
)))
278 (defconst regex-tests-re-even-escapes
279 "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*"
280 "Regex that matches an even number of \\ characters")
282 (defconst regex-tests-re-odd-escapes
283 (concat regex-tests-re-even-escapes
"\\\\")
284 "Regex that matches an odd number of \\ characters")
287 (defun regex-tests-unextend (pattern)
288 "Basic conversion from extended regexes to emacs ones. This is
289 mostly a hack that adds \\ to () and | and {}, and removes it if
290 it already exists. We also change \\S (and \\s) to \\S- (and
291 \\s-) because extended regexes see the former as whitespace, but
292 emacs requires an extra symbol character"
296 (goto-char (point-min))
298 (while (re-search-forward "[()|{}]" nil t
)
299 ;; point is past special character. If it is escaped, unescape
303 (re-search-backward (concat regex-tests-re-odd-escapes
".\\=") nil t
))
305 ;; This special character is preceded by an odd number of \,
306 ;; so I unescape it by removing the last one
312 ;; This special character is preceded by an even (possibly 0)
313 ;; number of \. I add an escape
319 (goto-char (point-min))
320 (while (re-search-forward (concat regex-tests-re-odd-escapes
"[Ss]") nil t
)
325 (defun regex-tests-BOOST-frob-escapes (s ispattern
)
326 "Mangle \\ the way it is done in frob_escapes() in
327 regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted;
328 \\\\, \\^, \{, \\|, \} are unescaped for the string (not
331 ;; this is all similar to (regex-tests-unextend)
335 (let ((interpret-list (list "t" "n" "r")))
336 (while interpret-list
337 (goto-char (point-min))
338 (while (re-search-forward
339 (concat "\\(" regex-tests-re-even-escapes
"\\)"
340 "\\\\" (car interpret-list
))
342 (replace-match (concat "\\1" (car (read-from-string
343 (concat "\"\\" (car interpret-list
) "\""))))))
345 (setq interpret-list
(cdr interpret-list
))))
347 (when (not ispattern
)
348 ;; unescape \\, \^, \{, \|, \}
349 (let ((unescape-list (list "\\\\" "^" "{" "|" "}")))
351 (goto-char (point-min))
352 (while (re-search-forward
353 (concat "\\(" regex-tests-re-even-escapes
"\\)"
354 "\\\\" (car unescape-list
))
356 (replace-match (concat "\\1" (car unescape-list
))))
358 (setq unescape-list
(cdr unescape-list
))))
365 (defconst regex-tests-BOOST-whitelist
367 ;; emacs is more stringent with regexes involving unbalanced )
370 ;; in emacs, regex . doesn't match \n
373 ;; emacs is more forgiving with * and ? that don't apply to
375 107 108 109 122 123 124 140 141 142
377 ;; emacs accepts regexes with {}
380 ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5]
383 ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of
387 ;; ambiguous groupings are ambiguous
388 443 444 445 446 448 449 450
390 ;; emacs doesn't know how to handle weird ranges such as [a-Z] and
394 ;; emacs matches non-greedy regex ab.*? non-greedily
397 "Line numbers in the boost test that should be skipped. These
398 are false-positive test failures that represent known/benign
399 differences in behavior.")
402 ;; - Comments are lines starting with ;
403 ;; - Lines starting with - set options passed to regcomp() and regexec():
404 ;; - if no "REG_BASIC" is found, with have an extended regex
405 ;; - These set a flag:
407 ;; - REG_NEWLINE (ignored by this function)
412 ;; pattern string start0 end0 start1 end1 ...
414 ;; - pattern, string can have escapes
415 ;; - string can have whitespace if enclosed in ""
416 ;; - if string is "!", then the pattern is supposed to fail compilation
417 ;; - start/end are of group0, group1, etc. group 0 is the full match
418 ;; - start<0 indicates "no match"
419 ;; - start is the 0-based index of the first character
420 ;; - end is the 0-based index of the first character past the group
421 (defun regex-tests-BOOST ()
423 basic icase notbol noteol
)
424 (regex-tests-generic-line
425 ?
; "BOOST.tests" regex-tests-BOOST-whitelist
426 (if (save-excursion (re-search-forward "^-" nil t
))
427 (setq basic
(save-excursion (re-search-forward "REG_BASIC" nil t
))
428 icase
(save-excursion (re-search-forward "REG_ICASE" nil t
))
429 notbol
(save-excursion (re-search-forward "REG_NOTBOL" nil t
))
430 noteol
(save-excursion (re-search-forward "REG_NOTEOL" nil t
)))
433 (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t
)
434 (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t
)
435 (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t
)))
437 (let* ((pattern-raw (match-string 1))
438 (string-raw (match-string 2))
439 (positions-raw (match-string 3))
440 (pattern (regex-tests-BOOST-frob-escapes pattern-raw t
))
441 (string (regex-tests-BOOST-frob-escapes string-raw nil
))
443 (if (string= string
"!")
444 (list 'compilation-failed
0)
447 (let ((x (string-to-number x
)))
449 (split-string positions-raw
)))))
451 (when (null (car positions
))
452 (setcar positions
'search-failed
))
455 (setq pattern
(regex-tests-unextend pattern
)))
457 ;; great. I now have all the data parsed. Let's use it to do
459 (let* ((case-fold-search icase
)
460 (msg (regex-tests-match pattern string positions
)))
463 ;; Skipping test: notbol/noteol not supported
464 (not notbol
) (not noteol
)
470 (cons (format "line number %d: Regex '%s': %s"
471 line-number pattern msg
)
476 (defconst regex-tests-PCRE-whitelist
478 ;; ambiguous groupings are ambiguous
479 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203
481 "Line numbers in the PCRE test that should be skipped. These
482 are false-positive test failures that represent known/benign
483 differences in behavior.")
489 ;; group_num: group_match | "No match"
491 ;; group_num: group_match | "No match"
493 ;; group_num: group_match | "No match"
495 ;; group_num: group_match | "No match"
497 (defun regex-tests-PCRE ()
499 pattern icase string what-failed matches-observed
)
500 (regex-tests-generic-line
501 ?
# "PCRE.tests" regex-tests-PCRE-whitelist
506 ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t
))
507 (setq icase
(string= "i" (match-string 2))
508 pattern
(regex-tests-unextend (match-string 1))))
510 ;; string. read it in, match against pattern, and save all the results
511 ((save-excursion (re-search-forward "^ \\(.*\\)" nil t
))
512 (let ((case-fold-search icase
))
513 (setq string
(match-string 1)
515 ;; the regex match under test
518 (if (string-match pattern string
) nil
'search-failed
)
519 ('invalid-regexp
'compilation-failed
))
522 (cl-loop for x from
0 to
20
523 collect
(and (not what-failed
)
524 (or (match-string x string
) "<unset>")))))
527 ;; verification line: failed match
528 ((save-excursion (re-search-forward "^No match" nil t
))
531 (cons (format "line number %d: Regex '%s': Expected no match; but match"
535 ;; verification line: succeeded match
536 ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t
))
537 (let* ((match-ref (match-string 2))
538 (idx (string-to-number (match-string 1))))
541 "Expected match; but no match"
542 (unless (string= match-ref
(elt matches-observed idx
))
544 (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'"
546 idx match-ref
(elt matches-observed idx
))
550 (t (setq pattern nil
) nil
)))
554 (defconst regex-tests-PTESTS-whitelist
556 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
560 ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character
563 "Line numbers in the PTESTS test that should be skipped. These
564 are false-positive test failures that represent known/benign
565 differences in behavior.")
568 ;; - fields separated by ¦ (note: this is not a |)
569 ;; - start¦end¦pattern¦string
570 ;; - start is the 1-based index of the first character
571 ;; - end is the 1-based index of the last character
572 (defun regex-tests-PTESTS ()
574 (regex-tests-generic-line
575 ?
# "PTESTS" regex-tests-PTESTS-whitelist
576 (let* ((fields (split-string (buffer-string) "¦"))
578 ;; string has 1-based index of first char in the
579 ;; match. -1 means "no match". -2 means "invalid
582 ;; start-ref is 0-based index of first char in the
585 ;; string==0 is a special case, and I have to treat
586 ;; it as start-ref = 0
587 (start-ref (let ((raw (string-to-number (elt fields
0))))
589 ((= raw -
2) 'compilation-failed
)
590 ((= raw -
1) 'search-failed
)
594 ;; string has 1-based index of last char in the
595 ;; match. end-ref is 0-based index of first char past
597 (end-ref (string-to-number (elt fields
1)))
598 (pattern (elt fields
2))
599 (string (elt fields
3)))
601 (let ((msg (regex-tests-match pattern string
(list start-ref end-ref
))))
604 (cons (format "line number %d: Regex '%s': %s"
605 line-number pattern msg
)
609 (defconst regex-tests-TESTS-whitelist
611 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
615 ;; emacs is more forgiving with * and ? that don't apply to
619 ;; emacs is more stringent with regexes involving unbalanced )
622 "Line numbers in the TESTS test that should be skipped. These
623 are false-positive test failures that represent known/benign
624 differences in behavior.")
627 ;; - fields separated by :. Watch for [\[:xxx:]]
628 ;; - expected:pattern:string
631 ;; | 0 | successful match |
632 ;; | 1 | failed match |
633 ;; | 2 | regcomp() should fail |
634 (defun regex-tests-TESTS ()
636 (regex-tests-generic-line
637 ?
# "TESTS" regex-tests-TESTS-whitelist
638 (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t
))
640 (let ((raw (string-to-number (match-string 1))))
642 ((= raw
2) 'compilation-failed
)
643 ((= raw
1) 'search-failed
)
645 (string (match-string 3))
646 (pattern (regex-tests-unextend (match-string 2))))
648 (let ((msg (regex-tests-match pattern string nil
(list what-failed
))))
651 (cons (format "line number %d: Regex '%s': %s"
652 line-number pattern msg
)
655 (error "Error parsing TESTS file line: '%s'" (buffer-string))))
658 (ert-deftest regex-tests-BOOST
()
659 "Tests of the regular expression engine.
660 This evaluates the BOOST test cases from glibc."
661 (should-not (regex-tests-BOOST)))
663 (ert-deftest regex-tests-PCRE
()
664 "Tests of the regular expression engine.
665 This evaluates the PCRE test cases from glibc."
666 (should-not (regex-tests-PCRE)))
668 (ert-deftest regex-tests-PTESTS
()
669 "Tests of the regular expression engine.
670 This evaluates the PTESTS test cases from glibc."
671 (should-not (regex-tests-PTESTS)))
673 (ert-deftest regex-tests-TESTS
()
674 "Tests of the regular expression engine.
675 This evaluates the TESTS test cases from glibc."
676 (should-not (regex-tests-TESTS)))
678 ;;; regex-tests.el ends here