Fix typo in lisp.h reordering patch
[emacs.git] / test / src / regex-tests.el
blobc4844c7cdbc7ed15a9a0bbe45df607fcc7869256
1 ;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*-
3 ;; Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ;; This file is part of GNU Emacs.
7 ;; GNU Emacs is free software: you can redistribute it and/or modify
8 ;; it under the terms of the GNU General Public License as published by
9 ;; the Free Software Foundation, either version 3 of the License, or
10 ;; (at your option) any later version.
12 ;; GNU Emacs is distributed in the hope that it will be useful,
13 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ;; GNU General Public License for more details.
17 ;; You should have received a copy of the GNU General Public License
18 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
20 ;;; Code:
22 (require 'ert)
24 (defvar regex-tests--resources-dir
25 (concat (concat (file-name-directory (or load-file-name buffer-file-name))
26 "/regex-resources/"))
27 "Path to regex-resources directory next to the \"regex-tests.el\" file.")
29 (ert-deftest regex-word-cc-fallback-test ()
30 "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020).
32 Test that a regex of the form \"[[:cc:]]*x\" where CC is
33 a character class which matches a multibyte character X, matches
34 string \"x\".
36 For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word
37 character) must match a string \"\u2420\"."
38 (dolist (class '("[[:word:]]" "\\sw"))
39 (dolist (repeat '("*" "+"))
40 (dolist (suffix '("" "b" "bar" "\u2620"))
41 (dolist (string '("" "foo"))
42 (when (not (and (string-equal repeat "+")
43 (string-equal string "")))
44 (should (string-match (concat "^" class repeat suffix "$")
45 (concat string suffix)))))))))
47 (defun regex--test-cc (name matching not-matching)
48 (let (case-fold-search)
49 (should (string-match-p (concat "^[[:" name ":]]*$") matching))
50 (should (string-match-p (concat "^[[:" name ":]]*?\u2622$")
51 (concat matching "\u2622")))
52 (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching))
53 (should (string-match-p (concat "^[^[:" name ":]]*\u2622$")
54 (concat not-matching "\u2622")))
55 (with-temp-buffer
56 (insert matching)
57 (let ((p (point)))
58 (insert not-matching)
59 (goto-char (point-min))
60 (skip-chars-forward (concat "[:" name ":]"))
61 (should (equal (point) p))
62 (skip-chars-forward (concat "^[:" name ":]"))
63 (should (equal (point) (point-max)))
64 (goto-char (point-min))
65 (skip-chars-forward (concat "[:" name ":]\u2622"))
66 (should (or (equal (point) p) (equal (point) (1+ p))))))))
68 (dolist (test '(("alnum" "abcABC012łąka" "-, \t\n")
69 ("alpha" "abcABCłąka" "-,012 \t\n")
70 ("digit" "012" "abcABCłąka-, \t\n")
71 ("xdigit" "0123aBc" "łąk-, \t\n")
72 ("upper" "ABCŁĄKA" "abc012-, \t\n")
73 ("lower" "abcłąka" "ABC012-, \t\n")
75 ("word" "abcABC012\u2620" "-, \t\n")
77 ("punct" ".,-" "abcABC012\u2620 \t\n")
78 ("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ")
79 ("graph" "abcłąka\u2620-," " \t\n\1")
80 ("print" "abcłąka\u2620-, " "\t\n\1")
82 ("space" " \t\n\u2001" "abcABCł0123")
83 ("blank" " \t" "\n\u2001")
85 ("ascii" "abcABC012 \t\n\1" "łą\u2620")
86 ("nonascii" "łą\u2622" "abcABC012 \t\n\1")
87 ("unibyte" "abcABC012 \t\n\1" "łą\u2622")
88 ("multibyte" "łą\u2622" "abcABC012 \t\n\1")))
89 (let ((name (intern (concat "regex-tests-" (car test) "-character-class")))
90 (doc (concat "Perform sanity test of regexes using " (car test)
91 " character class.
93 Go over all the supported character classes and test whether the
94 classes and their inversions match what they are supposed to
95 match. The test is done using `string-match-p' as well as
96 `skip-chars-forward'.")))
97 (eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t)))
100 (defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body)
101 "Reads a line of the test file TEST-FILE, skipping
102 comments (defined by COMMENT-CHAR), and evaluates the tests in
103 this line as defined in the BODY. Line numbers in the WHITELIST
104 are known failures, and are skipped."
106 `(with-temp-buffer
107 (modify-syntax-entry ?_ "w;; ") ; tests expect _ to be a word
108 (insert-file-contents (concat regex-tests--resources-dir ,test-file))
109 (let ((case-fold-search nil)
110 (line-number 1)
111 (whitelist-idx 0))
113 (goto-char (point-min))
115 (while (not (eobp))
116 (let ((start (point)))
117 (end-of-line)
118 (narrow-to-region start (point))
120 (goto-char (point-min))
122 (when
123 (and
124 ;; ignore comments
125 (save-excursion
126 (re-search-forward ,(concat "^[^" (string comment-char) "]") nil t))
128 ;; skip lines in the whitelist
129 (let ((whitelist-next
130 (condition-case nil
131 (aref ,whitelist whitelist-idx) (args-out-of-range nil))))
132 (cond
133 ;; whitelist exhausted. do process this line
134 ((null whitelist-next) t)
136 ;; we're not yet at the next whitelist element. do
137 ;; process this line
138 ((< line-number whitelist-next) t)
140 ;; we're past the next whitelist element. This
141 ;; shouldn't happen
142 ((> line-number whitelist-next)
143 (error
144 (format
145 "We somehow skipped the next whitelist element: line %d" whitelist-next)))
147 ;; we're at the next whitelist element. Skip this
148 ;; line, and advance the whitelist index
150 (setq whitelist-idx (1+ whitelist-idx)) nil))))
151 ,@body)
153 (widen)
154 (forward-line)
155 (beginning-of-line)
156 (setq line-number (1+ line-number)))))))
158 (defun regex-tests-compare (string what-failed bounds-ref &optional substring-ref)
159 "I just ran a search, looking at STRING. WHAT-FAILED describes
160 what failed, if anything; valid values are 'search-failed,
161 'compilation-failed and nil. I compare the beginning/end of each
162 group with their expected values. This is done with either
163 BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil.
164 BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1
165 end-ref1 ....] while SUBSTRING-REF is the expected substring
166 obtained by indexing the input string by start/end-ref.
168 If the search was supposed to fail then start-ref0/substring-ref0
169 is 'search-failed. If the search wasn't even supposed to compile
170 successfully, then start-ref0/substring-ref0 is
171 'compilation-failed. If I only care about a match succeeding,
172 this can be set to t.
174 This function returns a string that describes the failure, or nil
175 on success"
177 (when (or
178 (and bounds-ref substring-ref)
179 (not (or bounds-ref substring-ref)))
180 (error "Exactly one of bounds-ref and bounds-ref should be non-nil"))
182 (let ((what-failed-ref (car (or bounds-ref substring-ref))))
184 (cond
185 ((eq what-failed 'search-failed)
186 (cond
187 ((eq what-failed-ref 'search-failed)
188 nil)
189 ((eq what-failed-ref 'compilation-failed)
190 "Expected pattern failure; but no match")
192 "Expected match; but no match")))
194 ((eq what-failed 'compilation-failed)
195 (cond
196 ((eq what-failed-ref 'search-failed)
197 "Expected no match; but pattern failure")
198 ((eq what-failed-ref 'compilation-failed)
199 nil)
201 "Expected match; but pattern failure")))
203 ;; The regex match succeeded
204 ((eq what-failed-ref 'search-failed)
205 "Expected no match; but match")
206 ((eq what-failed-ref 'compilation-failed)
207 "Expected pattern failure; but match")
209 ;; The regex match succeeded, as expected. I now check all the
210 ;; bounds
212 (let ((idx 0)
214 ref next-ref-function compare-ref-function mismatched-ref-function)
216 (if bounds-ref
217 (setq ref bounds-ref
218 next-ref-function (lambda (x) (cddr x))
219 compare-ref-function (lambda (ref start-pos end-pos)
220 (or (eq (car ref) t)
221 (and (eq start-pos (car ref))
222 (eq end-pos (cadr ref)))))
223 mismatched-ref-function (lambda (ref start-pos end-pos)
224 (format
225 "beginning/end positions: %d/%s and %d/%s"
226 start-pos (car ref) end-pos (cadr ref))))
227 (setq ref substring-ref
228 next-ref-function (lambda (x) (cdr x))
229 compare-ref-function (lambda (ref start-pos end-pos)
230 (or (eq (car ref) t)
231 (string= (substring string start-pos end-pos) (car ref))))
232 mismatched-ref-function (lambda (ref start-pos end-pos)
233 (format
234 "beginning/end positions: %d/%s and %d/%s"
235 start-pos (car ref) end-pos (cadr ref)))))
237 (while (not (or (null ref) msg))
239 (let ((start (match-beginning idx))
240 (end (match-end idx)))
242 (when (not (funcall compare-ref-function ref start end))
243 (setq msg
244 (format
245 "Have expected match, but mismatch in group %d: %s" idx (funcall mismatched-ref-function ref start end))))
247 (setq ref (funcall next-ref-function ref)
248 idx (1+ idx))))
250 (or msg
251 nil))))))
255 (defun regex-tests-match (pattern string bounds-ref &optional substring-ref)
256 "I match the given STRING against PATTERN. I compare the
257 beginning/end of each group with their expected values.
258 BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1
259 ....].
261 If the search was supposed to fail then start-ref0 is
262 'search-failed. If the search wasn't even supposed to compile
263 successfully, then start-ref0 is 'compilation-failed.
265 This function returns a string that describes the failure, or nil
266 on success"
268 (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern)
269 ;; Skipping test: [.x.] and [=x=] forms not supported by emacs
272 (regex-tests-compare
273 string
274 (condition-case nil
275 (if (string-match pattern string) nil 'search-failed)
276 ('invalid-regexp 'compilation-failed))
277 bounds-ref substring-ref)))
280 (defconst regex-tests-re-even-escapes
281 "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*"
282 "Regex that matches an even number of \\ characters")
284 (defconst regex-tests-re-odd-escapes
285 (concat regex-tests-re-even-escapes "\\\\")
286 "Regex that matches an odd number of \\ characters")
289 (defun regex-tests-unextend (pattern)
290 "Basic conversion from extended regexes to emacs ones. This is
291 mostly a hack that adds \\ to () and | and {}, and removes it if
292 it already exists. We also change \\S (and \\s) to \\S- (and
293 \\s-) because extended regexes see the former as whitespace, but
294 emacs requires an extra symbol character"
296 (with-temp-buffer
297 (insert pattern)
298 (goto-char (point-min))
300 (while (re-search-forward "[()|{}]" nil t)
301 ;; point is past special character. If it is escaped, unescape
302 ;; it
304 (if (save-excursion
305 (re-search-backward (concat regex-tests-re-odd-escapes ".\\=") nil t))
307 ;; This special character is preceded by an odd number of \,
308 ;; so I unescape it by removing the last one
309 (progn
310 (forward-char -2)
311 (delete-char 1)
312 (forward-char 1))
314 ;; This special character is preceded by an even (possibly 0)
315 ;; number of \. I add an escape
316 (forward-char -1)
317 (insert "\\")
318 (forward-char 1)))
320 ;; convert \s to \s-
321 (goto-char (point-min))
322 (while (re-search-forward (concat regex-tests-re-odd-escapes "[Ss]") nil t)
323 (insert "-"))
325 (buffer-string)))
327 (defun regex-tests-BOOST-frob-escapes (s ispattern)
328 "Mangle \\ the way it is done in frob_escapes() in
329 regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted;
330 \\\\, \\^, \{, \\|, \} are unescaped for the string (not
331 pattern)"
333 ;; this is all similar to (regex-tests-unextend)
334 (with-temp-buffer
335 (insert s)
337 (let ((interpret-list (list "t" "n" "r")))
338 (while interpret-list
339 (goto-char (point-min))
340 (while (re-search-forward
341 (concat "\\(" regex-tests-re-even-escapes "\\)"
342 "\\\\" (car interpret-list))
343 nil t)
344 (replace-match (concat "\\1" (car (read-from-string
345 (concat "\"\\" (car interpret-list) "\""))))))
347 (setq interpret-list (cdr interpret-list))))
349 (when (not ispattern)
350 ;; unescape \\, \^, \{, \|, \}
351 (let ((unescape-list (list "\\\\" "^" "{" "|" "}")))
352 (while unescape-list
353 (goto-char (point-min))
354 (while (re-search-forward
355 (concat "\\(" regex-tests-re-even-escapes "\\)"
356 "\\\\" (car unescape-list))
357 nil t)
358 (replace-match (concat "\\1" (car unescape-list))))
360 (setq unescape-list (cdr unescape-list))))
362 (buffer-string)))
367 (defconst regex-tests-BOOST-whitelist
369 ;; emacs is more stringent with regexes involving unbalanced )
370 63 65 69
372 ;; in emacs, regex . doesn't match \n
375 ;; emacs is more forgiving with * and ? that don't apply to
376 ;; characters
377 107 108 109 122 123 124 140 141 142
379 ;; emacs accepts regexes with {}
382 ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5]
383 222 223
385 ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of
386 ;; ababaaa match
387 284 294
389 ;; ambiguous groupings are ambiguous
390 443 444 445 446 448 449 450
392 ;; emacs doesn't know how to handle weird ranges such as [a-Z] and
393 ;; [[:alpha:]-a]
394 539 580 581
396 ;; emacs matches non-greedy regex ab.*? non-greedily
397 639 677 712
399 "Line numbers in the boost test that should be skipped. These
400 are false-positive test failures that represent known/benign
401 differences in behavior.")
403 ;; - Format
404 ;; - Comments are lines starting with ;
405 ;; - Lines starting with - set options passed to regcomp() and regexec():
406 ;; - if no "REG_BASIC" is found, with have an extended regex
407 ;; - These set a flag:
408 ;; - REG_ICASE
409 ;; - REG_NEWLINE (ignored by this function)
410 ;; - REG_NOTBOL
411 ;; - REG_NOTEOL
413 ;; - Test lines are
414 ;; pattern string start0 end0 start1 end1 ...
416 ;; - pattern, string can have escapes
417 ;; - string can have whitespace if enclosed in ""
418 ;; - if string is "!", then the pattern is supposed to fail compilation
419 ;; - start/end are of group0, group1, etc. group 0 is the full match
420 ;; - start<0 indicates "no match"
421 ;; - start is the 0-based index of the first character
422 ;; - end is the 0-based index of the first character past the group
423 (defun regex-tests-BOOST ()
424 (let (failures
425 basic icase notbol noteol)
426 (regex-tests-generic-line
427 ?; "BOOST.tests" regex-tests-BOOST-whitelist
428 (if (save-excursion (re-search-forward "^-" nil t))
429 (setq basic (save-excursion (re-search-forward "REG_BASIC" nil t))
430 icase (save-excursion (re-search-forward "REG_ICASE" nil t))
431 notbol (save-excursion (re-search-forward "REG_NOTBOL" nil t))
432 noteol (save-excursion (re-search-forward "REG_NOTEOL" nil t)))
434 (save-excursion
435 (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t)
436 (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t)
437 (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t)))
439 (let* ((pattern-raw (match-string 1))
440 (string-raw (match-string 2))
441 (positions-raw (match-string 3))
442 (pattern (regex-tests-BOOST-frob-escapes pattern-raw t))
443 (string (regex-tests-BOOST-frob-escapes string-raw nil))
444 (positions
445 (if (string= string "!")
446 (list 'compilation-failed 0)
447 (mapcar
448 (lambda (x)
449 (let ((x (string-to-number x)))
450 (if (< x 0) nil x)))
451 (split-string positions-raw)))))
453 (when (null (car positions))
454 (setcar positions 'search-failed))
456 (when (not basic)
457 (setq pattern (regex-tests-unextend pattern)))
459 ;; great. I now have all the data parsed. Let's use it to do
460 ;; stuff
461 (let* ((case-fold-search icase)
462 (msg (regex-tests-match pattern string positions)))
464 (if (and
465 ;; Skipping test: notbol/noteol not supported
466 (not notbol) (not noteol)
468 msg)
470 ;; store failure
471 (setq failures
472 (cons (format "line number %d: Regex '%s': %s"
473 line-number pattern msg)
474 failures)))))))
476 failures))
478 (defconst regex-tests-PCRE-whitelist
480 ;; ambiguous groupings are ambiguous
481 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203
483 "Line numbers in the PCRE test that should be skipped. These
484 are false-positive test failures that represent known/benign
485 differences in behavior.")
487 ;; - Format
489 ;; regex
490 ;; input_string
491 ;; group_num: group_match | "No match"
492 ;; input_string
493 ;; group_num: group_match | "No match"
494 ;; input_string
495 ;; group_num: group_match | "No match"
496 ;; input_string
497 ;; group_num: group_match | "No match"
498 ;; ...
499 (defun regex-tests-PCRE ()
500 (let (failures
501 pattern icase string what-failed matches-observed)
502 (regex-tests-generic-line
503 ?# "PCRE.tests" regex-tests-PCRE-whitelist
505 (cond
507 ;; pattern
508 ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t))
509 (setq icase (string= "i" (match-string 2))
510 pattern (regex-tests-unextend (match-string 1))))
512 ;; string. read it in, match against pattern, and save all the results
513 ((save-excursion (re-search-forward "^ \\(.*\\)" nil t))
514 (let ((case-fold-search icase))
515 (setq string (match-string 1)
517 ;; the regex match under test
518 what-failed
519 (condition-case nil
520 (if (string-match pattern string) nil 'search-failed)
521 ('invalid-regexp 'compilation-failed))
523 matches-observed
524 (cl-loop for x from 0 to 20
525 collect (and (not what-failed)
526 (or (match-string x string) "<unset>")))))
527 nil)
529 ;; verification line: failed match
530 ((save-excursion (re-search-forward "^No match" nil t))
531 (unless what-failed
532 (setq failures
533 (cons (format "line number %d: Regex '%s': Expected no match; but match"
534 line-number pattern)
535 failures))))
537 ;; verification line: succeeded match
538 ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t))
539 (let* ((match-ref (match-string 2))
540 (idx (string-to-number (match-string 1))))
542 (if what-failed
543 "Expected match; but no match"
544 (unless (string= match-ref (elt matches-observed idx))
545 (setq failures
546 (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'"
547 line-number pattern
548 idx match-ref (elt matches-observed idx))
549 failures))))))
551 ;; reset
552 (t (setq pattern nil) nil)))
554 failures))
556 (defconst regex-tests-PTESTS-whitelist
558 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
559 ;; fails to match
562 ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character
565 "Line numbers in the PTESTS test that should be skipped. These
566 are false-positive test failures that represent known/benign
567 differences in behavior.")
569 ;; - Format
570 ;; - fields separated by ¦ (note: this is not a |)
571 ;; - start¦end¦pattern¦string
572 ;; - start is the 1-based index of the first character
573 ;; - end is the 1-based index of the last character
574 (defun regex-tests-PTESTS ()
575 (let (failures)
576 (regex-tests-generic-line
577 ?# "PTESTS" regex-tests-PTESTS-whitelist
578 (let* ((fields (split-string (buffer-string) "¦"))
580 ;; string has 1-based index of first char in the
581 ;; match. -1 means "no match". -2 means "invalid
582 ;; regex".
584 ;; start-ref is 0-based index of first char in the
585 ;; match
587 ;; string==0 is a special case, and I have to treat
588 ;; it as start-ref = 0
589 (start-ref (let ((raw (string-to-number (elt fields 0))))
590 (cond
591 ((= raw -2) 'compilation-failed)
592 ((= raw -1) 'search-failed)
593 ((= raw 0) 0)
594 (t (1- raw)))))
596 ;; string has 1-based index of last char in the
597 ;; match. end-ref is 0-based index of first char past
598 ;; the match
599 (end-ref (string-to-number (elt fields 1)))
600 (pattern (elt fields 2))
601 (string (elt fields 3)))
603 (let ((msg (regex-tests-match pattern string (list start-ref end-ref))))
604 (when msg
605 (setq failures
606 (cons (format "line number %d: Regex '%s': %s"
607 line-number pattern msg)
608 failures))))))
609 failures))
611 (defconst regex-tests-TESTS-whitelist
613 ;; emacs doesn't barf on weird ranges such as [b-a], but simply
614 ;; fails to match
617 ;; emacs is more forgiving with * and ? that don't apply to
618 ;; characters
619 57 58 59 60
621 ;; emacs is more stringent with regexes involving unbalanced )
624 "Line numbers in the TESTS test that should be skipped. These
625 are false-positive test failures that represent known/benign
626 differences in behavior.")
628 ;; - Format
629 ;; - fields separated by :. Watch for [\[:xxx:]]
630 ;; - expected:pattern:string
632 ;; expected:
633 ;; | 0 | successful match |
634 ;; | 1 | failed match |
635 ;; | 2 | regcomp() should fail |
636 (defun regex-tests-TESTS ()
637 (let (failures)
638 (regex-tests-generic-line
639 ?# "TESTS" regex-tests-TESTS-whitelist
640 (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t))
641 (let* ((what-failed
642 (let ((raw (string-to-number (match-string 1))))
643 (cond
644 ((= raw 2) 'compilation-failed)
645 ((= raw 1) 'search-failed)
646 (t t))))
647 (string (match-string 3))
648 (pattern (regex-tests-unextend (match-string 2))))
650 (let ((msg (regex-tests-match pattern string nil (list what-failed))))
651 (when msg
652 (setq failures
653 (cons (format "line number %d: Regex '%s': %s"
654 line-number pattern msg)
655 failures)))))
657 (error "Error parsing TESTS file line: '%s'" (buffer-string))))
658 failures))
660 (ert-deftest regex-tests-BOOST ()
661 "Tests of the regular expression engine.
662 This evaluates the BOOST test cases from glibc."
663 (should-not (regex-tests-BOOST)))
665 (ert-deftest regex-tests-PCRE ()
666 "Tests of the regular expression engine.
667 This evaluates the PCRE test cases from glibc."
668 (should-not (regex-tests-PCRE)))
670 (ert-deftest regex-tests-PTESTS ()
671 "Tests of the regular expression engine.
672 This evaluates the PTESTS test cases from glibc."
673 (should-not (regex-tests-PTESTS)))
675 (ert-deftest regex-tests-TESTS ()
676 "Tests of the regular expression engine.
677 This evaluates the TESTS test cases from glibc."
678 (should-not (regex-tests-TESTS)))
680 ;;; regex-tests.el ends here