(latexenc-find-file-coding-system): Don't inherit the EOL part of the
[emacs.git] / lisp / emacs-lisp / sregex.el
blob3f7aaa16bceb525933f1ffa30f8f0254f80a6c6d
1 ;;; sregex.el --- symbolic regular expressions
3 ;; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
7 ;; Keywords: extensions
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; any later version.
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
26 ;;; Commentary:
28 ;; This package allows you to write regular expressions using a
29 ;; totally new, Lisp-like syntax.
31 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
32 ;; that, when evaluated, produces the string form of the specified
33 ;; regular expression. Here's a simple example:
35 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
37 ;; As you can see, an sregex is specified by placing one or more
38 ;; special clauses in a call to `sregexq'. The clause in this case is
39 ;; the `or' of two strings (not to be confused with the Lisp function
40 ;; `or'). The list of allowable clauses appears below.
42 ;; With sregex, it is never necessary to "escape" magic characters
43 ;; that are meant to be taken literally; that happens automatically.
44 ;; For example:
46 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
48 ;; It is also unnecessary to "group" parts of the expression together
49 ;; to overcome operator precedence; that also happens automatically.
50 ;; For example:
52 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?"
54 ;; It *is* possible to group parts of the expression in order to refer
55 ;; to them with numbered backreferences:
57 ;; (sregexq (group (or "Go" "Run"))
58 ;; ", Spot, "
59 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
61 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
62 ;; Lisp expression that then invokes a moderately complex engine to
63 ;; interpret the sregex and render the string form. Because of this,
64 ;; I don't recommend sprinkling calls to `sregexq' throughout your
65 ;; code, the way one normally does with string regexes (which are
66 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
67 ;; you need wherever possible instead of repeatedly constructing the
68 ;; same ones over and over. Example:
70 ;; (let ((field-regex (sregexq (opt "resent-")
71 ;; (or "to" "cc" "bcc"))))
72 ;; ...
73 ;; (while ...
74 ;; ...
75 ;; (re-search-forward field-regex ...)
76 ;; ...))
78 ;; The arguments to `sregexq' are automatically quoted, but the
79 ;; flipside of this is that it is not straightforward to include
80 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
81 ;; `sregex' is a function that is like `sregexq' but which does not
82 ;; automatically quote its values. Literal sregex clauses must be
83 ;; explicitly quoted like so:
85 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
87 ;; but computed clauses can be included easily, allowing for the reuse
88 ;; of common clauses:
90 ;; (let ((dotstar '(0+ any))
91 ;; (whitespace '(1+ (syntax ?-)))
92 ;; (digits '(1+ (char (?0 . ?9)))))
93 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
95 ;; To use this package in a Lisp program, simply (require 'sregex).
97 ;; Here are the clauses allowed in an `sregex' or `sregexq'
98 ;; expression:
100 ;; - a string
101 ;; This stands for the literal string. If it contains
102 ;; metacharacters, they will be escaped in the resulting regex
103 ;; (using `regexp-quote').
105 ;; - the symbol `any'
106 ;; This stands for ".", a regex matching any character except
107 ;; newline.
109 ;; - the symbol `bol'
110 ;; Stands for "^", matching the empty string at the beginning of a line
112 ;; - the symbol `eol'
113 ;; Stands for "$", matching the empty string at the end of a line
115 ;; - (group CLAUSE ...)
116 ;; Groups the given CLAUSEs using "\\(" and "\\)".
118 ;; - (sequence CLAUSE ...)
120 ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)".
121 ;; Clauses grouped by `sequence' do not count for purposes of
122 ;; numbering backreferences. Use `sequence' in situations like
123 ;; this:
125 ;; (sregexq (or "dog" "cat"
126 ;; (sequence (opt "sea ") "monkey")))
127 ;; => "dog\\|cat\\|\\(?:sea \\)?monkey"
129 ;; where a single `or' alternate needs to contain multiple
130 ;; subclauses.
132 ;; - (backref N)
133 ;; Matches the same string previously matched by the Nth "group" in
134 ;; the same sregex. N is a positive integer.
136 ;; - (or CLAUSE ...)
137 ;; Matches any one of the CLAUSEs by separating them with "\\|".
139 ;; - (0+ CLAUSE ...)
140 ;; Concatenates the given CLAUSEs and matches zero or more
141 ;; occurrences by appending "*".
143 ;; - (1+ CLAUSE ...)
144 ;; Concatenates the given CLAUSEs and matches one or more
145 ;; occurrences by appending "+".
147 ;; - (opt CLAUSE ...)
148 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
149 ;; by appending "?".
151 ;; - (repeat MIN MAX CLAUSE ...)
152 ;; Concatenates the given CLAUSEs and constructs a regex matching at
153 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
154 ;; non-negative integer. MAX must be a non-negative integer greater
155 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
157 ;; - (char CHAR-CLAUSE ...)
158 ;; Creates a "character class" matching one character from the given
159 ;; set. See below for how to construct a CHAR-CLAUSE.
161 ;; - (not-char CHAR-CLAUSE ...)
162 ;; Creates a "character class" matching any one character not in the
163 ;; given set. See below for how to construct a CHAR-CLAUSE.
165 ;; - the symbol `bot'
166 ;; Stands for "\\`", matching the empty string at the beginning of
167 ;; text (beginning of a string or of a buffer).
169 ;; - the symbol `eot'
170 ;; Stands for "\\'", matching the empty string at the end of text.
172 ;; - the symbol `point'
173 ;; Stands for "\\=", matching the empty string at point.
175 ;; - the symbol `word-boundary'
176 ;; Stands for "\\b", matching the empty string at the beginning or
177 ;; end of a word.
179 ;; - the symbol `not-word-boundary'
180 ;; Stands for "\\B", matching the empty string not at the beginning
181 ;; or end of a word.
183 ;; - the symbol `bow'
184 ;; Stands for "\\<", matching the empty string at the beginning of a
185 ;; word.
187 ;; - the symbol `eow'
188 ;; Stands for "\\>", matching the empty string at the end of a word.
190 ;; - the symbol `wordchar'
191 ;; Stands for the regex "\\w", matching a word-constituent character
192 ;; (as determined by the current syntax table)
194 ;; - the symbol `not-wordchar'
195 ;; Stands for the regex "\\W", matching a non-word-constituent
196 ;; character.
198 ;; - (syntax CODE)
199 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
200 ;; (a single character). Matches any character with the requested
201 ;; syntax.
203 ;; - (not-syntax CODE)
204 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
205 ;; (a single character). Matches any character without the
206 ;; requested syntax.
208 ;; - (regex REGEX)
209 ;; This is a "trapdoor" for including ordinary regular expression
210 ;; strings in the result. Some regular expressions are clearer when
211 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
212 ;; instance. However, see the note under "Bugs," below.
214 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
215 ;; has one of the following forms:
217 ;; - a character
218 ;; Adds that character to the set.
220 ;; - a string
221 ;; Adds all the characters in the string to the set.
223 ;; - A pair (MIN . MAX)
224 ;; Where MIN and MAX are characters, adds the range of characters
225 ;; from MIN through MAX to the set.
227 ;;; To do:
229 ;; An earlier version of this package could optionally translate the
230 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
231 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
232 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
233 ;; such a facility.
235 ;; - handle multibyte chars in sregex--char-aux
236 ;; - add support for character classes ([:blank:], ...)
237 ;; - add support for non-greedy operators *? and +?
238 ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?"
240 ;;; Bugs:
242 ;;; Code:
244 (eval-when-compile (require 'cl))
246 ;; Compatibility code for when we didn't have shy-groups
247 (defvar sregex--current-sregex nil)
248 (defun sregex-info () nil)
249 (defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms))
250 (defun sregex-replace-match (r &optional f l str subexp x)
251 (replace-match r f l str subexp))
252 (defun sregex-match-string (c &optional i x) (match-string c i))
253 (defun sregex-match-string-no-properties (count &optional in-string sregex)
254 (match-string-no-properties count in-string))
255 (defun sregex-match-beginning (count &optional sregex) (match-beginning count))
256 (defun sregex-match-end (count &optional sregex) (match-end count))
257 (defun sregex-match-data (&optional sregex) (match-data))
258 (defun sregex-backref-num (n &optional sregex) n)
261 (defun sregex (&rest exps)
262 "Symbolic regular expression interpreter.
263 This is exactly like `sregexq' (q.v.) except that it evaluates all its
264 arguments, so literal sregex clauses must be quoted. For example:
266 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
268 An argument-evaluating sregex interpreter lets you reuse sregex
269 subexpressions:
271 (let ((dotstar '(0+ any))
272 (whitespace '(1+ (syntax ?-)))
273 (digits '(1+ (char (?0 . ?9)))))
274 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
275 (sregex--sequence exps nil))
277 (defmacro sregexq (&rest exps)
278 "Symbolic regular expression interpreter.
279 This macro allows you to specify a regular expression (regexp) in
280 symbolic form, and converts it into the string form required by Emacs's
281 regex functions such as `re-search-forward' and `looking-at'. Here is
282 a simple example:
284 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
286 As you can see, an sregex is specified by placing one or more special
287 clauses in a call to `sregexq'. The clause in this case is the `or'
288 of two strings (not to be confused with the Lisp function `or'). The
289 list of allowable clauses appears below.
291 With `sregex', it is never necessary to \"escape\" magic characters
292 that are meant to be taken literally; that happens automatically.
293 For example:
295 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
297 It is also unnecessary to \"group\" parts of the expression together
298 to overcome operator precedence; that also happens automatically.
299 For example:
301 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
303 It *is* possible to group parts of the expression in order to refer
304 to them with numbered backreferences:
306 (sregexq (group (or \"Go\" \"Run\"))
307 \", Spot, \"
308 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
310 If `sregexq' needs to introduce its own grouping parentheses, it will
311 automatically renumber your backreferences:
313 (sregexq (opt \"resent-\")
314 (group (or \"to\" \"cc\" \"bcc\"))
315 \": \"
316 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
318 `sregexq' is a macro. Each time it is used, it constructs a simple
319 Lisp expression that then invokes a moderately complex engine to
320 interpret the sregex and render the string form. Because of this, I
321 don't recommend sprinkling calls to `sregexq' throughout your code,
322 the way one normally does with string regexes (which are cheap to
323 evaluate). Instead, it's wiser to precompute the regexes you need
324 wherever possible instead of repeatedly constructing the same ones
325 over and over. Example:
327 (let ((field-regex (sregexq (opt \"resent-\")
328 (or \"to\" \"cc\" \"bcc\"))))
330 (while ...
332 (re-search-forward field-regex ...)
333 ...))
335 The arguments to `sregexq' are automatically quoted, but the
336 flipside of this is that it is not straightforward to include
337 computed (i.e., non-constant) values in `sregexq' expressions. So
338 `sregex' is a function that is like `sregexq' but which does not
339 automatically quote its values. Literal sregex clauses must be
340 explicitly quoted like so:
342 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
344 but computed clauses can be included easily, allowing for the reuse
345 of common clauses:
347 (let ((dotstar '(0+ any))
348 (whitespace '(1+ (syntax ?-)))
349 (digits '(1+ (char (?0 . ?9)))))
350 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
352 Here are the clauses allowed in an `sregex' or `sregexq' expression:
354 - a string
355 This stands for the literal string. If it contains
356 metacharacters, they will be escaped in the resulting regex
357 (using `regexp-quote').
359 - the symbol `any'
360 This stands for \".\", a regex matching any character except
361 newline.
363 - the symbol `bol'
364 Stands for \"^\", matching the empty string at the beginning of a line
366 - the symbol `eol'
367 Stands for \"$\", matching the empty string at the end of a line
369 - (group CLAUSE ...)
370 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
372 - (sequence CLAUSE ...)
374 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
375 Clauses grouped by `sequence' do not count for purposes of
376 numbering backreferences. Use `sequence' in situations like
377 this:
379 (sregexq (or \"dog\" \"cat\"
380 (sequence (opt \"sea \") \"monkey\")))
381 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\"
383 where a single `or' alternate needs to contain multiple
384 subclauses.
386 - (backref N)
387 Matches the same string previously matched by the Nth \"group\" in
388 the same sregex. N is a positive integer.
390 - (or CLAUSE ...)
391 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
393 - (0+ CLAUSE ...)
394 Concatenates the given CLAUSEs and matches zero or more
395 occurrences by appending \"*\".
397 - (1+ CLAUSE ...)
398 Concatenates the given CLAUSEs and matches one or more
399 occurrences by appending \"+\".
401 - (opt CLAUSE ...)
402 Concatenates the given CLAUSEs and matches zero or one occurrence
403 by appending \"?\".
405 - (repeat MIN MAX CLAUSE ...)
406 Concatenates the given CLAUSEs and constructs a regex matching at
407 least MIN occurrences and at most MAX occurrences. MIN must be a
408 non-negative integer. MAX must be a non-negative integer greater
409 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
411 - (char CHAR-CLAUSE ...)
412 Creates a \"character class\" matching one character from the given
413 set. See below for how to construct a CHAR-CLAUSE.
415 - (not-char CHAR-CLAUSE ...)
416 Creates a \"character class\" matching any one character not in the
417 given set. See below for how to construct a CHAR-CLAUSE.
419 - the symbol `bot'
420 Stands for \"\\\\`\", matching the empty string at the beginning of
421 text (beginning of a string or of a buffer).
423 - the symbol `eot'
424 Stands for \"\\\\'\", matching the empty string at the end of text.
426 - the symbol `point'
427 Stands for \"\\\\=\", matching the empty string at point.
429 - the symbol `word-boundary'
430 Stands for \"\\\\b\", matching the empty string at the beginning or
431 end of a word.
433 - the symbol `not-word-boundary'
434 Stands for \"\\\\B\", matching the empty string not at the beginning
435 or end of a word.
437 - the symbol `bow'
438 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
439 word.
441 - the symbol `eow'
442 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
444 - the symbol `wordchar'
445 Stands for the regex \"\\\\w\", matching a word-constituent character
446 (as determined by the current syntax table)
448 - the symbol `not-wordchar'
449 Stands for the regex \"\\\\W\", matching a non-word-constituent
450 character.
452 - (syntax CODE)
453 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
454 (a single character). Matches any character with the requested
455 syntax.
457 - (not-syntax CODE)
458 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
459 (a single character). Matches any character without the
460 requested syntax.
462 - (regex REGEX)
463 This is a \"trapdoor\" for including ordinary regular expression
464 strings in the result. Some regular expressions are clearer when
465 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
466 instance.
468 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
469 has one of the following forms:
471 - a character
472 Adds that character to the set.
474 - a string
475 Adds all the characters in the string to the set.
477 - A pair (MIN . MAX)
478 Where MIN and MAX are characters, adds the range of characters
479 from MIN through MAX to the set."
480 `(apply 'sregex ',exps))
482 (defun sregex--engine (exp combine)
483 (cond
484 ((stringp exp)
485 (if (and combine
486 (eq combine 'suffix)
487 (/= (length exp) 1))
488 (concat "\\(?:" (regexp-quote exp) "\\)")
489 (regexp-quote exp)))
490 ((symbolp exp)
491 (ecase exp
492 (any ".")
493 (bol "^")
494 (eol "$")
495 (wordchar "\\w")
496 (not-wordchar "\\W")
497 (bot "\\`")
498 (eot "\\'")
499 (point "\\=")
500 (word-boundary "\\b")
501 (not-word-boundary "\\B")
502 (bow "\\<")
503 (eow "\\>")))
504 ((consp exp)
505 (funcall (intern (concat "sregex--"
506 (symbol-name (car exp))))
507 (cdr exp)
508 combine))
509 (t (error "Invalid expression: %s" exp))))
511 (defun sregex--sequence (exps combine)
512 (if (= (length exps) 1) (sregex--engine (car exps) combine)
513 (let ((re (mapconcat
514 (lambda (e) (sregex--engine e 'concat))
515 exps "")))
516 (if (eq combine 'suffix)
517 (concat "\\(?:" re "\\)")
518 re))))
520 (defun sregex--or (exps combine)
521 (if (= (length exps) 1) (sregex--engine (car exps) combine)
522 (let ((re (mapconcat
523 (lambda (e) (sregex--engine e 'or))
524 exps "\\|")))
525 (if (not (eq combine 'or))
526 (concat "\\(?:" re "\\)")
527 re))))
529 (defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)"))
531 (defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps))))
532 (defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?"))
533 (defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*"))
534 (defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+"))
536 (defun sregex--char (exps combine) (sregex--char-aux nil exps))
537 (defun sregex--not-char (exps combine) (sregex--char-aux t exps))
539 (defun sregex--syntax (exps combine) (format "\\s%c" (car exps)))
540 (defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps)))
542 (defun sregex--regex (exps combine)
543 (if combine (concat "\\(?:" (car exps) "\\)") (car exps)))
545 (defun sregex--repeat (exps combine)
546 (let* ((min (or (pop exps) 0))
547 (minstr (number-to-string min))
548 (max (pop exps)))
549 (concat (sregex--sequence exps 'suffix)
550 (concat "\\{" minstr ","
551 (when max (number-to-string max)) "\\}"))))
553 (defun sregex--char-range (start end)
554 (let ((startc (char-to-string start))
555 (endc (char-to-string end)))
556 (cond
557 ((> end (+ start 2)) (concat startc "-" endc))
558 ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc))
559 ((> end start) (concat startc endc))
560 (t startc))))
562 (defun sregex--char-aux (complement args)
563 ;; regex-opt does the same, we should join effort.
564 (let ((chars (make-bool-vector 256 nil))) ; Yeah, right!
565 (dolist (arg args)
566 (cond ((integerp arg) (aset chars arg t))
567 ((stringp arg) (mapcar (lambda (c) (aset chars c t)) arg))
568 ((consp arg)
569 (let ((start (car arg))
570 (end (cdr arg)))
571 (when (> start end)
572 (let ((tmp start)) (setq start end) (setq end tmp)))
573 ;; now start <= end
574 (let ((i start))
575 (while (<= i end)
576 (aset chars i t)
577 (setq i (1+ i))))))))
578 ;; now chars is a map of the characters in the class
579 (let ((caret (aref chars ?^))
580 (dash (aref chars ?-))
581 (class (if (aref chars ?\]) "]" "")))
582 (aset chars ?^ nil)
583 (aset chars ?- nil)
584 (aset chars ?\] nil)
586 (let (start end)
587 (dotimes (i 256)
588 (if (aref chars i)
589 (progn
590 (unless start (setq start i))
591 (setq end i)
592 (aset chars i nil))
593 (when start
594 (setq class (concat class (sregex--char-range start end)))
595 (setq start nil))))
596 (if start
597 (setq class (concat class (sregex--char-range start end)))))
599 (if (> (length class) 0)
600 (setq class (concat class (if caret "^") (if dash "-")))
601 (setq class (concat class (if dash "-") (if caret "^"))))
602 (if (and (not complement) (= (length class) 1))
603 (regexp-quote class)
604 (concat "[" (if complement "^") class "]")))))
606 (provide 'sregex)
608 ;;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492
609 ;;; sregex.el ends here