1 ;;; sregex.el --- symbolic regular expressions
3 ;; Copyright (C) 1997, 1998, 2000 Free Software Foundation, Inc.
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
7 ;; Keywords: extensions
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
28 ;; This package allows you to write regular expressions using a
29 ;; totally new, Lisp-like syntax.
31 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
32 ;; that, when evaluated, produces the string form of the specified
33 ;; regular expression. Here's a simple example:
35 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
37 ;; As you can see, an sregex is specified by placing one or more
38 ;; special clauses in a call to `sregexq'. The clause in this case is
39 ;; the `or' of two strings (not to be confused with the Lisp function
40 ;; `or'). The list of allowable clauses appears below.
42 ;; With sregex, it is never necessary to "escape" magic characters
43 ;; that are meant to be taken literally; that happens automatically.
46 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
48 ;; It is also unnecessary to "group" parts of the expression together
49 ;; to overcome operator precedence; that also happens automatically.
52 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?"
54 ;; It *is* possible to group parts of the expression in order to refer
55 ;; to them with numbered backreferences:
57 ;; (sregexq (group (or "Go" "Run"))
59 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
61 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
62 ;; Lisp expression that then invokes a moderately complex engine to
63 ;; interpret the sregex and render the string form. Because of this,
64 ;; I don't recommend sprinkling calls to `sregexq' throughout your
65 ;; code, the way one normally does with string regexes (which are
66 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
67 ;; you need wherever possible instead of repeatedly constructing the
68 ;; same ones over and over. Example:
70 ;; (let ((field-regex (sregexq (opt "resent-")
71 ;; (or "to" "cc" "bcc"))))
75 ;; (re-search-forward field-regex ...)
78 ;; The arguments to `sregexq' are automatically quoted, but the
79 ;; flipside of this is that it is not straightforward to include
80 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
81 ;; `sregex' is a function that is like `sregexq' but which does not
82 ;; automatically quote its values. Literal sregex clauses must be
83 ;; explicitly quoted like so:
85 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
87 ;; but computed clauses can be included easily, allowing for the reuse
90 ;; (let ((dotstar '(0+ any))
91 ;; (whitespace '(1+ (syntax ?-)))
92 ;; (digits '(1+ (char (?0 . ?9)))))
93 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
95 ;; To use this package in a Lisp program, simply (require 'sregex).
97 ;; Here are the clauses allowed in an `sregex' or `sregexq'
101 ;; This stands for the literal string. If it contains
102 ;; metacharacters, they will be escaped in the resulting regex
103 ;; (using `regexp-quote').
105 ;; - the symbol `any'
106 ;; This stands for ".", a regex matching any character except
109 ;; - the symbol `bol'
110 ;; Stands for "^", matching the empty string at the beginning of a line
112 ;; - the symbol `eol'
113 ;; Stands for "$", matching the empty string at the end of a line
115 ;; - (group CLAUSE ...)
116 ;; Groups the given CLAUSEs using "\\(" and "\\)".
118 ;; - (sequence CLAUSE ...)
120 ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)".
121 ;; Clauses grouped by `sequence' do not count for purposes of
122 ;; numbering backreferences. Use `sequence' in situations like
125 ;; (sregexq (or "dog" "cat"
126 ;; (sequence (opt "sea ") "monkey")))
127 ;; => "dog\\|cat\\|\\(?:sea \\)?monkey"
129 ;; where a single `or' alternate needs to contain multiple
133 ;; Matches the same string previously matched by the Nth "group" in
134 ;; the same sregex. N is a positive integer.
137 ;; Matches any one of the CLAUSEs by separating them with "\\|".
140 ;; Concatenates the given CLAUSEs and matches zero or more
141 ;; occurrences by appending "*".
144 ;; Concatenates the given CLAUSEs and matches one or more
145 ;; occurrences by appending "+".
147 ;; - (opt CLAUSE ...)
148 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
151 ;; - (repeat MIN MAX CLAUSE ...)
152 ;; Concatenates the given CLAUSEs and constructs a regex matching at
153 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
154 ;; non-negative integer. MAX must be a non-negative integer greater
155 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
157 ;; - (char CHAR-CLAUSE ...)
158 ;; Creates a "character class" matching one character from the given
159 ;; set. See below for how to construct a CHAR-CLAUSE.
161 ;; - (not-char CHAR-CLAUSE ...)
162 ;; Creates a "character class" matching any one character not in the
163 ;; given set. See below for how to construct a CHAR-CLAUSE.
165 ;; - the symbol `bot'
166 ;; Stands for "\\`", matching the empty string at the beginning of
167 ;; text (beginning of a string or of a buffer).
169 ;; - the symbol `eot'
170 ;; Stands for "\\'", matching the empty string at the end of text.
172 ;; - the symbol `point'
173 ;; Stands for "\\=", matching the empty string at point.
175 ;; - the symbol `word-boundary'
176 ;; Stands for "\\b", matching the empty string at the beginning or
179 ;; - the symbol `not-word-boundary'
180 ;; Stands for "\\B", matching the empty string not at the beginning
183 ;; - the symbol `bow'
184 ;; Stands for "\\<", matching the empty string at the beginning of a
187 ;; - the symbol `eow'
188 ;; Stands for "\\>", matching the empty string at the end of a word.
190 ;; - the symbol `wordchar'
191 ;; Stands for the regex "\\w", matching a word-constituent character
192 ;; (as determined by the current syntax table)
194 ;; - the symbol `not-wordchar'
195 ;; Stands for the regex "\\W", matching a non-word-constituent
199 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
200 ;; (a single character). Matches any character with the requested
203 ;; - (not-syntax CODE)
204 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
205 ;; (a single character). Matches any character without the
209 ;; This is a "trapdoor" for including ordinary regular expression
210 ;; strings in the result. Some regular expressions are clearer when
211 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
212 ;; instance. However, see the note under "Bugs," below.
214 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
215 ;; has one of the following forms:
218 ;; Adds that character to the set.
221 ;; Adds all the characters in the string to the set.
223 ;; - A pair (MIN . MAX)
224 ;; Where MIN and MAX are characters, adds the range of characters
225 ;; from MIN through MAX to the set.
229 ;; An earlier version of this package could optionally translate the
230 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
231 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
232 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
235 ;; - handle multibyte chars in sregex--char-aux
236 ;; - add support for character classes ([:blank:], ...)
237 ;; - add support for non-greedy operators *? and +?
238 ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?"
244 (eval-when-compile (require 'cl
))
246 ;; Compatibility code for when we didn't have shy-groups
247 (defvar sregex--current-sregex nil
)
248 (defun sregex-info () nil
)
249 (defmacro sregex-save-match-data
(&rest forms
) (cons 'save-match-data forms
))
250 (defun sregex-replace-match (r &optional f l str subexp x
)
251 (replace-match r f l str subexp
))
252 (defun sregex-match-string (c &optional i x
) (match-string c i
))
253 (defun sregex-match-string-no-properties (count &optional in-string sregex
)
254 (match-string-no-properties count in-string
))
255 (defun sregex-match-beginning (count &optional sregex
) (match-beginning count
))
256 (defun sregex-match-end (count &optional sregex
) (match-end count
))
257 (defun sregex-match-data (&optional sregex
) (match-data))
258 (defun sregex-backref-num (n &optional sregex
) n
)
261 (defun sregex (&rest exps
)
262 "Symbolic regular expression interpreter.
263 This is exactly like `sregexq' (q.v.) except that it evaluates all its
264 arguments, so literal sregex clauses must be quoted. For example:
266 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
268 An argument-evaluating sregex interpreter lets you reuse sregex
271 (let ((dotstar '(0+ any))
272 (whitespace '(1+ (syntax ?-)))
273 (digits '(1+ (char (?0 . ?9)))))
274 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
275 (sregex--sequence exps nil
))
277 (defmacro sregexq
(&rest exps
)
278 "Symbolic regular expression interpreter.
279 This macro allows you to specify a regular expression (regexp) in
280 symbolic form, and converts it into the string form required by Emacs's
281 regex functions such as `re-search-forward' and `looking-at'. Here is
284 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
286 As you can see, an sregex is specified by placing one or more special
287 clauses in a call to `sregexq'. The clause in this case is the `or'
288 of two strings (not to be confused with the Lisp function `or'). The
289 list of allowable clauses appears below.
291 With `sregex', it is never necessary to \"escape\" magic characters
292 that are meant to be taken literally; that happens automatically.
295 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
297 It is also unnecessary to \"group\" parts of the expression together
298 to overcome operator precedence; that also happens automatically.
301 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
303 It *is* possible to group parts of the expression in order to refer
304 to them with numbered backreferences:
306 (sregexq (group (or \"Go\" \"Run\"))
308 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
310 If `sregexq' needs to introduce its own grouping parentheses, it will
311 automatically renumber your backreferences:
313 (sregexq (opt \"resent-\")
314 (group (or \"to\" \"cc\" \"bcc\"))
316 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
318 `sregexq' is a macro. Each time it is used, it constructs a simple
319 Lisp expression that then invokes a moderately complex engine to
320 interpret the sregex and render the string form. Because of this, I
321 don't recommend sprinkling calls to `sregexq' throughout your code,
322 the way one normally does with string regexes (which are cheap to
323 evaluate). Instead, it's wiser to precompute the regexes you need
324 wherever possible instead of repeatedly constructing the same ones
325 over and over. Example:
327 (let ((field-regex (sregexq (opt \"resent-\")
328 (or \"to\" \"cc\" \"bcc\"))))
332 (re-search-forward field-regex ...)
335 The arguments to `sregexq' are automatically quoted, but the
336 flipside of this is that it is not straightforward to include
337 computed (i.e., non-constant) values in `sregexq' expressions. So
338 `sregex' is a function that is like `sregexq' but which does not
339 automatically quote its values. Literal sregex clauses must be
340 explicitly quoted like so:
342 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
344 but computed clauses can be included easily, allowing for the reuse
347 (let ((dotstar '(0+ any))
348 (whitespace '(1+ (syntax ?-)))
349 (digits '(1+ (char (?0 . ?9)))))
350 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
352 Here are the clauses allowed in an `sregex' or `sregexq' expression:
355 This stands for the literal string. If it contains
356 metacharacters, they will be escaped in the resulting regex
357 (using `regexp-quote').
360 This stands for \".\", a regex matching any character except
364 Stands for \"^\", matching the empty string at the beginning of a line
367 Stands for \"$\", matching the empty string at the end of a line
370 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
372 - (sequence CLAUSE ...)
374 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
375 Clauses grouped by `sequence' do not count for purposes of
376 numbering backreferences. Use `sequence' in situations like
379 (sregexq (or \"dog\" \"cat\"
380 (sequence (opt \"sea \") \"monkey\")))
381 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\"
383 where a single `or' alternate needs to contain multiple
387 Matches the same string previously matched by the Nth \"group\" in
388 the same sregex. N is a positive integer.
391 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
394 Concatenates the given CLAUSEs and matches zero or more
395 occurrences by appending \"*\".
398 Concatenates the given CLAUSEs and matches one or more
399 occurrences by appending \"+\".
402 Concatenates the given CLAUSEs and matches zero or one occurrence
405 - (repeat MIN MAX CLAUSE ...)
406 Concatenates the given CLAUSEs and constructs a regex matching at
407 least MIN occurrences and at most MAX occurrences. MIN must be a
408 non-negative integer. MAX must be a non-negative integer greater
409 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
411 - (char CHAR-CLAUSE ...)
412 Creates a \"character class\" matching one character from the given
413 set. See below for how to construct a CHAR-CLAUSE.
415 - (not-char CHAR-CLAUSE ...)
416 Creates a \"character class\" matching any one character not in the
417 given set. See below for how to construct a CHAR-CLAUSE.
420 Stands for \"\\\\`\", matching the empty string at the beginning of
421 text (beginning of a string or of a buffer).
424 Stands for \"\\\\'\", matching the empty string at the end of text.
427 Stands for \"\\\\=\", matching the empty string at point.
429 - the symbol `word-boundary'
430 Stands for \"\\\\b\", matching the empty string at the beginning or
433 - the symbol `not-word-boundary'
434 Stands for \"\\\\B\", matching the empty string not at the beginning
438 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
442 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
444 - the symbol `wordchar'
445 Stands for the regex \"\\\\w\", matching a word-constituent character
446 (as determined by the current syntax table)
448 - the symbol `not-wordchar'
449 Stands for the regex \"\\\\W\", matching a non-word-constituent
453 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
454 (a single character). Matches any character with the requested
458 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
459 (a single character). Matches any character without the
463 This is a \"trapdoor\" for including ordinary regular expression
464 strings in the result. Some regular expressions are clearer when
465 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
468 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
469 has one of the following forms:
472 Adds that character to the set.
475 Adds all the characters in the string to the set.
478 Where MIN and MAX are characters, adds the range of characters
479 from MIN through MAX to the set."
480 `(apply 'sregex
',exps
))
482 (defun sregex--engine (exp combine
)
488 (concat "\\(?:" (regexp-quote exp
) "\\)")
500 (word-boundary "\\b")
501 (not-word-boundary "\\B")
505 (funcall (intern (concat "sregex--"
506 (symbol-name (car exp
))))
509 (t (error "Invalid expression: %s" exp
))))
511 (defun sregex--sequence (exps combine
)
512 (if (= (length exps
) 1) (sregex--engine (car exps
) combine
)
514 (lambda (e) (sregex--engine e
'concat
))
516 (if (eq combine
'suffix
)
517 (concat "\\(?:" re
"\\)")
520 (defun sregex--or (exps combine
)
521 (if (= (length exps
) 1) (sregex--engine (car exps
) combine
)
523 (lambda (e) (sregex--engine e
'or
))
525 (if (not (eq combine
'or
))
526 (concat "\\(?:" re
"\\)")
529 (defun sregex--group (exps combine
) (concat "\\(" (sregex--sequence exps nil
) "\\)"))
531 (defun sregex--backref (exps combine
) (concat "\\" (int-to-string (car exps
))))
532 (defun sregex--opt (exps combine
) (concat (sregex--sequence exps
'suffix
) "?"))
533 (defun sregex--0+ (exps combine
) (concat (sregex--sequence exps
'suffix
) "*"))
534 (defun sregex--1+ (exps combine
) (concat (sregex--sequence exps
'suffix
) "+"))
536 (defun sregex--char (exps combine
) (sregex--char-aux nil exps
))
537 (defun sregex--not-char (exps combine
) (sregex--char-aux t exps
))
539 (defun sregex--syntax (exps combine
) (format "\\s%c" (car exps
)))
540 (defun sregex--not-syntax (exps combine
) (format "\\S%c" (car exps
)))
542 (defun sregex--regex (exps combine
)
543 (if combine
(concat "\\(?:" (car exps
) "\\)") (car exps
)))
545 (defun sregex--repeat (exps combine
)
546 (let* ((min (or (pop exps
) 0))
547 (minstr (number-to-string min
))
549 (concat (sregex--sequence exps
'suffix
)
550 (concat "\\{" minstr
","
551 (when max
(number-to-string max
)) "\\}"))))
553 (defun sregex--char-range (start end
)
554 (let ((startc (char-to-string start
))
555 (endc (char-to-string end
)))
557 ((> end
(+ start
2)) (concat startc
"-" endc
))
558 ((> end
(+ start
1)) (concat startc
(char-to-string (1+ start
)) endc
))
559 ((> end start
) (concat startc endc
))
562 (defun sregex--char-aux (complement args
)
563 ;; regex-opt does the same, we should join effort.
564 (let ((chars (make-bool-vector 256 nil
))) ; Yeah, right!
566 (cond ((integerp arg
) (aset chars arg t
))
567 ((stringp arg
) (mapcar (lambda (c) (aset chars c t
)) arg
))
569 (let ((start (car arg
))
572 (let ((tmp start
)) (setq start end
) (setq end tmp
)))
577 (setq i
(1+ i
))))))))
578 ;; now chars is a map of the characters in the class
579 (let ((caret (aref chars ?^
))
580 (dash (aref chars ?-
))
581 (class (if (aref chars ?\
]) "]" "")))
590 (unless start
(setq start i
))
594 (setq class
(concat class
(sregex--char-range start end
)))
597 (setq class
(concat class
(sregex--char-range start end
)))))
599 (if (> (length class
) 0)
600 (setq class
(concat class
(if caret
"^") (if dash
"-")))
601 (setq class
(concat class
(if dash
"-") (if caret
"^"))))
602 (if (and (not complement
) (= (length class
) 1))
604 (concat "[" (if complement
"^") class
"]")))))
608 ;;; arch-tag: 460c1f5a-eb6e-42ec-a451-ffac78bdf492
609 ;;; sregex.el ends here