Fix maintainer address.
[emacs.git] / lisp / emacs-lisp / sregex.el
blob09fc23136754a6eeb50679bf236747089191a2ad
1 ;;; sregex.el --- symbolic regular expressions
3 ;; Copyright (C) 1997, 1998 Free Software Foundation, Inc.
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
25 ;;; Commentary:
27 ;; This package allows you to write regular expressions using a
28 ;; totally new, Lisp-like syntax.
30 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
31 ;; that, when evaluated, produces the string form of the specified
32 ;; regular expression. Here's a simple example:
34 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
36 ;; As you can see, an sregex is specified by placing one or more
37 ;; special clauses in a call to `sregexq'. The clause in this case is
38 ;; the `or' of two strings (not to be confused with the Lisp function
39 ;; `or'). The list of allowable clauses appears below.
41 ;; With sregex, it is never necessary to "escape" magic characters
42 ;; that are meant to be taken literally; that happens automatically.
43 ;; For example:
45 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
47 ;; It is also unnecessary to "group" parts of the expression together
48 ;; to overcome operator precedence; that also happens automatically.
49 ;; For example:
51 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(Bob\\|Robert\\)?"
53 ;; It *is* possible to group parts of the expression in order to refer
54 ;; to them with numbered backreferences:
56 ;; (sregexq (group (or "Go" "Run"))
57 ;; ", Spot, "
58 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
60 ;; If `sregexq' needs to introduce its own grouping parentheses, it
61 ;; will automatically renumber your backreferences:
63 ;; (sregexq (opt "resent-")
64 ;; (group (or "to" "cc" "bcc"))
65 ;; ": "
66 ;; (backref 1)) => "\\(resent-\\)?\\(to\\|cc\\|bcc\\): \\2"
68 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
69 ;; Lisp expression that then invokes a moderately complex engine to
70 ;; interpret the sregex and render the string form. Because of this,
71 ;; I don't recommend sprinkling calls to `sregexq' throughout your
72 ;; code, the way one normally does with string regexes (which are
73 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
74 ;; you need wherever possible instead of repeatedly constructing the
75 ;; same ones over and over. Example:
77 ;; (let ((field-regex (sregexq (opt "resent-")
78 ;; (or "to" "cc" "bcc"))))
79 ;; ...
80 ;; (while ...
81 ;; ...
82 ;; (re-search-forward field-regex ...)
83 ;; ...))
85 ;; The arguments to `sregexq' are automatically quoted, but the
86 ;; flipside of this is that it is not straightforward to include
87 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
88 ;; `sregex' is a function that is like `sregexq' but which does not
89 ;; automatically quote its values. Literal sregex clauses must be
90 ;; explicitly quoted like so:
92 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
94 ;; but computed clauses can be included easily, allowing for the reuse
95 ;; of common clauses:
97 ;; (let ((dotstar '(0+ any))
98 ;; (whitespace '(1+ (syntax ?-)))
99 ;; (digits '(1+ (char (?0 . ?9)))))
100 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
102 ;; This package also provides sregex-specific versions of the Emacs
103 ;; functions `replace-match', `match-string',
104 ;; `match-string-no-properties', `match-beginning', `match-end', and
105 ;; `match-data'. In each case, the sregex version's name begins with
106 ;; `sregex-' and takes one additional optional parameter, an sregex
107 ;; "info" object. Each of these functions is concerned with numbered
108 ;; submatches. Since sregex may renumber submatches, alternate
109 ;; versions of these functions are needed that know how to adjust the
110 ;; supplied number.
112 ;; The sregex info object for the most recently evaluated sregex can
113 ;; be obtained with `sregex-info'; so if you precompute your sregexes
114 ;; and you plan to use `replace-match' or one of the others with it,
115 ;; you need to record the info object for later use:
117 ;; (let* ((regex (sregexq (opt "resent-")
118 ;; (group (or "to" "cc" "bcc"))
119 ;; ":"))
120 ;; (regex-info (sregex-info)))
121 ;; ...
122 ;; (if (re-search-forward regex ...)
123 ;; (let ((which (sregex-match-string 1 nil regex-info)))
124 ;; ...)))
126 ;; In this example, `regex' is "\\(resent-\\)?\\(to\\|cc\\|bcc\\):",
127 ;; so the call to (sregex-match-string 1 ...) is automatically turned
128 ;; into a call to (match-string 2 ...).
130 ;; If the sregex info argument to `sregex-replace-match',
131 ;; `sregex-match-string', `sregex-match-string-no-properties',
132 ;; `sregex-match-beginning', `sregex-match-end', or
133 ;; `sregex-match-data' is omitted, the current value of (sregex-info)
134 ;; is used.
136 ;; You can do your own sregex submatch renumbering with
137 ;; `sregex-backref-num'.
139 ;; Finally, `sregex-save-match-data' is like `save-match-data' but
140 ;; also saves and restores the information maintained by
141 ;; `sregex-info'.
143 ;; To use this package in a Lisp program, simply (require 'sregex).
145 ;; Here are the clauses allowed in an `sregex' or `sregexq'
146 ;; expression:
148 ;; - a string
149 ;; This stands for the literal string. If it contains
150 ;; metacharacters, they will be escaped in the resulting regex
151 ;; (using `regexp-quote').
153 ;; - the symbol `any'
154 ;; This stands for ".", a regex matching any character except
155 ;; newline.
157 ;; - the symbol `bol'
158 ;; Stands for "^", matching the empty string at the beginning of a line
160 ;; - the symbol `eol'
161 ;; Stands for "$", matching the empty string at the end of a line
163 ;; - (group CLAUSE ...)
164 ;; Groups the given CLAUSEs using "\\(" and "\\)".
166 ;; - (sequence CLAUSE ...)
168 ;; Groups the given CLAUSEs; may or may not use "\\(" and "\\)".
169 ;; Clauses groups by `sequence' do not count for purposes of
170 ;; numbering backreferences. Use `sequence' in situations like
171 ;; this:
173 ;; (sregexq (or "dog" "cat"
174 ;; (sequence (opt "sea ") "monkey")))
175 ;; => "dog\\|cat\\|\\(sea \\)?monkey"
177 ;; where a single `or' alternate needs to contain multiple
178 ;; subclauses.
180 ;; - (backref N)
181 ;; Matches the same string previously matched by the Nth "group" in
182 ;; the same sregex. N is a positive integer. In the resulting
183 ;; regex, N may be adjusted to account for automatically introduced
184 ;; groups.
186 ;; - (or CLAUSE ...)
187 ;; Matches any one of the CLAUSEs by separating them with "\\|".
189 ;; - (0+ CLAUSE ...)
190 ;; Concatenates the given CLAUSEs and matches zero or more
191 ;; occurrences by appending "*".
193 ;; - (1+ CLAUSE ...)
194 ;; Concatenates the given CLAUSEs and matches one or more
195 ;; occurrences by appending "+".
197 ;; - (opt CLAUSE ...)
198 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
199 ;; by appending "?".
201 ;; - (repeat MIN MAX CLAUSE ...)
202 ;; Concatenates the given CLAUSEs and constructs a regex matching at
203 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
204 ;; non-negative integer. MAX must be a non-negative integer greater
205 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
207 ;; - (char CHAR-CLAUSE ...)
208 ;; Creates a "character class" matching one character from the given
209 ;; set. See below for how to construct a CHAR-CLAUSE.
211 ;; - (not-char CHAR-CLAUSE ...)
212 ;; Creates a "character class" matching any one character not in the
213 ;; given set. See below for how to construct a CHAR-CLAUSE.
215 ;; - the symbol `bot'
216 ;; Stands for "\\`", matching the empty string at the beginning of
217 ;; text (beginning of a string or of a buffer).
219 ;; - the symbol `eot'
220 ;; Stands for "\\'", matching the empty string at the end of text.
222 ;; - the symbol `point'
223 ;; Stands for "\\=", matching the empty string at point.
225 ;; - the symbol `word-boundary'
226 ;; Stands for "\\b", matching the empty string at the beginning or
227 ;; end of a word.
229 ;; - the symbol `not-word-boundary'
230 ;; Stands for "\\B", matching the empty string not at the beginning
231 ;; or end of a word.
233 ;; - the symbol `bow'
234 ;; Stands for "\\<", matching the empty string at the beginning of a
235 ;; word.
237 ;; - the symbol `eow'
238 ;; Stands for "\\>", matching the empty string at the end of a word.
240 ;; - the symbol `wordchar'
241 ;; Stands for the regex "\\w", matching a word-constituent character
242 ;; (as determined by the current syntax table)
244 ;; - the symbol `not-wordchar'
245 ;; Stands for the regex "\\W", matching a non-word-constituent
246 ;; character.
248 ;; - (syntax CODE)
249 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
250 ;; (a single character). Matches any character with the requested
251 ;; syntax.
253 ;; - (not-syntax CODE)
254 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
255 ;; (a single character). Matches any character without the
256 ;; requested syntax.
258 ;; - (regex REGEX)
259 ;; This is a "trapdoor" for including ordinary regular expression
260 ;; strings in the result. Some regular expressions are clearer when
261 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
262 ;; instance. However, see the note under "Bugs," below.
264 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
265 ;; has one of the following forms:
267 ;; - a character
268 ;; Adds that character to the set.
270 ;; - a string
271 ;; Adds all the characters in the string to the set.
273 ;; - A pair (MIN . MAX)
274 ;; Where MIN and MAX are characters, adds the range of characters
275 ;; from MIN through MAX to the set.
277 ;;; To do:
279 ;; Make (sregexq (or "a" (sequence "b" "c"))) return "a\\|bc" instead
280 ;; of "a\\|\\(bc\\)"
282 ;; An earlier version of this package could optionally translate the
283 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
284 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
285 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
286 ;; such a facility.
288 ;;; Bugs:
290 ;; The (regex REGEX) form can confuse the code that distinguishes
291 ;; introduced groups from user-specified groups. Try to avoid using
292 ;; grouping within a `regex' form. Failing that, try to avoid using
293 ;; backrefs if you're using `regex'.
295 ;;; Code:
297 (defsubst sregex--value-unitp (val) (nth 0 val))
298 (defsubst sregex--value-groups (val) (nth 1 val))
299 (defsubst sregex--value-tree (val) (nth 2 val))
301 (defun sregex--make-value (unitp groups tree)
302 (list unitp groups tree))
304 (defvar sregex--current-sregex nil
305 "Global state for `sregex-info'.")
307 (defun sregex-info ()
308 "Return extra information about the latest call to `sregex'.
309 This extra information is needed in order to adjust user-requested
310 backreference numbers to numbers suitable for the generated regexp.
311 See e.g. `sregex-match-string' and `sregex-backref-num'."
312 sregex--current-sregex)
314 ; (require 'advice)
315 ; (defadvice save-match-data (around sregex-save-match-data protect)
316 ; (let ((sregex--saved-sregex sregex--current-sregex))
317 ; (unwind-protect
318 ; ad-do-it
319 ; (setq sregex--current-sregex sregex--saved-sregex))))
320 (defmacro sregex-save-match-data (&rest forms)
321 "Like `save-match-data', but also saves and restores `sregex-info' data."
322 `(let ((sregex--saved-sregex sregex--current-sregex))
323 (unwind-protect
324 (save-match-data ,@forms)
325 (setq sregex--current-sregex sregex--saved-sregex))))
327 (defun sregex-replace-match (replacement
328 &optional fixedcase literal string subexp sregex)
329 "Like `replace-match', for a regexp made with `sregex'.
330 This takes one additional optional argument, the `sregex' info, which
331 can be obtained with `sregex-info'. The SUBEXP argument is adjusted
332 to allow for \"introduced groups\". If the extra argument is omitted
333 or nil, it defaults to the current value of (sregex-info)."
334 (replace-match replacement fixedcase literal string
335 (and subexp
336 (sregex-backref-num subexp sregex))))
338 (defun sregex-match-string (count &optional in-string sregex)
339 "Like `match-string', for a regexp made with `sregex'.
340 This takes one additional optional argument, the `sregex' info, which
341 can be obtained with `sregex-info'. The COUNT argument is adjusted to
342 allow for \"introduced groups\". If the extra argument is omitted or
343 nil, it defaults to the current value of (sregex-info)."
344 (match-string (and count
345 (sregex-backref-num count sregex))
346 in-string))
348 (defun sregex-match-string-no-properties (count &optional in-string sregex)
349 "Like `match-string-no-properties', for a regexp made with `sregex'.
350 This takes one additional optional argument, the `sregex' info, which
351 can be obtained with `sregex-info'. The COUNT argument is adjusted to
352 allow for \"introduced groups\". If the extra argument is omitted or
353 nil, it defaults to the current value of (sregex-info)."
354 (match-string-no-properties
355 (and count
356 (sregex-backref-num count sregex))
357 in-string))
359 (defun sregex-match-beginning (count &optional sregex)
360 "Like `match-beginning', for a regexp made with `sregex'.
361 This takes one additional optional argument, the `sregex' info, which
362 can be obtained with `sregex-info'. The COUNT argument is adjusted to
363 allow for \"introduced groups\". If the extra argument is omitted or
364 nil, it defaults to the current value of (sregex-info)."
365 (match-beginning (sregex-backref-num count sregex)))
367 (defun sregex-match-end (count &optional sregex)
368 "Like `match-end', for a regexp made with `sregex'.
369 This takes one additional optional argument, the `sregex' info, which
370 can be obtained with `sregex-info'. The COUNT argument is adjusted to
371 allow for \"introduced groups\". If the extra argument is omitted or
372 nil, it defaults to the current value of (sregex-info)."
373 (match-end (sregex-backref-num count sregex)))
375 (defun sregex-match-data (&optional sregex)
376 "Like `match-data', for a regexp made with `sregex'.
377 This takes one additional optional argument, the `sregex' info, which
378 can be obtained with `sregex-info'. \"Introduced groups\" are removed
379 from the result. If the extra argument is omitted or nil, it defaults
380 to the current value of (sregex-info)."
381 (let* ((data (match-data))
382 (groups (sregex--value-groups (or sregex
383 sregex--current-sregex)))
384 (result (list (car (cdr data))
385 (car data))))
386 (setq data (cdr (cdr data)))
387 (while data
388 (if (car groups)
389 (setq result (append (list (car (cdr data))
390 (car data))
391 result)))
392 (setq groups (cdr groups)
393 data (cdr (cdr data))))
394 (reverse result)))
396 (defun sregex--render-tree (tree sregex)
397 (let ((key (car tree)))
398 (cond ((eq key 'str)
399 (cdr tree))
400 ((eq key 'or)
401 (mapconcat '(lambda (x)
402 (sregex--render-tree x sregex))
403 (cdr tree)
404 "\\|"))
405 ((eq key 'sequence)
406 (apply 'concat
407 (mapcar '(lambda (x)
408 (sregex--render-tree x sregex))
409 (cdr tree))))
410 ((eq key 'group)
411 (concat "\\("
412 (sregex--render-tree (cdr tree) sregex)
413 "\\)"))
414 ((eq key 'opt)
415 (concat (sregex--render-tree (cdr tree) sregex)
416 "?"))
417 ((eq key '0+)
418 (concat (sregex--render-tree (cdr tree) sregex)
419 "*"))
420 ((eq key '1+)
421 (concat (sregex--render-tree (cdr tree) sregex)
422 "+"))
423 ((eq key 'backref)
424 (let ((num (sregex-backref-num (cdr tree) sregex)))
425 (if (> num 9)
426 (error "sregex: backref number %d too high after adjustment"
427 num)
428 (concat "\\" (int-to-string num)))))
429 (t (error "sregex internal error: unknown tree type %S"
430 key)))))
432 (defun sregex (&rest exps)
433 "Symbolic regular expression interpreter.
434 This is exactly like `sregexq' (q.v.) except that it evaluates all its
435 arguments, so literal sregex clauses must be quoted. For example:
437 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
439 An argument-evaluating sregex interpreter lets you reuse sregex
440 subexpressions:
442 (let ((dotstar '(0+ any))
443 (whitespace '(1+ (syntax ?-)))
444 (digits '(1+ (char (?0 . ?9)))))
445 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
446 (progn
447 (setq sregex--current-sregex (sregex--sequence exps nil))
448 (sregex--render-tree (sregex--value-tree sregex--current-sregex)
449 sregex--current-sregex)))
451 (defmacro sregexq (&rest exps)
452 "Symbolic regular expression interpreter.
453 This macro allows you to specify a regular expression (regexp) in
454 symbolic form, and converts it into the string form required by Emacs's
455 regex functions such as `re-search-forward' and `looking-at'. Here is
456 a simple example:
458 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
460 As you can see, an sregex is specified by placing one or more special
461 clauses in a call to `sregexq'. The clause in this case is the `or'
462 of two strings (not to be confused with the Lisp function `or'). The
463 list of allowable clauses appears below.
465 With `sregex', it is never necessary to \"escape\" magic characters
466 that are meant to be taken literally; that happens automatically.
467 For example:
469 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
471 It is also unnecessary to \"group\" parts of the expression together
472 to overcome operator precedence; that also happens automatically.
473 For example:
475 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
477 It *is* possible to group parts of the expression in order to refer
478 to them with numbered backreferences:
480 (sregexq (group (or \"Go\" \"Run\"))
481 \", Spot, \"
482 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
484 If `sregexq' needs to introduce its own grouping parentheses, it will
485 automatically renumber your backreferences:
487 (sregexq (opt \"resent-\")
488 (group (or \"to\" \"cc\" \"bcc\"))
489 \": \"
490 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
492 `sregexq' is a macro. Each time it is used, it constructs a simple
493 Lisp expression that then invokes a moderately complex engine to
494 interpret the sregex and render the string form. Because of this, I
495 don't recommend sprinkling calls to `sregexq' throughout your code,
496 the way one normally does with string regexes (which are cheap to
497 evaluate). Instead, it's wiser to precompute the regexes you need
498 wherever possible instead of repeatedly constructing the same ones
499 over and over. Example:
501 (let ((field-regex (sregexq (opt \"resent-\")
502 (or \"to\" \"cc\" \"bcc\"))))
504 (while ...
506 (re-search-forward field-regex ...)
507 ...))
509 The arguments to `sregexq' are automatically quoted, but the
510 flipside of this is that it is not straightforward to include
511 computed (i.e., non-constant) values in `sregexq' expressions. So
512 `sregex' is a function that is like `sregexq' but which does not
513 automatically quote its values. Literal sregex clauses must be
514 explicitly quoted like so:
516 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
518 but computed clauses can be included easily, allowing for the reuse
519 of common clauses:
521 (let ((dotstar '(0+ any))
522 (whitespace '(1+ (syntax ?-)))
523 (digits '(1+ (char (?0 . ?9)))))
524 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
526 Here are the clauses allowed in an `sregex' or `sregexq' expression:
528 - a string
529 This stands for the literal string. If it contains
530 metacharacters, they will be escaped in the resulting regex
531 (using `regexp-quote').
533 - the symbol `any'
534 This stands for \".\", a regex matching any character except
535 newline.
537 - the symbol `bol'
538 Stands for \"^\", matching the empty string at the beginning of a line
540 - the symbol `eol'
541 Stands for \"$\", matching the empty string at the end of a line
543 - (group CLAUSE ...)
544 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
546 - (sequence CLAUSE ...)
548 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
549 Clauses groups by `sequence' do not count for purposes of
550 numbering backreferences. Use `sequence' in situations like
551 this:
553 (sregexq (or \"dog\" \"cat\"
554 (sequence (opt \"sea \") \"monkey\")))
555 => \"dog\\\\|cat\\\\|\\\\(sea \\\\)?monkey\"
557 where a single `or' alternate needs to contain multiple
558 subclauses.
560 - (backref N)
561 Matches the same string previously matched by the Nth \"group\" in
562 the same sregex. N is a positive integer. In the resulting
563 regex, N may be adjusted to account for automatically introduced
564 groups.
566 - (or CLAUSE ...)
567 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
569 - (0+ CLAUSE ...)
570 Concatenates the given CLAUSEs and matches zero or more
571 occurrences by appending \"*\".
573 - (1+ CLAUSE ...)
574 Concatenates the given CLAUSEs and matches one or more
575 occurrences by appending \"+\".
577 - (opt CLAUSE ...)
578 Concatenates the given CLAUSEs and matches zero or one occurrence
579 by appending \"?\".
581 - (repeat MIN MAX CLAUSE ...)
582 Concatenates the given CLAUSEs and constructs a regex matching at
583 least MIN occurrences and at most MAX occurrences. MIN must be a
584 non-negative integer. MAX must be a non-negative integer greater
585 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
587 - (char CHAR-CLAUSE ...)
588 Creates a \"character class\" matching one character from the given
589 set. See below for how to construct a CHAR-CLAUSE.
591 - (not-char CHAR-CLAUSE ...)
592 Creates a \"character class\" matching any one character not in the
593 given set. See below for how to construct a CHAR-CLAUSE.
595 - the symbol `bot'
596 Stands for \"\\\\`\", matching the empty string at the beginning of
597 text (beginning of a string or of a buffer).
599 - the symbol `eot'
600 Stands for \"\\\\'\", matching the empty string at the end of text.
602 - the symbol `point'
603 Stands for \"\\\\=\", matching the empty string at point.
605 - the symbol `word-boundary'
606 Stands for \"\\\\b\", matching the empty string at the beginning or
607 end of a word.
609 - the symbol `not-word-boundary'
610 Stands for \"\\\\B\", matching the empty string not at the beginning
611 or end of a word.
613 - the symbol `bow'
614 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
615 word.
617 - the symbol `eow'
618 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
620 - the symbol `wordchar'
621 Stands for the regex \"\\\\w\", matching a word-constituent character
622 (as determined by the current syntax table)
624 - the symbol `not-wordchar'
625 Stands for the regex \"\\\\W\", matching a non-word-constituent
626 character.
628 - (syntax CODE)
629 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
630 (a single character). Matches any character with the requested
631 syntax.
633 - (not-syntax CODE)
634 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
635 (a single character). Matches any character without the
636 requested syntax.
638 - (regex REGEX)
639 This is a \"trapdoor\" for including ordinary regular expression
640 strings in the result. Some regular expressions are clearer when
641 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
642 instance. However, using this can confuse the code that
643 distinguishes introduced groups from user-specified groups. Avoid
644 using grouping within a `regex' form. Failing that, avoid using
645 backrefs if you're using `regex'.
647 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
648 has one of the following forms:
650 - a character
651 Adds that character to the set.
653 - a string
654 Adds all the characters in the string to the set.
656 - A pair (MIN . MAX)
657 Where MIN and MAX are characters, adds the range of characters
658 from MIN through MAX to the set."
659 `(apply 'sregex ',exps))
661 (defun sregex--engine (exp combine)
662 (let* ((val (cond ((stringp exp)
663 (sregex--make-value (or (not (eq combine 'suffix))
664 (= (length exp) 1))
666 (cons 'str
667 (regexp-quote exp))))
668 ((symbolp exp)
669 (funcall (intern (concat "sregex--"
670 (symbol-name exp)))
671 combine))
672 ((consp exp)
673 (funcall (intern (concat "sregex--"
674 (symbol-name (car exp))))
675 (cdr exp)
676 combine))
677 (t (error "Invalid expression: %s" exp))))
678 (unitp (sregex--value-unitp val))
679 (groups (sregex--value-groups val))
680 (tree (sregex--value-tree val)))
681 (if (and combine (not unitp))
682 (sregex--make-value t
683 (cons nil groups)
684 (cons 'group tree))
685 (sregex--make-value unitp groups tree))))
687 (defun sregex--sequence (exps combine)
688 (if (= (length exps) 1)
689 (sregex--engine (car exps) combine)
690 (let ((groups nil)
691 (trees nil)) ;grows in reverse
692 (while exps
693 (let ((val (sregex--engine (car exps) 'concat)))
694 (setq groups (append groups
695 (sregex--value-groups val))
696 trees (cons (sregex--value-tree val) trees)
697 exps (cdr exps))))
698 (setq trees (nreverse trees))
699 (if (eq combine 'suffix)
700 (sregex--make-value t
701 (cons nil groups)
702 (cons 'group
703 (cons 'sequence trees)))
704 (sregex--make-value (not (eq combine 'suffix))
705 groups
706 (cons 'sequence trees))))))
708 (defun sregex--group (exps combine)
709 (let ((val (sregex--sequence exps nil)))
710 (sregex--make-value t
711 (cons t (sregex--value-groups val))
712 (cons 'group (sregex--value-tree val)))))
714 (defun sregex-backref-num (n &optional sregex)
715 "Adjust backreference number N according to SREGEX.
716 When `sregex' introduces parenthesized groups that the user didn't ask
717 for, the numbering of the groups that the user *did* ask for gets all
718 out of whack. This function accounts for introduced groups. Example:
720 (sregexq (opt \"ab\")
721 (group (or \"c\" \"d\"))) => \"\\\\(ab\\\\)?\\\\(c\\\\|d\\\\)\"
722 (setq info (sregex-info))
723 (sregex-backref-num 1 info) => 2
725 The SREGEX parameter is optional and defaults to the current value of
726 `sregex-info'."
727 (let ((groups (sregex--value-groups (or sregex
728 sregex--current-sregex)))
729 (result 0))
730 (while (and groups (> n 0))
731 (if (car groups)
732 (setq n (1- n)))
733 (setq result (1+ result)
734 groups (cdr groups)))
735 result))
737 (defun sregex--backref (exps combine)
738 (sregex--make-value t nil (cons 'backref (car exps))))
740 (defun sregex--any (combine)
741 (sregex--make-value t nil '(str . ".")))
743 (defun sregex--opt (exps combine)
744 (let ((val (sregex--sequence exps 'suffix)))
745 (sregex--make-value t
746 (sregex--value-groups val)
747 (cons 'opt (sregex--value-tree val)))))
749 (defun sregex--0+ (exps combine)
750 (let ((val (sregex--sequence exps 'suffix)))
751 (sregex--make-value t
752 (sregex--value-groups val)
753 (cons '0+ (sregex--value-tree val)))))
754 (defun sregex--1+ (exps combine)
755 (let ((val (sregex--sequence exps 'suffix)))
756 (sregex--make-value t
757 (sregex--value-groups val)
758 (cons '1+ (sregex--value-tree val)))))
760 (defun sregex--repeat (exps combine)
761 (let ((min (or (car exps) 0))
762 (max (car (cdr exps))))
763 (setq exps (cdr (cdr exps)))
764 (cond ((zerop min)
765 (cond ((equal max 0) ;degenerate
766 (sregex--make-value t nil nil))
767 ((equal max 1)
768 (sregex--opt exps combine))
769 ((not max)
770 (sregex--0+ exps combine))
771 (t (sregex--sequence (make-list max
772 (cons 'opt exps))
773 combine))))
774 ((= min 1)
775 (cond ((equal max 1)
776 (sregex--sequence exps combine))
777 ((not max)
778 (sregex--1+ exps combine))
779 (t (sregex--sequence (append exps
780 (make-list (1- max)
781 (cons 'opt exps)))
782 combine))))
783 (t (sregex--sequence (append exps
784 (list (append (list 'repeat
785 (1- min)
786 (and max
787 (1- max)))
788 exps)))
789 combine)))))
791 (defun sregex--or (exps combine)
792 (if (= (length exps) 1)
793 (sregex--engine (car exps) combine)
794 (let ((groups nil)
795 (trees nil))
796 (while exps
797 (let ((val (sregex--engine (car exps) 'or)))
798 (setq groups (append groups
799 (sregex--value-groups val))
800 trees (cons (sregex--value-tree val) trees)
801 exps (cdr exps))))
802 (sregex--make-value (eq combine 'or)
803 groups
804 (cons 'or (nreverse trees))))))
806 (defmacro sregex--char-range-aux ()
807 '(if start
808 (let (startc endc)
809 (if (and (<= 32 start)
810 (<= start 127))
811 (setq startc (char-to-string start)
812 endc (char-to-string end))
813 (setq startc (format "\\%03o" start)
814 endc (format "\\%03o" end)))
815 (if (> end start)
816 (if (> end (+ start 1))
817 (setq class (concat class startc "-" endc))
818 (setq class (concat class startc endc)))
819 (setq class (concat class startc))))))
821 (defmacro sregex--char-range (rstart rend)
822 `(let ((i ,rstart)
823 start end)
824 (while (<= i ,rend)
825 (if (aref chars i)
826 (progn
827 (if start
828 (setq end i)
829 (setq start i
830 end i))
831 (aset chars i nil))
832 (sregex--char-range-aux)
833 (setq start nil
834 end nil))
835 (setq i (1+ i)))
836 (sregex--char-range-aux)))
838 (defun sregex--char-aux (complement args)
839 (let ((chars (make-vector 256 nil)))
840 (while args
841 (let ((arg (car args)))
842 (cond ((integerp arg)
843 (aset chars arg t))
844 ((stringp arg)
845 (mapcar (function
846 (lambda (c)
847 (aset chars c t)))
848 arg))
849 ((consp arg)
850 (let ((start (car arg))
851 (end (cdr arg)))
852 (if (> start end)
853 (let ((tmp start))
854 (setq start end
855 end tmp)))
856 ;; now start <= end
857 (let ((i start))
858 (while (<= i end)
859 (aset chars i t)
860 (setq i (1+ i))))))))
861 (setq args (cdr args)))
862 ;; now chars is a map of the characters in the class
863 (let ((class "")
864 (caret (aref chars ?^)))
865 (aset chars ?^ nil)
866 (if (aref chars ?\])
867 (progn
868 (setq class (concat class "]"))
869 (aset chars ?\] nil)))
870 (if (aref chars ?-)
871 (progn
872 (setq class (concat class "-"))
873 (aset chars ?- nil)))
874 (if (aref chars ?\\)
875 (progn
876 (setq class (concat class "\\\\"))
877 (aset chars ?\\ nil)))
879 (sregex--char-range ?A ?Z)
880 (sregex--char-range ?a ?z)
881 (sregex--char-range ?0 ?9)
883 (let ((i 32))
884 (while (< i 128)
885 (if (aref chars i)
886 (progn
887 (setq class (concat class (char-to-string i)))
888 (aset chars i nil)))
889 (setq i (1+ i))))
891 (sregex--char-range 0 31)
892 (sregex--char-range 128 255)
894 (let ((i 0))
895 (while (< i 256)
896 (if (aref chars i)
897 (setq class (concat class (format "\\%03o" i))))
898 (setq i (1+ i))))
900 (if caret
901 (setq class (concat class "^")))
902 (concat "[" (if complement "^") class "]"))))
904 (defun sregex--char (exps combine)
905 (sregex--make-value t nil (cons 'str (sregex--char-aux nil exps))))
906 (defun sregex--not-char (exps combine)
907 (sregex--make-value t nil (cons 'str (sregex--char-aux t exps))))
909 (defun sregex--bol (combine)
910 (sregex--make-value t nil '(str . "^")))
911 (defun sregex--eol (combine)
912 (sregex--make-value t nil '(str . "$")))
914 (defun sregex--wordchar (combine)
915 (sregex--make-value t nil '(str . "\\w")))
916 (defun sregex--not-wordchar (combine)
917 (sregex--make-value t nil '(str . "\\W")))
919 (defun sregex--syntax (exps combine)
920 (sregex--make-value t nil (cons 'str (format "\\s%c" (car exps)))))
921 (defun sregex--not-syntax (exps combine)
922 (sregex--make-value t nil (cons 'str (format "\\S%c" (car exps)))))
924 (defun sregex--bot (combine)
925 (sregex--make-value t nil (cons 'str "\\`")))
926 (defun sregex--eot (combine)
927 (sregex--make-value t nil (cons 'str "\\'")))
929 (defun sregex--point (combine)
930 (sregex--make-value t nil '(str . "\\=")))
932 (defun sregex--word-boundary (combine)
933 (sregex--make-value t nil '(str . "\\b")))
934 (defun sregex--not-word-boundary (combine)
935 (sregex--make-value t nil '(str . "\\B")))
937 (defun sregex--bow (combine)
938 (sregex--make-value t nil '(str . "\\<")))
939 (defun sregex--eow (combine)
940 (sregex--make-value t nil '(str . "\\>")))
943 ;; trapdoor - usage discouraged
944 (defun sregex--regex (exps combine)
945 (sregex--make-value nil nil (car exps)))
947 (provide 'sregex)
949 ;;; sregex.el ends here