1 ;;; make-regexp.el --- generate efficient regexps to match strings.
3 ;; Copyright (C) 1994, 1995 Simon Marshall.
5 ;; Author: Simon Marshall <simon@gnu.ai.mit.edu>
6 ;; Keywords: strings, regexps
10 ;; make-regexp|Simon Marshall|simon@gnu.ai.mit.edu|
11 ;; Generate efficient regexps to match strings.|
12 ;; 11-Jul-1995|1.02|~/functions/make-regexp.el.gz|
14 ;; The archive is archive.cis.ohio-state.edu in /pub/gnu/emacs/elisp-archive.
16 ;;; This file is not part of GNU Emacs.
18 ;;; This program is free software; you can redistribute it and/or modify
19 ;;; it under the terms of the GNU General Public License as published by
20 ;;; the Free Software Foundation; either version 2, or (at your option)
21 ;;; any later version.
23 ;;; This program is distributed in the hope that it will be useful,
24 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
25 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 ;;; GNU General Public License for more details.
28 ;;; You should have received a copy of the GNU General Public License
29 ;;; along with GNU Emacs; see the file COPYING. If not, write to
30 ;;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
36 ;; To make efficient regexps from lists of strings.
40 ;; (let ((strings '("cond" "if" "while" "let\\*?" "prog1" "prog2" "progn"
41 ;; "catch" "throw" "save-restriction" "save-excursion"
42 ;; "save-window-excursion" "save-match-data"
43 ;; "unwind-protect" "condition-case" "track-mouse")))
44 ;; (concat "(" (make-regexp strings t)))
46 ;; => "(\\(c\\(atch\\|ond\\(\\|ition-case\\)\\)\\|if\\|let\\*?\\|prog[12n]\\|save-\\(excursion\\|match-data\\|restriction\\|window-excursion\\)\\|t\\(hrow\\|rack-mouse\\)\\|unwind-protect\\|while\\)"
48 ;; To search for the above regexp takes about 70% of the time as for the simple
49 ;; (concat "(\\(" (mapconcat 'identity strings "\\|") "\\)") regexp.
51 ;; Obviously, the more the similarity between strings, the faster the regexp:
53 ;; (make-regexp '("abort" "abs" "accept" "access" "array" "begin" "body" "case"
54 ;; "constant" "declare" "delay" "delta" "digits" "else" "elsif"
55 ;; "entry" "exception" "exit" "function" "generic" "goto" "if"
56 ;; "others" "limited" "loop" "mod" "new" "null" "out" "subtype"
57 ;; "package" "pragma" "private" "procedure" "raise" "range"
58 ;; "record" "rem" "renames" "return" "reverse" "select"
59 ;; "separate" "task" "terminate" "then" "type" "when" "while"
62 ;; => "a\\(b\\(ort\\|s\\)\\|cce\\(pt\\|ss\\)\\|rray\\)\\|b\\(egin\\|ody\\)\\|c\\(ase\\|onstant\\)\\|d\\(e\\(clare\\|l\\(ay\\|ta\\)\\)\\|igits\\)\\|e\\(ls\\(e\\|if\\)\\|ntry\\|x\\(ception\\|it\\)\\)\\|function\\|g\\(eneric\\|oto\\)\\|if\\|l\\(imited\\|oop\\)\\|mod\\|n\\(ew\\|ull\\)\\|o\\(thers\\|ut\\)\\|p\\(ackage\\|r\\(agma\\|ivate\\|ocedure\\)\\)\\|r\\(a\\(ise\\|nge\\)\\|e\\(cord\\|m\\|names\\|turn\\|verse\\)\\)\\|s\\(e\\(lect\\|parate\\)\\|ubtype\\)\\|t\\(ask\\|erminate\\|hen\\|ype\\)\\|w\\(h\\(en\\|ile\\)\\|ith\\)\\|xor"
64 ;; To search for the above regexp takes less than 60% of the time of the simple
65 ;; mapconcat equivalent.
67 ;; But even small regexps may be worth it:
69 ;; (make-regexp '("and" "at" "do" "end" "for" "in" "is" "not" "of" "or" "use"))
70 ;; => "a\\(nd\\|t\\)\\|do\\|end\\|for\\|i[ns]\\|not\\|o[fr]\\|use"
72 ;; as this is 10% faster than the mapconcat equivalent.
76 ;; (autoload 'make-regexp "make-regexp"
77 ;; "Return a regexp to match a string item in STRINGS.")
79 ;; (autoload 'make-regexps "make-regexp"
80 ;; "Return a regexp to REGEXPS.")
82 ;; Since these functions were written to produce efficient regexps, not regexps
83 ;; efficiently, it is probably not a good idea to in-line too many calls in
84 ;; your code, unless you use the following neat trick with `eval-when-compile':
86 ;; (defvar definition-regexp
87 ;; (let ((regexp (eval-when-compile
88 ;; (make-regexp '("defun" "defsubst" "defmacro" "defalias"
89 ;; "defvar" "defconst" "defadvice") t))))
90 ;; (concat "^(" regexp)))
92 ;; The `byte-compile' code will be as if you had defined the variable thus:
94 ;; (defvar definition-regexp
95 ;; "^(\\(def\\(a\\(dvice\\|lias\\)\\|const\\|macro\\|subst\\|un\\|var\\)\\)")
99 ;; Originally written for font-lock, from an idea from Stig's hl319.
100 ;; Please don't tell me that it doesn't produce optimal regexps; I know that
101 ;; already. But (ideas or) code to improve things (are) is welcome. Please
102 ;; test your code and tell me the speed up in searching an appropriate buffer.
104 ;; Please send me bug reports, bug fixes, and extensions, etc.
105 ;; Simon Marshall <simon@gnu.ai.mit.edu>
110 ;; - Made `make-regexp' take `lax' to force top-level parentheses.
111 ;; - Fixed `make-regexps' for MATCH bug and new `font-lock-keywords'.
112 ;; - Added `unfontify' to user timing functions.
114 ;; - Made `make-regexp' `let' a big `max-lisp-eval-depth'.
116 ;; The basic idea is to find the shortest common non-"" prefix each time, and
117 ;; squirrel it out. If there is no such prefix, we divide the list into two so
118 ;; that (at least) one half will have at least a one-character common prefix.
120 ;; In addition, we (a) delay the addition of () parenthesis as long as possible
121 ;; (until we're sure we need them), and (b) try to squirrel out one-character
122 ;; sequences (so we can use [] rather than ()).
124 (defun make-regexp (strings &optional paren lax
)
125 "Return a regexp to match a string item in STRINGS.
126 If optional PAREN non-nil, output regexp parentheses around returned regexp.
127 If optional LAX non-nil, don't output parentheses if it doesn't require them.
128 Merges keywords to avoid backtracking in Emacs' regexp matcher."
129 (let* ((max-lisp-eval-depth (* 1024 1024))
130 (strings (let ((l strings
)) ; Paranoia---make strings unique!
131 (while l
(setq l
(setcdr l
(delete (car l
) (cdr l
)))))
132 (sort strings
'string-lessp
)))
133 (open-paren (if paren
"\\(" "")) (close-paren (if paren
"\\)" ""))
134 (open-lax (if lax
"" open-paren
)) (close-lax (if lax
"" close-paren
))
135 (completion-ignore-case nil
))
137 ;; If there's only one string, just return it.
138 ((= (length strings
) 1)
139 (concat open-lax
(car strings
) close-lax
))
140 ;; If there's an empty string, pull it out.
141 ((string= (car strings
) "")
142 (if (and (= (length strings
) 2) (= (length (nth 1 strings
)) 1))
143 (concat open-lax
(nth 1 strings
) "?" close-lax
)
144 (concat open-paren
"\\|" (make-regexp (cdr strings
)) close-paren
)))
145 ;; If there are only one-character strings, make a [] list instead.
146 ((= (length strings
) (apply '+ (mapcar 'length strings
)))
147 (concat open-lax
"[" (mapconcat 'identity strings
"") "]" close-lax
))
149 ;; We have a list of strings. Is there a common prefix?
150 (let ((prefix (try-completion "" (mapcar 'list strings
))))
151 (if (> (length prefix
) 0)
152 ;; Common prefix! Squirrel it out and recurse with the suffixes.
153 (let* ((len (length prefix
))
154 (sufs (mapcar '(lambda (str) (substring str len
)) strings
)))
155 (concat open-paren prefix
(make-regexp sufs t t
) close-paren
))
156 ;; No common prefix. Is there a one-character sequence?
157 (let ((letters (let ((completion-regexp-list '("^.$")))
158 (all-completions "" (mapcar 'list strings
)))))
159 (if (> (length letters
) 1)
160 ;; Do the one-character sequences, then recurse on the rest.
161 (let ((rest (let ((completion-regexp-list '("^..+$")))
162 (all-completions "" (mapcar 'list strings
)))))
164 (make-regexp letters
) "\\|" (make-regexp rest
)
166 ;; No one-character sequence, so divide the list into two by
167 ;; dividing into those that start with a particular letter, and
168 ;; those that do not.
169 (let* ((char (substring (car strings
) 0 1))
170 (half1 (all-completions char
(mapcar 'list strings
)))
171 (half2 (nthcdr (length half1
) strings
)))
173 (make-regexp half1
) "\\|" (make-regexp half2
)
174 close-paren
))))))))))
176 ;; This stuff is realy for font-lock...
178 ;; Ahhh, the wonders of lisp...
179 (defun regexp-span (regexp &optional start
)
180 "Return the span or depth of REGEXP.
181 This means the number of \"\\\\(...\\\\)\" pairs in REGEXP, optionally from START."
182 (let ((match (string-match (regexp-quote "\\(") regexp
(or start
0))))
183 (if (not match
) 0 (1+ (regexp-span regexp
(match-end 0))))))
185 ;; The basic idea is to concat the regexps together, keeping count of the span
186 ;; of the regexps so that we can get the correct match for hilighting.
187 (defun make-regexps (&rest regexps
)
188 "Return a regexp to match REGEXPS
189 Each item of REGEXPS should be of the form:
191 STRING ; A STRING to be used literally.
192 (STRING MATCH FACE DATA) ; Match STRING at depth MATCH with FACE
193 ; and highlight according to DATA.
194 (STRINGS FACE DATA) ; STRINGS is a list of strings FACE is
195 ; to highlight according to DATA.
197 Returns a list of the form:
199 (REGEXP (MATCH FACE DATA) ...)
204 '((\"defun\" \"defalias\" \"defsubst\" \"defadvice\") keyword)
206 '(\"\\\\([a-zA-Z-]+\\\\)?\" 1 function-name nil t))
210 (\"^(\\\\(def\\\\(a\\\\(dvice\\\\|lias\\\\)\\\\|subst\\\\|un\\\\)\\\\)[ ]*\\\\([a-zA-Z-]+\\\\)?\"
211 (1 keyword) (4 function-name nil t))
213 Uses `make-regexp' to make efficient regexps."
214 (let ((regexp "") (data ()))
216 (cond ((stringp (car regexps
))
217 (setq regexp
(concat regexp
(car regexps
))))
218 ((stringp (nth 0 (car regexps
)))
219 (setq data
(cons (cons (+ (regexp-span regexp
)
220 (nth 1 (car regexps
)))
221 (nthcdr 2 (car regexps
)))
223 regexp
(concat regexp
(nth 0 (car regexps
)))))
225 (setq data
(cons (cons (1+ (regexp-span regexp
))
228 regexp
(concat regexp
(make-regexp (nth 0 (car regexps
))
230 (setq regexps
(cdr regexps
)))
231 (cons regexp
(nreverse data
))))
233 ;; timing functions removed due to name collisions with Gnus
235 (provide 'make-regexp
)
236 ;;; make-regexp.el ends here