1 ;;; -*- Mode: Lisp; Package:USER; Base:10 -*-
3 ;;; This code was written by:
5 ;;; Lawrence E. Freil <lef@nscf.org>
6 ;;; National Science Center Foundation
7 ;;; Augusta, Georgia 30909
9 ;;; If you modify this code, please comment your modifications
10 ;;; clearly and inform the author of any improvements so they
11 ;;; can be incorporated in future releases.
13 ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
16 ;;; This regular expression parser operates by taking a
17 ;;; regular expression and breaking it down into a list
18 ;;; consisting of lisp expressions and flags. The list
19 ;;; of lisp expressions is then taken in turned into a
20 ;;; lambda expression that can be later applied to a
21 ;;; string argument for parsing.
24 ;;; First we create a copy of macros to help debug the beast
26 (eval-when #-gcl
(:compile-toplevel
:load-toplevel
:execute
)
27 #+gcl
(load compile eval
)
28 (defpackage :maxima-nregex
32 #:*regex-debug
* #:*regex-groups
* #:*regex-groupings
*
38 (in-package :maxima-nregex
)
40 (eval-when (:compile-toplevel
:load-toplevel
:execute
)
41 (defvar *regex-debug
* nil
) ; Set to nil for no debugging code
43 (defmacro info
(message &rest args
)
45 `(format *trace-output
* ,message
,@args
)))
48 ;;; Declare the global variables for storing the paren index list.
50 (defvar *regex-groups
* (make-array 10))
51 (defvar *regex-groupings
* 0)
55 ;;; Declare a simple interface for testing. You probably wouldn't want
56 ;;; to use this interface unless you were just calling this once.
58 (defun regex (expression string
)
59 "Usage: (regex <expression> <string)
60 This function will call regex-compile on the expression and then apply
61 the string to the returned lambda list."
62 (let ((findit (cond ((stringp expression
)
63 (regex-compile expression
))
67 (if (not (funcall (if (functionp findit
)
69 (eval `(function ,findit
))) string
))
70 (return-from regex nil
))
71 (if (= *regex-groupings
* 0)
72 (return-from regex t
))
73 (dotimes (i *regex-groupings
*)
74 (push (funcall 'subseq
76 (car (aref *regex-groups
* i
))
77 (cadr (aref *regex-groups
* i
)))
81 ;;; Declare some simple macros to make the code more readable.
83 (defvar *regex-special-chars
* "?*+.()[]\\${}")
85 (defmacro add-exp
(list)
86 "Add an item to the end of expression"
87 `(setf expression
(append expression
,list
)))
90 ;;; Now for the main regex compiler routine.
92 (defun regex-compile (source &key
(anchored nil
) (case-sensitive t
))
93 "Usage: (regex-compile <expression> [ :anchored (t/nil) ] [ :case-sensitive (t/nil) ])
94 This function take a regular expression (supplied as source) and
95 compiles this into a lambda list that a string argument can then
96 be applied to. It is also possible to compile this lambda list
97 for better performance or to save it as a named function for later
99 (info "Now entering regex-compile with \"~A\"~%" source
)
101 ;; This routine works in two parts.
102 ;; The first pass take the regular expression and produces a list of
103 ;; operators and lisp expressions for the entire regular expression.
104 ;; The second pass takes this list and produces the lambda expression.
105 (let ((expression '()) ; holder for expressions
106 (group 1) ; Current group index
107 (group-stack nil
) ; Stack of current group endings
108 (result nil
) ; holder for built expression.
109 (fast-first nil
)) ; holder for quick unanchored scan
111 ;; If the expression was an empty string then it alway
112 ;; matches (so lets leave early)
114 (if (= (length source
) 0)
115 (return-from regex-compile
116 '(lambda (&rest args
)
117 (declare (ignore args
))
120 ;; If the first character is a caret then set the anchored
121 ;; flags and remove if from the expression string.
123 (cond ((eql (char source
0) #\^
)
124 (setf source
(subseq source
1))
127 ;; If the first sequence is .* then also set the anchored flags.
128 ;; (This is purely for optimization, it will work without this).
130 (if (>= (length source
) 2)
131 (if (string= source
".*" :start1
0 :end1
2)
134 ;; Also, If this is not an anchored search and the first character is
135 ;; a literal, then do a quick scan to see if it is even in the string.
136 ;; If not then we can issue a quick nil,
137 ;; otherwise we can start the search at the matching character to skip
138 ;; the checks of the non-matching characters anyway.
140 ;; If I really wanted to speed up this section of code it would be
141 ;; easy to recognize the case of a fairly long multi-character literal
142 ;; and generate a Boyer-Moore search for the entire literal.
144 ;; I generate the code to do a loop because on CMU Lisp this is about
145 ;; twice as fast a calling position.
147 (if (and (not anchored
)
148 (not (position (char source
0) *regex-special-chars
*))
149 (not (and (> (length source
) 1)
150 (position (char source
1) *regex-special-chars
*))))
151 (setf fast-first
`((if (not (do ((i start
(+ i
1)))
153 (if (,(if case-sensitive
'eql
'char-equal
)
156 (return (setf start i
)))))
157 (return-from final-return nil
)))))
159 ;; Generate the very first expression to save the starting index
160 ;; so that group 0 will be the entire string matched always
162 (add-exp '((setf (aref *regex-groups
* 0)
165 ;; Loop over each character in the regular expression building the
166 ;; expression list as we go.
168 (do ((eindex 0 (1+ eindex
)))
169 ((= eindex
(length source
)))
170 (let ((current (char source eindex
)))
171 (info "Now processing character ~A index = ~A~%" current eindex
)
175 ;; Generate code for a single wild character
177 (add-exp '((if (>= index length
)
178 (return-from compare nil
)
182 ;; If this is the last character of the expression then
183 ;; anchor the end of the expression, otherwise let it slide
184 ;; as a standard character (even though it should be quoted).
186 (if (= eindex
(1- (length source
)))
187 (add-exp '((if (not (= index length
))
188 (return-from compare nil
))))
189 (add-exp '((if (not (and (< index length
)
190 (eql (char string index
) #\$
)))
191 (return-from compare nil
)
194 (add-exp '(astrisk)))
200 (add-exp '(question)))
207 (push group group-stack
)
208 (add-exp `((setf (aref *regex-groups
* ,(1- group
))
215 (let ((group (pop group-stack
)))
216 (add-exp `((setf (cadr (aref *regex-groups
* ,(1- group
)))
218 (add-exp `(,(- group
)))))
221 ;; Start of a range operation.
222 ;; Generate a bit-vector that has one bit per possible character
223 ;; and then on each character or range, set the possible bits.
225 ;; If the first character is carat then invert the set.
226 (let* ((invert (eql (char source
(1+ eindex
)) #\^
))
227 (bitstring (make-array 256 :element-type
'bit
230 (set-char (if invert
0 1)))
231 (if invert
(incf eindex
))
232 (do ((x (1+ eindex
) (1+ x
)))
233 ((eql (char source x
) #\
]) (setf eindex x
))
234 (info "Building range with character ~A~%" (char source x
))
235 (cond ((and (eql (char source
(1+ x
)) #\-
)
236 (not (eql (char source
(+ x
2)) #\
])))
237 (if (>= (char-code (char source x
))
238 (char-code (char source
(+ 2 x
))))
239 (error (intl:gettext
"regex: ranges must be in ascending order; found: \"~A-~A\"")
240 (char source x
) (char source
(+ 2 x
))))
241 (do ((j (char-code (char source x
)) (1+ j
)))
242 ((> j
(char-code (char source
(+ 2 x
))))
244 (info "Setting bit for char ~A code ~A~%" (code-char j
) j
)
245 (setf (sbit bitstring j
) set-char
)))
247 (cond ((not (eql (char source x
) #\
]))
248 (let ((char (char source x
)))
250 ;; If the character is quoted then find out what
251 ;; it should have been
253 (if (eql (char source x
) #\\ )
255 (multiple-value-setq (char length
)
256 (regex-quoted (subseq source x
) invert
))
258 (info "Setting bit for char ~A code ~A~%" char
(char-code char
))
259 (if (not (vectorp char
))
260 (setf (sbit bitstring
(char-code (char source x
))) set-char
)
261 (bit-ior bitstring char t
))))))))
262 (add-exp `((let ((range ,bitstring
))
263 (if (>= index length
)
264 (return-from compare nil
))
265 (if (= 1 (sbit range
(char-code (char string index
))))
267 (return-from compare nil
)))))))
270 ;; Intreprete the next character as a special, range, octal, group or
271 ;; just the character itself.
275 (multiple-value-setq (value length
)
276 (regex-quoted (subseq source
(1+ eindex
)) nil
))
280 (add-exp `((if (not (and (< index length
)
281 (eql (char string index
)
283 (return-from compare nil
)
286 (add-exp `((let ((range ,value
))
287 (if (>= index length
)
288 (return-from compare nil
))
289 (if (= 1 (sbit range
(char-code (char string index
))))
291 (return-from compare nil
)))))))
292 (incf eindex length
)))
295 ;; We have a literal character.
296 ;; Scan to see how many we have and if it is more than one
297 ;; generate a string= verses as single eql.
300 (term (dotimes (litindex (- (length source
) eindex
) nil
)
301 (let ((litchar (char source
(+ eindex litindex
))))
302 (if (position litchar
*regex-special-chars
*)
305 (info "Now adding ~A index ~A to lit~%" litchar
307 (setf lit
(concatenate 'string lit
308 (string litchar
)))))))))
309 (if (= (length lit
) 1)
310 (add-exp `((if (not (and (< index length
)
311 (,(if case-sensitive
'eql
'char-equal
)
312 (char string index
) ,current
)))
313 (return-from compare nil
)
316 ;; If we have a multi-character literal then we must
317 ;; check to see if the next character (if there is one)
318 ;; is an astrisk or a plus. If so then we must not use this
319 ;; character in the big literal.
321 (if (or (eql term
#\
*) (eql term
#\
+))
322 (setf lit
(subseq lit
0 (1- (length lit
)))))
323 (add-exp `((if (< length
(+ index
,(length lit
)))
324 (return-from compare nil
))
325 (if (not (,(if case-sensitive
'string
= 'string-equal
)
326 string
,lit
:start1 index
327 :end1
(+ index
,(length lit
))))
328 (return-from compare nil
)
329 (incf index
,(length lit
)))))))
330 (incf eindex
(1- (length lit
))))))))
332 ;; Plug end of list to return t. If we made it this far then
334 (add-exp '((setf (cadr (aref *regex-groups
* 0))
336 (add-exp '((return-from final-return t
)))
338 ;;; (print expression)
340 ;; Now take the expression list and turn it into a lambda expression
341 ;; replacing the special flags with lisp code.
342 ;; For example: A BEGIN needs to be replace by an expression that
343 ;; saves the current index, then evaluates everything till it gets to
344 ;; the END then save the new index if it didn't fail.
345 ;; On an ASTRISK I need to take the previous expression and wrap
346 ;; it in a do that will evaluate the expression till an error
347 ;; occurs and then another do that encompases the remainder of the
348 ;; regular expression and iterates decrementing the index by one
349 ;; of the matched expression sizes and then returns nil. After
350 ;; the last expression insert a form that does a return t so that
351 ;; if the entire nested sub-expression succeeds then the loop
352 ;; is broken manually.
354 (setf result
(copy-tree nil
))
356 ;; Reversing the current expression makes building up the
357 ;; lambda list easier due to the nexting of expressions when
358 ;; and astrisk has been encountered.
359 (setf expression
(reverse expression
))
360 (do ((elt 0 (1+ elt
)))
361 ((>= elt
(length expression
)))
362 (let ((piece (nth elt expression
)))
364 ;; Now check for PLUS, if so then ditto the expression and then let the
365 ;; ASTRISK below handle the rest.
367 (cond ((eql piece
'plus
)
368 (cond ((listp (nth (1+ elt
) expression
))
369 (setf result
(append (list (nth (1+ elt
) expression
))
372 ;; duplicate the entire group
373 ;; NOTE: This hasn't been implemented yet!!
375 (format *standard-output
* "`group' repeat hasn't been implemented yet~%")))))
376 (cond ((listp piece
) ;Just append the list
377 (setf result
(append (list piece
) result
)))
378 ((eql piece
'question
) ; Wrap it in a block that won't fail
379 (cond ((listp (nth (1+ elt
) expression
))
381 (append `((progn (block compare
388 ;; This is a QUESTION on an entire group which
389 ;; hasn't been implemented yet!!!
392 (format *standard-output
* "Optional groups not implemented yet~%"))))
393 ((or (eql piece
'astrisk
) ; Do the wild thing!
395 (cond ((listp (nth (1+ elt
) expression
))
397 ;; This is a single character wild card so
398 ;; do the simple form.
401 `((let ((oindex index
))
402 (declare (fixnum oindex
))
406 ,(nth (1+ elt
) expression
)))
407 (do ((start index
(1- start
)))
408 ((< start oindex
) nil
)
409 (declare (fixnum start
))
411 (declare (fixnum index
))
417 ;; This is a subgroup repeated so I must build
418 ;; the loop using several values.
422 (t t
)))) ; Just ignore everything else.
424 ;; Now wrap the result in a lambda list that can then be
425 ;; invoked or compiled, however the user wishes.
429 `(lambda (string &key
(start 0) (end (length string
)))
430 (declare (string string
)
433 (optimize (speed 0) (compilation-speed 3)))
434 (setf *regex-groupings
* ,group
)
439 (declare (fixnum index length
))
442 `(lambda (string &key
(start 0) (end (length string
)))
443 (declare (string string
)
446 (optimize (speed 0) (compilation-speed 3)))
447 (setf *regex-groupings
* ,group
)
450 (declare (fixnum length
))
452 (do ((marker start
(1+ marker
)))
454 (declare (fixnum marker
))
455 (let ((index marker
))
456 (declare (fixnum index
))
463 ;;; Define a function that will take a quoted character and return
464 ;;; what the real character should be plus how much of the source
465 ;;; string was used. If the result is a set of characters, return an
466 ;;; array of bits indicating which characters should be set. If the
467 ;;; expression is one of the sub-group matches return a
468 ;;; list-expression that will provide the match.
470 (defun regex-quoted (char-string &optional
(invert nil
))
471 "Usage: (regex-quoted <char-string> &optional invert)
472 Returns either the quoted character or a simple bit vector of bits set for
474 (let ((first (char char-string
0))
475 (result (char char-string
0))
477 (cond ((eql first
#\n)
478 (setf result
#\newline
))
480 (setf result
#\return
))
484 (setf result
#*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
486 (setf result
#*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
488 (setf result
#*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
490 (setf result
#*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
492 (setf result
#*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
494 (setf result
#*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
496 (setf result
#*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
498 (setf result
#*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
499 ((and (>= (char-code first
) (char-code #\
0))
500 (<= (char-code first
) (char-code #\
9)))
501 (if (and (> (length char-string
) 2)
502 (and (>= (char-code (char char-string
1)) (char-code #\
0))
503 (<= (char-code (char char-string
1)) (char-code #\
9))
504 (>= (char-code (char char-string
2)) (char-code #\
0))
505 (<= (char-code (char char-string
2)) (char-code #\
9))))
507 ;; It is a single character specified in octal
510 (setf result
(do ((x 0 (1+ x
))
513 (setf return
(+ (* return
8)
514 (- (char-code (char char-string x
))
516 (setf used-length
3))
518 ;; We have a group number replacement.
520 (let ((group (- (char-code first
) (char-code #\
0))))
521 (setf result
`((let ((nstring (subseq string
(car (aref *regex-groups
* ,group
))
522 (cadr (aref *regex-groups
* ,group
)))))
523 (if (< length
(+ index
(length nstring
)))
524 (return-from compare nil
))
525 (if (not (string= string nstring
527 :end1
(+ index
(length nstring
))))
528 (return-from compare nil
)
529 (incf index
(length nstring
)))))))))
531 (setf result first
)))
532 (if (and (vectorp result
) invert
)
533 (bit-xor result
#*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t
))
534 (values result used-length
)))