Replace Python program auto-generated for documentation categories
[maxima/cygwin.git] / src / nregex.lisp
blobed1fbfb6354fa029d1895fae2db5b7f22451aad8
1 ;;; -*- Mode: Lisp; Package:USER; Base:10 -*-
2 ;;;
3 ;;; This code was written by:
4 ;;;
5 ;;; Lawrence E. Freil <lef@nscf.org>
6 ;;; National Science Center Foundation
7 ;;; Augusta, Georgia 30909
8 ;;;
9 ;;; If you modify this code, please comment your modifications
10 ;;; clearly and inform the author of any improvements so they
11 ;;; can be incorporated in future releases.
12 ;;;
13 ;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression
14 ;;; parser.
15 ;;;
16 ;;; This regular expression parser operates by taking a
17 ;;; regular expression and breaking it down into a list
18 ;;; consisting of lisp expressions and flags. The list
19 ;;; of lisp expressions is then taken in turned into a
20 ;;; lambda expression that can be later applied to a
21 ;;; string argument for parsing.
23 ;;;
24 ;;; First we create a copy of macros to help debug the beast
26 (eval-when #-gcl(:compile-toplevel :load-toplevel :execute)
27 #+gcl(load compile eval)
28 (defpackage :maxima-nregex
29 (:use :common-lisp)
30 (:export
31 ;; Vars
32 #:*regex-debug* #:*regex-groups* #:*regex-groupings*
33 ;; Functions
34 #:regex-compile
38 (in-package :maxima-nregex)
40 (eval-when (:compile-toplevel :load-toplevel :execute)
41 (defvar *regex-debug* nil) ; Set to nil for no debugging code
43 (defmacro info (message &rest args)
44 (if *regex-debug*
45 `(format *trace-output* ,message ,@args)))
47 ;;;
48 ;;; Declare the global variables for storing the paren index list.
49 ;;;
50 (defvar *regex-groups* (make-array 10))
51 (defvar *regex-groupings* 0)
54 ;;;
55 ;;; Declare a simple interface for testing. You probably wouldn't want
56 ;;; to use this interface unless you were just calling this once.
57 ;;;
58 (defun regex (expression string)
59 "Usage: (regex <expression> <string)
60 This function will call regex-compile on the expression and then apply
61 the string to the returned lambda list."
62 (let ((findit (cond ((stringp expression)
63 (regex-compile expression))
64 ((listp expression)
65 expression)))
66 (result nil))
67 (if (not (funcall (if (functionp findit)
68 findit
69 (eval `(function ,findit))) string))
70 (return-from regex nil))
71 (if (= *regex-groupings* 0)
72 (return-from regex t))
73 (dotimes (i *regex-groupings*)
74 (push (funcall 'subseq
75 string
76 (car (aref *regex-groups* i))
77 (cadr (aref *regex-groups* i)))
78 result))
79 (reverse result)))
80 ;;;
81 ;;; Declare some simple macros to make the code more readable.
82 ;;;
83 (defvar *regex-special-chars* "?*+.()[]\\${}")
85 (defmacro add-exp (list)
86 "Add an item to the end of expression"
87 `(setf expression (append expression ,list)))
89 ;;;
90 ;;; Now for the main regex compiler routine.
91 ;;;
92 (defun regex-compile (source &key (anchored nil) (case-sensitive t))
93 "Usage: (regex-compile <expression> [ :anchored (t/nil) ] [ :case-sensitive (t/nil) ])
94 This function take a regular expression (supplied as source) and
95 compiles this into a lambda list that a string argument can then
96 be applied to. It is also possible to compile this lambda list
97 for better performance or to save it as a named function for later
98 use"
99 (info "Now entering regex-compile with \"~A\"~%" source)
101 ;; This routine works in two parts.
102 ;; The first pass take the regular expression and produces a list of
103 ;; operators and lisp expressions for the entire regular expression.
104 ;; The second pass takes this list and produces the lambda expression.
105 (let ((expression '()) ; holder for expressions
106 (group 1) ; Current group index
107 (group-stack nil) ; Stack of current group endings
108 (result nil) ; holder for built expression.
109 (fast-first nil)) ; holder for quick unanchored scan
111 ;; If the expression was an empty string then it alway
112 ;; matches (so lets leave early)
114 (if (= (length source) 0)
115 (return-from regex-compile
116 '(lambda (&rest args)
117 (declare (ignore args))
118 t)))
120 ;; If the first character is a caret then set the anchored
121 ;; flags and remove if from the expression string.
123 (cond ((eql (char source 0) #\^)
124 (setf source (subseq source 1))
125 (setf anchored t)))
127 ;; If the first sequence is .* then also set the anchored flags.
128 ;; (This is purely for optimization, it will work without this).
130 (if (>= (length source) 2)
131 (if (string= source ".*" :start1 0 :end1 2)
132 (setf anchored t)))
134 ;; Also, If this is not an anchored search and the first character is
135 ;; a literal, then do a quick scan to see if it is even in the string.
136 ;; If not then we can issue a quick nil,
137 ;; otherwise we can start the search at the matching character to skip
138 ;; the checks of the non-matching characters anyway.
140 ;; If I really wanted to speed up this section of code it would be
141 ;; easy to recognize the case of a fairly long multi-character literal
142 ;; and generate a Boyer-Moore search for the entire literal.
144 ;; I generate the code to do a loop because on CMU Lisp this is about
145 ;; twice as fast a calling position.
147 (if (and (not anchored)
148 (not (position (char source 0) *regex-special-chars*))
149 (not (and (> (length source) 1)
150 (position (char source 1) *regex-special-chars*))))
151 (setf fast-first `((if (not (do ((i start (+ i 1)))
152 ((>= i length))
153 (if (,(if case-sensitive 'eql 'char-equal)
154 (char string i)
155 ,(char source 0))
156 (return (setf start i)))))
157 (return-from final-return nil)))))
159 ;; Generate the very first expression to save the starting index
160 ;; so that group 0 will be the entire string matched always
162 (add-exp '((setf (aref *regex-groups* 0)
163 (list index nil))))
165 ;; Loop over each character in the regular expression building the
166 ;; expression list as we go.
168 (do ((eindex 0 (1+ eindex)))
169 ((= eindex (length source)))
170 (let ((current (char source eindex)))
171 (info "Now processing character ~A index = ~A~%" current eindex)
172 (case current
173 ((#\.)
175 ;; Generate code for a single wild character
177 (add-exp '((if (>= index length)
178 (return-from compare nil)
179 (incf index)))))
180 ((#\$)
182 ;; If this is the last character of the expression then
183 ;; anchor the end of the expression, otherwise let it slide
184 ;; as a standard character (even though it should be quoted).
186 (if (= eindex (1- (length source)))
187 (add-exp '((if (not (= index length))
188 (return-from compare nil))))
189 (add-exp '((if (not (and (< index length)
190 (eql (char string index) #\$)))
191 (return-from compare nil)
192 (incf index))))))
193 ((#\*)
194 (add-exp '(astrisk)))
196 ((#\+)
197 (add-exp '(plus)))
199 ((#\?)
200 (add-exp '(question)))
202 ((#\()
204 ;; Start a grouping.
206 (incf group)
207 (push group group-stack)
208 (add-exp `((setf (aref *regex-groups* ,(1- group))
209 (list index nil))))
210 (add-exp `(,group)))
211 ((#\))
213 ;; End a grouping
215 (let ((group (pop group-stack)))
216 (add-exp `((setf (cadr (aref *regex-groups* ,(1- group)))
217 index)))
218 (add-exp `(,(- group)))))
219 ((#\[)
221 ;; Start of a range operation.
222 ;; Generate a bit-vector that has one bit per possible character
223 ;; and then on each character or range, set the possible bits.
225 ;; If the first character is carat then invert the set.
226 (let* ((invert (eql (char source (1+ eindex)) #\^))
227 (bitstring (make-array 256 :element-type 'bit
228 :initial-element
229 (if invert 1 0)))
230 (set-char (if invert 0 1)))
231 (if invert (incf eindex))
232 (do ((x (1+ eindex) (1+ x)))
233 ((eql (char source x) #\]) (setf eindex x))
234 (info "Building range with character ~A~%" (char source x))
235 (cond ((and (eql (char source (1+ x)) #\-)
236 (not (eql (char source (+ x 2)) #\])))
237 (if (>= (char-code (char source x))
238 (char-code (char source (+ 2 x))))
239 (error (intl:gettext "regex: ranges must be in ascending order; found: \"~A-~A\"")
240 (char source x) (char source (+ 2 x))))
241 (do ((j (char-code (char source x)) (1+ j)))
242 ((> j (char-code (char source (+ 2 x))))
243 (incf x 2))
244 (info "Setting bit for char ~A code ~A~%" (code-char j) j)
245 (setf (sbit bitstring j) set-char)))
247 (cond ((not (eql (char source x) #\]))
248 (let ((char (char source x)))
250 ;; If the character is quoted then find out what
251 ;; it should have been
253 (if (eql (char source x) #\\ )
254 (let ((length))
255 (multiple-value-setq (char length)
256 (regex-quoted (subseq source x) invert))
257 (incf x length)))
258 (info "Setting bit for char ~A code ~A~%" char (char-code char))
259 (if (not (vectorp char))
260 (setf (sbit bitstring (char-code (char source x))) set-char)
261 (bit-ior bitstring char t))))))))
262 (add-exp `((let ((range ,bitstring))
263 (if (>= index length)
264 (return-from compare nil))
265 (if (= 1 (sbit range (char-code (char string index))))
266 (incf index)
267 (return-from compare nil)))))))
268 ((#\\ )
270 ;; Intreprete the next character as a special, range, octal, group or
271 ;; just the character itself.
273 (let ((length)
274 (value))
275 (multiple-value-setq (value length)
276 (regex-quoted (subseq source (1+ eindex)) nil))
277 (cond ((listp value)
278 (add-exp value))
279 ((characterp value)
280 (add-exp `((if (not (and (< index length)
281 (eql (char string index)
282 ,value)))
283 (return-from compare nil)
284 (incf index)))))
285 ((vectorp value)
286 (add-exp `((let ((range ,value))
287 (if (>= index length)
288 (return-from compare nil))
289 (if (= 1 (sbit range (char-code (char string index))))
290 (incf index)
291 (return-from compare nil)))))))
292 (incf eindex length)))
295 ;; We have a literal character.
296 ;; Scan to see how many we have and if it is more than one
297 ;; generate a string= verses as single eql.
299 (let* ((lit "")
300 (term (dotimes (litindex (- (length source) eindex) nil)
301 (let ((litchar (char source (+ eindex litindex))))
302 (if (position litchar *regex-special-chars*)
303 (return litchar)
304 (progn
305 (info "Now adding ~A index ~A to lit~%" litchar
306 litindex)
307 (setf lit (concatenate 'string lit
308 (string litchar)))))))))
309 (if (= (length lit) 1)
310 (add-exp `((if (not (and (< index length)
311 (,(if case-sensitive 'eql 'char-equal)
312 (char string index) ,current)))
313 (return-from compare nil)
314 (incf index))))
316 ;; If we have a multi-character literal then we must
317 ;; check to see if the next character (if there is one)
318 ;; is an astrisk or a plus. If so then we must not use this
319 ;; character in the big literal.
320 (progn
321 (if (or (eql term #\*) (eql term #\+))
322 (setf lit (subseq lit 0 (1- (length lit)))))
323 (add-exp `((if (< length (+ index ,(length lit)))
324 (return-from compare nil))
325 (if (not (,(if case-sensitive 'string= 'string-equal)
326 string ,lit :start1 index
327 :end1 (+ index ,(length lit))))
328 (return-from compare nil)
329 (incf index ,(length lit)))))))
330 (incf eindex (1- (length lit))))))))
332 ;; Plug end of list to return t. If we made it this far then
333 ;; We have matched!
334 (add-exp '((setf (cadr (aref *regex-groups* 0))
335 index)))
336 (add-exp '((return-from final-return t)))
338 ;;; (print expression)
340 ;; Now take the expression list and turn it into a lambda expression
341 ;; replacing the special flags with lisp code.
342 ;; For example: A BEGIN needs to be replace by an expression that
343 ;; saves the current index, then evaluates everything till it gets to
344 ;; the END then save the new index if it didn't fail.
345 ;; On an ASTRISK I need to take the previous expression and wrap
346 ;; it in a do that will evaluate the expression till an error
347 ;; occurs and then another do that encompases the remainder of the
348 ;; regular expression and iterates decrementing the index by one
349 ;; of the matched expression sizes and then returns nil. After
350 ;; the last expression insert a form that does a return t so that
351 ;; if the entire nested sub-expression succeeds then the loop
352 ;; is broken manually.
354 (setf result (copy-tree nil))
356 ;; Reversing the current expression makes building up the
357 ;; lambda list easier due to the nexting of expressions when
358 ;; and astrisk has been encountered.
359 (setf expression (reverse expression))
360 (do ((elt 0 (1+ elt)))
361 ((>= elt (length expression)))
362 (let ((piece (nth elt expression)))
364 ;; Now check for PLUS, if so then ditto the expression and then let the
365 ;; ASTRISK below handle the rest.
367 (cond ((eql piece 'plus)
368 (cond ((listp (nth (1+ elt) expression))
369 (setf result (append (list (nth (1+ elt) expression))
370 result)))
372 ;; duplicate the entire group
373 ;; NOTE: This hasn't been implemented yet!!
375 (format *standard-output* "`group' repeat hasn't been implemented yet~%")))))
376 (cond ((listp piece) ;Just append the list
377 (setf result (append (list piece) result)))
378 ((eql piece 'question) ; Wrap it in a block that won't fail
379 (cond ((listp (nth (1+ elt) expression))
380 (setf result
381 (append `((progn (block compare
382 ,(nth (1+ elt)
383 expression))
385 result))
386 (incf elt))
388 ;; This is a QUESTION on an entire group which
389 ;; hasn't been implemented yet!!!
392 (format *standard-output* "Optional groups not implemented yet~%"))))
393 ((or (eql piece 'astrisk) ; Do the wild thing!
394 (eql piece 'plus))
395 (cond ((listp (nth (1+ elt) expression))
397 ;; This is a single character wild card so
398 ;; do the simple form.
400 (setf result
401 `((let ((oindex index))
402 (declare (fixnum oindex))
403 (block compare
404 (do ()
405 (nil)
406 ,(nth (1+ elt) expression)))
407 (do ((start index (1- start)))
408 ((< start oindex) nil)
409 (declare (fixnum start))
410 (let ((index start))
411 (declare (fixnum index))
412 (block compare
413 ,@result))))))
414 (incf elt))
417 ;; This is a subgroup repeated so I must build
418 ;; the loop using several values.
422 (t t)))) ; Just ignore everything else.
424 ;; Now wrap the result in a lambda list that can then be
425 ;; invoked or compiled, however the user wishes.
427 (if anchored
428 (setf result
429 `(lambda (string &key (start 0) (end (length string)))
430 (declare (string string)
431 (fixnum start end)
432 (ignorable start)
433 (optimize (speed 0) (compilation-speed 3)))
434 (setf *regex-groupings* ,group)
435 (block final-return
436 (block compare
437 (let ((index start)
438 (length end))
439 (declare (fixnum index length))
440 ,@result)))))
441 (setf result
442 `(lambda (string &key (start 0) (end (length string)))
443 (declare (string string)
444 (fixnum start end)
445 (ignorable start)
446 (optimize (speed 0) (compilation-speed 3)))
447 (setf *regex-groupings* ,group)
448 (block final-return
449 (let ((length end))
450 (declare (fixnum length))
451 ,@fast-first
452 (do ((marker start (1+ marker)))
453 ((> marker end) nil)
454 (declare (fixnum marker))
455 (let ((index marker))
456 (declare (fixnum index))
457 (if (block compare
458 ,@result)
459 (return t)))))))))))
463 ;;; Define a function that will take a quoted character and return
464 ;;; what the real character should be plus how much of the source
465 ;;; string was used. If the result is a set of characters, return an
466 ;;; array of bits indicating which characters should be set. If the
467 ;;; expression is one of the sub-group matches return a
468 ;;; list-expression that will provide the match.
470 (defun regex-quoted (char-string &optional (invert nil))
471 "Usage: (regex-quoted <char-string> &optional invert)
472 Returns either the quoted character or a simple bit vector of bits set for
473 the matching values"
474 (let ((first (char char-string 0))
475 (result (char char-string 0))
476 (used-length 1))
477 (cond ((eql first #\n)
478 (setf result #\newline))
479 ((eql first #\c)
480 (setf result #\return))
481 ((eql first #\t)
482 (setf result #\tab))
483 ((eql first #\d)
484 (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
485 ((eql first #\D)
486 (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
487 ((eql first #\w)
488 (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
489 ((eql first #\W)
490 (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
491 ((eql first #\b)
492 (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
493 ((eql first #\B)
494 (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
495 ((eql first #\s)
496 (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000))
497 ((eql first #\S)
498 (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111))
499 ((and (>= (char-code first) (char-code #\0))
500 (<= (char-code first) (char-code #\9)))
501 (if (and (> (length char-string) 2)
502 (and (>= (char-code (char char-string 1)) (char-code #\0))
503 (<= (char-code (char char-string 1)) (char-code #\9))
504 (>= (char-code (char char-string 2)) (char-code #\0))
505 (<= (char-code (char char-string 2)) (char-code #\9))))
507 ;; It is a single character specified in octal
509 (progn
510 (setf result (do ((x 0 (1+ x))
511 (return 0))
512 ((= x 2) return)
513 (setf return (+ (* return 8)
514 (- (char-code (char char-string x))
515 (char-code #\0))))))
516 (setf used-length 3))
518 ;; We have a group number replacement.
520 (let ((group (- (char-code first) (char-code #\0))))
521 (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group))
522 (cadr (aref *regex-groups* ,group)))))
523 (if (< length (+ index (length nstring)))
524 (return-from compare nil))
525 (if (not (string= string nstring
526 :start1 index
527 :end1 (+ index (length nstring))))
528 (return-from compare nil)
529 (incf index (length nstring)))))))))
531 (setf result first)))
532 (if (and (vectorp result) invert)
533 (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t))
534 (values result used-length)))