whitespace fixes
[cxml-rng.git] / nppcre.lisp
blob106eb89cf9243f4cea4ba5f01b97cd7a03c33f78
1 ;;; -*- show-trailing-whitespace: t; indent-tabs: nil -*-
3 ;;; Copyright (c) 2002-2005, Dr. Edmund Weitz. All rights reserved.
5 ;;; hacked for XSD regular expressions by David Lichteblau in 2007:
7 ;;; - no comments and extended stuff
8 ;;; - no (?
9 ;;; - no greedyness modifier
10 ;;; - fewer and different backslash-escapes: \i \I \c \C \.
11 ;;; - character set substraction: [foo-[bar]]
12 ;;; - no ^ and $, but always wrap those around the complete parse tree
13 ;;; - ...
15 ;;; Derived from:
16 ;;; /usr/local/cvsrep/cl-ppcre/lexer.lisp,v 1.24 2005/04/01 21:29:09 edi Exp
17 ;;; and
18 ;;; /usr/local/cvsrep/cl-ppcre/parser.lisp,v 1.21 2005/08/03 21:11:27 edi Exp
20 ;;; Redistribution and use in source and binary forms, with or without
21 ;;; modification, are permitted provided that the following conditions
22 ;;; are met:
24 ;;; * Redistributions of source code must retain the above copyright
25 ;;; notice, this list of conditions and the following disclaimer.
27 ;;; * Redistributions in binary form must reproduce the above
28 ;;; copyright notice, this list of conditions and the following
29 ;;; disclaimer in the documentation and/or other materials
30 ;;; provided with the distribution.
32 ;;; THIS SOFTWARE IS PROVIDED BY THE AUTHOR 'AS IS' AND ANY EXPRESSED
33 ;;; OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
34 ;;; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
35 ;;; ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
36 ;;; DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
37 ;;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
38 ;;; GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
39 ;;; INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
40 ;;; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
41 ;;; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 ;;; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44 (in-package :cxml-types)
46 (eval-when (:compile-toplevel :load-toplevel :execute)
47 (defparameter *standard-optimize-settings* '(optimize)))
49 (defvar *in-pattern-parser-p* nil)
51 (defvar *convert-char-class-to-hash* #'cl-ppcre::convert-char-class-to-hash)
53 ;;; zzz Evil hack!
55 (format t "Patching CL-PPCRE::CONVERT-CHAR-CLASS-TO-HASH~%")
56 (setf (fdefinition 'cl-ppcre::convert-char-class-to-hash)
57 (lambda (list)
58 (when *in-pattern-parser-p*
59 (setf list (mapcan (lambda (x)
60 (if (symbolp x)
61 (symbol-value x)
62 x))
63 list)))
64 (funcall *convert-char-class-to-hash* list)))
66 (defun signal-ppcre-syntax-error (fmt &rest args)
67 (error "invalid pattern: ~?" fmt args))
69 (defun signal-ppcre-syntax-error* (pos fmt &rest args)
70 (error "invalid pattern at ~D: ~?" pos fmt args))
72 (defmacro maybe-coerce-to-simple-string (string)
73 (let ((=string= (gensym)))
74 `(let ((,=string= ,string))
75 (cond ((simple-string-p ,=string=)
76 ,=string=)
78 (coerce ,=string= 'simple-string))))))
80 (defun map-char-to-special-char-class (chr lexer)
81 (declare #.*standard-optimize-settings*)
82 "Maps escaped characters like \"\\d\" to the tokens which represent
83 their associated character classes."
84 (case chr
85 (#\. '\.)
86 (#\s '\\s) (#\i '\\i) (#\c '\\c) (#\d '\\d) (#\w '\\w)
87 (#\S '^s) (#\I '^i) (#\C '^c) (#\D '^d) (#\W '^w)
88 (#\p
89 (unless (eql (next-char lexer) #\{)
90 (signal-ppcre-syntax-error "Missing open brace after \\p"))
91 (let* ((bag (loop
92 for c = (next-char lexer)
93 for last = (eql c #\})
94 and done = nil then last
95 until done
96 unless c do
97 (signal-ppcre-syntax-error
98 "Missing close brace after \\p")
99 collect c))
100 (bag (coerce (list* #\p #\{ bag) 'string)))
101 (or (find-symbol bag 'cxml-types)
102 (signal-ppcre-syntax-error "Invalid character property: ~A"
103 bag))))))
105 (locally
106 (declare #.*standard-optimize-settings*)
107 (defstruct (lexer (:constructor make-lexer-internal))
108 "LEXER structures are used to hold the regex string which is
109 currently lexed and to keep track of the lexer's state."
110 (str ""
111 :type string
112 :read-only t)
113 (len 0
114 :type fixnum
115 :read-only t)
116 (pos 0
117 :type fixnum)
118 (last-pos nil
119 :type list)))
121 (defun make-lexer (string)
122 (declare (inline make-lexer-internal)
123 #-genera (type string string))
124 (make-lexer-internal :str (maybe-coerce-to-simple-string string)
125 :len (length string)))
127 (declaim (inline end-of-string-p))
128 (defun end-of-string-p (lexer)
129 (declare #.*standard-optimize-settings*)
130 "Tests whether we're at the end of the regex string."
131 (<= (lexer-len lexer)
132 (lexer-pos lexer)))
134 (declaim (inline looking-at-p))
135 (defun looking-at-p (lexer chr)
136 (declare #.*standard-optimize-settings*)
137 "Tests whether the next character the lexer would see is CHR.
138 Does not respect extended mode."
139 (and (not (end-of-string-p lexer))
140 (char= (schar (lexer-str lexer) (lexer-pos lexer))
141 chr)))
143 (declaim (inline next-char-non-extended))
144 (defun next-char-non-extended (lexer)
145 (declare #.*standard-optimize-settings*)
146 "Returns the next character which is to be examined and updates the
147 POS slot. Does not respect extended mode."
148 (cond ((end-of-string-p lexer)
149 nil)
151 (prog1
152 (schar (lexer-str lexer) (lexer-pos lexer))
153 (incf (lexer-pos lexer))))))
155 (defun next-char (lexer)
156 (declare #.*standard-optimize-settings*)
157 (next-char-non-extended lexer))
159 (declaim (inline fail))
160 (defun fail (lexer)
161 (declare #.*standard-optimize-settings*)
162 "Moves (LEXER-POS LEXER) back to the last position stored in
163 \(LEXER-LAST-POS LEXER) and pops the LAST-POS stack."
164 (unless (lexer-last-pos lexer)
165 (signal-ppcre-syntax-error "LAST-POS stack of LEXER ~A is empty" lexer))
166 (setf (lexer-pos lexer) (pop (lexer-last-pos lexer)))
167 nil)
169 (defun get-number (lexer &key (radix 10) max-length no-whitespace-p)
170 (declare #.*standard-optimize-settings*)
171 "Read and consume the number the lexer is currently looking at and
172 return it. Returns NIL if no number could be identified.
173 RADIX is used as in PARSE-INTEGER. If MAX-LENGTH is not NIL we'll read
174 at most the next MAX-LENGTH characters. If NO-WHITESPACE-P is not NIL
175 we don't tolerate whitespace in front of the number."
176 (when (or (end-of-string-p lexer)
177 (and no-whitespace-p
178 (not (find (schar (lexer-str lexer) (lexer-pos lexer))
179 "0123456789"))))
180 (return-from get-number nil))
181 (multiple-value-bind (integer new-pos)
182 (parse-integer (lexer-str lexer)
183 :start (lexer-pos lexer)
184 :end (if max-length
185 (let ((end-pos (+ (lexer-pos lexer)
186 (the fixnum max-length)))
187 (lexer-len (lexer-len lexer)))
188 (if (< end-pos lexer-len)
189 end-pos
190 lexer-len))
191 (lexer-len lexer))
192 :radix radix
193 :junk-allowed t)
194 (cond ((and integer (>= (the fixnum integer) 0))
195 (setf (lexer-pos lexer) new-pos)
196 integer)
197 (t nil))))
199 (declaim (inline make-char-from-code))
200 (defun make-char-from-code (number error-pos)
201 (declare #.*standard-optimize-settings*)
202 "Create character from char-code NUMBER. NUMBER can be NIL
203 which is interpreted as 0. ERROR-POS is the position where
204 the corresponding number started within the regex string."
205 ;; only look at rightmost eight bits in compliance with Perl
206 (let ((code (logand #o377 (the fixnum (or number 0)))))
207 (or (and (< code char-code-limit)
208 (code-char code))
209 (signal-ppcre-syntax-error*
210 error-pos
211 "No character for hex-code ~X"
212 number))))
214 (defun unescape-char (lexer)
215 (declare #.*standard-optimize-settings*)
216 "Convert the characters(s) following a backslash into a token
217 which is returned. This function is to be called when the backslash
218 has already been consumed. Special character classes like \\W are
219 handled elsewhere."
220 (when (end-of-string-p lexer)
221 (signal-ppcre-syntax-error "String ends with backslash"))
222 (let ((chr (next-char-non-extended lexer)))
223 (case chr
224 ;; the following five character names are 'semi-standard'
225 ;; according to the CLHS but I'm not aware of any implementation
226 ;; that doesn't implement them
227 ((#\t)
228 #\Tab)
229 ((#\n)
230 #\Newline)
231 ((#\r)
232 #\Return)
233 (otherwise
234 ;; all other characters aren't affected by a backslash
235 chr))))
237 (defun convert-substraction (r s)
238 (flet ((rangify (x)
239 (etypecase x
240 (character `((:range ,x ,x)))
241 (list (assert (eq (car x) :range)) (list x))
242 (symbol (copy-list (symbol-value x))))))
243 (ranges- (mapcan #'rangify r) (mapcan #'rangify s))))
245 (defun collect-char-class (lexer)
246 (declare #.*standard-optimize-settings*)
247 "Reads and consumes characters from regex string until a right
248 bracket is seen. Assembles them into a list \(which is returned) of
249 characters, character ranges, like \(:RANGE #\\A #\\E) for a-e, and
250 tokens representing special character classes."
251 (let ((start-pos (lexer-pos lexer)) ; remember start for error message
252 hyphen-seen
253 last-char
254 list)
255 (flet ((handle-char (c)
256 "Do the right thing with character C depending on whether
257 we're inside a range or not."
258 (cond ((and hyphen-seen last-char)
259 (setf (car list) (list :range last-char c)
260 last-char nil))
262 (push c list)
263 (setq last-char c)))
264 (setq hyphen-seen nil)))
265 (loop for first = t then nil
266 for c = (next-char-non-extended lexer)
267 ;; leave loop if at end of string
268 while c
269 do (cond
270 ((char= c #\\)
271 ;; we've seen a backslash
272 (let ((next-char (next-char-non-extended lexer)))
273 (case next-char
274 ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
275 ;; a special character class
276 (push (map-char-to-special-char-class next-char lexer)
277 list)
278 ;; if the last character was a hyphen
279 ;; just collect it literally
280 (when hyphen-seen
281 (push #\- list))
282 ;; if the next character is a hyphen do the same
283 (when (looking-at-p lexer #\-)
284 (incf (lexer-pos lexer))
285 (when (looking-at-p lexer #\[)
286 (incf (lexer-pos lexer))
287 (return-from collect-char-class
288 (prog1
289 (convert-substraction
290 (nreverse list)
291 (collect-char-class lexer))
292 (unless
293 (eql (next-char-non-extended lexer) #\])
294 (signal-ppcre-syntax-error*
295 start-pos
296 "Missing right bracket to close character class")))))
297 (push #\- list))
298 (setq hyphen-seen nil))
299 (otherwise
300 ;; otherwise unescape the following character(s)
301 (decf (lexer-pos lexer))
302 (handle-char (unescape-char lexer))))))
303 (first
304 ;; the first character must not be a right bracket
305 ;; and isn't treated specially if it's a hyphen
306 (handle-char c))
307 ((char= c #\])
308 ;; end of character class
309 ;; make sure we collect a pending hyphen
310 (when hyphen-seen
311 (setq hyphen-seen nil)
312 (handle-char #\-))
313 ;; reverse the list to preserve the order intended
314 ;; by the author of the regex string
315 (return-from collect-char-class (nreverse list)))
316 ((and hyphen-seen (char= c #\[))
317 (return-from collect-char-class
318 (prog1
319 (convert-substraction
320 (nreverse list)
321 (collect-char-class lexer))
322 (unless (eql (next-char-non-extended lexer) #\])
323 (signal-ppcre-syntax-error*
324 start-pos
325 "Missing right bracket to close character class")))))
326 ((and (char= c #\-)
327 last-char
328 (not hyphen-seen))
329 ;; if the last character was 'just a character'
330 ;; we expect to be in the middle of a range
331 (setq hyphen-seen t))
332 ((char= c #\-)
333 ;; otherwise this is just an ordinary hyphen
334 (handle-char #\-))
336 ;; default case - just collect the character
337 (handle-char c))))
338 ;; we can only exit the loop normally if we've reached the end
339 ;; of the regex string without seeing a right bracket
340 (signal-ppcre-syntax-error*
341 start-pos
342 "Missing right bracket to close character class"))))
344 (defun get-quantifier (lexer)
345 (declare #.*standard-optimize-settings*)
346 "Returns a list of two values (min max) if what the lexer is looking
347 at can be interpreted as a quantifier. Otherwise returns NIL and
348 resets the lexer to its old position."
349 ;; remember starting position for FAIL and UNGET-TOKEN functions
350 (push (lexer-pos lexer) (lexer-last-pos lexer))
351 (let ((next-char (next-char lexer)))
352 (case next-char
353 ((#\*)
354 ;; * (Kleene star): match 0 or more times
355 '(0 nil))
356 ((#\+)
357 ;; +: match 1 or more times
358 '(1 nil))
359 ((#\?)
360 ;; ?: match 0 or 1 times
361 '(0 1))
362 ((#\{)
363 ;; one of
364 ;; {n}: match exactly n times
365 ;; {n,}: match at least n times
366 ;; {n,m}: match at least n but not more than m times
367 ;; note that anything not matching one of these patterns will
368 ;; be interpreted literally - even whitespace isn't allowed
369 (let ((num1 (get-number lexer :no-whitespace-p t)))
370 (if num1
371 (let ((next-char (next-char-non-extended lexer)))
372 (case next-char
373 ((#\,)
374 (let* ((num2 (get-number lexer :no-whitespace-p t))
375 (next-char (next-char-non-extended lexer)))
376 (case next-char
377 ((#\})
378 ;; this is the case {n,} (NUM2 is NIL) or {n,m}
379 (list num1 num2))
380 (otherwise
381 (fail lexer)))))
382 ((#\})
383 ;; this is the case {n}
384 (list num1 num1))
385 (otherwise
386 (fail lexer))))
387 ;; no number following left curly brace, so we treat it
388 ;; like a normal character
389 (fail lexer))))
390 ;; cannot be a quantifier
391 (otherwise
392 (fail lexer)))))
394 (defun get-token (lexer)
395 (declare #.*standard-optimize-settings*)
396 "Returns and consumes the next token from the regex string (or NIL)."
397 ;; remember starting position for UNGET-TOKEN function
398 (push (lexer-pos lexer)
399 (lexer-last-pos lexer))
400 (let ((next-char (next-char lexer)))
401 (cond (next-char
402 (case next-char
403 ;; the easy cases first - the following six characters
404 ;; always have a special meaning and get translated
405 ;; into tokens immediately
406 ((#\))
407 :close-paren)
408 ((#\|)
409 :vertical-bar)
410 ((#\?)
411 :question-mark)
412 ((#\.)
413 :everything)
414 ((#\+ #\*)
415 ;; quantifiers will always be consumend by
416 ;; GET-QUANTIFIER, they must not appear here
417 (signal-ppcre-syntax-error*
418 (1- (lexer-pos lexer))
419 "Quantifier '~A' not allowed"
420 next-char))
421 ((#\{)
422 ;; left brace isn't a special character in it's own
423 ;; right but we must check if what follows might
424 ;; look like a quantifier
425 (let ((this-pos (lexer-pos lexer))
426 (this-last-pos (lexer-last-pos lexer)))
427 (unget-token lexer)
428 (when (get-quantifier lexer)
429 (signal-ppcre-syntax-error*
430 (car this-last-pos)
431 "Quantifier '~A' not allowed"
432 (subseq (lexer-str lexer)
433 (car this-last-pos)
434 (lexer-pos lexer))))
435 (setf (lexer-pos lexer) this-pos
436 (lexer-last-pos lexer) this-last-pos)
437 next-char))
438 ((#\[)
439 ;; left bracket always starts a character class
440 (cons (cond ((looking-at-p lexer #\^)
441 (incf (lexer-pos lexer))
442 :inverted-char-class)
444 :char-class))
445 (collect-char-class lexer)))
446 ((#\\)
447 ;; backslash might mean different things so we have
448 ;; to peek one char ahead:
449 (let ((next-char (next-char-non-extended lexer)))
450 (case next-char
451 ((#\. #\i #\I #\c #\C #\d #\D #\w #\W #\s #\S #\p)
452 ;; these will be treated like character classes
453 (map-char-to-special-char-class next-char lexer))
454 (otherwise
455 ;; in all other cases just unescape the
456 ;; character
457 (decf (lexer-pos lexer))
458 (unescape-char lexer)))))
459 ((#\()
460 :open-paren)
461 (otherwise
462 ;; all other characters are their own tokens
463 next-char)))
464 ;; we didn't get a character (this if the "else" branch from
465 ;; the first IF), so we don't return a token but NIL
467 (pop (lexer-last-pos lexer))
468 nil))))
470 (declaim (notinline unget-token)) ;FIXME: else AVER in GET-TOKEN
471 (defun unget-token (lexer)
472 (declare #.*standard-optimize-settings*)
473 "Moves the lexer back to the last position stored in the LAST-POS stack."
474 (if (lexer-last-pos lexer)
475 (setf (lexer-pos lexer)
476 (pop (lexer-last-pos lexer)))
477 (error "No token to unget \(this should not happen)")))
479 (declaim (inline start-of-subexpr-p))
480 (defun start-of-subexpr-p (lexer)
481 (declare #.*standard-optimize-settings*)
482 "Tests whether the next token can start a valid sub-expression, i.e.
483 a stand-alone regex."
484 (let* ((pos (lexer-pos lexer))
485 (next-char (next-char lexer)))
486 (not (or (null next-char)
487 (prog1
488 (member (the character next-char)
489 '(#\) #\|)
490 :test #'char=)
491 (setf (lexer-pos lexer) pos))))))
493 (defun group (lexer)
494 (declare #.*standard-optimize-settings*)
495 "Parses and consumes a <group>.
496 The productions are: <group> -> \"(\"<regex>\")\"
497 <legal-token>
498 Will return <parse-tree> or (<grouping-type> <parse-tree>) where
499 <grouping-type> is one of six keywords - see source for details."
500 (let ((open-token (get-token lexer)))
501 (cond ((eq open-token :open-paren)
502 (let* ((open-paren-pos (car (lexer-last-pos lexer)))
503 (reg-expr (reg-expr lexer))
504 (close-token (get-token lexer)))
505 (unless (eq close-token :close-paren)
506 ;; the token following <regex> must be the closing
507 ;; parenthesis or this is a syntax error
508 (signal-ppcre-syntax-error*
509 open-paren-pos
510 "Opening paren has no matching closing paren"))
511 (list :register reg-expr)))
513 ;; this is the <legal-token> production; <legal-token> is
514 ;; any token which passes START-OF-SUBEXPR-P (otherwise
515 ;; parsing had already stopped in the SEQ method)
516 open-token))))
518 (defun greedy-quant (lexer)
519 (declare #.*standard-optimize-settings*)
520 "Parses and consumes a <greedy-quant>.
521 The productions are: <greedy-quant> -> <group> | <group><quantifier>
522 where <quantifier> is parsed by the lexer function GET-QUANTIFIER.
523 Will return <parse-tree> or (:GREEDY-REPETITION <min> <max> <parse-tree>)."
524 (let* ((group (group lexer))
525 (token (get-quantifier lexer)))
526 (if token
527 ;; if GET-QUANTIFIER returned a non-NIL value it's the
528 ;; two-element list (<min> <max>)
529 (list :greedy-repetition (first token) (second token) group)
530 group)))
532 (defun quant (lexer)
533 (declare #.*standard-optimize-settings*)
534 (greedy-quant lexer))
536 (defun seq (lexer)
537 (declare #.*standard-optimize-settings*)
538 "Parses and consumes a <seq>.
539 The productions are: <seq> -> <quant> | <quant><seq>.
540 Will return <parse-tree> or (:SEQUENCE <parse-tree> <parse-tree>)."
541 (flet ((make-array-from-two-chars (char1 char2)
542 (let ((string (make-array 2
543 :element-type 'character
544 :fill-pointer t
545 :adjustable t)))
546 (setf (aref string 0) char1)
547 (setf (aref string 1) char2)
548 string)))
549 ;; Note that we're calling START-OF-SUBEXPR-P before we actually try
550 ;; to parse a <seq> or <quant> in order to catch empty regular
551 ;; expressions
552 (if (start-of-subexpr-p lexer)
553 (let ((quant (quant lexer)))
554 (if (start-of-subexpr-p lexer)
555 (let* ((seq (seq lexer))
556 (quant-is-char-p (characterp quant))
557 (seq-is-sequence-p (and (consp seq)
558 (eq (first seq) :sequence))))
559 (cond ((and quant-is-char-p
560 (characterp seq))
561 (make-array-from-two-chars seq quant))
562 ((and quant-is-char-p
563 (stringp seq))
564 (vector-push-extend quant seq)
565 seq)
566 ((and quant-is-char-p
567 seq-is-sequence-p
568 (characterp (second seq)))
569 (cond ((cddr seq)
570 (setf (cdr seq)
571 (cons
572 (make-array-from-two-chars (second seq)
573 quant)
574 (cddr seq)))
575 seq)
576 (t (make-array-from-two-chars (second seq) quant))))
577 ((and quant-is-char-p
578 seq-is-sequence-p
579 (stringp (second seq)))
580 (cond ((cddr seq)
581 (setf (cdr seq)
582 (cons
583 (progn
584 (vector-push-extend quant (second seq))
585 (second seq))
586 (cddr seq)))
587 seq)
589 (vector-push-extend quant (second seq))
590 (second seq))))
591 (seq-is-sequence-p
592 ;; if <seq> is also a :SEQUENCE parse tree we merge
593 ;; both lists into one to avoid unnecessary consing
594 (setf (cdr seq)
595 (cons quant (cdr seq)))
596 seq)
597 (t (list :sequence quant seq))))
598 quant))
599 :void)))
601 (defun reg-expr (lexer)
602 (declare #.*standard-optimize-settings*)
603 "Parses and consumes a <regex>, a complete regular expression.
604 The productions are: <regex> -> <seq> | <seq>\"|\"<regex>.
605 Will return <parse-tree> or (:ALTERNATION <parse-tree> <parse-tree>)."
606 (let ((pos (lexer-pos lexer)))
607 (case (next-char lexer)
608 ((nil)
609 ;; if we didn't get any token we return :VOID which stands for
610 ;; "empty regular expression"
611 :void)
612 ((#\|)
613 ;; now check whether the expression started with a vertical
614 ;; bar, i.e. <seq> - the left alternation - is empty
615 (list :alternation :void (reg-expr lexer)))
616 (otherwise
617 ;; otherwise un-read the character we just saw and parse a
618 ;; <seq> plus the character following it
619 (setf (lexer-pos lexer) pos)
620 (let* ((seq (seq lexer))
621 (pos (lexer-pos lexer)))
622 (case (next-char lexer)
623 ((nil)
624 ;; no further character, just a <seq>
625 seq)
626 ((#\|)
627 ;; if the character was a vertical bar, this is an
628 ;; alternation and we have the second production
629 (let ((reg-expr (reg-expr lexer)))
630 (cond ((and (consp reg-expr)
631 (eq (first reg-expr) :alternation))
632 ;; again we try to merge as above in SEQ
633 (setf (cdr reg-expr)
634 (cons seq (cdr reg-expr)))
635 reg-expr)
636 (t (list :alternation seq reg-expr)))))
637 (otherwise
638 ;; a character which is not a vertical bar - this is
639 ;; either a syntax error or we're inside of a group and
640 ;; the next character is a closing parenthesis; so we
641 ;; just un-read the character and let another function
642 ;; take care of it
643 (setf (lexer-pos lexer) pos)
644 seq)))))))
646 (defun reverse-strings (parse-tree)
647 (declare #.*standard-optimize-settings*)
648 (cond ((stringp parse-tree)
649 (nreverse parse-tree))
650 ((consp parse-tree)
651 (loop for parse-tree-rest on parse-tree
652 while parse-tree-rest
653 do (setf (car parse-tree-rest)
654 (reverse-strings (car parse-tree-rest))))
655 parse-tree)
656 (t parse-tree)))
658 (defun parse-pattern (string)
659 (declare #.*standard-optimize-settings*)
660 "Translate the regex string STRING into a parse tree."
661 (let* ((*in-pattern-parser-p* t)
662 (lexer (make-lexer string))
663 (parse-tree (reverse-strings (reg-expr lexer))))
664 ;; check whether we've consumed the whole regex string
665 (if (end-of-string-p lexer)
666 `(:sequence :start-anchor ,parse-tree :end-anchor)
667 (signal-ppcre-syntax-error*
668 (lexer-pos lexer)
669 "Expected end of string"))))
671 (defmethod pattern-scanner ((str string))
672 (cl-ppcre:create-scanner (parse-pattern str)))
674 (defmethod pattern-scanner ((scanner function))
675 scanner)