base64 und hex
[cxml-rng.git] / clex.lisp
blobdff32bca18d2681d774b83683de463875f24361c
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CLEX; -*-
2 ;;; --------------------------------------------------------------------------------------
3 ;;; Title: A flex like scanner generator for Common LISP
4 ;;; Created: 1997-10-12
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; --------------------------------------------------------------------------------------
8 ;;; (c) copyright 1997-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
17 ;;;
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
20 ;;;
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 ;;;
30 ;;; Changes
32 ;; When Who What
33 ;; ----------------------------------------------------------------------------
34 ;; 2007-04-29 DFL - Represent RANGE directly to cope with character
35 ;; set sizes typical for Unicode.
36 ;; - Disable *full-table-p* by default.
37 ;; - Added SBCL case to the CMUCL workarounds.
39 (defpackage :clex
40 (:use :cl :runes)
41 (:export
42 #:deflexer #:backup #:begin #:initial #:bag))
44 (in-package :CLEX)
46 ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL
47 ;;; compared to CLISP or ACL. Probably they do not have a good implementation of
48 ;;; bit vectors.
50 ;;; We encode our FSA's directly as linked datastructures; A state is represented by:
52 (defstruct (state (:type vector))
53 (final 0)
54 transitions ;simple alist of (sigma . next-state)
55 id ;numeric id of state
56 eps-transitions) ;list of all states reached by epsilon (empty transitions)
58 (defun destructure-range (x)
59 (if (listp x)
60 (values (car x) (cadr x))
61 (values x x)))
63 (defun range- (a b)
64 (multiple-value-bind (amin amax) (destructure-range a)
65 (multiple-value-bind (bmin bmax) (destructure-range b)
66 (incf amax)
67 (incf bmax)
68 (let ((result nil))
69 (flet ((range* (min max)
70 (when (< min max)
71 (push (list min (1- max)) result))))
72 (range* amin (min bmin amax))
73 (range* (max amin bmax) amax))
74 result))))
76 (defun ranges- (aa b)
77 (mapcan (lambda (a) (range- a b)) aa))
79 (defun partition-range (a pos)
80 (multiple-value-bind (min max) (destructure-range a)
81 (if (and (< min pos) (<= pos max))
82 (list (list min (1- pos))
83 (list pos max))
84 (list a))))
86 (defun code (x)
87 (typecase x
88 (integer x)
89 (character (char-code x))))
91 (defun parse-range (range)
92 (if (listp range)
93 (list (code (car range)) (code (cadr range)))
94 (list (code range) (code range))))
96 (defun state-add-link (this range that)
97 "Add a transition to state `this'; reading `range' proceeds to `that'."
98 (cond ((eq range 'eps)
99 (pushnew that (state-eps-transitions this)))
101 (let ((new (list (parse-range range))))
102 (dolist (k (state-transitions this)
103 (push (cons new that) (state-transitions this)))
104 (when (eq (cdr k) that)
105 (dolist (l (car k)) ;avoid duplicates
106 (setf new (ranges- new l)))
107 (setf (car k) (append new (car k)))
108 (return nil)))
109 ;; split existing ranges to remove overlap
110 (dolist (k (state-transitions this))
111 (flet ((doit (pos)
112 (setf (car k)
113 (mapcan (lambda (l)
114 (partition-range l pos))
115 (car k)))))
116 (dolist (n new)
117 (doit (car n))
118 (doit (1+ (cadr n))))))))))
120 ;;; When constructing FSA's from regular expressions we abstract by the notation
121 ;;; of FSA's as boxen with an entry and an exit state.
123 (defstruct fsa
124 start ;entry state
125 end) ;exit state
127 (defun fsa-empty ()
128 "Accepts the empty word."
129 (let ((q (make-state)))
130 (make-fsa :start q :end q)))
132 (defun fsa-trivial (char)
133 "Accepts the trivial word consisting out of exactly one `char'."
134 (let ((q0 (make-state))
135 (q1 (make-state)))
136 (state-add-link q0 char q1)
137 (make-fsa :start q0 :end q1)))
139 (defun fsa-concat (a1 a2)
140 "Concatenation of `a1' and `a2'. Hence `a1 a2'."
141 (state-add-link (fsa-end a1) 'eps (fsa-start a2))
142 (make-fsa :start (fsa-start a1)
143 :end (fsa-end a2)))
145 (defun fsa-iterate (a)
146 "Iteration of `a'. Hence `a*'"
147 (let ((q0 (make-state))
148 (q1 (make-state)))
149 (state-add-link q0 'eps (fsa-start a))
150 (state-add-link q0 'eps q1)
151 (state-add-link q1 'eps q0)
152 (state-add-link (fsa-end a) 'eps q1)
153 (make-fsa :start q0 :end q1)))
155 (defun fsa-branch (&rest as)
156 "Alternation of a0..an; Hence `a0 | a1 | ... | an'."
157 (let ((q0 (make-state))
158 (q1 (make-state)))
159 (dolist (a as)
160 (state-add-link q0 'eps (fsa-start a))
161 (state-add-link (fsa-end a) 'eps q1))
162 (make-fsa :start q0 :end q1)))
164 ;;;; ----------------------------------------------------------------------------------------------------
165 ;;;; Converting regular expressions to (ND)FSA
166 ;;;;
168 ;;; However we choose here a Lispy syntax for regular expressions:
170 ;;; a singelton
171 ;;; (and a0 .. an) concatation
172 ;;; (or a0 .. an) alternation
173 ;;; (* a) iteration
175 ;;; Further the abbrevs.:
176 ;;; (+ a) == (and a (* a))
177 ;;; (? a) == (or a (and))
178 ;;; (a0 ... an) == (and a0 ... an)
180 ;;; When a string embeded into a regular expression is seen, the list
181 ;;; of characters is spliced in. So formally:
182 ;;; (a0 .. ai "xyz" aj .. an) == (a0 .. ai #\x #\y #\z aj .. an)
184 ;;; This is useful for matching words:
185 ;;; "foo" --> (and "foo") --> (and #\f #\o #\o) == The word 'foo'
186 ;;; or for denoting small sets:
187 ;;; (or "+-") --> (or #\+ #\-) == One of '+' or '-'
189 (defun loose-eq (x y)
190 (cond ((eq x y))
191 ((and (symbolp x) (symbolp y))
192 (string= (symbol-name x) (symbol-name y)))))
194 (defun regexp->fsa (term)
195 (setf term (regexp-expand-splicing term))
196 (cond ((and (atom term) (not (stringp term)))
197 (fsa-trivial term))
198 ((loose-eq (car term) 'RANGE)
199 (fsa-trivial (cdr term)))
200 ((loose-eq (car term) 'AND) (regexp/and->fsa term))
201 ((loose-eq (car term) 'OR) (regexp/or->fsa term))
202 ((loose-eq (car term) '*) (fsa-iterate (regexp->fsa (cadr term))))
203 ((loose-eq (car term) '+) (regexp->fsa `(AND ,(cadr term) (* ,(cadr term)))))
204 ((loose-eq (car term) '?) (regexp->fsa `(OR (AND) ,(cadr term))))
206 (regexp->fsa `(AND .,term))) ))
208 (defun regexp/or->fsa (term)
209 ;; I optimize here a bit: I recognized, that ORs are mainly just
210 ;; (large) sets of characters. The extra epsilon transitions are not
211 ;; neccessary on single atoms, so I omit them here. -- This reduces the
212 ;; number of states quite a bit in the first place.
213 (let ((q0 (make-state))
214 (q1 (make-state)))
215 (dolist (a (cdr term))
216 (cond ((atom a)
217 (state-add-link q0 a q1))
218 ((let ((a (regexp->fsa a)))
219 (state-add-link q0 'eps (fsa-start a))
220 (state-add-link (fsa-end a) 'eps q1)))))
221 (make-fsa :start q0 :end q1)))
223 (defun regexp/and->fsa (term)
224 (cond ((null (cdr term)) (fsa-empty))
225 ((null (cddr term)) (regexp->fsa (cadr term)))
226 ((fsa-concat (regexp->fsa (cadr term)) (regexp->fsa `(and .,(cddr term)))))))
228 (defun regexp-expand-splicing (term)
229 (cond ((consp term)
230 (mapcan #'(lambda (x)
231 (cond ((stringp x) (coerce x 'list))
232 ((list x))))
233 term))
234 (t term)))
236 ;;;; ----------------------------------------------------------------------------------------------------
237 ;;;; Converting a ND-FSA to a D-FSA
238 ;;;;
240 ;;; Since we have to compare and unionfy sets of states a lot, I use bit-vectors
241 ;;; to represent these sets for speed. However let me abstract that a bit:
243 ;;; (All these are defined as macros simply for speed. Inlining would be an
244 ;;; option here, when it would be reliable. With defining macros I enforce
245 ;;; inlining).
247 (defmacro make-empty-set (n)
248 "Create the empty set on the domain [0,n)."
249 `(make-array ,n :element-type 'bit :initial-element 0))
251 (defmacro nset-put (bag new)
252 "Destructively calculate bag = bag U {new}."
253 `(setf (sbit (the (simple-array bit (*)) ,bag) (the fixnum ,new)) 1))
255 (defmacro element-of-set-p (elm set)
256 "Determine whether `elm' is element of the set `set'."
257 `(eq 1 (sbit (the (simple-array bit (*)) ,set) (the fixnum ,elm))))
259 (defmacro set-size (set)
260 "Return the upper bound of the domain of `set'."
261 `(length ,set))
263 (defmacro do-bits ((var set &optional result) &body body)
264 "Iterate body with `var' over all elements of `set'."
265 (let ((g/set (gensym)))
266 `(let ((,g/set ,set))
267 (dotimes (,var (set-size ,g/set) ,result)
268 (when (element-of-set-p ,var ,g/set)
269 ,@body)))))
271 ;;; Since the sets we defined above only take non-negative integers, we have to
272 ;;; number our states. This is done once by NUMBER-STATES.
274 (defun number-states (starts)
275 "Number all state reachable form `starts', continuosly from 0. Each state got
276 it's number stuck into the `id' slot.
277 Returns two values: `n' the number of states and `tab' a table to lookup a
278 state given the number it got attached to."
279 (let ((n 0)
280 (tab (make-array 0 :adjustable t :fill-pointer 0 :initial-element nil)))
281 (labels ((walk (x)
282 (unless (state-id x)
283 (vector-push-extend x tab 300)
284 (setf (state-id x) (prog1 n (incf n)))
285 (dolist (tr (state-transitions x))
286 (walk (cdr tr)))
287 (dolist (y (state-eps-transitions x))
288 (walk y)))))
289 (dolist (s starts) (walk s))
290 (values n tab))))
292 ;;; We need to calculate the epsilon closure of a given state. Due to the
293 ;;; precise workings of our algorithm below, we only need this augmenting
294 ;;; version.
296 (defun fsa-epsilon-closure/set (x state-set)
297 "Augment the epsilon closure of the state `state' into `state-set'."
298 (unless (element-of-set-p (state-id x) state-set)
299 (nset-put state-set (state-id x))
300 (dolist (k (state-eps-transitions x))
301 (fsa-epsilon-closure/set k state-set))))
303 (defun ndfsa->dfsa (starts)
304 (let ((batch nil)
305 (known nil))
306 (multiple-value-bind (n tab) (number-states starts)
307 (labels ((name-state-set (state-set)
308 (or (cdr (assoc state-set known :test #'equal))
309 (let ((new (make-state)))
310 (push (cons state-set new) known)
311 (push state-set batch)
312 new)))
313 (add-state-set (state-set)
314 (let ((new-tr (make-hash-table :test 'equal))
315 (new-tr-real nil)
316 (name (name-state-set state-set))
317 (new-final 0))
318 (do-bits (s0 state-set)
319 (let ((s (aref tab s0)))
320 (setf new-final (max new-final (state-final s)))
321 (dolist (tr (state-transitions s))
322 (let ((to (cdr tr)))
323 (dolist (z (car tr))
324 (let ((looked (gethash z new-tr)))
325 (if looked
326 (fsa-epsilon-closure/set to looked)
327 (let ((sts (make-empty-set n)))
328 (fsa-epsilon-closure/set to sts)
329 (setf (gethash z new-tr) sts)))))))))
330 (do ((q (frob2 new-tr) (cddr q)))
331 ((null q))
332 (let ((z (car q))
333 (to (cadr q)))
334 (push (cons z (name-state-set to)) new-tr-real)))
335 (setf (state-transitions name) new-tr-real
336 (state-final name) new-final))))
337 (prog1
338 (mapcar #'(lambda (s)
339 (name-state-set (let ((sts (make-empty-set n)))
340 (fsa-epsilon-closure/set s sts)
341 sts)))
342 starts)
343 (do ()
344 ((null batch))
345 (add-state-set (pop batch)))) ))))
347 (defun frob2 (res &aux res2)
348 (maphash (lambda (z to)
349 (do ((p res2 (cddr p)))
350 ((null p)
351 (setf res2 (list* (list z) to res2)))
352 (when (equal to (cadr p))
353 (setf (car p) (cons z (car p)))
354 (return))))
355 res)
356 res2)
358 ;;;; ----------------------------------------------------------------------------------------------------
359 ;;;; API
360 ;;;;
362 ;;; Features to think about:
363 ;;; - case insensitive scanner
364 ;;; - compression of tables
365 ;;; - debugging aids
366 ;;; - non-interactive high speed scanning?
367 ;;; - make BAG a macro? So that non used bags are not considered?
368 ;;; - REJECT?
369 ;;; - support for include?
370 ;;; - support for putting back input?
371 ;;; - count lines/columns? Track source?
372 ;;; - richer set of regexp primitives e.g. "[a-z]" style sets
373 ;;; - could we offer complement regexp?
374 ;;; - trailing context
375 ;;; - sub-state stacks?
376 ;;; - user variables to include ['global' / 'lexical']
377 ;;; - identifing sub-expression of regexps (ala \(..\) and \n)
380 #-(OR CMU SBCL GCL)
381 (defun loadable-states-form (starts)
382 `',starts)
384 #+(OR CMU SBCL GCL)
385 ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die
386 ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-(
387 ;; Deswegen hier dieser read-from-string Hack.
388 (defun loadable-states-form (starts)
389 `(LET ((*PACKAGE* (FIND-PACKAGE ',(package-name *package*))))
390 (READ-FROM-STRING ',(let ((*print-circle* t)
391 (*print-readably* t)
392 (*print-pretty* nil))
393 (prin1-to-string starts)))))
395 ;;;; ----------------------------------------------------------------------------------------------------
396 ;;;;
398 (defun parse-char-set (string i)
399 (let ((res nil)
400 (complement-p nil))
401 (incf i) ;skip '['
402 ;;the first char is special
403 (cond ((char= (char string i) #\]) (incf i) (push #\] res))
404 ((char= (char string i) #\^) (incf i) (setq complement-p t))
405 ((char= (char string i) #\-) (incf i) (push #\- res)))
406 (do ()
407 ((char= (char string i) #\])
408 (values (if complement-p (cons 'cset res) (cons 'set res)) (+ i 1)))
409 (cond ((char= (char string (+ i 1)) #\-)
410 ;;it's a range
411 (push (cons (char string i) (char string (+ i 2))) res)
412 (incf i 3))
414 ;;singleton
415 (push (char string i) res)
416 (incf i))))))
418 ;;;; ------------------------------------------------------------------------------------------
420 (defparameter *full-table-p* nil)
422 (defun mungle-transitions (trs)
423 (if *full-table-p*
424 (let ((res (make-array 256 :initial-element nil)))
425 (dolist (tr trs)
426 (dolist (range (car tr))
427 (loop
428 for code from (car range) to (cadr range)
429 do (setf (aref res code) (cdr tr)))))
430 res)
431 trs))
433 (defun over-all-states (fun starts)
434 ;; Apply `fun' to each state reachable from starts.
435 (let ((yet nil))
436 (labels ((walk (q)
437 (unless (member q yet)
438 (push q yet)
439 (let ((trs (state-transitions q)))
440 (funcall fun q)
441 (dolist (tr trs)
442 (walk (cdr tr)))))))
443 (mapc #'walk starts))))
445 (defmacro deflexer (name macro-defs &rest rule-defs)
446 (let ((macros nil) starts clauses (n-fin 0))
447 (dolist (k macro-defs)
448 (push (cons (car k) (sublis macros (cadr k))) macros))
449 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
450 (setq rule-defs
451 (mapcar #'(lambda (x)
452 (cond ((and (consp (car x)) (string-equal (caar x) :in))
453 (list (cadar x) (sublis macros (caddar x)) (progn (incf n-fin) n-fin) (cdr x)))
454 ((list 'initial (sublis macros (car x)) (progn (incf n-fin) n-fin) (cdr x)))))
455 (reverse rule-defs)))
456 ;;collect all start states in alist (<name> . <state>)
457 (setq starts (mapcar #'(lambda (name)
458 (cons name (make-state)))
459 (remove-duplicates (mapcar #'car rule-defs))))
460 ;;build the nd-fsa's
461 (dolist (r rule-defs)
462 (destructuring-bind (start expr end action) r
463 (let ((q0 (cdr (assoc start starts)))
464 (fsa (regexp->fsa `(and ,expr))))
465 ;;link start state
466 (state-add-link q0 'eps (fsa-start fsa))
467 ;;mark final state
468 (setf (state-final (fsa-end fsa)) end)
469 ;; build a clause for CASE
470 (push `((,end) .,action) clauses))))
471 ;; hmm... we have to sort the final states after building the dfsa
472 ;; or introduce fixnum identifier and instead of union take the minimum
473 ;; above in ndfsa->dfsa.
474 (progn
475 (mapcar #'(lambda (x y) (setf (cdr x) y))
476 starts (ndfsa->dfsa (mapcar #'cdr starts))))
477 ;;(terpri)(princ `(,(number-states starts) states))(finish-output)
478 (let ((n 0))
479 (over-all-states (lambda (state)
480 (incf n)
481 (setf (state-transitions state)
482 (mungle-transitions (state-transitions state))))
483 (mapcar #'cdr starts))
484 (format T "~&~D states." n))
485 `(DEFUN ,(intern (format nil "MAKE-~A-LEXER" name)) (INPUT)
486 (LET* ((STARTS ,(loadable-states-form starts))
487 (SUB-STATE 'INITIAL)
488 (STATE NIL)
489 (LOOK-AHEAD NIL)
490 (BAGG/CH (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T
491 :ELEMENT-TYPE 'CHARACTER))
492 (BAGG/STATE (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T))
493 (CH NIL))
494 #'(LAMBDA ()
495 (BLOCK NIL
496 (LABELS ((BEGIN (X)
497 (SETQ SUB-STATE X))
498 (BACKUP (CH)
499 (COND ((STRINGP CH)
500 (WHEN (> (LENGTH CH) 0)
501 (PUSH (CONS 0 CH) LOOK-AHEAD)))
502 (T (PUSH CH LOOK-AHEAD))))
503 (PUSH* (CH STATE)
504 (VECTOR-PUSH-EXTEND CH BAGG/CH 10)
505 (VECTOR-PUSH-EXTEND STATE BAGG/STATE 10) )
506 (POP*/CH ()
507 (LET ((FP (LENGTH BAGG/CH)))
508 (PROG1 (CHAR BAGG/CH (1- FP))
509 (SETF (FILL-POINTER BAGG/STATE) (1- FP))
510 (SETF (FILL-POINTER BAGG/CH) (1- FP)))))
511 (TOS*/STATE ()
512 (AREF BAGG/STATE (1- (LENGTH BAGG/STATE))) )
513 (EMPTY*? ()
514 (= (LENGTH BAGG/CH) 0))
515 (REWIND* ()
516 (SETF (FILL-POINTER BAGG/CH) 0)
517 (SETF (FILL-POINTER BAGG/STATE) 0) )
518 (STRING* ()
519 (COPY-SEQ BAGG/CH))
520 (GETCH ()
521 (COND ((NULL LOOK-AHEAD) (READ-CHAR INPUT NIL NIL))
522 ((CONSP (CAR LOOK-AHEAD))
523 (LET ((S (CDAR LOOK-AHEAD)))
524 (PROG1
525 (CHAR S (CAAR LOOK-AHEAD))
526 (INCF (CAAR LOOK-AHEAD))
527 (WHEN (= (CAAR LOOK-AHEAD) (LENGTH S))
528 (POP LOOK-AHEAD)))))
529 (T (POP LOOK-AHEAD)) ))
530 ,(if *full-table-p*
531 `(FIND-NEXT-STATE (STATE CH)
532 (IF (CHARACTERP CH)
533 (SVREF (STATE-TRANSITIONS STATE) (CHAR-CODE CH))
534 NIL))
535 `(FIND-NEXT-STATE (STATE CH)
536 (WHEN ch
537 (BLOCK FOO
538 (DOLIST (K (STATE-TRANSITIONS STATE))
539 (DOLIST (Q (CAR K))
540 (WHEN (<= (CAR Q) (CHAR-CODE CH) (CADR q))
541 (RETURN-FROM FOO (CDR K))))))))) )
542 (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE)
543 (IGNORABLE #'BEGIN))
544 (TAGBODY
545 START (SETQ STATE (CDR (ASSOC SUB-STATE STARTS)))
546 (WHEN (NULL STATE)
547 (ERROR "Sub-state ~S is not defined." SUB-STATE))
548 (REWIND*)
549 LOOP (SETQ CH (GETCH))
550 (LET ((NEXT-STATE (FIND-NEXT-STATE STATE CH)) )
551 (COND ((NULL NEXT-STATE)
552 (BACKUP CH)
553 (DO ()
554 ((OR (EMPTY*?) (NOT (EQ 0 (TOS*/STATE)))))
555 (BACKUP (POP*/CH)))
556 (COND ((AND (EMPTY*?) (NULL CH))
557 (RETURN :EOF))
558 ((EMPTY*?)
559 (ERROR "oops at ~A: ~S ~S"
560 (file-position (cxml-rng::stream-source INPUT))
562 (mapcar #'car (state-transitions state))))
564 (LET ((HALTING-STATE (TOS*/STATE)))
565 (LET ((BAG* NIL))
566 (SYMBOL-MACROLET ((BAG (IF BAG*
567 BAG*
568 (SETF BAG* (STRING*)))))
569 (CASE HALTING-STATE
570 ,@clauses)))
571 (GO START)))))
573 (PUSH* CH (STATE-FINAL NEXT-STATE))
574 (SETQ STATE NEXT-STATE)
575 (GO LOOP))))))))))))