1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CLEX; -*-
2 ;;; --------------------------------------------------------------------------------------
3 ;;; Title: A flex like scanner generator for Common LISP
4 ;;; Created: 1997-10-12
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; --------------------------------------------------------------------------------------
8 ;;; (c) copyright 1997-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 #:deflexer
#:backup
#:begin
#:initial
#:bag
))
36 ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL
37 ;;; compared to CLISP or ACL. Probably they do not have a good implementation of
40 ;;; We encode our FSA's directly as linked datastructures; A state is represented by:
42 (defstruct (state (:type vector
))
44 transitions
;simple alist of (sigma . next-state)
45 id
;numeric id of state
46 eps-transitions
) ;list of all states reached by epsilon (empty transitions)
48 (defun state-add-link (this char that
)
49 "Add a transition to state `this'; reading `char' proceeds to `that'."
51 (pushnew that
(state-eps-transitions this
)))
53 (dolist (k (state-transitions this
)
54 (push (cons (list char
) that
) (state-transitions this
)))
55 (when (eq (cdr k
) that
)
56 (pushnew char
(car k
))
59 ;;; When constructing FSA's from regular expressions we abstract by the notation
60 ;;; of FSA's as boxen with an entry and an exit state.
67 "Accepts the empty word."
68 (let ((q (make-state)))
69 (make-fsa :start q
:end q
)))
71 (defun fsa-trivial (char)
72 "Accepts the trivial word consisting out of exactly one `char'."
73 (let ((q0 (make-state))
75 (state-add-link q0 char q1
)
76 (make-fsa :start q0
:end q1
)))
78 (defun fsa-concat (a1 a2
)
79 "Concatenation of `a1' and `a2'. Hence `a1 a2'."
80 (state-add-link (fsa-end a1
) 'eps
(fsa-start a2
))
81 (make-fsa :start
(fsa-start a1
)
84 (defun fsa-iterate (a)
85 "Iteration of `a'. Hence `a*'"
86 (let ((q0 (make-state))
88 (state-add-link q0
'eps
(fsa-start a
))
89 (state-add-link q0
'eps q1
)
90 (state-add-link q1
'eps q0
)
91 (state-add-link (fsa-end a
) 'eps q1
)
92 (make-fsa :start q0
:end q1
)))
94 (defun fsa-branch (&rest as
)
95 "Alternation of a0..an; Hence `a0 | a1 | ... | an'."
96 (let ((q0 (make-state))
99 (state-add-link q0
'eps
(fsa-start a
))
100 (state-add-link (fsa-end a
) 'eps q1
))
101 (make-fsa :start q0
:end q1
)))
103 ;;;; ----------------------------------------------------------------------------------------------------
104 ;;;; Converting regular expressions to (ND)FSA
107 ;;; However we choose here a Lispy syntax for regular expressions:
110 ;;; (and a0 .. an) concatation
111 ;;; (or a0 .. an) alternation
114 ;;; Further the abbrevs.:
115 ;;; (+ a) == (and a (* a))
116 ;;; (? a) == (or a (and))
117 ;;; (a0 ... an) == (and a0 ... an)
119 ;;; When a string embeded into a regular expression is seen, the list
120 ;;; of characters is spliced in. So formally:
121 ;;; (a0 .. ai "xyz" aj .. an) == (a0 .. ai #\x #\y #\z aj .. an)
123 ;;; This is useful for matching words:
124 ;;; "foo" --> (and "foo") --> (and #\f #\o #\o) == The word 'foo'
125 ;;; or for denoting small sets:
126 ;;; (or "+-") --> (or #\+ #\-) == One of '+' or '-'
128 (defun loose-eq (x y
)
130 ((and (symbolp x
) (symbolp y
))
131 (string= (symbol-name x
) (symbol-name y
)))))
133 (defun regexp->fsa
(term)
134 (setf term
(regexp-expand-splicing term
))
135 (cond ((and (atom term
) (not (stringp term
)))
137 ((loose-eq (car term
) 'AND
) (regexp/and-
>fsa term
))
138 ((loose-eq (car term
) 'OR
) (regexp/or-
>fsa term
))
139 ((loose-eq (car term
) '*) (fsa-iterate (regexp->fsa
(cadr term
))))
140 ((loose-eq (car term
) '+) (regexp->fsa
`(AND ,(cadr term
) (* ,(cadr term
)))))
141 ((loose-eq (car term
) '?
) (regexp->fsa
`(OR (AND) ,(cadr term
))))
142 ((loose-eq (car term
) 'RANGE
)
143 (regexp->fsa
`(OR .
,(loop for i from
(char-code (cadr term
)) to
(char-code (caddr term
))
144 collect
(code-char i
)))))
146 (regexp->fsa
`(AND .
,term
))) ))
148 (defun regexp/or-
>fsa
(term)
149 ;; I optimize here a bit: I recognized, that ORs are mainly just
150 ;; (large) sets of characters. The extra epsilon transitions are not
151 ;; neccessary on single atoms, so I omit them here. -- This reduces the
152 ;; number of states quite a bit in the first place.
153 (let ((q0 (make-state))
155 (dolist (a (cdr term
))
157 (state-add-link q0 a q1
))
158 ((let ((a (regexp->fsa a
)))
159 (state-add-link q0
'eps
(fsa-start a
))
160 (state-add-link (fsa-end a
) 'eps q1
)))))
161 (make-fsa :start q0
:end q1
)))
163 (defun regexp/and-
>fsa
(term)
164 (cond ((null (cdr term
)) (fsa-empty))
165 ((null (cddr term
)) (regexp->fsa
(cadr term
)))
166 ((fsa-concat (regexp->fsa
(cadr term
)) (regexp->fsa
`(and .
,(cddr term
)))))))
168 (defun regexp-expand-splicing (term)
170 (mapcan #'(lambda (x)
171 (cond ((stringp x
) (coerce x
'list
))
176 ;;;; ----------------------------------------------------------------------------------------------------
177 ;;;; Converting a ND-FSA to a D-FSA
180 ;;; Since we have to compare and unionfy sets of states a lot, I use bit-vectors
181 ;;; to represent these sets for speed. However let me abstract that a bit:
183 ;;; (All these are defined as macros simply for speed. Inlining would be an
184 ;;; option here, when it would be reliable. With defining macros I enforce
187 (defmacro make-empty-set
(n)
188 "Create the empty set on the domain [0,n)."
189 `(make-array ,n
:element-type
'bit
:initial-element
0))
191 (defmacro nset-put
(bag new
)
192 "Destructively calculate bag = bag U {new}."
193 `(setf (sbit (the (simple-array bit
(*)) ,bag
) (the fixnum
,new
)) 1))
195 (defmacro element-of-set-p
(elm set
)
196 "Determine whether `elm' is element of the set `set'."
197 `(eq 1 (sbit (the (simple-array bit
(*)) ,set
) (the fixnum
,elm
))))
199 (defmacro set-size
(set)
200 "Return the upper bound of the domain of `set'."
203 (defmacro do-bits
((var set
&optional result
) &body body
)
204 "Iterate body with `var' over all elements of `set'."
205 (let ((g/set
(gensym)))
206 `(let ((,g
/set
,set
))
207 (dotimes (,var
(set-size ,g
/set
) ,result
)
208 (when (element-of-set-p ,var
,g
/set
)
211 ;;; Since the sets we defined above only take non-negative integers, we have to
212 ;;; number our states. This is done once by NUMBER-STATES.
214 (defun number-states (starts)
215 "Number all state reachable form `starts', continuosly from 0. Each state got
216 it's number stuck into the `id' slot.
217 Returns two values: `n' the number of states and `tab' a table to lookup a
218 state given the number it got attached to."
220 (tab (make-array 0 :adjustable t
:fill-pointer
0 :initial-element nil
)))
223 (vector-push-extend x tab
300)
224 (setf (state-id x
) (prog1 n
(incf n
)))
225 (dolist (tr (state-transitions x
))
227 (dolist (y (state-eps-transitions x
))
229 (dolist (s starts
) (walk s
))
232 ;;; We need to calculate the epsilon closure of a given state. Due to the
233 ;;; precise workings of our algorithm below, we only need this augmenting
236 (defun fsa-epsilon-closure/set
(x state-set
)
237 "Augment the epsilon closure of the state `state' into `state-set'."
238 (unless (element-of-set-p (state-id x
) state-set
)
239 (nset-put state-set
(state-id x
))
240 (dolist (k (state-eps-transitions x
))
241 (fsa-epsilon-closure/set k state-set
))))
243 (defun ndfsa->dfsa
(starts)
246 (multiple-value-bind (n tab
) (number-states starts
)
247 (labels ((name-state-set (state-set)
248 (or (cdr (assoc state-set known
:test
#'equal
))
249 (let ((new (make-state)))
250 (push (cons state-set new
) known
)
251 (push state-set batch
)
253 (add-state-set (state-set)
256 (name (name-state-set state-set
))
258 (do-bits (s0 state-set
)
259 (let ((s (aref tab s0
)))
260 (setf new-final
(max new-final
(state-final s
)))
261 (dolist (tr (state-transitions s
))
264 (let ((looked (getf new-tr z nil
)))
266 (fsa-epsilon-closure/set to looked
)
267 (let ((sts (make-empty-set n
)))
268 (fsa-epsilon-closure/set to sts
)
269 (setf (getf new-tr z
) sts
) ))))))))
270 (setq new-tr
(frob2 new-tr
))
271 (do ((q new-tr
(cddr q
)))
275 (push (cons z
(name-state-set to
)) new-tr-real
)))
276 (setf (state-transitions name
) new-tr-real
277 (state-final name
) new-final
))))
279 (mapcar #'(lambda (s)
280 (name-state-set (let ((sts (make-empty-set n
)))
281 (fsa-epsilon-closure/set s sts
)
286 (add-state-set (pop batch
)))) ))))
288 (defun frob2 (res &aux res2
)
289 (do ((q res
(cddr q
)))
291 (do ((p res2
(cddr p
)))
293 (setf res2
(list* (list (car q
)) (cadr q
) res2
)))
294 (when (equal (cadr q
) (cadr p
))
295 (setf (car p
) (cons (car q
) (car p
)))
298 ;;;; ----------------------------------------------------------------------------------------------------
302 ;;; Features to think about:
303 ;;; - case insensitive scanner
304 ;;; - compression of tables
306 ;;; - non-interactive high speed scanning?
307 ;;; - make BAG a macro? So that non used bags are not considered?
309 ;;; - support for include?
310 ;;; - support for putting back input?
311 ;;; - count lines/columns? Track source?
312 ;;; - richer set of regexp primitives e.g. "[a-z]" style sets
313 ;;; - could we offer complement regexp?
314 ;;; - trailing context
315 ;;; - sub-state stacks?
316 ;;; - user variables to include ['global' / 'lexical']
317 ;;; - identifing sub-expression of regexps (ala \(..\) and \n)
321 (defun loadable-states-form (starts)
325 ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die
326 ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-(
327 ;; Deswegen hier dieser read-from-string Hack.
328 (defun loadable-states-form (starts)
329 `(LET ((*PACKAGE
* (FIND-PACKAGE ',(package-name *package
*))))
330 (READ-FROM-STRING ',(let ((*print-circle
* t
)
332 (*print-pretty
* nil
))
333 (prin1-to-string starts
)))))
335 (defmacro old
/deflexer
(name macro-defs
&rest rule-defs
)
336 (let ((macros nil
) starts clauses
(n-fin 0))
337 (dolist (k macro-defs
)
338 (push (cons (car k
) (sublis macros
(cadr k
))) macros
))
339 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
341 (mapcar #'(lambda (x)
342 (cond ((and (consp (car x
)) (eq (caar x
) 'in
))
343 (list (cadar x
) (sublis macros
(caddar x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))
344 ((list 'initial
(sublis macros
(car x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))))
345 (reverse rule-defs
)))
346 ;;collect all start states in alist (<name> . <state>)
347 (setq starts
(mapcar #'(lambda (name)
348 (cons name
(make-state)))
349 (remove-duplicates (mapcar #'car rule-defs
))))
351 (dolist (r rule-defs
)
352 (destructuring-bind (start expr end action
) r
353 (let ((q0 (cdr (assoc start starts
)))
354 (fsa (regexp->fsa
`(and ,expr
))))
356 (state-add-link q0
'eps
(fsa-start fsa
))
358 (setf (state-final (fsa-end fsa
)) end
)
359 ;; build a clause for CASE
360 (push `((,end
) .
,action
) clauses
))))
361 ;; hmm... we have to sort the final states after building the dfsa
362 ;; or introduce fixnum identifier and instead of union take the minimum
363 ;; above in ndfsa->dfsa.
365 (mapcar #'(lambda (x y
) (setf (cdr x
) y
))
366 starts
(ndfsa->dfsa
(mapcar #'cdr starts
))))
367 ;; (print (number-states starts))
368 `(DEFUN ,(intern (format nil
"MAKE-~A-LEXER" name
)) (INPUT)
369 (LET* ((STARTS ,(loadable-states-form starts
))
373 (BAGG/CH
(G/MAKE-STRING
100 :FILL-POINTER
0 :ADJUSTABLE T
))
374 (BAGG/STATE
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
))
382 (WHEN (> (LENGTH CH
) 0)
383 (PUSH (CONS 0 CH
) LOOK-AHEAD
)))
384 (T (PUSH CH LOOK-AHEAD
))))
386 (VECTOR-PUSH-EXTEND CH BAGG
/CH
10)
387 (VECTOR-PUSH-EXTEND STATE BAGG
/STATE
10) )
389 (LET ((FP (LENGTH BAGG
/CH
)))
390 (PROG1 (AREF BAGG
/CH
(1- FP
))
391 (SETF (FILL-POINTER BAGG
/STATE
) (1- FP
))
392 (SETF (FILL-POINTER BAGG
/CH
) (1- FP
)))))
394 (AREF BAGG
/STATE
(1- (LENGTH BAGG
/STATE
))) )
396 (= (LENGTH BAGG
/CH
) 0))
398 (SETF (FILL-POINTER BAGG
/CH
) 0)
399 (SETF (FILL-POINTER BAGG
/STATE
) 0) )
403 (FIND-NEXT-STATE (CH STATE
)
404 (DOLIST (K (STATE-TRANSITIONS STATE
))
405 (WHEN (MEMBER CH
(CAR K
))
408 (COND ((NULL LOOK-AHEAD
) (READ-CHAR INPUT NIL NIL
))
409 ((CONSP (CAR LOOK-AHEAD
))
410 (LET ((S (CDAR LOOK-AHEAD
)))
412 (AREF S
(CAAR LOOK-AHEAD
))
413 (INCF (CAAR LOOK-AHEAD
))
414 (WHEN (= (CAAR LOOK-AHEAD
) (LENGTH S
))
416 (T (POP LOOK-AHEAD
)) )))
417 (DECLARE (INLINE BACKUP GETCH
))
419 START
(SETQ STATE
(CDR (ASSOC SUB-STATE STARTS
)))
421 (ERROR "Sub-state ~S is not defined." SUB-STATE
))
423 LOOP
(SETQ CH
(GETCH))
426 (DOLIST (K (STATE-TRANSITIONS STATE
))
429 (RETURN-FROM FOO
(CDR K
)))))) ))
430 (COND ((NULL NEXT-STATE
)
433 ((OR (EMPTY*?
) (NOT (EQ 0 (TOS*/STATE
)))))
435 (COND ((AND (EMPTY*?
) (NULL CH
))
438 (ERROR "oops ~S ~S" ch
(mapcar #'car
(state-transitions state
))))
440 (LET ((HALTING-STATE (TOS*/STATE
)))
442 (SYMBOL-MACROLET ((BAG (IF BAG
*
444 (SETF BAG
* (STRING*)))))
449 (PUSH* CH
(STATE-FINAL NEXT-STATE
))
450 (SETQ STATE NEXT-STATE
)
453 ;;;; ----------------------------------------------------------------------------------------------------
456 (defun parse-char-set (string i
)
460 ;;the first char is special
461 (cond ((char= (char string i
) #\
]) (incf i
) (push #\
] res
))
462 ((char= (char string i
) #\^
) (incf i
) (setq complement-p t
))
463 ((char= (char string i
) #\-
) (incf i
) (push #\- res
)))
465 ((char= (char string i
) #\
])
466 (values (if complement-p
(cons 'cset res
) (cons 'set res
)) (+ i
1)))
467 (cond ((char= (char string
(+ i
1)) #\-
)
469 (push (cons (char string i
) (char string
(+ i
2))) res
)
473 (push (char string i
) res
)
476 ;;;; ------------------------------------------------------------------------------------------
478 (defparameter *full-table-p
* t
)
480 (defun mungle-transitions (trs)
482 (let ((res (make-array 256 :initial-element nil
)))
484 (dolist (ch (car tr
))
485 (setf (aref res
(char-code ch
)) (cdr tr
))))
489 (defun over-all-states (fun starts
)
490 ;; Apply `fun' to each state reachable from starts.
493 (unless (member q yet
)
495 (let ((trs (state-transitions q
)))
499 (mapc #'walk starts
))))
501 (defmacro deflexer
(name macro-defs
&rest rule-defs
)
502 (let ((macros nil
) starts clauses
(n-fin 0))
503 (dolist (k macro-defs
)
504 (push (cons (car k
) (sublis macros
(cadr k
))) macros
))
505 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
507 (mapcar #'(lambda (x)
508 (cond ((and (consp (car x
)) (string-equal (caar x
) :in
))
509 (list (cadar x
) (sublis macros
(caddar x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))
510 ((list 'initial
(sublis macros
(car x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))))
511 (reverse rule-defs
)))
512 ;;collect all start states in alist (<name> . <state>)
513 (setq starts
(mapcar #'(lambda (name)
514 (cons name
(make-state)))
515 (remove-duplicates (mapcar #'car rule-defs
))))
517 (dolist (r rule-defs
)
518 (destructuring-bind (start expr end action
) r
519 (let ((q0 (cdr (assoc start starts
)))
520 (fsa (regexp->fsa
`(and ,expr
))))
522 (state-add-link q0
'eps
(fsa-start fsa
))
524 (setf (state-final (fsa-end fsa
)) end
)
525 ;; build a clause for CASE
526 (push `((,end
) .
,action
) clauses
))))
527 ;; hmm... we have to sort the final states after building the dfsa
528 ;; or introduce fixnum identifier and instead of union take the minimum
529 ;; above in ndfsa->dfsa.
531 (mapcar #'(lambda (x y
) (setf (cdr x
) y
))
532 starts
(ndfsa->dfsa
(mapcar #'cdr starts
))))
533 ;;(terpri)(princ `(,(number-states starts) states))(finish-output)
535 (over-all-states (lambda (state)
537 (setf (state-transitions state
)
538 (mungle-transitions (state-transitions state
))))
539 (mapcar #'cdr starts
))
540 (format T
"~&~D states." n
))
541 `(DEFUN ,(intern (format nil
"MAKE-~A-LEXER" name
)) (INPUT)
542 (LET* ((STARTS ,(loadable-states-form starts
))
546 (BAGG/CH
(G/MAKE-STRING
100 :FILL-POINTER
0 :ADJUSTABLE T
))
547 (BAGG/STATE
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
))
555 (WHEN (> (LENGTH CH
) 0)
556 (PUSH (CONS 0 CH
) LOOK-AHEAD
)))
557 (T (PUSH CH LOOK-AHEAD
))))
559 (VECTOR-PUSH-EXTEND CH BAGG
/CH
10)
560 (VECTOR-PUSH-EXTEND STATE BAGG
/STATE
10) )
562 (LET ((FP (LENGTH BAGG
/CH
)))
563 (PROG1 (CHAR BAGG
/CH
(1- FP
))
564 (SETF (FILL-POINTER BAGG
/STATE
) (1- FP
))
565 (SETF (FILL-POINTER BAGG
/CH
) (1- FP
)))))
567 (AREF BAGG
/STATE
(1- (LENGTH BAGG
/STATE
))) )
569 (= (LENGTH BAGG
/CH
) 0))
571 (SETF (FILL-POINTER BAGG
/CH
) 0)
572 (SETF (FILL-POINTER BAGG
/STATE
) 0) )
576 (COND ((NULL LOOK-AHEAD
) (READ-CHAR INPUT NIL NIL
))
577 ((CONSP (CAR LOOK-AHEAD
))
578 (LET ((S (CDAR LOOK-AHEAD
)))
580 (CHAR S
(CAAR LOOK-AHEAD
))
581 (INCF (CAAR LOOK-AHEAD
))
582 (WHEN (= (CAAR LOOK-AHEAD
) (LENGTH S
))
584 (T (POP LOOK-AHEAD
)) ))
586 `(FIND-NEXT-STATE (STATE CH
)
588 (SVREF (STATE-TRANSITIONS STATE
) (CHAR-CODE CH
))
590 `(FIND-NEXT-STATE (STATE CH
)
592 (DOLIST (K (STATE-TRANSITIONS STATE
))
595 (RETURN-FROM FOO
(CDR K
)))))))) )
596 (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE
))
598 START
(SETQ STATE
(CDR (ASSOC SUB-STATE STARTS
)))
600 (ERROR "Sub-state ~S is not defined." SUB-STATE
))
602 LOOP
(SETQ CH
(GETCH))
603 (LET ((NEXT-STATE (FIND-NEXT-STATE STATE CH
)) )
604 (COND ((NULL NEXT-STATE
)
607 ((OR (EMPTY*?
) (NOT (EQ 0 (TOS*/STATE
)))))
609 (COND ((AND (EMPTY*?
) (NULL CH
))
612 (ERROR "oops ~S ~S" ch
(mapcar #'car
(state-transitions state
))))
614 (LET ((HALTING-STATE (TOS*/STATE
)))
616 (SYMBOL-MACROLET ((BAG (IF BAG
*
618 (SETF BAG
* (STRING*)))))
623 (PUSH* CH
(STATE-FINAL NEXT-STATE
))
624 (SETQ STATE NEXT-STATE
)