1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CLEX; -*-
2 ;;; --------------------------------------------------------------------------------------
3 ;;; Title: A flex like scanner generator for Common LISP
4 ;;; Created: 1997-10-12
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; --------------------------------------------------------------------------------------
8 ;;; (c) copyright 1997-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33 ;; ----------------------------------------------------------------------------
34 ;; 2007-04-29 DFL - Represent RANGE directly to cope with character
35 ;; set sizes typical for Unicode.
36 ;; - Disable *full-table-p* by default.
37 ;; - Added SBCL case to the CMUCL workarounds.
42 #:deflexer
#:backup
#:begin
#:initial
#:bag
))
46 ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL
47 ;;; compared to CLISP or ACL. Probably they do not have a good implementation of
50 ;;; We encode our FSA's directly as linked datastructures; A state is represented by:
52 (defstruct (state (:type vector
))
54 transitions
;simple alist of (sigma . next-state)
55 id
;numeric id of state
56 eps-transitions
) ;list of all states reached by epsilon (empty transitions)
58 (defun destructure-range (x)
60 (values (car x
) (cadr x
))
64 (multiple-value-bind (amin amax
) (destructure-range a
)
65 (multiple-value-bind (bmin bmax
) (destructure-range b
)
69 (flet ((range* (min max
)
71 (push (list min
(1- max
)) result
))))
72 (range* amin
(min bmin amax
))
73 (range* (max amin bmax
) amax
))
77 (mapcan (lambda (a) (range- a b
)) aa
))
79 (defun partition-range (a pos
)
80 (multiple-value-bind (min max
) (destructure-range a
)
81 (if (and (< min pos
) (<= pos max
))
82 (list (list min
(1- pos
))
89 (character (char-code x
))))
91 (defun parse-range (range)
93 (list (code (car range
)) (code (cadr range
)))
94 (list (code range
) (code range
))))
96 (defun state-add-link (this range that
)
97 "Add a transition to state `this'; reading `range' proceeds to `that'."
98 (cond ((eq range
'eps
)
99 (pushnew that
(state-eps-transitions this
)))
101 (let ((new (list (parse-range range
))))
102 (dolist (k (state-transitions this
)
103 (push (cons new that
) (state-transitions this
)))
104 (when (eq (cdr k
) that
)
105 (dolist (l (car k
)) ;avoid duplicates
106 (setf new
(ranges- new l
)))
107 (setf (car k
) (append new
(car k
)))
109 ;; split existing ranges to remove overlap
110 (dolist (k (state-transitions this
))
114 (partition-range l pos
))
118 (doit (1+ (cadr n
))))))))))
120 ;;; When constructing FSA's from regular expressions we abstract by the notation
121 ;;; of FSA's as boxen with an entry and an exit state.
128 "Accepts the empty word."
129 (let ((q (make-state)))
130 (make-fsa :start q
:end q
)))
132 (defun fsa-trivial (char)
133 "Accepts the trivial word consisting out of exactly one `char'."
134 (let ((q0 (make-state))
136 (state-add-link q0 char q1
)
137 (make-fsa :start q0
:end q1
)))
139 (defun fsa-concat (a1 a2
)
140 "Concatenation of `a1' and `a2'. Hence `a1 a2'."
141 (state-add-link (fsa-end a1
) 'eps
(fsa-start a2
))
142 (make-fsa :start
(fsa-start a1
)
145 (defun fsa-iterate (a)
146 "Iteration of `a'. Hence `a*'"
147 (let ((q0 (make-state))
149 (state-add-link q0
'eps
(fsa-start a
))
150 (state-add-link q0
'eps q1
)
151 (state-add-link q1
'eps q0
)
152 (state-add-link (fsa-end a
) 'eps q1
)
153 (make-fsa :start q0
:end q1
)))
155 (defun fsa-branch (&rest as
)
156 "Alternation of a0..an; Hence `a0 | a1 | ... | an'."
157 (let ((q0 (make-state))
160 (state-add-link q0
'eps
(fsa-start a
))
161 (state-add-link (fsa-end a
) 'eps q1
))
162 (make-fsa :start q0
:end q1
)))
164 ;;;; ----------------------------------------------------------------------------------------------------
165 ;;;; Converting regular expressions to (ND)FSA
168 ;;; However we choose here a Lispy syntax for regular expressions:
171 ;;; (and a0 .. an) concatation
172 ;;; (or a0 .. an) alternation
175 ;;; Further the abbrevs.:
176 ;;; (+ a) == (and a (* a))
177 ;;; (? a) == (or a (and))
178 ;;; (a0 ... an) == (and a0 ... an)
180 ;;; When a string embeded into a regular expression is seen, the list
181 ;;; of characters is spliced in. So formally:
182 ;;; (a0 .. ai "xyz" aj .. an) == (a0 .. ai #\x #\y #\z aj .. an)
184 ;;; This is useful for matching words:
185 ;;; "foo" --> (and "foo") --> (and #\f #\o #\o) == The word 'foo'
186 ;;; or for denoting small sets:
187 ;;; (or "+-") --> (or #\+ #\-) == One of '+' or '-'
189 (defun loose-eq (x y
)
191 ((and (symbolp x
) (symbolp y
))
192 (string= (symbol-name x
) (symbol-name y
)))))
194 (defun regexp->fsa
(term)
195 (setf term
(regexp-expand-splicing term
))
196 (cond ((and (atom term
) (not (stringp term
)))
198 ((loose-eq (car term
) 'RANGE
)
199 (fsa-trivial (cdr term
)))
200 ((loose-eq (car term
) 'AND
) (regexp/and-
>fsa term
))
201 ((loose-eq (car term
) 'OR
) (regexp/or-
>fsa term
))
202 ((loose-eq (car term
) '*) (fsa-iterate (regexp->fsa
(cadr term
))))
203 ((loose-eq (car term
) '+) (regexp->fsa
`(AND ,(cadr term
) (* ,(cadr term
)))))
204 ((loose-eq (car term
) '?
) (regexp->fsa
`(OR (AND) ,(cadr term
))))
206 (regexp->fsa
`(AND .
,term
))) ))
208 (defun regexp/or-
>fsa
(term)
209 ;; I optimize here a bit: I recognized, that ORs are mainly just
210 ;; (large) sets of characters. The extra epsilon transitions are not
211 ;; neccessary on single atoms, so I omit them here. -- This reduces the
212 ;; number of states quite a bit in the first place.
213 (let ((q0 (make-state))
215 (dolist (a (cdr term
))
217 (state-add-link q0 a q1
))
218 ((let ((a (regexp->fsa a
)))
219 (state-add-link q0
'eps
(fsa-start a
))
220 (state-add-link (fsa-end a
) 'eps q1
)))))
221 (make-fsa :start q0
:end q1
)))
223 (defun regexp/and-
>fsa
(term)
224 (cond ((null (cdr term
)) (fsa-empty))
225 ((null (cddr term
)) (regexp->fsa
(cadr term
)))
226 ((fsa-concat (regexp->fsa
(cadr term
)) (regexp->fsa
`(and .
,(cddr term
)))))))
228 (defun regexp-expand-splicing (term)
230 (mapcan #'(lambda (x)
231 (cond ((stringp x
) (coerce x
'list
))
236 ;;;; ----------------------------------------------------------------------------------------------------
237 ;;;; Converting a ND-FSA to a D-FSA
240 ;;; Since we have to compare and unionfy sets of states a lot, I use bit-vectors
241 ;;; to represent these sets for speed. However let me abstract that a bit:
243 ;;; (All these are defined as macros simply for speed. Inlining would be an
244 ;;; option here, when it would be reliable. With defining macros I enforce
247 (defmacro make-empty-set
(n)
248 "Create the empty set on the domain [0,n)."
249 `(make-array ,n
:element-type
'bit
:initial-element
0))
251 (defmacro nset-put
(bag new
)
252 "Destructively calculate bag = bag U {new}."
253 `(setf (sbit (the (simple-array bit
(*)) ,bag
) (the fixnum
,new
)) 1))
255 (defmacro element-of-set-p
(elm set
)
256 "Determine whether `elm' is element of the set `set'."
257 `(eq 1 (sbit (the (simple-array bit
(*)) ,set
) (the fixnum
,elm
))))
259 (defmacro set-size
(set)
260 "Return the upper bound of the domain of `set'."
263 (defmacro do-bits
((var set
&optional result
) &body body
)
264 "Iterate body with `var' over all elements of `set'."
265 (let ((g/set
(gensym)))
266 `(let ((,g
/set
,set
))
267 (dotimes (,var
(set-size ,g
/set
) ,result
)
268 (when (element-of-set-p ,var
,g
/set
)
271 ;;; Since the sets we defined above only take non-negative integers, we have to
272 ;;; number our states. This is done once by NUMBER-STATES.
274 (defun number-states (starts)
275 "Number all state reachable form `starts', continuosly from 0. Each state got
276 it's number stuck into the `id' slot.
277 Returns two values: `n' the number of states and `tab' a table to lookup a
278 state given the number it got attached to."
280 (tab (make-array 0 :adjustable t
:fill-pointer
0 :initial-element nil
)))
283 (vector-push-extend x tab
300)
284 (setf (state-id x
) (prog1 n
(incf n
)))
285 (dolist (tr (state-transitions x
))
287 (dolist (y (state-eps-transitions x
))
289 (dolist (s starts
) (walk s
))
292 ;;; We need to calculate the epsilon closure of a given state. Due to the
293 ;;; precise workings of our algorithm below, we only need this augmenting
296 (defun fsa-epsilon-closure/set
(x state-set
)
297 "Augment the epsilon closure of the state `state' into `state-set'."
298 (unless (element-of-set-p (state-id x
) state-set
)
299 (nset-put state-set
(state-id x
))
300 (dolist (k (state-eps-transitions x
))
301 (fsa-epsilon-closure/set k state-set
))))
303 (defun ndfsa->dfsa
(starts)
306 (multiple-value-bind (n tab
) (number-states starts
)
307 (labels ((name-state-set (state-set)
308 (or (cdr (assoc state-set known
:test
#'equal
))
309 (let ((new (make-state)))
310 (push (cons state-set new
) known
)
311 (push state-set batch
)
313 (add-state-set (state-set)
314 (let ((new-tr (make-hash-table :test
'equal
))
316 (name (name-state-set state-set
))
318 (do-bits (s0 state-set
)
319 (let ((s (aref tab s0
)))
320 (setf new-final
(max new-final
(state-final s
)))
321 (dolist (tr (state-transitions s
))
324 (let ((looked (gethash z new-tr
)))
326 (fsa-epsilon-closure/set to looked
)
327 (let ((sts (make-empty-set n
)))
328 (fsa-epsilon-closure/set to sts
)
329 (setf (gethash z new-tr
) sts
)))))))))
330 (do ((q (frob2 new-tr
) (cddr q
)))
334 (push (cons z
(name-state-set to
)) new-tr-real
)))
335 (setf (state-transitions name
) new-tr-real
336 (state-final name
) new-final
))))
338 (mapcar #'(lambda (s)
339 (name-state-set (let ((sts (make-empty-set n
)))
340 (fsa-epsilon-closure/set s sts
)
345 (add-state-set (pop batch
)))) ))))
347 (defun frob2 (res &aux res2
)
348 (maphash (lambda (z to
)
349 (do ((p res2
(cddr p
)))
351 (setf res2
(list* (list z
) to res2
)))
352 (when (equal to
(cadr p
))
353 (setf (car p
) (cons z
(car p
)))
358 ;;;; ----------------------------------------------------------------------------------------------------
362 ;;; Features to think about:
363 ;;; - case insensitive scanner
364 ;;; - compression of tables
366 ;;; - non-interactive high speed scanning?
367 ;;; - make BAG a macro? So that non used bags are not considered?
369 ;;; - support for include?
370 ;;; - support for putting back input?
371 ;;; - count lines/columns? Track source?
372 ;;; - richer set of regexp primitives e.g. "[a-z]" style sets
373 ;;; - could we offer complement regexp?
374 ;;; - trailing context
375 ;;; - sub-state stacks?
376 ;;; - user variables to include ['global' / 'lexical']
377 ;;; - identifing sub-expression of regexps (ala \(..\) and \n)
381 (defun loadable-states-form (starts)
385 ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die
386 ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-(
387 ;; Deswegen hier dieser read-from-string Hack.
388 (defun loadable-states-form (starts)
389 `(LET ((*PACKAGE
* (FIND-PACKAGE ',(package-name *package
*))))
390 (READ-FROM-STRING ',(let ((*print-circle
* t
)
392 (*print-pretty
* nil
))
393 (prin1-to-string starts
)))))
395 ;;;; ----------------------------------------------------------------------------------------------------
398 (defun parse-char-set (string i
)
402 ;;the first char is special
403 (cond ((char= (char string i
) #\
]) (incf i
) (push #\
] res
))
404 ((char= (char string i
) #\^
) (incf i
) (setq complement-p t
))
405 ((char= (char string i
) #\-
) (incf i
) (push #\- res
)))
407 ((char= (char string i
) #\
])
408 (values (if complement-p
(cons 'cset res
) (cons 'set res
)) (+ i
1)))
409 (cond ((char= (char string
(+ i
1)) #\-
)
411 (push (cons (char string i
) (char string
(+ i
2))) res
)
415 (push (char string i
) res
)
418 ;;;; ------------------------------------------------------------------------------------------
420 (defparameter *full-table-p
* nil
)
422 (defun mungle-transitions (trs)
424 (let ((res (make-array 256 :initial-element nil
)))
426 (dolist (range (car tr
))
428 for code from
(car range
) to
(cadr range
)
429 do
(setf (aref res code
) (cdr tr
)))))
433 (defun over-all-states (fun starts
)
434 ;; Apply `fun' to each state reachable from starts.
437 (unless (member q yet
)
439 (let ((trs (state-transitions q
)))
443 (mapc #'walk starts
))))
445 (defmacro deflexer
(name macro-defs
&rest rule-defs
)
446 (let ((macros nil
) starts clauses
(n-fin 0))
447 (dolist (k macro-defs
)
448 (push (cons (car k
) (sublis macros
(cadr k
))) macros
))
449 ;;canon clauses -- each element of rule-defs becomes (start expr end action)
451 (mapcar #'(lambda (x)
452 (cond ((and (consp (car x
)) (string-equal (caar x
) :in
))
453 (list (cadar x
) (sublis macros
(caddar x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))
454 ((list 'initial
(sublis macros
(car x
)) (progn (incf n-fin
) n-fin
) (cdr x
)))))
455 (reverse rule-defs
)))
456 ;;collect all start states in alist (<name> . <state>)
457 (setq starts
(mapcar #'(lambda (name)
458 (cons name
(make-state)))
459 (remove-duplicates (mapcar #'car rule-defs
))))
461 (dolist (r rule-defs
)
462 (destructuring-bind (start expr end action
) r
463 (let ((q0 (cdr (assoc start starts
)))
464 (fsa (regexp->fsa
`(and ,expr
))))
466 (state-add-link q0
'eps
(fsa-start fsa
))
468 (setf (state-final (fsa-end fsa
)) end
)
469 ;; build a clause for CASE
470 (push `((,end
) .
,action
) clauses
))))
471 ;; hmm... we have to sort the final states after building the dfsa
472 ;; or introduce fixnum identifier and instead of union take the minimum
473 ;; above in ndfsa->dfsa.
475 (mapcar #'(lambda (x y
) (setf (cdr x
) y
))
476 starts
(ndfsa->dfsa
(mapcar #'cdr starts
))))
477 ;;(terpri)(princ `(,(number-states starts) states))(finish-output)
479 (over-all-states (lambda (state)
481 (setf (state-transitions state
)
482 (mungle-transitions (state-transitions state
))))
483 (mapcar #'cdr starts
))
484 (format T
"~&~D states." n
))
485 `(DEFUN ,(intern (format nil
"MAKE-~A-LEXER" name
)) (INPUT)
486 (LET* ((STARTS ,(loadable-states-form starts
))
490 (BAGG/CH
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
491 :ELEMENT-TYPE
'CHARACTER
))
492 (BAGG/STATE
(MAKE-ARRAY 100 :FILL-POINTER
0 :ADJUSTABLE T
))
500 (WHEN (> (LENGTH CH
) 0)
501 (PUSH (CONS 0 CH
) LOOK-AHEAD
)))
502 (T (PUSH CH LOOK-AHEAD
))))
504 (VECTOR-PUSH-EXTEND CH BAGG
/CH
10)
505 (VECTOR-PUSH-EXTEND STATE BAGG
/STATE
10) )
507 (LET ((FP (LENGTH BAGG
/CH
)))
508 (PROG1 (CHAR BAGG
/CH
(1- FP
))
509 (SETF (FILL-POINTER BAGG
/STATE
) (1- FP
))
510 (SETF (FILL-POINTER BAGG
/CH
) (1- FP
)))))
512 (AREF BAGG
/STATE
(1- (LENGTH BAGG
/STATE
))) )
514 (= (LENGTH BAGG
/CH
) 0))
516 (SETF (FILL-POINTER BAGG
/CH
) 0)
517 (SETF (FILL-POINTER BAGG
/STATE
) 0) )
521 (COND ((NULL LOOK-AHEAD
) (READ-CHAR INPUT NIL NIL
))
522 ((CONSP (CAR LOOK-AHEAD
))
523 (LET ((S (CDAR LOOK-AHEAD
)))
525 (CHAR S
(CAAR LOOK-AHEAD
))
526 (INCF (CAAR LOOK-AHEAD
))
527 (WHEN (= (CAAR LOOK-AHEAD
) (LENGTH S
))
529 (T (POP LOOK-AHEAD
)) ))
531 `(FIND-NEXT-STATE (STATE CH
)
533 (SVREF (STATE-TRANSITIONS STATE
) (CHAR-CODE CH
))
535 `(FIND-NEXT-STATE (STATE CH
)
538 (DOLIST (K (STATE-TRANSITIONS STATE
))
540 (WHEN (<= (CAR Q
) (CHAR-CODE CH
) (CADR q
))
541 (RETURN-FROM FOO
(CDR K
))))))))) )
542 (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE
)
545 START
(SETQ STATE
(CDR (ASSOC SUB-STATE STARTS
)))
547 (ERROR "Sub-state ~S is not defined." SUB-STATE
))
549 LOOP
(SETQ CH
(GETCH))
550 (LET ((NEXT-STATE (FIND-NEXT-STATE STATE CH
)) )
551 (COND ((NULL NEXT-STATE
)
554 ((OR (EMPTY*?
) (NOT (EQ 0 (TOS*/STATE
)))))
556 (COND ((AND (EMPTY*?
) (NULL CH
))
559 (ERROR "oops at ~A: ~S ~S"
560 (file-position (cxml-rng::stream-source INPUT
))
562 (mapcar #'car
(state-transitions state
))))
564 (LET ((HALTING-STATE (TOS*/STATE
)))
566 (SYMBOL-MACROLET ((BAG (IF BAG
*
568 (SETF BAG
* (STRING*)))))
573 (PUSH* CH
(STATE-FINAL NEXT-STATE
))
574 (SETQ STATE NEXT-STATE
)