1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
6 ;;; When we preprocess the grammar, give every symbol a unique
7 ;;; integer, and then use bitvectors for all set operations. Keep a
8 ;;; bitvector to track terminal/nonterminal-ness.
9 ;;; (any benefit to doing this? should do some profiling.)
11 ;;; Add a suite of tests using RT.
13 ;;; Write some usage information.
15 ;;; Add some operator precedence controls.
17 ;;; Code to convert yacc file into suitable grammar.
19 (in-package :lalr-parser-generator
)
21 ;;;; Special variables.
23 (defparameter *start-symbol
* 'start
)
24 (defparameter *end-symbol
* '$
)
27 "The default grammar used by the LALR parser generator; set by
29 (defvar *first-set
* nil
)
30 (defvar *follow-set
* nil
)
31 (defvar *nullable-set
* nil
)
36 (defstruct item
(lhs) (rhs) (dot) (lookahead))
38 ;;; XXX should these dot functions operate on the dot itself, rather
39 ;;; than calling item-dot? That would make it easier to hide the fact
40 ;;; that dot is just a list.
41 (defun dot-at-end-p (item) (endp (item-dot item
)))
42 (defun symbol-at-dot (item) (car (item-dot item
)))
43 (defun advance-dot (item)
44 "Returns the item dot, advanced by one symbol. Note:
46 (cdr (item-dot item
)))
49 ;;; item sets -- see also macros.lisp.
51 (defun make-item-set (&rest items
)
52 (let ((set (make-array '(0) :adjustable t
:fill-pointer
0)))
53 (dolist (i items
) (add-to-set i set
))
57 (defun items-equal-except-lookahead-p (a b
)
58 (every (lambda (x) (equal (funcall x a
) (funcall x b
)))
59 (list #'item-lhs
#'item-rhs
#'item-dot
)))
61 (defun add-to-set (item set
)
62 "Returns position of ITEM in SET."
63 (let* ((i (or (position item set
:test
#'items-equal-except-lookahead-p
)
64 (vector-push-extend item set
)))
65 (la-of-a (item-lookahead item
))
66 (la-of-b (item-lookahead (aref set i
))))
67 (unless (equal la-of-a la-of-b
)
68 (setf (item-lookahead (aref set i
))
69 (union la-of-a la-of-b
)))))
71 (defun item-set-equal-ignoring-la (set-a set-b
)
72 (when (= (length set-a
) (length set-b
))
73 (every #'items-equal-except-lookahead-p set-a set-b
)))
78 (defun process-grammar (grammar)
79 "Processes GRAMMAR, returns a grammar suitable for binding to
80 *GRAMMAR*. Augments the grammar with a new start rule."
81 ;; we compile the basic hash table of non-terminals by iterating
82 ;; through the grammar, storing the lists of productions.
83 (let ((grammar-hash (make-hash-table)))
84 (do ((list-> grammar
(cddr list-
>)))
86 (setf (gethash (car list-
>) grammar-hash
)
89 (augment-grammar grammar-hash
(car grammar
))
92 (defun augment-grammar (hash first-real-symbol
)
93 ;; augment grammar with start symbol
94 (dolist (i (list *start-symbol
* *end-symbol
*))
95 (assert (null (gethash i hash
)) nil
96 "~A is a reserved non-terminal, unfortunately. Try
97 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
99 (setf (gethash *start-symbol
* hash
)
100 (list (list first-real-symbol
*end-symbol
*))))
102 (defun non-terminal-p (symbol) (gethash symbol
*grammar
*))
103 (defun grammar-productions (symbol) (gethash symbol
*grammar
*))
106 ;;;; PARSE TABLE CONSTRUCTION
108 (defun first-sets (symbol-list)
109 "Returns the union of the first sets of each symbol in SYMBOL-LIST,
110 until either a nullable symbol is found or we run out of symbols."
111 (do* ((x-> symbol-list
(cdr x-
>))
112 (s (and x-
> (gethash (car x-
>) *first-set
*))
113 (union s
(and x-
> (gethash (car x-
>) *first-set
*)))))
114 ((or (null x-
>) (not (gethash (car x-
>) *nullable-set
*)))
117 (defun lalr-closure (item-set)
118 "Returns the closure of ITEM-SET."
119 (do-until-unchanged (item-set)
120 (do-for-each-item (i item-set
)
121 (when (non-terminal-p (symbol-at-dot i
))
122 (dolist (r (grammar-productions (symbol-at-dot i
)))
123 (add-to-set (make-item :lhs
(symbol-at-dot i
) :rhs r
:dot r
124 :lookahead
(union (first-sets (advance-dot i
))
129 (defun lalr-goto (item-set grammar-symbol
)
130 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
131 (let ((j (make-item-set)))
132 (do-for-each-item (i item-set
)
133 (when (eql (symbol-at-dot i
) grammar-symbol
)
134 (add-to-set (make-item :lhs
(item-lhs i
) :rhs
(item-rhs i
)
136 :lookahead
(item-lookahead i
))
140 (defun make-start-item ()
141 "Makes the item S' -> .S$, as appropriate for the grammar."
142 (make-item :lhs
*start-symbol
*
143 :rhs
(first (gethash *start-symbol
* *grammar
*))
144 :dot
(first (gethash *start-symbol
* *grammar
*))))
146 (defun make-almost-done-item ()
147 "Makes the item S' -> S.$, as appropriate for the grammar."
148 (let* ((start-item (make-start-item))
149 (dot (do ((dot (advance-dot start-item
) (cdr dot
)))
150 ((or (null dot
) (eql (car dot
) *end-symbol
*)) dot
))))
151 (assert (not (null dot
)))
152 (make-item :lhs
(item-lhs start-item
) :rhs
(item-rhs start-item
)
157 (defun merge-lookahead-in-sets (src dst
)
158 (macrolet ((la (set) `(item-lookahead (aref ,set i
))))
159 (dotimes (i (length dst
))
160 (unless (equal (la dst
) (la src
))
161 (setf (la dst
) (union (la dst
) (la src
)))))))
163 (defun add-to-states (set states
)
164 "Adds SET to STATES, either by merging it with another set which is
165 identical save for look-ahead, or push it onto the end. Returns the
167 (flet ((merge-existing ()
168 (loop for i below
(length states
)
169 and other-set across states
170 when
(item-set-equal-ignoring-la set other-set
)
171 do
(merge-lookahead-in-sets set other-set
)
173 (or (merge-existing) (vector-push-extend set states
))))
175 (defun make-initial-state ()
176 (lalr-closure (make-item-set (make-start-item))))
178 (defun compute-shifts (table)
179 "Computes shift actions and states for the generated parser. Adds
180 shifts to the parse table as we find them. Returns the state table."
181 (let ((states (make-array '(1) :adjustable t
:fill-pointer
1
182 :initial-element
(make-initial-state))))
183 (do-until-unchanged (states) ;; XXX also table?
184 (dotimes (i (length states
))
185 (do-for-each-item (item (aref states i
))
186 (maybe-shift table states item i
))))
189 ;; XXX awful name; refactor.
190 (defun maybe-shift (table states item i
)
191 (unless (or (dot-at-end-p item
)
192 (eql (symbol-at-dot item
) *end-symbol
*))
193 (let* ((symbol (symbol-at-dot item
))
194 (new-set (lalr-goto (aref states i
) symbol
))
195 (j (add-to-states new-set states
))
196 (action (list (if (non-terminal-p symbol
) 'goto
'shift
) j
)))
197 (add-to-parse-table table i symbol action
))))
200 (defun compute-reductions (table states
)
201 "Compute reduce actions for the generated parser, based on STATES.
202 Fills in TABLE with the reduce actions."
203 (dotimes (i (length states
))
204 (do-for-each-item (item (aref states i
))
205 (when (dot-at-end-p item
)
206 (dolist (symbol (item-lookahead item
))
207 (let ((action `(reduce ,(item-lhs item
)
208 ,(length (item-rhs item
)))))
209 (add-to-parse-table table i symbol action
)))))))
212 (defun add-accept-actions (parse-table states
)
213 "Finds states whose next token should be $ (EOF) and adds accept
214 actions to the parse table for those states."
215 (loop with n-states
= (length states
)
216 and item
= (make-almost-done-item)
217 for i from
0 below n-states
218 when
(find item
(aref states i
) :test
#'equalp
)
219 do
(add-to-parse-table parse-table i
*end-symbol
* '(accept))))
222 (defun add-to-parse-table (parse-table state symbol action
)
223 "Adds ACTION to the parse table at (SYMBOL,STATE). Applies braindead
224 conflict resolution rule to any conflicts detected."
225 (sunless (gethash symbol parse-table
)
226 (setf it
(make-hash-table :test
'equal
)))
227 (let ((row (gethash symbol parse-table
)))
228 (awhen (gethash state row
)
229 (when (equal action it
) (return-from add-to-parse-table
))
230 (setf action
(resolve-collision action it symbol state
)))
231 (setf (gethash state row
) action
)))
234 (defun resolve-collision (new-action old-action symbol state
)
235 ;; XXX should probably collate the number of conflicts
237 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
238 symbol state old-action new-action
)
239 (cond ((and (eql (car old-action
) 'shift
) (eql (car new-action
) 'reduce
))
240 old-action
) ; S/R => prefer shift.
241 ((and (eql (car old-action
) 'reduce
) (eql (car new-action
) 'reduce
))
242 ;; R/R => prefer longer reduction.
243 (if (>= (third new-action
) (third old-action
))
246 (t (error "This is an unexpected conflict (~A, ~A). Call a wizard."
247 old-action new-action
))))
250 (defun create-parse-table ()
251 "Constructs a parse table usable by PARSE."
252 (let* ((parse-table (make-hash-table))
253 (states (compute-shifts parse-table
)))
254 (compute-reductions parse-table states
)
255 (add-accept-actions parse-table states
)
259 ;;; XXX certainly not the most attractive way to do this, but I've
261 (defun write-parser-function (table package stream fn-name
)
262 (let* ((*package
* (find-package "LALR-PARSER-GENERATOR"))
263 (fn-name (intern (if (stringp fn-name
)
265 (symbol-name fn-name
)))))
266 (format stream
";; Automatically generated by LALR-PARSER-GENERATOR.")
267 (format stream
"~&(in-package ~S)~%" (package-name package
))
268 (pprint `(flet ((unmash (entries)
269 (let ((ht (make-hash-table)))
271 (setf (gethash (car e
) ht
) (cdr e
)))
273 (let ((table (unmash ',(let ((untable))
274 (maphash (lambda (k v
)
279 (defun ,fn-name
(next-token)
280 "NEXT-TOKEN is a function which returns a cons of the next token in
281 the input (the CAR being the symbol name, the CDR being any
282 information the lexer would like to preserve), and advances the input
283 one token. Returns what might pass for a parse tree in some
285 (loop with stack
= (list 0)
286 and token
= (funcall next-token
)
288 for row
= (gethash (car token
) table
)
290 (gethash (first stack
) row
)
291 (error "~A is not a valid token in this grammar."
293 do
(case (first action
)
294 (shift (push token result-stack
)
295 (setf token
(funcall next-token
))
296 (push (second action
) stack
))
297 (reduce (push (list (second action
)) result-stack
)
298 (dotimes (i (third action
))
300 (push (pop (cdr result-stack
)) (cdar result-stack
)))
301 (destructuring-bind (goto state
)
302 (gethash (first stack
) (gethash (second action
) table
))
303 (assert (eql goto
'goto
) () "Malformed parse table!")
305 (accept (return (car result-stack
)))
306 (t (error "Parse error at ~A" token
))))))))))
309 (defun parse (table next-token
)
310 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
311 function which returns a cons of the next token in the input (the CAR
312 being the symbol name, the CDR being any information the lexer would
313 like to preserve), and advances the input one token. Returns what
314 might pass for a parse tree in some countries."
315 (loop with stack
= (list 0)
316 and token
= (funcall next-token
)
318 for row
= (gethash (car token
) table
)
320 (gethash (first stack
) row
)
321 (error "~A is not a valid token in this grammar."
323 do
(case (first action
)
324 (shift (push token result-stack
)
325 (setf token
(funcall next-token
))
326 (push (second action
) stack
))
327 (reduce (push (list (second action
)) result-stack
)
328 (dotimes (i (third action
))
330 (push (pop (cdr result-stack
)) (cdar result-stack
)))
331 (destructuring-bind (goto state
)
332 (gethash (first stack
) (gethash (second action
) table
))
333 (assert (eql goto
'goto
) () "Malformed parse table!")
335 (accept (return (car result-stack
)))
336 (t (error "Parse error at ~A" token
)))))
338 ;;;; External functions
340 ;; XXX document this, improve interface
341 (defun make-parser (grammar &key end-symbol start-symbol
342 (stream *standard-output
*)
345 "Writes a parser for GRAMMAR onto STREAM, with symbols in PACKAGE;
346 notably, with the parser name being FN-NAME (default of PARSE)."
347 (awhen end-symbol
(setf *end-symbol
* it
))
348 (awhen start-symbol
(setf *start-symbol
* it
))
349 (let ((*grammar
* (process-grammar grammar
)))
350 (multiple-value-bind (*first-set
* *follow-set
* *nullable-set
*)
351 (compute-prediction-sets *grammar
*)
352 (let ((table (create-parse-table)))
353 (write-parser-function table package stream fn-name
)))))
358 (defun test-parser (grammar string
)
359 (let ((*grammar
* (process-grammar grammar
))
361 (multiple-value-bind (*first-set
* *follow-set
* *nullable-set
*)
362 (compute-prediction-sets *grammar
*)
363 (with-input-from-string (*standard-input
* string
)
364 (parse (create-parse-table)
365 #'(lambda () (cons (handler-case (read)
366 (end-of-file () *end-symbol
*))