Fixed write-parse-table fu.
[lalr-parser-generator.git] / parser.lisp
blob8a25f14339fdc118cac082d2982d4dc59457dbd0
1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
4 ;;; Things to do:
5 ;;;
6 ;;; Add some operator precedence controls.
7 ;;;
8 ;;; Write some usage information.
9 ;;;
10 ;;; When we preprocess the grammar, give every symbol a unique
11 ;;; integer, and then use bitvectors for all set operations. Keep a
12 ;;; bitvector to track terminal/nonterminal-ness.
13 ;;;
14 ;;; Code to convert yacc file into suitable grammar.
15 ;;;
16 ;;; Refactor heavily.
18 (in-package :lalr-parser-generator)
20 ;;;; Special variables.
22 (defparameter *start-symbol* 'start)
23 (defparameter *end-symbol* '$)
25 (defvar *grammar* nil
26 "The default grammar used by the LALR parser generator; set by
27 PROCESS-GRAMMAR.")
28 (defvar *first-set* nil)
29 (defvar *follow-set* nil)
30 (defvar *nullable-set* nil)
33 ;;;; LALR ITEMS
35 (defstruct item (lhs) (rhs) (dot) (lookahead))
37 ;;; XXX should these dot functions operate on the dot itself, rather
38 ;;; than calling item-dot? That would make it easier to hide the fact
39 ;;; that dot is just a list.
40 (defun dot-at-end-p (item) (endp (item-dot item)))
41 (defun symbol-at-dot (item) (car (item-dot item)))
42 (defun advance-dot (item)
43 "Returns the item dot, advanced by one symbol. Note:
44 non-destructive."
45 (cdr (item-dot item)))
48 ;;; item sets -- see also macros.lisp.
50 (defun make-item-set (&rest items)
51 (let ((set (make-array '(0) :adjustable t :fill-pointer 0)))
52 (dolist (i items) (add-to-set i set))
53 set))
56 (defun items-equal-except-lookahead-p (a b)
57 (every (lambda (x) (equal (funcall x a) (funcall x b)))
58 (list #'item-lhs #'item-rhs #'item-dot)))
60 (defun add-to-set (item set)
61 "Returns position of ITEM in SET."
62 (let* ((i (or (position item set :test #'items-equal-except-lookahead-p)
63 (vector-push-extend item set)))
64 (la-of-a (item-lookahead item))
65 (la-of-b (item-lookahead (aref set i))))
66 (unless (equal la-of-a la-of-b)
67 (setf (item-lookahead (aref set i))
68 (union la-of-a la-of-b)))))
70 (defun item-set-equal-ignoring-la (set-a set-b)
71 (when (= (length set-a) (length set-b))
72 (every #'items-equal-except-lookahead-p set-a set-b)))
75 ;;;; GRAMMAR
77 (defun process-grammar (grammar)
78 "Processes GRAMMAR, returns a grammar suitable for binding to
79 *GRAMMAR*. Augments the grammar with a new start rule."
80 ;; we compile the basic hash table of non-terminals by iterating
81 ;; through the grammar, storing the lists of productions.
82 (let ((grammar-hash (make-hash-table)))
83 (do ((list-> grammar (cddr list->)))
84 ((null list->))
85 (setf (gethash (car list->) grammar-hash)
86 (cadr list->)))
88 (augment-grammar grammar-hash (car grammar))
89 grammar-hash))
91 (defun augment-grammar (hash first-real-symbol)
92 ;; augment grammar with start symbol
93 (dolist (i (list *start-symbol* *end-symbol*))
94 (assert (null (gethash i hash)) nil
95 "~A is a reserved non-terminal, unfortunately. Try
96 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
97 specified." i))
98 (setf (gethash *start-symbol* hash)
99 (list (list first-real-symbol *end-symbol*))))
101 (defun non-terminal-p (symbol) (gethash symbol *grammar*))
102 (defun grammar-productions (symbol) (gethash symbol *grammar*))
105 ;;;; PARSE TABLE CONSTRUCTION
107 (defun first-sets (symbol-list)
108 "Returns the union of the first sets of each symbol in SYMBOL-LIST,
109 until either a nullable symbol is found or we run out of symbols."
110 (do* ((x-> symbol-list (cdr x->))
111 (s (and x-> (gethash (car x->) *first-set*))
112 (union s (and x-> (gethash (car x->) *first-set*)))))
113 ((or (null x->) (not (gethash (car x->) *nullable-set*)))
114 s)))
116 (defun lalr-closure (item-set)
117 "Returns the closure of ITEM-SET."
118 (do-until-unchanged (item-set)
119 (do-for-each-item (i item-set)
120 (when (non-terminal-p (symbol-at-dot i))
121 (dolist (r (grammar-productions (symbol-at-dot i)))
122 (add-to-set (make-item :lhs (symbol-at-dot i) :rhs r :dot r
123 :lookahead (union (first-sets (advance-dot i))
124 (item-lookahead i)))
125 item-set)))))
126 item-set)
128 (defun lalr-goto (item-set grammar-symbol)
129 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
130 (let ((j (make-item-set)))
131 (do-for-each-item (i item-set)
132 (when (eql (symbol-at-dot i) grammar-symbol)
133 (add-to-set (make-item :lhs (item-lhs i) :rhs (item-rhs i)
134 :dot (advance-dot i)
135 :lookahead (item-lookahead i))
136 j)))
137 (lalr-closure j)))
139 (defun make-start-item ()
140 "Makes the item S' -> .S$, as appropriate for the grammar."
141 (make-item :lhs *start-symbol*
142 :rhs (first (gethash *start-symbol* *grammar*))
143 :dot (first (gethash *start-symbol* *grammar*))))
145 (defun make-almost-done-item ()
146 "Makes the item S' -> S.$, as appropriate for the grammar."
147 (let* ((start-item (make-start-item))
148 (dot (do ((dot (advance-dot start-item) (cdr dot)))
149 ((or (null dot) (eql (car dot) *end-symbol*)) dot))))
150 (assert (not (null dot)))
151 (make-item :lhs (item-lhs start-item) :rhs (item-rhs start-item)
152 :dot dot)))
156 (defun merge-lookahead-in-sets (src dst)
157 (macrolet ((la (set) `(item-lookahead (aref ,set i))))
158 (dotimes (i (length dst))
159 (unless (equal (la dst) (la src))
160 (setf (la dst) (union (la dst) (la src)))))))
162 (defun add-to-states (set states)
163 "Adds SET to STATES, either by merging it with another set which is
164 identical save for look-ahead, or push it onto the end. Returns the
165 index in STATES."
166 (flet ((merge-existing ()
167 (loop for i below (length states)
168 and other-set across states
169 when (item-set-equal-ignoring-la set other-set)
170 do (merge-lookahead-in-sets set other-set)
171 (return i))))
172 (or (merge-existing) (vector-push-extend set states))))
174 (defun make-initial-state ()
175 (lalr-closure (make-item-set (make-start-item))))
177 (defun compute-shifts (table)
178 "Computes shift actions and states for the generated parser. Adds
179 shifts to the parse table as we find them. Returns the state table."
180 (let ((states (make-array '(1) :adjustable t :fill-pointer 1
181 :initial-element (make-initial-state))))
182 (do-until-unchanged (states) ;; XXX also table?
183 (dotimes (i (length states))
184 (do-for-each-item (item (aref states i))
185 (maybe-shift table states item i))))
186 states))
188 ;; XXX awful name; refactor.
189 (defun maybe-shift (table states item i)
190 (unless (or (dot-at-end-p item)
191 (eql (symbol-at-dot item) *end-symbol*))
192 (let* ((symbol (symbol-at-dot item))
193 (new-set (lalr-goto (aref states i) symbol))
194 (j (add-to-states new-set states))
195 (action (list (if (non-terminal-p symbol) 'goto 'shift) j)))
196 (add-to-parse-table table i symbol action))))
199 (defun compute-reductions (table states)
200 "Compute reduce actions for the generated parser, based on STATES.
201 Fills in TABLE with the reduce actions."
202 (dotimes (i (length states))
203 (do-for-each-item (item (aref states i))
204 (when (dot-at-end-p item)
205 (dolist (symbol (item-lookahead item))
206 (let ((action `(reduce ,(item-lhs item)
207 ,(length (item-rhs item)))))
208 (add-to-parse-table table i symbol action)))))))
211 (defun add-accept-actions (parse-table states)
212 "Finds states whose next token should be $ (EOF) and adds accept
213 actions to the parse table for those states."
214 (loop with n-states = (length states)
215 and item = (make-almost-done-item)
216 for i from 0 below n-states
217 when (find item (aref states i) :test #'equalp)
218 do (add-to-parse-table parse-table i *end-symbol* '(accept))))
221 (defun add-to-parse-table (parse-table state symbol action)
222 "Adds ACTION to the parse table at (SYMBOL,STATE). Applies braindead
223 conflict resolution rule to any conflicts detected."
224 (sunless (gethash symbol parse-table)
225 (setf it (make-hash-table :test 'equal)))
226 (let ((row (gethash symbol parse-table)))
227 (awhen (gethash state row)
228 (when (equal action it) (return-from add-to-parse-table))
229 (setf action (resolve-collision action it symbol state))
230 (warn "Preferring ~A." action))
231 (setf (gethash state row) action)))
234 (defun resolve-collision (new-action old-action symbol state)
235 ;; XXX should probably collate the number of conflicts
236 ;; somewhere.
237 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
238 symbol state old-action new-action)
239 (cond ((and (eql (car old-action) 'shift) (eql (car new-action) 'reduce))
240 old-action) ; S/R => prefer shift.
241 ((and (eql (car old-action) 'reduce) (eql (car new-action) 'reduce))
242 ;; R/R => prefer longer reduction.
243 (if (>= (third new-action) (third old-action))
244 new-action
245 old-action))
246 (t (error "This is an unexpected conflict (~A, ~A). Call a wizard."
247 old-action new-action))))
250 (defun create-parse-table ()
251 "Constructs a parse table usable by PARSE."
252 (let* ((parse-table (make-hash-table))
253 (states (compute-shifts parse-table)))
254 (compute-reductions parse-table states)
255 (add-accept-actions parse-table states)
256 parse-table))
260 (defun hash->tree (table)
261 (let ((acc))
262 (maphash #'(lambda (k v) (push (cons k (typecase v
263 (hash-table (hash->tree v))
264 (t v)))
265 acc))
266 table)
267 acc))
269 (defun tree->hash (tree)
270 (let ((ht (make-hash-table)))
271 (dolist (x tree)
272 (setf (gethash (car x) ht)
273 (if (listp (second x))
274 (tree->hash (cdr x))
275 (cdr x))))
276 ht))
278 ;;; XXX certainly not the most attractive way to do this, but I've
279 ;;; done worse...
280 (defun write-parser-function (table package stream fn-name)
281 (let* ((*package* (find-package "LALR-PARSER-GENERATOR"))
282 (fn-name (intern (if (stringp fn-name)
283 fn-name
284 (symbol-name fn-name)))))
285 (format stream ";; Automatically generated by LALR-PARSER-GENERATOR.")
286 (format stream "~&(in-package ~S)~%" (package-name package))
287 (pprint `(labels ((tree->hash (tree)
288 (let ((ht (make-hash-table)))
289 (dolist (x tree)
290 (setf (gethash (car x) ht)
291 (if (listp (second x))
292 (tree->hash (cdr x))
293 (cdr x))))
294 ht)))
295 (let ((table (tree->hash ',(hash->tree table))))
296 (defun ,fn-name (next-token)
297 "NEXT-TOKEN is a function which returns a cons of the next token in
298 the input (the CAR being the symbol name, the CDR being any
299 information the lexer would like to preserve), and advances the input
300 one token. Returns what might pass for a parse tree in some
301 countries."
302 (loop with stack = (list 0)
303 and token = (funcall next-token)
304 and result-stack
305 for row = (gethash (car token) table)
306 for action = (if row
307 (gethash (first stack) row)
308 (error "~A is not a valid token in this grammar."
309 token))
310 do (case (first action)
311 (shift (push token result-stack)
312 (setf token (funcall next-token))
313 (push (second action) stack))
314 (reduce (push (list (second action)) result-stack)
315 (dotimes (i (third action))
316 (pop stack)
317 (push (pop (cdr result-stack)) (cdar result-stack)))
318 (destructuring-bind (goto state)
319 (gethash (first stack) (gethash (second action) table))
320 (assert (eql goto 'goto) () "Malformed parse table!")
321 (push state stack)))
322 (accept (return (car result-stack)))
323 (t (error "Parse error at ~A" token))))))) stream)))
326 (defun parse (table next-token)
327 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
328 function which returns a cons of the next token in the input (the CAR
329 being the symbol name, the CDR being any information the lexer would
330 like to preserve), and advances the input one token. Returns what
331 might pass for a parse tree in some countries."
332 (loop with stack = (list 0)
333 and token = (funcall next-token)
334 and result-stack
335 for row = (gethash (car token) table)
336 for action = (if row
337 (gethash (first stack) row)
338 (error "~A is not a valid token in this grammar."
339 token))
340 do (case (first action)
341 (shift (push token result-stack)
342 (setf token (funcall next-token))
343 (push (second action) stack))
344 (reduce (push (list (second action)) result-stack)
345 (dotimes (i (third action))
346 (pop stack)
347 (push (pop (cdr result-stack)) (cdar result-stack)))
348 (destructuring-bind (goto state)
349 (gethash (first stack) (gethash (second action) table))
350 (assert (eql goto 'goto) () "Malformed parse table!")
351 (push state stack)))
352 (accept (return (car result-stack)))
353 (t (error "Parse error at ~A" token)))))
355 ;;;; External functions
357 ;; XXX document this, improve interface
358 (defun make-parser (grammar &key end-symbol start-symbol
359 (stream *standard-output*)
360 (package *package*)
361 (fn-name 'parse))
362 "Writes a parser for GRAMMAR onto STREAM, with symbols in PACKAGE;
363 notably, with the parser name being FN-NAME (default of PARSE)."
364 (awhen end-symbol (setf *end-symbol* it))
365 (awhen start-symbol (setf *start-symbol* it))
366 (let ((*grammar* (process-grammar grammar)))
367 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
368 (compute-prediction-sets *grammar*)
369 (let ((table (create-parse-table)))
370 (write-parser-function table package stream fn-name)))))
373 ;;;; Testing stuff.
375 (defun test-parser (grammar string)
376 (let ((*grammar* (process-grammar grammar))
377 (*read-eval* nil))
378 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
379 (compute-prediction-sets *grammar*)
380 (with-input-from-string (*standard-input* string)
381 (parse (create-parse-table)
382 #'(lambda () (cons (handler-case (read)
383 (end-of-file () *end-symbol*))
384 nil)))))))