Refactored ugly setf-union stuff to unionf.
[lalr-parser-generator.git] / parser.lisp
blob7c42799d4c41433376a07a2eea6bce38fc0838fc
1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
4 ;;; Things to do:
5 ;;;
6 ;;; Add some operator precedence controls.
7 ;;;
8 ;;; Write some usage information.
9 ;;;
10 ;;; When we preprocess the grammar, give every symbol a unique
11 ;;; integer, and then use bitvectors for all set operations. Keep a
12 ;;; bitvector to track terminal/nonterminal-ness.
13 ;;;
14 ;;; Code to convert yacc file into suitable grammar.
15 ;;;
16 ;;; Refactor heavily.
18 (in-package :lalr-parser-generator)
20 ;;;; Special variables.
22 (defparameter *start-symbol* 'start)
23 (defparameter *end-symbol* '$)
25 (defvar *grammar* nil
26 "The default grammar used by the LALR parser generator; set by
27 PROCESS-GRAMMAR.")
28 (defvar *first-set* nil)
29 (defvar *follow-set* nil)
30 (defvar *nullable-set* nil)
33 ;;;; LALR ITEMS
35 (defstruct item (lhs) (rhs) (dot) (lookahead))
37 ;;; XXX should these dot functions operate on the dot itself, rather
38 ;;; than calling item-dot? That would make it easier to hide the fact
39 ;;; that dot is just a list.
40 (defun dot-at-end-p (item) (endp (item-dot item)))
41 (defun symbol-at-dot (item) (car (item-dot item)))
42 (defun advance-dot (item)
43 "Returns the item dot, advanced by one symbol. Note:
44 non-destructive."
45 (cdr (item-dot item)))
48 ;;; item sets -- see also macros.lisp.
50 (defun make-item-set (&rest items)
51 (let ((set (make-array '(0) :adjustable t :fill-pointer 0)))
52 (dolist (i items) (add-to-set i set))
53 set))
56 (defun items-equal-except-lookahead-p (a b)
57 (every (lambda (x) (equal (funcall x a) (funcall x b)))
58 (list #'item-lhs #'item-rhs #'item-dot)))
60 (defun add-to-set (item set)
61 "Returns position of ITEM in SET."
62 (let* ((i (or (position item set :test #'items-equal-except-lookahead-p)
63 (vector-push-extend item set)))
64 (la-of-a (item-lookahead item))
65 (la-of-b (item-lookahead (aref set i))))
66 (unless (equal la-of-a la-of-b)
67 (unionf (item-lookahead (aref set i)) la-of-a))))
69 (defun item-set-equal-ignoring-la (set-a set-b)
70 (when (= (length set-a) (length set-b))
71 (every #'items-equal-except-lookahead-p set-a set-b)))
74 ;;;; GRAMMAR
76 (defun process-grammar (grammar)
77 "Processes GRAMMAR, returns a grammar suitable for binding to
78 *GRAMMAR*. Augments the grammar with a new start rule."
79 ;; we compile the basic hash table of non-terminals by iterating
80 ;; through the grammar, storing the lists of productions.
81 (let ((grammar-hash (make-hash-table)))
82 (do ((list-> grammar (cddr list->)))
83 ((null list->))
84 (setf (gethash (car list->) grammar-hash)
85 (cadr list->)))
87 (augment-grammar grammar-hash (car grammar))
88 grammar-hash))
90 (defun augment-grammar (hash first-real-symbol)
91 ;; augment grammar with start symbol
92 (dolist (i (list *start-symbol* *end-symbol*))
93 (assert (null (gethash i hash)) nil
94 "~A is a reserved non-terminal, unfortunately. Try
95 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
96 specified." i))
97 (setf (gethash *start-symbol* hash)
98 (list (list first-real-symbol *end-symbol*))))
100 (defun non-terminal-p (symbol) (gethash symbol *grammar*))
101 (defun grammar-productions (symbol) (gethash symbol *grammar*))
104 ;;;; PARSE TABLE CONSTRUCTION
106 (defun first-sets (symbol-list)
107 "Returns the union of the first sets of each symbol in SYMBOL-LIST,
108 until either a nullable symbol is found or we run out of symbols."
109 (do* ((x-> symbol-list (cdr x->))
110 (s (and x-> (gethash (car x->) *first-set*))
111 (union s (and x-> (gethash (car x->) *first-set*)))))
112 ((or (null x->) (not (gethash (car x->) *nullable-set*)))
113 s)))
115 (defun lalr-closure (item-set)
116 "Returns the closure of ITEM-SET."
117 (do-until-unchanged (item-set)
118 (do-for-each-item (i item-set)
119 (when (non-terminal-p (symbol-at-dot i))
120 (dolist (r (grammar-productions (symbol-at-dot i)))
121 (add-to-set (make-item :lhs (symbol-at-dot i) :rhs r :dot r
122 :lookahead (union (first-sets (advance-dot i))
123 (item-lookahead i)))
124 item-set)))))
125 item-set)
127 (defun lalr-goto (item-set grammar-symbol)
128 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
129 (let ((j (make-item-set)))
130 (do-for-each-item (i item-set)
131 (when (eql (symbol-at-dot i) grammar-symbol)
132 (add-to-set (make-item :lhs (item-lhs i) :rhs (item-rhs i)
133 :dot (advance-dot i)
134 :lookahead (item-lookahead i))
135 j)))
136 (lalr-closure j)))
138 (defun make-start-item ()
139 "Makes the item S' -> .S$, as appropriate for the grammar."
140 (make-item :lhs *start-symbol*
141 :rhs (first (gethash *start-symbol* *grammar*))
142 :dot (first (gethash *start-symbol* *grammar*))))
144 (defun make-almost-done-item ()
145 "Makes the item S' -> S.$, as appropriate for the grammar."
146 (let* ((start-item (make-start-item))
147 (dot (do ((dot (advance-dot start-item) (cdr dot)))
148 ((or (null dot) (eql (car dot) *end-symbol*)) dot))))
149 (assert (not (null dot)))
150 (make-item :lhs (item-lhs start-item) :rhs (item-rhs start-item)
151 :dot dot)))
155 (defun merge-lookahead-in-sets (src dst)
156 (macrolet ((la (set) `(item-lookahead (aref ,set i))))
157 (dotimes (i (length dst))
158 (unless (equal (la dst) (la src))
159 (unionf (la dst) (la src))))))
161 (defun add-to-states (set states)
162 "Adds SET to STATES, either by merging it with another set which is
163 identical save for look-ahead, or push it onto the end. Returns the
164 index in STATES."
165 (flet ((merge-existing ()
166 (loop for i below (length states)
167 and other-set across states
168 when (item-set-equal-ignoring-la set other-set)
169 do (merge-lookahead-in-sets set other-set)
170 (return i))))
171 (or (merge-existing) (vector-push-extend set states))))
173 (defun make-initial-state ()
174 (lalr-closure (make-item-set (make-start-item))))
176 (defun compute-shifts (table)
177 "Computes shift actions and states for the generated parser. Adds
178 shifts to the parse table as we find them. Returns the state table."
179 (let ((states (make-array '(1) :adjustable t :fill-pointer 1
180 :initial-element (make-initial-state))))
181 (do-until-unchanged (states) ;; XXX also table?
182 (dotimes (i (length states))
183 (do-for-each-item (item (aref states i))
184 (maybe-shift table states item i))))
185 states))
187 ;; XXX awful name; refactor.
188 (defun maybe-shift (table states item i)
189 (unless (or (dot-at-end-p item)
190 (eql (symbol-at-dot item) *end-symbol*))
191 (let* ((symbol (symbol-at-dot item))
192 (new-set (lalr-goto (aref states i) symbol))
193 (j (add-to-states new-set states))
194 (action (list (if (non-terminal-p symbol) 'goto 'shift) j)))
195 (add-to-parse-table table i symbol action))))
198 (defun compute-reductions (table states)
199 "Compute reduce actions for the generated parser, based on STATES.
200 Fills in TABLE with the reduce actions."
201 (dotimes (i (length states))
202 (do-for-each-item (item (aref states i))
203 (when (dot-at-end-p item)
204 (dolist (symbol (item-lookahead item))
205 (let ((action `(reduce ,(item-lhs item)
206 ,(length (item-rhs item)))))
207 (add-to-parse-table table i symbol action)))))))
210 (defun add-accept-actions (parse-table states)
211 "Finds states whose next token should be $ (EOF) and adds accept
212 actions to the parse table for those states."
213 (loop with n-states = (length states)
214 and item = (make-almost-done-item)
215 for i from 0 below n-states
216 when (find item (aref states i) :test #'equalp)
217 do (add-to-parse-table parse-table i *end-symbol* '(accept))))
220 (defun add-to-parse-table (parse-table state symbol action)
221 "Adds ACTION to the parse table at (SYMBOL,STATE). Applies braindead
222 conflict resolution rule to any conflicts detected."
223 (sunless (gethash symbol parse-table)
224 (setf it (make-hash-table :test 'equal)))
225 (let ((row (gethash symbol parse-table)))
226 (awhen (gethash state row)
227 (when (equal action it) (return-from add-to-parse-table))
228 (setf action (resolve-collision action it symbol state))
229 (warn "Preferring ~A." action))
230 (setf (gethash state row) action)))
233 (defun resolve-collision (new-action old-action symbol state)
234 ;; XXX should probably collate the number of conflicts
235 ;; somewhere.
236 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
237 symbol state old-action new-action)
238 (cond ((and (eql (car old-action) 'shift) (eql (car new-action) 'reduce))
239 old-action) ; S/R => prefer shift.
240 ((and (eql (car old-action) 'reduce) (eql (car new-action) 'reduce))
241 ;; R/R => prefer longer reduction.
242 (if (>= (third new-action) (third old-action))
243 new-action
244 old-action))
245 (t (error "This is an unexpected conflict (~A, ~A). Call a wizard."
246 old-action new-action))))
249 (defun create-parse-table ()
250 "Constructs a parse table usable by PARSE."
251 (let* ((parse-table (make-hash-table))
252 (states (compute-shifts parse-table)))
253 (compute-reductions parse-table states)
254 (add-accept-actions parse-table states)
255 parse-table))
259 (defun hash->tree (table)
260 (let ((acc))
261 (maphash #'(lambda (k v) (push (cons k (typecase v
262 (hash-table (hash->tree v))
263 (t v)))
264 acc))
265 table)
266 acc))
268 (defun tree->hash (tree)
269 (let ((ht (make-hash-table)))
270 (dolist (x tree)
271 (setf (gethash (car x) ht)
272 (if (listp (second x))
273 (tree->hash (cdr x))
274 (cdr x))))
275 ht))
277 ;;; XXX certainly not the most attractive way to do this, but I've
278 ;;; done worse...
279 (defun write-parser-function (table package stream fn-name)
280 (let* ((*package* (find-package "LALR-PARSER-GENERATOR"))
281 (fn-name (intern (if (stringp fn-name)
282 fn-name
283 (symbol-name fn-name)))))
284 (format stream ";; Automatically generated by LALR-PARSER-GENERATOR.")
285 (format stream "~&(in-package ~S)~%" (package-name package))
286 (pprint `(labels ((tree->hash (tree)
287 (let ((ht (make-hash-table)))
288 (dolist (x tree)
289 (setf (gethash (car x) ht)
290 (if (listp (second x))
291 (tree->hash (cdr x))
292 (cdr x))))
293 ht)))
294 (let ((table (tree->hash ',(hash->tree table))))
295 (defun ,fn-name (next-token)
296 "NEXT-TOKEN is a function which returns a cons of the next token in
297 the input (the CAR being the symbol name, the CDR being any
298 information the lexer would like to preserve), and advances the input
299 one token. Returns what might pass for a parse tree in some
300 countries."
301 (loop with stack = (list 0)
302 and token = (funcall next-token)
303 and result-stack
304 for row = (gethash (car token) table)
305 for action = (if row
306 (gethash (first stack) row)
307 (error "~A is not a valid token in this grammar."
308 token))
309 do (case (first action)
310 (shift (push token result-stack)
311 (setf token (funcall next-token))
312 (push (second action) stack))
313 (reduce (push (list (second action)) result-stack)
314 (dotimes (i (third action))
315 (pop stack)
316 (push (pop (cdr result-stack)) (cdar result-stack)))
317 (destructuring-bind (goto state)
318 (gethash (first stack) (gethash (second action) table))
319 (assert (eql goto 'goto) () "Malformed parse table!")
320 (push state stack)))
321 (accept (return (car result-stack)))
322 (t (error "Parse error at ~A" token))))))) stream)))
325 (defun parse (table next-token)
326 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
327 function which returns a cons of the next token in the input (the CAR
328 being the symbol name, the CDR being any information the lexer would
329 like to preserve), and advances the input one token. Returns what
330 might pass for a parse tree in some countries."
331 (loop with stack = (list 0)
332 and token = (funcall next-token)
333 and result-stack
334 for row = (gethash (car token) table)
335 for action = (if row
336 (gethash (first stack) row)
337 (error "~A is not a valid token in this grammar."
338 token))
339 do (case (first action)
340 (shift (push token result-stack)
341 (setf token (funcall next-token))
342 (push (second action) stack))
343 (reduce (push (list (second action)) result-stack)
344 (dotimes (i (third action))
345 (pop stack)
346 (push (pop (cdr result-stack)) (cdar result-stack)))
347 (destructuring-bind (goto state)
348 (gethash (first stack) (gethash (second action) table))
349 (assert (eql goto 'goto) () "Malformed parse table!")
350 (push state stack)))
351 (accept (return (car result-stack)))
352 (t (error "Parse error at ~A" token)))))
354 ;;;; External functions
356 ;; XXX document this, improve interface
357 (defun make-parser (grammar &key end-symbol start-symbol
358 (stream *standard-output*)
359 (package *package*)
360 (fn-name 'parse))
361 "Writes a parser for GRAMMAR onto STREAM, with symbols in PACKAGE;
362 notably, with the parser name being FN-NAME (default of PARSE)."
363 (awhen end-symbol (setf *end-symbol* it))
364 (awhen start-symbol (setf *start-symbol* it))
365 (let ((*grammar* (process-grammar grammar)))
366 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
367 (compute-prediction-sets *grammar*)
368 (let ((table (create-parse-table)))
369 (write-parser-function table package stream fn-name)))))
372 ;;;; Testing stuff.
374 (defun test-parser (grammar string)
375 (let ((*grammar* (process-grammar grammar))
376 (*read-eval* nil))
377 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
378 (compute-prediction-sets *grammar*)
379 (with-input-from-string (*standard-input* string)
380 (parse (create-parse-table)
381 #'(lambda () (cons (handler-case (read)
382 (end-of-file () *end-symbol*))
383 nil)))))))