Continuing refactoring. Also fixed test fu.
[lalr-parser-generator.git] / parser.lisp
blob355fd9229bba347dc2f2ffaf9887fc5e38e832d0
1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
4 ;;; notes for rewrite:
5 ;;;
6 ;;; When we preprocess the grammar, give every symbol a unique
7 ;;; integer, and then use bitvectors for all set operations. Keep a
8 ;;; bitvector to track terminal/nonterminal-ness.
9 ;;; (any benefit to doing this? should do some profiling.)
10 ;;;
11 ;;; Add a suite of tests using RT.
12 ;;;
13 ;;; Write some usage information.
14 ;;;
15 ;;; Add some operator precedence controls.
16 ;;;
17 ;;; Code to convert yacc file into suitable grammar.
19 (in-package :lalr-parser-generator)
21 ;;;; Special variables.
23 (defparameter *start-symbol* 'start)
24 (defparameter *end-symbol* '$)
26 (defvar *grammar* nil
27 "The default grammar used by the LALR parser generator; set by
28 PROCESS-GRAMMAR.")
29 (defvar *first-set* nil)
30 (defvar *follow-set* nil)
31 (defvar *nullable-set* nil)
34 ;;;; LALR ITEMS
36 (defstruct item (lhs) (rhs) (dot) (lookahead))
38 ;;; XXX should these dot functions operate on the dot itself, rather
39 ;;; than calling item-dot? That would make it easier to hide the fact
40 ;;; that dot is just a list.
41 (defun dot-at-end-p (item) (endp (item-dot item)))
42 (defun symbol-at-dot (item) (car (item-dot item)))
43 (defun advance-dot (item)
44 "Returns the item dot, advanced by one symbol. Note:
45 non-destructive."
46 (cdr (item-dot item)))
49 ;;; item sets -- see also macros.lisp.
51 (defun make-item-set (&rest items)
52 (let ((set (make-array '(0) :adjustable t :fill-pointer 0)))
53 (dolist (i items) (add-to-set i set))
54 set))
57 (defun items-equal-except-lookahead-p (a b)
58 (every (lambda (x) (equal (funcall x a) (funcall x b)))
59 (list #'item-lhs #'item-rhs #'item-dot)))
61 (defun add-to-set (item set)
62 "Returns position of ITEM in SET."
63 (let* ((i (or (position item set :test #'items-equal-except-lookahead-p)
64 (vector-push-extend item set)))
65 (la-of-a (item-lookahead item))
66 (la-of-b (item-lookahead (aref set i))))
67 (unless (equal la-of-a la-of-b)
68 (setf (item-lookahead (aref set i))
69 (union la-of-a la-of-b)))))
71 (defun item-set-equal-ignoring-la (set-a set-b)
72 (when (= (length set-a) (length set-b))
73 (every #'items-equal-except-lookahead-p set-a set-b)))
76 ;;;; GRAMMAR
78 (defun process-grammar (grammar)
79 "Processes GRAMMAR, returns a grammar suitable for binding to
80 *GRAMMAR*. Augments the grammar with a new start rule."
81 ;; we compile the basic hash table of non-terminals by iterating
82 ;; through the grammar, storing the lists of productions.
83 (let ((grammar-hash (make-hash-table)))
84 (do ((list-> grammar (cddr list->)))
85 ((null list->))
86 (setf (gethash (car list->) grammar-hash)
87 (cadr list->)))
89 (augment-grammar grammar-hash (car grammar))
90 grammar-hash))
92 (defun augment-grammar (hash first-real-symbol)
93 ;; augment grammar with start symbol
94 (dolist (i (list *start-symbol* *end-symbol*))
95 (assert (null (gethash i hash)) nil
96 "~A is a reserved non-terminal, unfortunately. Try
97 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
98 specified." i))
99 (setf (gethash *start-symbol* hash)
100 (list (list first-real-symbol *end-symbol*))))
102 (defun non-terminal-p (symbol) (gethash symbol *grammar*))
103 (defun grammar-productions (symbol) (gethash symbol *grammar*))
106 ;;;; PARSE TABLE CONSTRUCTION
108 (defun first-sets (symbol-list)
109 "Returns the union of the first sets of each symbol in SYMBOL-LIST,
110 until either a nullable symbol is found or we run out of symbols."
111 (do* ((x-> symbol-list (cdr x->))
112 (s (and x-> (gethash (car x->) *first-set*))
113 (union s (and x-> (gethash (car x->) *first-set*)))))
114 ((or (null x->) (not (gethash (car x->) *nullable-set*)))
115 s)))
117 (defun lalr-closure (item-set)
118 "Returns the closure of ITEM-SET."
119 (do-until-unchanged (item-set)
120 (do-for-each-item (i item-set)
121 (when (non-terminal-p (symbol-at-dot i))
122 (dolist (r (grammar-productions (symbol-at-dot i)))
123 (add-to-set (make-item :lhs (symbol-at-dot i) :rhs r :dot r
124 :lookahead (union (first-sets (advance-dot i))
125 (item-lookahead i)))
126 item-set)))))
127 item-set)
129 (defun lalr-goto (item-set grammar-symbol)
130 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
131 (let ((j (make-item-set)))
132 (do-for-each-item (i item-set)
133 (when (eql (symbol-at-dot i) grammar-symbol)
134 (add-to-set (make-item :lhs (item-lhs i) :rhs (item-rhs i)
135 :dot (advance-dot i)
136 :lookahead (item-lookahead i))
137 j)))
138 (lalr-closure j)))
140 (defun make-start-item ()
141 "Makes the item S' -> .S$, as appropriate for the grammar."
142 (make-item :lhs *start-symbol*
143 :rhs (first (gethash *start-symbol* *grammar*))
144 :dot (first (gethash *start-symbol* *grammar*))))
146 (defun make-almost-done-item ()
147 "Makes the item S' -> S.$, as appropriate for the grammar."
148 (let* ((start-item (make-start-item))
149 (dot (do ((dot (advance-dot start-item) (cdr dot)))
150 ((or (null dot) (eql (car dot) *end-symbol*)) dot))))
151 (assert (not (null dot)))
152 (make-item :lhs (item-lhs start-item) :rhs (item-rhs start-item)
153 :dot dot)))
157 (defun merge-lookahead-in-sets (src dst)
158 (macrolet ((la (set) `(item-lookahead (aref ,set i))))
159 (dotimes (i (length dst))
160 (unless (equal (la dst) (la src))
161 (setf (la dst) (union (la dst) (la src)))))))
163 (defun add-to-states (set states)
164 "Adds SET to STATES, either by merging it with another set which is
165 identical save for look-ahead, or push it onto the end. Returns the
166 index in STATES."
167 (flet ((merge-existing ()
168 (loop for i below (length states)
169 and other-set across states
170 when (item-set-equal-ignoring-la set other-set)
171 do (merge-lookahead-in-sets set other-set)
172 (return i))))
173 (or (merge-existing) (vector-push-extend set states))))
175 (defun make-initial-state ()
176 (lalr-closure (make-item-set (make-start-item))))
178 (defun compute-shifts (table)
179 "Computes shift actions and states for the generated parser. Adds
180 shifts to the parse table as we find them. Returns the state table."
181 (let ((states (make-array '(1) :adjustable t :fill-pointer 1
182 :initial-element (make-initial-state))))
183 (do-until-unchanged (states) ;; XXX also table?
184 (dotimes (i (length states))
185 (do-for-each-item (item (aref states i))
186 (maybe-shift table states item i))))
187 states))
189 ;; XXX awful name; refactor.
190 (defun maybe-shift (table states item i)
191 (unless (or (dot-at-end-p item)
192 (eql (symbol-at-dot item) *end-symbol*))
193 (let* ((symbol (symbol-at-dot item))
194 (new-set (lalr-goto (aref states i) symbol))
195 (j (add-to-states new-set states))
196 (action (list (if (non-terminal-p symbol) 'goto 'shift) j)))
197 (add-to-parse-table table i symbol action))))
200 (defun compute-reductions (table states)
201 "Compute reduce actions for the generated parser, based on STATES.
202 Fills in TABLE with the reduce actions."
203 (dotimes (i (length states))
204 (do-for-each-item (item (aref states i))
205 (when (dot-at-end-p item)
206 (dolist (symbol (item-lookahead item))
207 (let ((action `(reduce ,(item-lhs item)
208 ,(length (item-rhs item)))))
209 (add-to-parse-table table i symbol action)))))))
212 (defun add-accept-actions (parse-table states)
213 "Finds states whose next token should be $ (EOF) and adds accept
214 actions to the parse table for those states."
215 (loop with n-states = (length states)
216 and item = (make-almost-done-item)
217 for i from 0 below n-states
218 when (find item (aref states i) :test #'equalp)
219 do (add-to-parse-table parse-table i *end-symbol* '(accept))))
222 (defun add-to-parse-table (parse-table state symbol action)
223 "Adds ACTION to the parse table at (SYMBOL,STATE). Applies braindead
224 conflict resolution rule to any conflicts detected."
225 (sunless (gethash symbol parse-table)
226 (setf it (make-hash-table :test 'equal)))
227 (let ((row (gethash symbol parse-table)))
228 (awhen (gethash state row)
229 (when (equal action it) (return-from add-to-parse-table))
230 (setf action (resolve-collision action it symbol state)))
231 (setf (gethash state row) action)))
234 (defun resolve-collision (new-action old-action symbol state)
235 ;; XXX should probably collate the number of conflicts
236 ;; somewhere.
237 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
238 symbol state old-action new-action)
239 (cond ((and (eql (car old-action) 'shift) (eql (car new-action) 'reduce))
240 old-action) ; S/R => prefer shift.
241 ((and (eql (car old-action) 'reduce) (eql (car new-action) 'reduce))
242 ;; R/R => prefer longer reduction.
243 (if (>= (third new-action) (third old-action))
244 new-action
245 old-action))
246 (t (error "This is an unexpected conflict (~A, ~A). Call a wizard."
247 old-action new-action))))
250 (defun create-parse-table ()
251 "Constructs a parse table usable by PARSE."
252 (let* ((parse-table (make-hash-table))
253 (states (compute-shifts parse-table)))
254 (compute-reductions parse-table states)
255 (add-accept-actions parse-table states)
256 parse-table))
259 ;;; XXX certainly not the most attractive way to do this, but I've
260 ;;; done worse...
261 (defun write-parser-function (table package stream fn-name)
262 (let* ((*package* (find-package "LALR-PARSER-GENERATOR"))
263 (fn-name (intern (if (stringp fn-name)
264 fn-name
265 (symbol-name fn-name)))))
266 (format stream ";; Automatically generated by LALR-PARSER-GENERATOR.")
267 (format stream "~&(in-package ~S)~%" (package-name package))
268 (pprint `(flet ((unmash (entries)
269 (let ((ht (make-hash-table)))
270 (dolist (e entries)
271 (setf (gethash (car e) ht) (cdr e)))
272 ht)))
273 (let ((table (unmash ',(let ((untable))
274 (maphash (lambda (k v)
275 (push (cons k v)
276 untable))
277 table)
278 untable))))
279 (defun ,fn-name (next-token)
280 "NEXT-TOKEN is a function which returns a cons of the next token in
281 the input (the CAR being the symbol name, the CDR being any
282 information the lexer would like to preserve), and advances the input
283 one token. Returns what might pass for a parse tree in some
284 countries."
285 (loop with stack = (list 0)
286 and token = (funcall next-token)
287 and result-stack
288 for row = (gethash (car token) table)
289 for action = (if row
290 (gethash (first stack) row)
291 (error "~A is not a valid token in this grammar."
292 token))
293 do (case (first action)
294 (shift (push token result-stack)
295 (setf token (funcall next-token))
296 (push (second action) stack))
297 (reduce (push (list (second action)) result-stack)
298 (dotimes (i (third action))
299 (pop stack)
300 (push (pop (cdr result-stack)) (cdar result-stack)))
301 (destructuring-bind (goto state)
302 (gethash (first stack) (gethash (second action) table))
303 (assert (eql goto 'goto) () "Malformed parse table!")
304 (push state stack)))
305 (accept (return (car result-stack)))
306 (t (error "Parse error at ~A" token))))))))))
309 (defun parse (table next-token)
310 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
311 function which returns a cons of the next token in the input (the CAR
312 being the symbol name, the CDR being any information the lexer would
313 like to preserve), and advances the input one token. Returns what
314 might pass for a parse tree in some countries."
315 (loop with stack = (list 0)
316 and token = (funcall next-token)
317 and result-stack
318 for row = (gethash (car token) table)
319 for action = (if row
320 (gethash (first stack) row)
321 (error "~A is not a valid token in this grammar."
322 token))
323 do (case (first action)
324 (shift (push token result-stack)
325 (setf token (funcall next-token))
326 (push (second action) stack))
327 (reduce (push (list (second action)) result-stack)
328 (dotimes (i (third action))
329 (pop stack)
330 (push (pop (cdr result-stack)) (cdar result-stack)))
331 (destructuring-bind (goto state)
332 (gethash (first stack) (gethash (second action) table))
333 (assert (eql goto 'goto) () "Malformed parse table!")
334 (push state stack)))
335 (accept (return (car result-stack)))
336 (t (error "Parse error at ~A" token)))))
338 ;;;; External functions
340 ;; XXX document this, improve interface
341 (defun make-parser (grammar &key end-symbol start-symbol
342 (stream *standard-output*)
343 (package *package*)
344 (fn-name 'parse))
345 "Writes a parser for GRAMMAR onto STREAM, with symbols in PACKAGE;
346 notably, with the parser name being FN-NAME (default of PARSE)."
347 (awhen end-symbol (setf *end-symbol* it))
348 (awhen start-symbol (setf *start-symbol* it))
349 (let ((*grammar* (process-grammar grammar)))
350 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
351 (compute-prediction-sets *grammar*)
352 (let ((table (create-parse-table)))
353 (write-parser-function table package stream fn-name)))))
356 ;;;; Testing stuff.
358 (defun test-parser (grammar string)
359 (let ((*grammar* (process-grammar grammar))
360 (*read-eval* nil))
361 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
362 (compute-prediction-sets *grammar*)
363 (with-input-from-string (*standard-input* string)
364 (parse (create-parse-table)
365 #'(lambda () (cons (handler-case (read)
366 (end-of-file () *end-symbol*))
367 nil)))))))