Some cleanups, improved #'MAKE-PARSER.
[lalr-parser-generator.git] / parser.lisp
blob83086625f22e6380687a76613c0f16122a1a39b4
1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
4 ;;; notes for rewrite:
5 ;;;
6 ;;; When we preprocess the grammar, give every symbol a unique
7 ;;; integer, and then use bitvectors for all set operations. Keep a
8 ;;; bitvector to track terminal/nonterminal-ness.
9 ;;; (any benefit to doing this?)
10 ;;;
11 ;;; Add a suite of tests using RT.
12 ;;;
13 ;;; Write some usage information.
14 ;;;
15 ;;; Add some operator precedence controls.
16 ;;;
17 ;;; Code to convert yacc file into suitable grammar.
19 (in-package :lalr-parser-generator)
21 ;;;; Special variables.
23 (defparameter *start-symbol* 'start)
24 (defparameter *end-symbol* '$)
26 (defvar *grammar* nil
27 "The default grammar used by the LALR parser generator; set by
28 PROCESS-GRAMMAR.")
29 (defvar *first-set* nil)
30 (defvar *follow-set* nil)
31 (defvar *nullable-set* nil)
34 ;;;; LALR ITEMS
35 ;;; We could use a structure instead of a list here, and it would
36 ;;; probably be much more efficient. For the moment, it doesn't
37 ;;; matter.
39 (defstruct item (lhs) (rhs) (dot) (lookahead))
41 (defun dot-at-end-p (item) (endp (item-dot item)))
43 (defun symbol-at-dot (item) (car (item-dot item)))
45 (defun advance-dot (item)
46 "Returns the item dot, advanced by one symbol. Note:
47 non-destructive."
48 (cdr (item-dot item)))
51 ;;; item sets -- see also macros.lisp.
53 (defun make-item-set (&rest items)
54 (let ((set (make-array '(0) :adjustable t :fill-pointer 0)))
55 (dolist (i items)
56 (add-to-set i set))
57 set))
59 (defun add-to-set (item set)
60 "Returns position of ITEM in SET."
61 (or (dotimes (i (length set))
62 (when (and (equal (item-lhs item) (item-lhs (aref set i)))
63 (equal (item-rhs item) (item-rhs (aref set i)))
64 (equal (item-dot item) (item-dot (aref set i))))
65 (unless (equal (item-lookahead item) (item-lookahead
66 (aref set i)))
67 (setf (item-lookahead (aref set i))
68 (union (item-lookahead item)
69 (item-lookahead (aref set i)))))
70 (return i)))
71 ;(position item set :test #'equalp)
72 (vector-push-extend item set)))
75 ;;;; GRAMMAR
77 (defun process-grammar (grammar)
78 "Processes GRAMMAR, returns a grammar suitable for binding to
79 *GRAMMAR. Augments the grammar with a new start rule."
80 ;; split grammar into hash table of non-terminals, terminals.
82 ;; the grammar is a list of non-terminals followed by their
83 ;; productions.
85 ;; we compile the basic hash table of non-terminals by iterating
86 ;; through the grammar, storing the lists of productions.
87 (let ((grammar-hash (make-hash-table)))
88 (do ((list-> grammar (cddr list->)))
89 ((null list->))
90 (setf (gethash (car list->) grammar-hash)
91 (cadr list->)))
93 ;; augment grammar with start symbol
94 (dolist (i (list *start-symbol* *end-symbol*))
95 (assert (null (gethash i grammar-hash)) nil
96 "~A is a reserved non-terminal, unfortunately. Try
97 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
98 specified." i))
99 (setf (gethash *start-symbol* grammar-hash)
100 (list (list (car grammar) *end-symbol*)))
101 grammar-hash))
103 (defun non-terminal-p (symbol) (gethash symbol *grammar*))
104 (defun grammar-productions (symbol) (gethash symbol *grammar*))
107 ;;;; PARSE TABLE CONSTRUCTION
109 (defun first-sets (symbol-list)
110 (do* ((x-> symbol-list (cdr x->))
111 (s (and x-> (gethash (car x->) *first-set*))
112 (union s (and x-> (gethash (car x->) *first-set*)))))
113 ((or (null x->) (not (gethash (car x->) *nullable-set*)))
114 s)))
116 (defun lalr-closure (item-set)
117 "Returns the closure of ITEM-SET."
118 (do-until-unchanged (item-set)
119 (do-for-each-item (i item-set)
120 (when (non-terminal-p (symbol-at-dot i))
121 (dolist (r (grammar-productions (symbol-at-dot i)))
122 (add-to-set (make-item :lhs (symbol-at-dot i) :rhs r :dot r
123 :lookahead (union (first-sets (advance-dot i))
124 (item-lookahead i)))
125 item-set)))))
126 item-set)
128 (defun lalr-goto (item-set grammar-symbol)
129 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
130 (let ((j (make-item-set)))
131 (do-for-each-item (i item-set)
132 (when (eql (symbol-at-dot i) grammar-symbol)
133 (add-to-set (make-item :lhs (item-lhs i) :rhs (item-rhs i)
134 :dot (advance-dot i)
135 :lookahead (item-lookahead i))
136 j)))
137 (lalr-closure j)))
139 (defun make-start-item ()
140 "Makes the item S' -> .S$, as appropriate for the grammar."
141 (make-item :lhs *start-symbol*
142 :rhs (first (gethash *start-symbol* *grammar*))
143 :dot (first (gethash *start-symbol* *grammar*))))
145 (defun make-almost-done-item ()
146 "Makes the item S' -> S.$, as appropriate for the grammar."
147 (let* ((start-item (make-start-item))
148 (dot (do ((dot (advance-dot start-item) (cdr dot)))
149 ((or (null dot) (eql (car dot) *end-symbol*)) dot))))
150 (assert (not (null dot)))
151 (make-item :lhs (item-lhs start-item) :rhs (item-rhs start-item)
152 :dot dot)))
154 ;;; The code gets progressively uglier as I refine the data
155 ;;; structures. Shame on me.
157 #+nil
158 (defun item-set-equal (set-a set-b)
159 (do ((a set-a (cdr a))
160 (b set-b (cdr b)))
161 ((and (endp a) (endp b)) t)
162 (unless (and (equal (item-lhs (car a)) (item-lhs (car b)))
163 (equal (item-rhs (car a)) (item-rhs (car b)))
164 (equal (item-dot (car a)) (item-dot (car b))))
165 (return nil))))
167 (defun item-set-equal-ignoring-la (set-a set-b)
168 (block body
169 (when (= (length set-a) (length set-b))
170 (dotimes (i (length set-a))
171 (unless (and (equal (item-lhs (aref set-a i))
172 (item-lhs (aref set-b i)))
173 (equal (item-rhs (aref set-a i))
174 (item-rhs (aref set-b i)))
175 (equal (item-dot (aref set-a i))
176 (item-dot (aref set-b i))))
177 (return-from body nil)))
178 t)))
181 (defun merge-la-in-sets (dst src)
182 (dotimes (i (length dst))
183 (unless (equal (item-lookahead (aref dst i))
184 (item-lookahead (aref src i)))
185 (setf (item-lookahead (aref dst i))
186 (union (item-lookahead (aref dst i))
187 (item-lookahead (aref src i)))))))
189 (defun add-to-states (set states)
191 (dotimes (i (length states))
192 (when (item-set-equal-ignoring-la set (aref states i))
193 ;; find items which are same but for LA, merge LA.
194 (merge-la-in-sets (aref states i) set)
195 (return i)))
196 (vector-push-extend set states)))
198 (defun compute-shifts ()
199 "Compute shift actions and states for the generated parser. Returns
200 a list of shift actions and the state table."
201 (let ((shift-table nil)
202 (states (make-array '(1) :adjustable t :fill-pointer 1
203 :initial-element
204 (lalr-closure
205 (make-item-set (make-start-item))))))
206 (do-until-unchanged (states shift-table)
207 (dotimes (i (length states))
208 (do-for-each-item (item (aref states i))
209 (when (and (not (dot-at-end-p item))
210 (not (eql (symbol-at-dot item) *end-symbol*)))
211 (let* ((x (symbol-at-dot item))
212 (new-set (lalr-goto (aref states i) x))
213 (j (add-to-states new-set states)))
214 (pushnew (list i x j) shift-table :test #'equalp))))))
215 (values shift-table states)))
218 (defun compute-reductions (states)
219 "Compute reduce actions for the generated parser. Depends on
220 *STATE* already being filled, and returns the reduce actions."
221 (let ((reduce-table nil))
222 (dotimes (i (length states))
223 (do-for-each-item (item (aref states i))
224 (when (dot-at-end-p item)
225 (dolist (j (item-lookahead item))
226 (pushnew (list i j item) reduce-table :test #'equalp)))))
227 reduce-table))
230 (defun add-accept-actions (parse-table states)
231 (do* ((i 0 (1+ i))
232 (n-states (length states))
233 (item (make-almost-done-item)))
234 ((>= i n-states))
235 (when (find item (aref states i) :test #'equalp)
236 (add-to-parse-table parse-table n-states i *end-symbol* `(accept)))))
239 (defun add-to-parse-table (parse-table n-states i x action)
240 "Adds ACTION to the parse table at (X,I). Applies braindead
241 conflict resolution rule to any conflicts detected."
242 (sunless (gethash x parse-table)
243 (setf it (make-array (list n-states) :initial-element nil)))
244 (aif (aref (gethash x parse-table) i)
245 ;; XXX should probably collate the number of conflicts
246 ;; somewhere.
247 ;; XXX should resolve reduce/reduce conflicts by reducing by
248 ;; the largest rule.
249 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
250 x i it action)
251 ;; (assert (null (aref (gethash x parse-table) i)))
252 (setf (aref (gethash x parse-table) i) action)))
255 (defun create-parse-table (shifts reductions states)
256 "Constructs a parse table usable by PARSE, from the list of shift
257 and reduce actions, and the set of parse states."
258 (let ((parse-table (make-hash-table))
259 (n-states (length states)))
260 (dolist (shift shifts)
261 (destructuring-bind (i x j) shift
262 (add-to-parse-table parse-table n-states i x
263 (list (if (non-terminal-p x) 'goto 'shift) j))))
265 (dolist (reduce reductions)
266 (destructuring-bind (i x j) reduce
267 (add-to-parse-table parse-table n-states i x
268 `(reduce ,(item-lhs j) ,(length (item-rhs j))))))
269 (add-accept-actions parse-table states)
271 parse-table))
274 ;;; XXX certainly not the most attractive way to do this, but I've
275 ;;; done worse...
276 (defun write-parser-function (table package stream fn-name)
277 (let* ((*package* (find-package "LALR-PARSER-GENERATOR"))
278 (fn-name (intern (if (stringp fn-name)
279 fn-name
280 (symbol-name fn-name)))))
281 (format stream ";; Automatically generated by LALR-PARSER-GENERATOR.")
282 (format stream "~&(in-package ~S)~%" (package-name package))
283 (pprint `(flet ((unmash (entries)
284 (let ((ht (make-hash-table)))
285 (dolist (e entries)
286 (setf (gethash (car e) ht) (cdr e)))
287 ht)))
288 (let ((table (unmash ',(let ((untable))
289 (maphash (lambda (k v)
290 (push (cons k v)
291 untable))
292 table)
293 untable))))
294 (defun ,fn-name (next-token)
295 "NEXT-TOKEN is a function which returns a cons of the next token in
296 the input (the CAR being the symbol name, the CDR being any
297 information the lexer would like to preserve), and advances the input
298 one token. Returns what might pass for a parse tree in some
299 countries."
300 (do* ((stack (list 0))
301 (token (funcall next-token))
302 (result-stack nil)
303 (row (gethash (car token) table)
304 (gethash (car token) table)))
305 (nil)
306 (unless row
307 (error "~A is not a valid token in this grammar." token))
308 (let ((action (aref row (first stack))))
309 (case (first action)
310 (shift
311 (push token result-stack)
312 (setf token (funcall next-token))
313 (push (second action) stack))
314 (reduce
315 (push (list (second action)) result-stack)
316 (dotimes (i (third action))
317 (pop stack)
318 (push (pop (cdr result-stack)) (cdar result-stack)))
319 (destructuring-bind (goto state)
320 (aref (gethash (second action) table) (first stack))
321 (assert (eql goto 'goto) (state) "Malformed parse table!")
322 (push state stack)))
323 (accept (return (car result-stack)))
324 (t (error "Parse error at ~A" token)))))))))))
327 (defun parse (table next-token)
328 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
329 function which returns a cons of the next token in the input (the CAR
330 being the symbol name, the CDR being any information the lexer would
331 like to preserve), and advances the input one token. Returns what
332 might pass for a parse tree in some countries."
333 (declare (optimize (debug 3)))
334 (do* ((stack (list 0))
335 (token (funcall next-token))
336 (result-stack nil)
337 (row (gethash (car token) table)
338 (gethash (car token) table)))
339 (nil)
340 (unless row
341 (error "~A is not a valid token in this grammar." token))
342 (let ((action (aref row (first stack))))
343 (case (first action)
344 (shift
345 (push token result-stack)
346 (setf token (funcall next-token))
347 (push (second action) stack))
348 (reduce
349 (push (list (second action)) result-stack)
350 (dotimes (i (third action))
351 (pop stack)
352 (push (pop (cdr result-stack)) (cdar result-stack)))
353 (destructuring-bind (goto state)
354 (aref (gethash (second action) table) (first stack))
355 (assert (eql goto 'goto) (state) "Malformed parse table!")
356 (push state stack)))
357 (accept (return (car result-stack)))
358 (t (error "Parse error at ~A" token))))))
361 ;;;; External functions
363 (defun make-parser (grammar &key end-symbol start-symbol
364 (stream *standard-output*)
365 (package *package*)
366 (fn-name 'parse))
367 (awhen end-symbol (setf *end-symbol* it))
368 (awhen start-symbol (setf *start-symbol* it))
369 (let ((*grammar* (process-grammar grammar)))
370 (process-grammar grammar)
371 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
372 (compute-prediction-sets *grammar*)
373 (multiple-value-bind (shifts states) (compute-shifts)
374 (let ((table (create-parse-table shifts
375 (compute-reductions states)
376 states)))
377 (write-parser-function table package stream fn-name))))))
380 ;;;; Testing stuff.
382 (defun test-parser (grammar string)
383 (let ((*grammar* (process-grammar grammar)))
384 (multiple-value-bind (*first-set* *follow-set* *nullable-set*)
385 (compute-prediction-sets *grammar*)
386 (with-input-from-string (*standard-input* string)
387 (multiple-value-bind (shifts states) (compute-shifts)
388 (parse (create-parse-table shifts
389 (compute-reductions states)
390 states)
391 (lambda () (cons (read) nil))))))))
393 (defparameter *lr0-test-grammar*
394 '(sentence ((open list close)
395 (variable))
396 list ((sentence)
397 (list comma sentence))))
399 (defparameter *slr-test-grammar*
400 '(E ((T + E) (T))
401 T ((x))))
403 (defparameter *simple-nullable-test-grammar*
404 '(Z ((d)
405 (X Y Z))
406 Y (nil
407 (c))
408 X ((Y)
409 (a))))
411 (defparameter *simple-lalr-test-grammar*
412 '(S ((E))
413 E ((E - T) (T))
414 T ((n) (OPEN E CLOSE))))