Conversion of SLR parser generator to (very ineffecient) LALR parser generator.
[lalr-parser-generator.git] / parser.lisp
blobb5c2dee993106232a526e6881c6bdef147d0b47e
1 ;;; LALR parser generator.
2 ;;; Julian Squires / 2005
3 ;;;
4 ;;; Currently SLR, will be LALR after rewrite.
6 ;;; notes for rewrite:
7 ;;;
8 ;;; When we preprocess the grammar, give every symbol a unique
9 ;;; integer, and then use bitvectors for all set operations. Keep a
10 ;;; bitvector to track terminal/nonterminal-ness.
11 ;;;
12 ;;; Add a suite of tests using RT.
13 ;;;
14 ;;; write parse tables and functions to a file, so projects don't need
15 ;;; to even depend on this package to use their parser.
17 (in-package :lalr-parser-generator)
19 ;;;; Special variables.
21 (defparameter +start-symbol+ 'start)
22 (defparameter +end-symbol+ '$)
24 (defvar *grammar* nil
25 "The default grammar used by the LALR parser generator; set by
26 PROCESS-GRAMMAR.")
27 (defvar *states* nil
28 "A list of states seen by the parser generator. Constructed in
29 COMPUTE-SHIFTS, used in COMPUTE-REDUCTIONS.")
30 (defvar *first-set* nil)
31 (defvar *follow-set* nil)
32 (defvar *nullable-set* nil)
35 ;;;; LALR ITEMS
36 ;;; We could use a structure instead of a list here, and it would
37 ;;; probably be much more efficient. For the moment, it doesn't
38 ;;; matter.
40 (defun make-item (lhs rhs dot lookahead)
41 (list lhs rhs dot lookahead))
43 (defun item-lhs (item) (first item))
44 (defun item-rhs (item) (second item))
45 (defun item-dot (item) (third item))
46 (defun item-la (item) (fourth item))
48 (defun dot-at-end-p (item)
49 (endp (item-dot item)))
51 (defun symbol-at-dot (item)
52 (car (item-dot item)))
54 (defun advance-dot (item)
55 "Returns the item dot, advanced by one symbol. Note:
56 non-destructive."
57 (cdr (item-dot item)))
60 ;;; item sets -- see also macros.lisp.
62 (defun make-item-set (&rest items)
63 (let ((set (make-array '(0) :adjustable t :fill-pointer 0)))
64 (dolist (i items)
65 (add-to-set i set))
66 set))
68 (defun add-to-set (item set)
69 "Returns position of ITEM in SET."
70 (or (position item set :test #'equalp)
71 (vector-push-extend item set)))
74 ;;;; GRAMMAR
76 (defun process-grammar (grammar)
77 "Processes GRAMMAR, sets *GRAMMAR*. Augments the grammar with a new
78 start rule."
79 ;; split grammar into hash table of non-terminals, terminals.
81 ;; the grammar is a list of non-terminals followed by their
82 ;; productions.
84 ;; we compile the basic hash table of non-terminals by iterating
85 ;; through the grammar, storing the lists of productions.
86 (let ((grammar-hash (make-hash-table)))
87 (do ((list-> grammar (cddr list->)))
88 ((null list->))
89 (setf (gethash (car list->) grammar-hash)
90 (cadr list->)))
92 ;; augment grammar with start symbol
93 (dolist (i (list +start-symbol+ +end-symbol+))
94 (assert (null (gethash i grammar-hash)) nil
95 "~A is a reserved non-terminal, unfortunately. Try
96 calling MAKE-PARSER with a different END-SYMBOL or START-SYMBOL
97 specified." i))
98 (setf (gethash +start-symbol+ grammar-hash)
99 (list (list (car grammar) +end-symbol+)))
100 (setf *grammar* grammar-hash)))
102 (defun non-terminal-p (symbol) (gethash symbol *grammar*))
103 (defun grammar-productions (symbol) (gethash symbol *grammar*))
106 ;;;; FIRST and FOLLOW
108 (defun compute-prediction-sets ()
109 "Computes the first, follow, and nullable sets for *GRAMMAR*.
110 Sets *FIRST-SET*, *FOLLOW-SET*, and *NULLABLE-SET*."
111 (let ((nullable (make-hash-table))
112 (follow (make-hash-table))
113 (first (make-hash-table)))
114 (flet ((nullable-p (x) (gethash x nullable)))
115 (do-for-each-terminal (z *grammar*)
116 (setf (gethash z first) (list z)))
118 (do-until-unchanged (first follow nullable)
119 (do-for-each-production (x ys *grammar*)
120 (when (every #'nullable-p ys)
121 (setf (gethash x nullable) t))
123 (do ((i 0 (1+ i))
124 (k (length ys)))
125 ((>= i k))
127 (when (every #'nullable-p (and (> i 0) (subseq ys 0 (1- i))))
128 (setf (gethash x first)
129 (union (gethash x first)
130 (gethash (nth i ys) first))))
132 (when (every #'nullable-p (and (< i k) (subseq ys (1+ i) k)))
133 (setf (gethash (nth i ys) follow)
134 (union (gethash (nth i ys) follow)
135 (gethash x follow))))
137 (loop for j from (1+ i) to k
138 when (every #'nullable-p (and (> j (1+ i))
139 (subseq ys (1+ i) (1- j))))
140 do (setf (gethash (nth i ys) follow)
141 (union (gethash (nth i ys) follow)
142 (gethash (nth j ys) first)))))))
144 (setf *first-set* first *follow-set* follow *nullable-set* nullable)
145 (values first follow nullable))))
147 (defun first-sets (symbol-list)
148 (do* ((x-> symbol-list (cdr x->))
149 (s (and x-> (gethash (car x->) *first-set*))
150 (union s (and x-> (gethash (car x->) *first-set*)))))
151 ((or (null x->) (not (gethash (car x->) *nullable-set*)))
152 s)))
154 ;;; The following three functions are just for testing. Combined,
155 ;;; they perform the same functions as COMPUTE-PREDICTION-SETS
157 (defun list-nullable ()
158 (let ((nullable nil))
159 (do-until-unchanged (nullable)
160 (do-for-each-production (lhs rhs *grammar*)
161 (when (or (null rhs)
162 (every #'(lambda (x) (member x nullable)) rhs))
163 (pushnew lhs nullable))))
164 nullable))
166 (defun list-first-set (nullable)
167 (let ((first-set (make-hash-table)))
168 (do-for-each-terminal (x *grammar*)
169 (setf (gethash x first-set) (list x)))
170 (do-until-unchanged (first-set)
171 (do-for-each-production (lhs rhs *grammar*)
172 (do ((r-> rhs (cdr r->))
173 (done-p nil))
174 ((or done-p (null r->)))
175 (when (not (member (car r->) nullable))
176 (setf (gethash lhs first-set)
177 (union (gethash lhs first-set)
178 (gethash (car r->) first-set)))
179 (setf done-p t)))))
180 first-set))
182 (defun list-follow-set (nullable first-set)
183 (let ((follow-set (make-hash-table)))
184 (do-until-unchanged (follow-set)
185 (do-for-each-production (lhs rhs *grammar*)
186 (do ((r-> rhs (cdr r->))
187 (done-p nil))
188 ((or done-p (null r->)))
189 (when (every (lambda (x) (member x nullable)) (cdr r->))
190 (setf (gethash (car r->) follow-set)
191 (union (gethash (car r->) follow-set)
192 (gethash lhs follow-set))))
194 (loop for j from 1 to (length r->)
195 do (progn
196 (when (every (lambda (x) (member x nullable))
197 (and (> j 1) (subseq r-> 1 (1- j))))
198 (setf (gethash (car r->) follow-set)
199 (union (gethash (car r->) follow-set)
200 (gethash (nth j r->) first-set)))))))))
201 follow-set))
205 ;;;; PARSE TABLE CONSTRUCTION
207 (defun lalr-closure (item-set)
208 "Returns the closure of ITEM-SET."
209 (do-until-unchanged (item-set)
210 (do-for-each-item (i item-set)
211 (when (non-terminal-p (symbol-at-dot i))
212 (dolist (r (grammar-productions (symbol-at-dot i)))
213 (dolist (w (first-sets (append (advance-dot i)
214 (list (item-la i)))))
215 (add-to-set (make-item (symbol-at-dot i) r r w) item-set))))))
216 item-set)
218 (defun lalr-goto (item-set grammar-symbol)
219 "Returns the closure of ITEM-SET after having read GRAMMAR-SYMBOL."
220 (let ((j (make-item-set)))
221 (do-for-each-item (i item-set)
222 (when (eql (symbol-at-dot i) grammar-symbol)
223 (add-to-set (make-item (item-lhs i) (item-rhs i) (advance-dot i)
224 (item-la i))
225 j)))
226 (lalr-closure j)))
228 (defun make-start-item ()
229 "Makes the item S' -> .S$, as appropriate for the grammar."
230 (make-item +start-symbol+
231 (first (gethash +start-symbol+ *grammar*))
232 (first (gethash +start-symbol+ *grammar*))
233 nil))
235 (defun make-almost-done-item ()
236 "Makes the item S' -> S.$, as appropriate for the grammar."
237 (let* ((start-item (make-start-item))
238 (dot (do ((dot (advance-dot start-item) (cdr dot)))
239 ((or (null dot) (eql (car dot) +end-symbol+)) dot))))
240 (assert (not (null dot)))
241 (make-item (item-lhs start-item) (item-rhs start-item) dot
242 nil)))
244 ;;; The code gets progressively uglier as I refine the data
245 ;;; structures. Shame on me.
247 #+nil
248 (defun item-set-equal (set-a set-b)
249 (do ((a set-a (cdr a))
250 (b set-b (cdr b)))
251 ((and (endp a) (endp b)) t)
252 (unless (and (equal (item-lhs (car a)) (item-lhs (car b)))
253 (equal (item-rhs (car a)) (item-rhs (car b)))
254 (equal (item-dot (car a)) (item-dot (car b))))
255 (return nil))))
257 (defun item-set-equal (set-a set-b)
258 (block body
259 (when (= (length set-a) (length set-b))
260 (dotimes (i (length set-a))
261 (unless (and (equal (item-lhs (aref set-a i))
262 (item-lhs (aref set-b i)))
263 (equal (item-rhs (aref set-a i))
264 (item-rhs (aref set-b i)))
265 (equal (item-dot (aref set-a i))
266 (item-dot (aref set-b i))))
267 (return-from body nil)))
268 t)))
270 (defun add-to-states (set states)
271 (block body
272 (dotimes (i (length states))
273 (when (item-set-equal set (aref states i))
274 (return-from body i)))
275 (vector-push-extend set states)))
277 (defun compute-shifts ()
278 "Compute shift actions for the generated parser. Fills the *STATE*
279 variable and returns a list of shift actions."
280 (setf *states* (make-array '(1) :adjustable t :fill-pointer 1
281 :initial-element
282 (lalr-closure (make-item-set (make-start-item)))))
284 (let ((shift-table nil))
285 (do-until-unchanged (*states* shift-table)
286 (dotimes (i (length *states*))
287 (do-for-each-item (item (aref *states* i))
288 (when (and (not (dot-at-end-p item))
289 (not (eql (symbol-at-dot item) +end-symbol+)))
290 (let* ((x (symbol-at-dot item))
291 (new-set (lalr-goto (aref *states* i) x))
292 (j (add-to-states new-set *states*)))
293 (pushnew (list i x j) shift-table :test #'equalp))))))
294 shift-table))
297 (defun compute-reductions ()
298 "Compute reduce actions for the generated parser. Depends on
299 *STATE* already being filled, and returns the reduce actions."
300 (let ((reduce-table nil))
301 (do-for-each-item (item-set *states*)
302 (do-for-each-item (item item-set)
303 (when (dot-at-end-p item)
304 (pushnew (list (position item-set *states* :test #'equalp)
305 (item-la item) item)
306 reduce-table :test #'equalp))))
307 reduce-table))
310 (defun add-accept-actions (parse-table)
311 (do* ((i 0 (1+ i))
312 (item (make-almost-done-item)))
313 ((>= i (length *states*)))
314 (when (find item (aref *states* i) :test #'equalp)
315 (add-to-parse-table parse-table i +end-symbol+ (list 'accept)))))
318 (defun add-to-parse-table (parse-table i x action)
319 "Adds ACTION to the parse table at (X,I). Applies braindead
320 conflict resolution rule to any conflicts detected."
321 (anaphora:sunless (gethash x parse-table)
322 (setf anaphora:it (make-array (list (length *states*))
323 :initial-element nil)))
324 (aif (aref (gethash x parse-table) i)
325 ;; XXX should probably collate the number of conflicts somewhere.
326 (warn "~&Conflict at ~A,~A -> last action ~A, new action ~A."
327 x i it action)
328 ;; (assert (null (aref (gethash x parse-table) i)))
329 (setf (aref (gethash x parse-table) i) action)))
332 (defun create-parse-table (shifts reductions)
333 "Constructs a parse table usable by PARSE, from the list of shift
334 and reduce actions supplied as arguments, and from the set of states
335 stored in *STATES*, which COMPUTE-SHIFTS fills in."
336 (let ((parse-table (make-hash-table)))
337 (dolist (shift shifts)
338 (destructuring-bind (i x j) shift
339 (add-to-parse-table parse-table i x
340 (list (if (non-terminal-p x) 'goto 'shift) j))))
342 (dolist (reduce reductions)
343 (destructuring-bind (i x j) reduce
344 (add-to-parse-table parse-table i x
345 (list 'reduce (list (item-lhs j)
346 (length (item-rhs j)))))))
347 (add-accept-actions parse-table)
349 parse-table))
352 (defun parse (table next-token)
353 "TABLE is a table generated by CREATE-PARSE-TABLE, NEXT-TOKEN is a
354 function which returns a cons of the next token in the input (the CAR
355 being the symbol name, the CDR being any information the lexer would
356 like to preserve), and advances the input one token. Returns what
357 might pass for a parse tree in some countries."
358 (do* ((stack (list 0))
359 (token (funcall next-token))
360 (result-stack nil)
361 (row (gethash (car token) table)
362 (gethash (car token) table)))
363 (nil)
364 (unless row
365 (error "~A is not a valid token in this grammar." token))
366 (let ((action (aref row (first stack))))
367 (case (first action)
368 (shift
369 (push token result-stack)
370 (setf token (funcall next-token))
371 (push (second action) stack))
372 (reduce
373 (destructuring-bind (lhs rhs-len) (second action)
374 (push (list lhs) result-stack)
375 (dotimes (i rhs-len)
376 (pop stack)
377 (push (pop (cdr result-stack)) (cdar result-stack)))
378 (destructuring-bind (goto state)
379 (aref (gethash lhs table) (first stack))
380 (assert (eql goto 'goto) (state) "Malformed parse table!")
381 (push state stack))))
382 (accept (return (car result-stack)))
383 (t (error "Parse error at ~A" token))))))
386 ;;;; External functions
388 (defun make-parser (grammar &key end-symbol start-symbol)
389 (awhen end-symbol (setf +end-symbol+ it))
390 (awhen start-symbol (setf +start-symbol+ it))
391 (process-grammar grammar)
392 (compute-prediction-sets)
393 (let ((table (create-parse-table (compute-shifts) (compute-reductions))))
394 (lambda (&key (next-token (lambda () (list (read)))))
395 (parse table next-token))))
398 ;;;; Testing stuff.
400 (defun test-parser (grammar string)
401 (process-grammar grammar)
402 (compute-prediction-sets)
403 (with-input-from-string (*standard-input* string)
404 (parse (create-parse-table (compute-shifts) (compute-reductions))
405 (lambda () (cons (read) nil)))))
407 (defparameter *lr0-test-grammar*
408 '(sentence ((open list close)
409 (variable))
410 list ((sentence)
411 (list comma sentence))))
413 (defparameter *slr-test-grammar*
414 '(E ((T + E) (T))
415 T ((x))))
417 (defparameter *simple-nullable-test-grammar*
418 '(Z ((d)
419 (X Y Z))
420 Y (nil
421 (c))
422 X ((Y)
423 (a))))