Fix PEG grammar and add action support.
[cl-opossum.git] / pegutils.lisp
blobe9a9382bc92e833e8718669ab4f94fcd14c16e01
1 ;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; -*-
2 ;;;
3 ;;; pegutils.lisp --- Utility functions for implementing PEG parsers
5 ;; Copyright (C) 2008 Utz-Uwe Haus <lisp@uuhaus.de>
6 ;; $Id$
7 ;;
8 ;; This code is free software; you can redistribute it and/or modify
9 ;; it under the terms of the version 2.1 of the GNU Lesser General
10 ;; Public License as published by the Free Software Foundation, as
11 ;; clarified by the lisp prequel found in LICENSE.
13 ;; This code is distributed in the hope that it will be useful, but
14 ;; without any warranty; without even the implied warranty of
15 ;; merchantability or fitness for a particular purpose. See the GNU
16 ;; Lesser General Public License for more details.
18 ;; Version 2.1 of the GNU Lesser General Public License is in the file
19 ;; LICENSE that was distributed with this file. If it is not
20 ;; present, you can access it from
21 ;; http://www.gnu.org/copyleft/lgpl.txt (until superseded by a
22 ;; newer version) or write to the Free Software Foundation, Inc., 59
23 ;; Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 ;; Commentary:
27 ;; Some code here is inspired by the metapeg library of John Leuner
29 ;;; Code:
32 (eval-when (:load-toplevel :compile-toplevel :execute)
33 (declaim (optimize (speed 0) (safety 3) (debug 3))))
35 (in-package #:opossum)
37 (defparameter *trace* nil "If non-nil, do extensive tracing of the parser functions.")
40 (defclass context ()
41 (;; these slots are copied when cloning a context for recursion
42 (input :accessor input :initarg :input :initform nil
43 :type 'string
44 :documentation "The input string being parsed.")
45 (dst-package :accessor dst-package :initarg :dst-package :initform nil
46 :type 'package
47 :documentation "The package into which symbols generated during the parse are interned.")
48 ;; these slots are shared by all cloned copies of a context -- use only STORE-ACTION to guarantee consistency
49 (actions :accessor actions :initarg :actions :initform (make-list 1 :initial-element NIL)
50 :type 'list
51 :documentation "The list of actions accumulated during the parse.")
52 (action-counter :accessor action-counter :initarg :action-counter :initform '(0)
53 :type '(cons (integer 0) null)
54 :documentation "The counter of actions.")
55 ;; these slots are what make a context unique
56 (parent :accessor parent :initarg :parent :initform nil)
57 (rule :accessor rule :initarg :rule :initform nil)
58 (children :accessor children :initform nil)
59 (value :accessor value :initarg :value :initform nil)
60 (start-index :accessor start-index :initarg :start-index :initform nil)
61 (end-index :accessor end-index :initarg :end-index :initform nil)
62 (depth :accessor depth :initarg :depth :initform 0
63 :documentation "How deep in the tree is this context?"))
64 (:documentation "The parser context."))
66 (defmethod print-object ((obj context) stream)
67 (print-unreadable-object (obj stream :type T :identity NIL)
68 (format stream "rule ~A (~S) value ~S (~D:~D)"
69 (rule obj) (children obj) (value obj) (start-index obj) (end-index obj))))
71 (defmethod store-action ((ctx context) action)
72 (let ((a (actions ctx)))
73 (rplacd a (cons (car a) (cdr a)))
74 (rplaca a action)))
76 (defvar *context* nil "The current parser context.")
78 (defun clone-ctx (ctx rule)
79 "Create clone context of CTX for rule RULE."
80 (make-instance 'context
81 :input (input ctx)
82 :dst-package (dst-package ctx)
83 :actions (actions ctx)
84 :action-counter (action-counter ctx)
85 :parent ctx
86 :rule rule
87 :start-index (end-index ctx)
88 :depth (1+ (depth ctx))))
90 (defun ctx-failed-p (ctx)
91 (unless ctx
92 (break "Botched context"))
93 (null (end-index ctx)))
95 (defun succeed (ctx value start-index end-index)
96 "Mark CTX as successful: set VALUE and matched region."
97 (setf (value ctx) value)
98 (setf (start-index ctx) start-index)
99 (setf (end-index ctx) end-index)
100 ;; (when *trace*
101 ;; (format *trace-output* "Matched: ~A (~D:~D)~%"
102 ;; (rule ctx) (start-index ctx) (end-index ctx)))
103 ctx)
105 (defun fail ()
106 "Return a failure context."
107 (let ((ctx (make-instance 'context
108 :input (input *context*)
109 :rule ':fail
110 :value (rule *context*)
111 :start-index (start-index *context*)
112 :end-index (end-index *context*)
113 ;; probably some of these copies can be saved
114 :dst-package (dst-package *context*)
115 :actions (actions *context*)
116 :action-counter (action-counter *context*)
117 :depth (1+ (depth *context*)))))
118 ;; (when *trace*
119 ;; (format *trace-output* "(failed: ~A ~A ~A)~%"
120 ;; (value ctx) (start-index ctx) (end-index ctx)))
121 ctx))
125 (defun make-name (string)
126 (intern (concatenate 'string "parse-" string)
127 (dst-package *context*)))
129 (defun make-action-name (&key ctx)
130 "Return a symbol suitable to name the next action."
131 (incf (car (action-counter *context*)))
132 (let ((aname (if ctx
133 (format nil "opossum-action-~D-srcpos-~D-~D"
134 (car (action-counter *context*))
135 (start-index ctx)
136 (end-index ctx))
137 (format nil "opossum-action-~D"
138 (car (action-counter *context*)))) ))
139 (intern aname (dst-package *context*))))
141 (defun char-list-to-string (char-list)
142 (coerce char-list 'string))
144 (defmacro build-parser-function (name parser)
145 `(lambda (offset)
146 (let ((indent (depth *context*)))
147 (when *trace* (format *trace-output* "~vTTrying to parse a ~A at pos ~D~%" indent ,name offset))
148 (let* ((*context* (clone-ctx *context* ,name))
149 (result (funcall ,parser offset)))
150 (unless result
151 (error "Parser function ~A did not return a value" ,parser))
152 (if (ctx-failed-p result)
153 (progn
154 (when *trace* (format *trace-output* "~vT... no ~A at pos ~D~%" indent ,name offset))
155 (fail))
156 (progn
157 (when *trace* (format *trace-output* "~vT... found ~A at ~D:~D~%"
158 indent
159 ,name (start-index result) (end-index result)))
160 (succeed *context* (value result) (start-index result) (end-index result))))))))
164 (defun match-string (string)
165 #'(lambda (offset)
166 (let ((input (input *context*))
167 (len (length string)))
168 (if (and (>= (length input) (+ offset len))
169 (string= string input :start2 offset :end2 (+ offset len)))
170 (succeed (clone-ctx *context* 'opossum-string) string offset (+ offset (length string)))
171 (fail)))))
173 (defun match-char (char-list)
174 #'(lambda (offset)
175 (let ((input (input *context*)))
176 ;; (when *trace*
177 ;; (format *trace-output* "match-char: looking for one of `~{~A~}'~%" char-list))
178 (if (and (> (length input) offset)
179 (member (char input offset)
180 char-list :test #'char=))
181 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
182 (fail)))))
184 (defun match-octal-char-code (i1 i2 i3)
185 "Compare the character given by i3 + 8*i2 + 64*i1 to the next input character."
186 (let ((c (+ i3 (* 8 i2) (* 64 i1))))
187 #'(lambda (offset)
188 (let ((input (input *context*)))
189 ;; (when *trace*
190 ;; (format *trace-output* "match-octal-char-code: looking for ~D~%" c))
191 (if (and (> (length input) offset)
192 (= (char-int (char input offset)) c))
193 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
194 (fail))))))
196 (defun match-char-range (lower-char upper-char)
197 "Match characters in the range between LOWER-CHAR and UPPER-CHAR (inclusive) as decided by CL:CHAR-CODE."
198 #'(lambda (offset)
199 (let ((input (input *context*)))
200 ;; (when *trace*
201 ;; (format *trace-output* "match-char-range: looking for ~A-~A~%" lower-char upper-char))
202 (if (and (> (length input) offset)
203 (let ((x (char-code (char input offset))))
204 (and (>= x (char-code lower-char))
205 (<= x (char-code upper-char)))))
206 (succeed (clone-ctx *context* 'opossum-char-range) (char input offset) offset (+ offset 1))
207 (fail)))))
209 (defun match-any-char (&optional ignored)
210 (declare (ignore ignored))
211 #'(lambda (offset)
212 "Match any character at OFFSET, fail only on EOF."
213 (let ((input (input *context*)))
214 ;; (when *trace*
215 ;; (format *trace-output* "match-any-char~%"))
216 (if (< (1+ offset) (length input))
217 (succeed (clone-ctx *context* 'opossum-anychar) (char input offset) offset (+ offset 1))
218 (fail)))))
220 (defun match-char-class (char-class)
221 "Regexp matching of next input character against [CHAR-CLASS] using cl-ppcre:scan."
222 (declare (type string char-class))
223 ;; FIXME: could use a pre-computed scanner
224 (let ((cc (format nil "[~A]" char-class)))
225 #'(lambda (offset)
226 "Match next character at OFFSET against the characters in CHAR-CLASS."
227 (let ((input (input *context*)))
228 ;; (when *trace*
229 ;; (format *trace-output* "match-char-class on ~A~%"))
230 (if (and (< (1+ offset) (length input))
231 (let ((c (char input offset)))
232 (cl-ppcre:scan cc (make-string 1 :initial-element c))))
233 (succeed (clone-ctx *context* 'opossum-charclass)
234 (char input offset) offset (+ offset 1))
235 (fail))))))
237 (defun fix-escape-sequences (char-list)
238 "Iterate over the list CHAR-LIST, glueing adjacent #\\ #\n and #\\ #\t chars into
239 #\Newline and #\Tab."
240 (cond
241 ((null char-list) char-list)
242 ((null (cdr char-list)) char-list)
244 (loop :with drop := nil
245 :for (c1 c2) :on char-list
246 :if drop
247 :do (setf drop nil)
248 :else
249 :collect (if (char= c1 #\\)
250 (case c2
251 ((#\n) (setf drop T) #\Newline)
252 ((#\t) (setf drop T) #\Tab)
253 ((#\r) (setf drop T) #\Linefeed)
254 (otherwise c1))
256 :end))))
259 ;; parsing combinator functions cribbed from libmetapeg
261 (defun either (&rest parsers)
262 "Produce a function that tries each of the functions in PARSERS sequentially until one succeeds and
263 returns the result of that function, or a failure context if none succeeded."
264 #'(lambda (offset)
265 (let ((*context* (clone-ctx *context* 'opossum-either)))
266 ;; (when *trace*
267 ;; (format *trace-output* "either: ~A ~A~%" *context* parsers))
268 (loop :for p :in parsers
269 :as result = (funcall p offset)
270 :when (not (ctx-failed-p result))
271 :return (succeed *context* (value result) offset (end-index result))
272 :finally (return (fail))))))
275 (defun optional (parser)
276 #'(lambda (offset)
277 (let ((*context* (clone-ctx *context* 'opossum-optional)))
278 ;; (when *trace*
279 ;; (format *trace-output* "optional: ~A ~A~%" *context* parser))
280 (let ((result (funcall parser offset)))
281 (if (ctx-failed-p result)
282 (succeed *context* 'optional offset offset)
283 (succeed *context* (value result) offset (end-index result)))))))
285 (defun follow (parser)
286 #'(lambda (offset)
287 (let ((*context* (clone-ctx *context* 'opossum-follow)))
288 ;; (when *trace*
289 ;; (format *trace-output* "follow: ~A ~A~%" *context* parser))
290 (let ((result (funcall parser offset)))
291 (if (ctx-failed-p result)
292 (fail)
293 (succeed *context* (value result)
294 ;; don't consume input
295 offset offset))))))
297 (defun many (parser)
298 #'(lambda (offset)
299 (let ((*context* (clone-ctx *context* 'opossum-many))
300 (start-offset offset)
301 children)
302 ;; (when *trace*
303 ;; (format *trace-output* "many: ~A ~A~%" *context* parser))
304 (loop :as result := (funcall parser offset)
305 :while (not (ctx-failed-p result))
306 :do (progn (push (value result) children)
307 (setf offset (end-index result)))
308 :finally (return (succeed *context* (nreverse children) start-offset offset))))))
311 (defun many1 (parser)
312 #'(lambda (offset)
313 (let* ((*context* (clone-ctx *context* 'opossum-many1))
314 (result (funcall parser offset)))
315 ;; (when *trace*
316 ;; (format *trace-output* "many1: ~A ~A~%" *context* parser))
317 (if (not (ctx-failed-p result))
318 (let ((result2 (funcall (many parser) (end-index result))))
319 (if (end-index result2)
320 (succeed *context* (cons (value result) (value result2)) offset (end-index result2))
321 (succeed *context* (value result) offset (end-index result))))
322 (fail)))))
325 (defun seq (&rest parsers)
326 #'(lambda (offset)
327 (assert (> (length parsers) 0))
328 (let ((*context* (clone-ctx *context* 'opossum-seq))
329 (start-offset offset)
330 child-values
331 child-nodes)
332 ;; (when *trace*
333 ;; (format *trace-output* "seq: ~A ~A~%" *context* parsers))
334 ;; run the parsers
335 (loop :for p :in parsers
336 ;; :do (when *trace* (format *trace-output* " (seq ~A) trying ~A~%" *context* p))
337 :do (cond
338 ((consp p)
339 (push (succeed (clone-ctx *context* 'action) nil offset offset) child-nodes)
340 (push p child-values)
341 (setf (children *context*) (reverse child-nodes)))
343 (let ((result (funcall p offset)))
344 (if (end-index result)
345 (progn
346 (push result child-nodes)
347 (push (value result) child-values)
348 (setf offset (end-index result))
349 (setf (children *context*) (reverse child-nodes)))
350 (return (fail))))))
351 :finally (return (succeed *context* (reverse child-values) start-offset offset))))))
353 (defun negate (parser)
354 #'(lambda (offset)
355 "Return a successful context at OFFSET if PARSER succeeds, without advancing input position."
356 (let ((*context* (clone-ctx *context* 'opossum-negate)))
357 (let ((result (funcall parser offset)))
358 (if (ctx-failed-p result)
359 (succeed *context* 'negate offset offset)
360 (fail))))))
364 (defun read-stream (stream)
365 "Read STREAM and return a string of all its contents."
366 (let ((s ""))
367 (loop :as line := (read-line stream nil nil)
368 :while line
369 :do (setq s (concatenate 'string s line)))
372 (defun read-file (file)
373 "Read FILE and return a string of all its contents."
374 (with-open-file (f file :direction :input)
375 (let ((len (file-length f)))
376 (if len
377 (let ((s (make-string len)))
378 (read-sequence s f)
380 (read-stream f)))))
382 (defun make-default-dst-package (grammarfile)
383 (make-package (gensym "opossum-parser")
384 :documentation (format T "Opossum parser for grammar ~A" grammarfile)))
386 (defun get-iso-time ()
387 "Return a string in ISO format for the current time"
388 (multiple-value-bind (second minute hour date month year day daylight-p zone)
389 (get-decoded-time)
390 (declare (ignore day))
391 (format nil "~4D-~2,'0D-~2,'0D-~2,'0D:~2,'0D:~2,'0D (UCT~@D)"
392 year month date
393 hour minute second
394 (if daylight-p (1+ (- zone)) (- zone)))))
396 (defun cleanup-action-code (code)
397 "Remove trailing newlines so that the string CODE can be printed nicely."
398 (subseq code 0
399 (when (char= #\Newline (char code (1- (length code))))
400 (1+ (position #\Newline code :from-end T :test #'char/=)))))
402 (defmacro checking-parse (grammarfile parse-file-fun)
403 (let ((res (gensym "resultctx")))
404 `(let ((,res (funcall ,parse-file-fun ,grammarfile *package*)))
405 (cond
406 ((ctx-failed-p ,res)
407 (format *error-output* "Failed to parse PEG grammar ~A~%" ,grammarfile)
408 (error "Parsing ~A failed: ~A" ,grammarfile ,res))
409 ((< (end-index ,res) (length (input ,res)))
410 (format *error-output* "Parsed only ~D characters of grammar ~A~%" (end-index ,res) ,grammarfile)
411 ,res)
412 (T ,res)))))
414 (defun generate-parser-file (grammarfile dst-package dst-file &key start-rule (parse-file-fun #'opossum:parse-file))
415 "Create lisp code in DST-FILE that can be loaded to yield functions to parse using GRAMMARFILE in DST-PACKAGE.
416 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
417 (let* ((*package* dst-package)
418 (result (checking-parse grammarfile parse-file-fun))
419 ;; FIXME: check for complete parse
420 (*context* result) ;; routines in pegutils.lisp expect *context* to be bound properly
421 (dpkg (intern (package-name dst-package) :keyword)))
422 (let ((forms (transform (value result)))
423 (actions (actions result)))
424 (with-open-file (s dst-file :direction :output :if-exists :supersede)
425 (let ((*print-readably* T)
426 (*print-pretty* T)
427 (*print-circle* NIL))
428 (format s ";; This is a Common Lisp peg parser automatically generated by OPOSSUM -*- mode:lisp -*-~%")
429 (format s ";; generated from ~A on ~A~%" grammarfile (get-iso-time))
430 (prin1 `(eval-when (:load-toplevel :compile-toplevel :execute)
431 (declaim (optimize (speed 0) (safety 3) (debug 3))))
433 (terpri s)
434 (prin1 `(defpackage ,dpkg
435 (:use :cl :opossum)
436 (:export :parse-string :parse-file :parse-stream :*trace*))
438 (terpri s)
439 (prin1 `(in-package ,dpkg) s) (terpri s)
440 ;; First form is taken to be the start rule
441 (let ((entryrule (or (and start-rule (make-name start-rule))
442 (and forms (cadr (first forms))))))
443 (if (not entryrule)
444 (format *error-output* "Cannot find entry rule for parser")
445 (progn
446 (when *trace*
447 (format *trace-output* "Inserting definitions for parser entry points through ~A~%"
448 entryrule))
449 (terpri s)
450 (prin1 `(defun parse-string (,(intern :s dst-package) dst-package)
451 ,(format nil "Parse S using grammar ~A starting at ~A" grammarfile entryrule)
452 (let ((*context* (make-instance 'opossum:context
453 :dst-package dst-package
454 :input ,(intern :s dst-package))))
455 (funcall (,entryrule) 0)))
457 (terpri s)
458 (prin1 `(defun parse-file (,(intern :f dst-package) dst-package)
459 ,(format nil "Parse file F using grammar ~A starting at ~A" grammarfile entryrule)
460 (parse-string (opossum:read-file ,(intern :f dst-package)) dst-package))
462 (terpri s)
463 (prin1 `(defun parse-stream (,(intern :stream dst-package) dst-package)
464 ,(format nil "Parse stream F using grammar ~A starting at ~A" grammarfile entryrule)
465 (parse-string (opossum:read-stream ,(intern :stream dst-package)) dst-package))
467 (fresh-line s))))
468 (loop :for aform :in forms
469 :do (when *trace* (format *trace-output* "Inserting form ~A~%" aform))
470 :do (terpri s)
471 :do (prin1 aform s)
472 :do (fresh-line s))
473 (terpri s)
474 (prin1
475 `(defparameter ,(intern "*trace*" dst-package) nil
476 "When non-nil, the generated parser function log to cl:*trace-output*.")
478 (terpri s)
480 (loop :for (sym code) :in actions
481 :when sym ;; the final action is named NIL because we push a
482 ;; NIL ahead of us in store-actions
483 :do (when *trace* (format *trace-output* "Inserting defun for ~A~%" sym))
484 :and :do (format s "~%(defun ~S (data)~% (declare (ignorable data) (type list data))~% ~A)~%"
485 sym (cleanup-action-code code))))))))
487 (defun generate-parser-package (grammarfile &key (dst-package (make-package (gensym "opossum-parser-")))
488 start-rule (parse-file-fun #'opossum:parse-file))
489 "Create functions to parse using GRAMMARFILE in DST-PACKAGE, starting ar rule named HEAD.
490 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
491 (let* ((*package* dst-package)
492 (result (checking-parse grammarfile parse-file-fun))
493 ;; FIXME: check for complete parse
494 (*context* result)) ;; routines in pegutils.lisp expect *context* to be bound properly)
495 (let ((forms (transform (value result)))
496 (actions (actions result)))
497 (format *trace-output* "Injecting parser functions into ~A~%" dst-package)
498 (break "~A, ~A" forms actions)
499 (use-package '(:cl :opossum) dst-package)
500 (let ((entryrule (or (and start-rule (make-name start-rule))
501 (and forms (cadr (first forms))))))
502 (if (not entryrule)
503 (format *error-output* "Cannot find entry rule for parser")
504 (progn
505 (when *trace*
506 (format *trace-output* "Inserting definitions for parser entry points through ~A~%"
507 entryrule))
508 (intern (compile 'parse-string
509 `(lambda (,(intern :s dst-package))
510 ,(format nil "Parse S using grammar ~A starting at ~A" grammarfile entryrule)
511 (let ((*context* (make-instance 'opossum:context
512 :dst-package ,dst-package
513 :input ,(intern :s dst-package))))
514 (funcall (,entryrule) 0))))
515 dst-package)
516 (intern (compile 'parse-file
517 `(lambda (,(intern :f dst-package))
518 ,(format nil "Parse file F using grammar ~A starting at ~A" grammarfile entryrule)
519 (parse-string (opossum:read-file ,(intern :f dst-package)))))
520 dst-package)
522 (intern (compile 'parse-stream
523 `(lambda (,(intern :stream dst-package))
524 ,(format nil "Parse stream F using grammar ~A starting at ~A" grammarfile entryrule)
525 (parse-string (opossum:read-stream ,(intern :stream dst-package)))))
526 dst-package))))
527 (intern '*trace* dst-package)
528 (setf (documentation '*trace* 'cl:variable)
529 "When non-nil, the generated parser function log to cl:*trace-output*.")
530 (export '(:parse-string :parse-file :parse-stream :*trace*) dst-package)
532 (loop :for aform :in forms
533 :do (when *trace*
534 (format *trace-output* "Injecting form ~A~%" aform))
535 :do (destructuring-bind (defun-sym name args &rest body)
536 aform
537 (declare (ignore defun-sym))
538 (intern (compile name `(lambda ,args ,@body)) dst-package)))
539 (loop :for (sym code) :in actions
540 :when sym
541 :do (when *trace* (format *trace-output* "Injecting definition for ~A~%" sym))
542 :and :do (intern (compile sym `(lambda (data) (declare (ignorable data)) ,code)) dst-package)))))
546 (defun transform (tree &optional (depth 0))
547 (if (and tree
548 (consp tree))
549 (if (eq (first tree) ':action)
550 (progn
551 (when *trace*
552 (format *trace-output* "~AFound action ~A~%" (make-string depth :initial-element #\Space) tree))
553 tree)
554 (let ((data (mapcar #'(lambda (tr) (transform tr (1+ depth)))
555 tree)))
556 (loop :for el :in data
557 :when (and (listp el)
558 (eq (first el) ':action)
559 (symbolp (third el)))
560 :do (let ((*package* (dst-package *context*))
561 (action (third el)))
562 (when *trace*
563 (format *trace-output* "~&Applying action ~A to ~A~%" action data))
564 (handler-case
565 (return-from transform
566 (funcall (symbol-function action) data))
567 (undefined-function (x)
568 (progn
569 (format *error-output* "missing definition for ~A: ~A~%" action x)
570 ; (break "~A in ~A" action *package*)
571 tree)))))
572 data))
573 tree))