7a52b7ae06738221a56e81abde09b224c8105d00
[cl-opossum.git] / pegutils.lisp
blob7a52b7ae06738221a56e81abde09b224c8105d00
1 ;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; -*-
2 ;;;
3 ;;; pegutils.lisp --- Utility functions for implementing PEG parsers
5 ;; Copyright (C) 2008 Utz-Uwe Haus <lisp@uuhaus.de>
6 ;; $Id$
7 ;;
8 ;; This code is free software; you can redistribute it and/or modify
9 ;; it under the terms of the version 2.1 of the GNU Lesser General
10 ;; Public License as published by the Free Software Foundation, as
11 ;; clarified by the lisp prequel found in LICENSE.
13 ;; This code is distributed in the hope that it will be useful, but
14 ;; without any warranty; without even the implied warranty of
15 ;; merchantability or fitness for a particular purpose. See the GNU
16 ;; Lesser General Public License for more details.
18 ;; Version 2.1 of the GNU Lesser General Public License is in the file
19 ;; LICENSE that was distributed with this file. If it is not
20 ;; present, you can access it from
21 ;; http://www.gnu.org/copyleft/lgpl.txt (until superseded by a
22 ;; newer version) or write to the Free Software Foundation, Inc., 59
23 ;; Temple Place, Suite 330, Boston, MA 02111-1307 USA
25 ;; Commentary:
27 ;; Some code here is inspired by the metapeg library of John Leuner
29 ;;; Code:
32 (eval-when (:load-toplevel :compile-toplevel :execute)
33 (declaim (optimize (speed 0) (safety 3) (debug 3))))
35 (in-package #:opossum)
37 (defparameter *trace* nil "If non-nil, do extensive tracing of the parser functions.")
40 (defclass context ()
41 (;; these slots are copied when cloning a context for recursion
42 (input :accessor input :initarg :input :initform nil
43 :type 'string
44 :documentation "The input string being parsed.")
45 (dst-package :accessor dst-package :initarg :dst-package :initform nil
46 :type 'package
47 :documentation "The package into which symbols generated during the parse are interned.")
48 ;; these slots are shared by all cloned copies of a context -- use only STORE-ACTION to guarantee consistency
49 (actions :accessor actions :initarg :actions :initform (make-list 1 :initial-element NIL)
50 :type 'list
51 :documentation "The list of actions accumulated during the parse.")
52 (action-counter :accessor action-counter :initarg :action-counter :initform '(0)
53 :type '(cons (integer 0) null)
54 :documentation "The counter of actions.")
55 ;; these slots are what make a context unique
56 (parent :accessor parent :initarg :parent :initform nil)
57 (rule :accessor rule :initarg :rule :initform nil)
58 (children :accessor children :initform nil)
59 (value :accessor value :initarg :value :initform nil)
60 (start-index :accessor start-index :initarg :start-index :initform nil)
61 (end-index :accessor end-index :initarg :end-index :initform nil))
62 (:documentation "The parser context."))
64 (defmethod print-object ((obj context) stream)
65 (print-unreadable-object (obj stream :type T :identity NIL)
66 (format stream "rule ~A (~S) value ~S (~D:~D)"
67 (rule obj) (children obj) (value obj) (start-index obj) (end-index obj))))
69 (defmethod store-action ((ctx context) action)
70 (let ((a (actions ctx)))
71 (rplacd a (cons (car a) (cdr a)))
72 (rplaca a action)))
74 (defvar *context* nil "The current parser context.")
76 (defun clone-ctx (ctx rule)
77 "Create clone context of CTX for rule RULE."
78 (make-instance 'context
79 :input (input ctx)
80 :dst-package (dst-package ctx)
81 :actions (actions ctx)
82 :action-counter (action-counter ctx)
83 :parent ctx
84 :rule rule
85 :start-index (end-index ctx)))
87 (defun ctx-failed-p (ctx)
88 (unless ctx
89 (break "Botched context"))
90 (null (end-index ctx)))
92 (defun succeed (ctx value start-index end-index)
93 "Mark CTX as successful: set VALUE and matched region."
94 (setf (value ctx) value)
95 (setf (start-index ctx) start-index)
96 (setf (end-index ctx) end-index)
97 ; (break "generated success context ~A" ctx)
98 (when *trace*
99 (format *trace-output* "Matched: ~A (~D:~D)~%"
100 (rule ctx) (start-index ctx) (end-index ctx)))
101 ctx)
103 (defun fail ()
104 "Return a failure context."
105 (let ((ctx (make-instance 'context
106 :input (input *context*)
107 :rule ':fail
108 :value (rule *context*)
109 :start-index (start-index *context*)
110 :end-index (end-index *context*)
111 ;; probably some of these copies can be saved
112 :dst-package (dst-package *context*)
113 :actions (actions *context*)
114 :action-counter (action-counter *context*))))
115 ; (break "generated failure context ~A" ctx )
116 (when *trace*
117 (format *trace-output* "(failed: ~A ~A ~A)~%"
118 (value ctx) (start-index ctx) (end-index ctx)))
119 ctx))
123 (defun make-name (string)
124 (intern (concatenate 'string "parse-" string)
125 (dst-package *context*)))
127 (defun make-action-name (&key ctx)
128 "Return a symbol suitable to name the next action."
129 (incf (car (action-counter *context*)))
130 (let ((aname (if ctx
131 (format nil "opossum-action-~D-srcpos-~D-~D"
132 (car (action-counter *context*))
133 (start-index ctx)
134 (end-index ctx))
135 (format nil "opossum-action-~D-~D~D"
136 (car (action-counter *context*)))) ))
137 (intern aname (dst-package *context*))))
139 (defun char-list-to-string (char-list)
140 (coerce char-list 'string))
142 (defmacro build-parser-function (name parser)
143 `(lambda (offset)
144 (let* ((*context* (clone-ctx *context* ,name))
145 (result (funcall ,parser offset)))
146 (unless result
147 (break "Yow"))
148 (if (ctx-failed-p result)
149 (fail)
150 (succeed *context* (value result) (start-index result) (end-index result))))))
154 (defun match-string (string)
155 #'(lambda (offset)
156 (let ((input (input *context*))
157 (len (length string)))
158 (if (and (>= (length input) (+ offset len))
159 (string= string input :start2 offset :end2 (+ offset len)))
160 (succeed (clone-ctx *context* 'opossum-string) string offset (+ offset (length string)))
161 (fail)))))
163 (defun match-char (char-list)
164 #'(lambda (offset)
165 (let ((input (input *context*)))
166 (when *trace*
167 (format *trace-output* "match-char: looking for one of `~{~A~}'~%" char-list))
168 (if (and (> (length input) offset)
169 (member (char input offset)
170 char-list :test #'char=))
171 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
172 (fail)))))
174 (defun match-octal-char-code (i1 i2 i3)
175 "Compare the character given by i3 + 8*i2 + 64*i1 to the next input character."
176 (let ((c (+ i3 (* 8 i2) (* 64 i1))))
177 #'(lambda (offset)
178 (let ((input (input *context*)))
179 (when *trace*
180 (format *trace-output* "match-octal-char-code: looking for ~D~%" c))
181 (if (and (> (length input) offset)
182 (= (char-int (char input offset)) c))
183 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
184 (fail))))))
186 (defun match-char-range (lower-char upper-char)
187 "Match characters in the range between LOWER-CHAR and UPPER-CHAR (inclusive) as decided by CL:CHAR-CODE."
188 #'(lambda (offset)
189 (let ((input (input *context*)))
190 (when *trace*
191 (format *trace-output* "match-char-range: looking for ~A-~A~%" lower-char upper-char))
192 (if (and (> (length input) offset)
193 (let ((x (char-code (char input offset))))
194 (and (>= x (char-code lower-char))
195 (<= x (char-code upper-char)))))
196 (succeed (clone-ctx *context* 'opossum-char-range) (char input offset) offset (+ offset 1))
197 (fail)))))
199 (defun match-any-char (&optional ignored)
200 (declare (ignore ignored))
201 #'(lambda (offset)
202 "Match any character at OFFSET, fail only on EOF."
203 (let ((input (input *context*)))
204 (when *trace*
205 (format *trace-output* "match-any-char~%"))
206 (if (< (1+ offset) (length input))
207 (succeed (clone-ctx *context* 'opossum-anychar) (char input offset) offset (+ offset 1))
208 (fail)))))
210 (defun match-char-class (char-class)
211 "Regexp matching of next input character against [CHAR-CLASS] using cl-ppcre:scan."
212 (declare (type string char-class))
213 ;; FIXME: could use a pre-computed scanner
214 (let ((cc (format nil "[~A]" char-class)))
215 #'(lambda (offset)
216 "Match next character at OFFSET against the characters in CHAR-CLASS."
217 (let ((input (input *context*)))
218 (when *trace*
219 (format *trace-output* "match-char-class on ~A~%"))
220 (if (and (< (1+ offset) (length input))
221 (let ((c (char input offset)))
222 (cl-ppcre:scan cc (make-string 1 :initial-element c))))
223 (succeed (clone-ctx *context* 'opossum-charclass)
224 (char input offset) offset (+ offset 1))
225 (fail))))))
227 (defun fix-escape-sequences (char-list)
228 "Iterate over the list CHAR-LIST, glueing adjacent #\\ #\n and #\\ #\t chars into
229 #\Newline and #\Tab."
230 (cond
231 ((null char-list) char-list)
232 ((null (cdr char-list)) char-list)
234 (loop :with drop := nil
235 :for (c1 c2) :on char-list
236 :if drop
237 :do (setf drop nil)
238 :else
239 :collect (if (char= c1 #\\)
240 (case c2
241 ((#\n) (setf drop T) #\Newline)
242 ((#\t) (setf drop T) #\Tab)
243 ((#\r) (setf drop T) #\Linefeed)
244 (otherwise c1))
246 :end))))
249 ;; parsing combinator functions cribbed from libmetapeg
251 (defun either (&rest parsers)
252 "Produce a function that tries each of the functions in PARSERS sequentially until one succeeds and
253 returns the result of that function, or a failure context if none succeeded."
254 #'(lambda (offset)
255 (let ((*context* (clone-ctx *context* 'opossum-either)))
256 (when *trace*
257 (format *trace-output* "either: ~A ~A~%" *context* parsers))
258 (loop :for p :in parsers
259 :as result = (funcall p offset)
260 :when (not (ctx-failed-p result))
261 :return (succeed *context* (value result) offset (end-index result))
262 :finally (return (fail))))))
265 (defun optional (parser)
266 #'(lambda (offset)
267 (let ((*context* (clone-ctx *context* 'opossum-optional)))
268 (when *trace*
269 (format *trace-output* "optional: ~A ~A~%" *context* parser))
270 (let ((result (funcall parser offset)))
271 (if (ctx-failed-p result)
272 (succeed *context* 'optional offset offset)
273 (succeed *context* (value result) offset (end-index result)))))))
275 (defun follow (parser)
276 #'(lambda (offset)
277 (let ((*context* (clone-ctx *context* 'opossum-follow)))
278 (when *trace*
279 (format *trace-output* "follow: ~A ~A~%" *context* parser))
280 (let ((result (funcall parser offset)))
281 (if (ctx-failed-p result)
282 (fail)
283 (succeed *context* (value result)
284 ;; don't consume input
285 offset offset))))))
287 (defun many (parser)
288 #'(lambda (offset)
289 (let ((*context* (clone-ctx *context* 'opossum-many))
290 (start-offset offset)
291 children)
292 (when *trace*
293 (format *trace-output* "many: ~A ~A~%" *context* parser))
294 (loop :as result := (funcall parser offset)
295 :while (not (ctx-failed-p result))
296 :do (progn (push (value result) children)
297 (setf offset (end-index result)))
298 :finally (return (succeed *context* (nreverse children) start-offset offset))))))
301 (defun many1 (parser)
302 #'(lambda (offset)
303 (let* ((*context* (clone-ctx *context* 'opossum-many1))
304 (result (funcall parser offset)))
305 (when *trace*
306 (format *trace-output* "many1: ~A ~A~%" *context* parser))
307 (if (not (ctx-failed-p result))
308 (let ((result2 (funcall (many parser) (end-index result))))
309 (if (end-index result2)
310 (succeed *context* (cons (value result) (value result2)) offset (end-index result2))
311 (succeed *context* (value result) offset (end-index result))))
312 (fail)))))
315 (defun seq (&rest parsers)
316 #'(lambda (offset)
317 (assert (> (length parsers) 0))
318 (let ((*context* (clone-ctx *context* 'opossum-seq))
319 (start-offset offset)
320 child-values
321 child-nodes)
322 (when *trace*
323 (format *trace-output* "seq: ~A ~A~%" *context* parsers))
324 ;; run the parsers
325 (loop :for p :in parsers
326 :do (when *trace* (format *trace-output* " (seq ~A) trying ~A~%" *context* p))
327 :do (cond
328 ((consp p)
329 (push (succeed (clone-ctx *context* 'action) nil offset offset) child-nodes)
330 (push p child-values)
331 (setf (children *context*) (reverse child-nodes)))
333 (let ((result (funcall p offset)))
334 (if (end-index result)
335 (progn
336 (push result child-nodes)
337 (push (value result) child-values)
338 (setf offset (end-index result))
339 (setf (children *context*) (reverse child-nodes)))
340 (return (fail))))))
341 :finally (return (succeed *context* (reverse child-values) start-offset offset))))))
343 (defun negate (parser)
344 #'(lambda (offset)
345 "Return a successful context at OFFSET if PARSER succeeds, without advancing input position."
346 (let ((*context* (clone-ctx *context* 'opossum-negate)))
347 (let ((result (funcall parser offset)))
348 (if (ctx-failed-p result)
349 (succeed *context* 'negate offset offset)
350 (fail))))))
354 (defun read-stream (stream)
355 "Read STREAM and return a string of all its contents."
356 (let ((s ""))
357 (loop :as line := (read-line stream nil nil)
358 :while line
359 :do (setq s (concatenate 'string s line)))
362 (defun read-file (file)
363 "Read FILE and return a string of all its contents."
364 (with-open-file (f file :direction :input)
365 (let ((len (file-length f)))
366 (if len
367 (let ((s (make-string len)))
368 (read-sequence s f)
370 (read-stream f)))))
372 (defun make-default-dst-package (grammarfile)
373 (make-package (gensym "opossum-parser")
374 :documentation (format T "Opossum parser for grammar ~A" grammarfile)))
376 (defun get-iso-time ()
377 "Return a string in ISO format for the current time"
378 (multiple-value-bind (second minute hour date month year day daylight-p zone)
379 (get-decoded-time)
380 (declare (ignore day))
381 (format nil "~4D-~2,'0D-~2,'0D-~2,'0D:~2,'0D:~2,'0D (UCT~@D)"
382 year month date
383 hour minute second
384 (if daylight-p (1+ (- zone)) (- zone)))))
386 (defun cleanup-action-code (code)
387 "Remove trailing newlines so that the string CODE can be printed nicely."
388 (subseq code 0
389 (when (char= #\Newline (char code (1- (length code))))
390 (1+ (position #\Newline code :from-end T :test #'char/=)))))
392 (defmacro checking-parse (grammarfile parse-file-fun)
393 (let ((res (gensym "resultctx")))
394 `(let ((,res (funcall ,parse-file-fun ,grammarfile *package*)))
395 (cond
396 ((ctx-failed-p ,res)
397 (format *error-output* "Failed to parse PEG grammar ~A~%" ,grammarfile)
398 (error "Parsing ~A failed: ~A" ,grammarfile ,res))
399 ((< (end-index ,res) (length (input ,res)))
400 (format *error-output* "Parsed only ~D characters of grammar ~A~%" (end-index ,res) ,grammarfile)
401 ,res)
402 (T ,res)))))
404 (defun generate-parser-file (grammarfile dst-package dst-file &key start-rule (parse-file-fun #'opossum:parse-file))
405 "Create lisp code in DST-FILE that can be loaded to yield functions to parse using GRAMMARFILE in DST-PACKAGE.
406 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
407 (let* ((*package* dst-package)
408 (result (checking-parse grammarfile parse-file-fun))
409 ;; FIXME: check for complete parse
410 (*context* result) ;; routines in pegutils.lisp expect *context* to be bound properly
411 (dpkg (intern (package-name dst-package) :keyword)))
412 (let ((forms (transform (value result)))
413 (actions (actions result)))
414 (with-open-file (s dst-file :direction :output :if-exists :supersede)
415 (let ((*print-readably* T)
416 (*print-pretty* T)
417 (*print-circle* NIL))
418 (format s ";; This is a Common Lisp peg parser automatically generated by OPOSSUM -*- mode:lisp -*-~%")
419 (format s ";; generated from ~A on ~A~%" grammarfile (get-iso-time))
420 (prin1 `(eval-when (:load-toplevel :compile-toplevel :execute)
421 (declaim (optimize (speed 0) (safety 3) (debug 3))))
423 (terpri s)
424 (prin1 `(defpackage ,dpkg
425 (:use :cl :opossum)
426 (:export :parse-string :parse-file :parse-stream :*trace*))
428 (terpri s)
429 (prin1 `(in-package ,dpkg) s) (terpri s)
430 ;; First form is taken to be the start rule
431 (let ((entryrule (or (and start-rule (make-name start-rule))
432 (and forms (cadr (first forms))))))
433 (if (not entryrule)
434 (format *error-output* "Cannot find entry rule for parser")
435 (progn
436 (when *trace*
437 (format *trace-output* "Inserting definitions for parser entry points through ~A~%"
438 entryrule))
439 (terpri s)
440 (prin1 `(defun parse-string (,(intern :s dst-package) dst-package)
441 ,(format nil "Parse S using grammar ~A starting at ~A" grammarfile entryrule)
442 (let ((*context* (make-instance 'opossum:context
443 :dst-package dst-package
444 :input ,(intern :s dst-package))))
445 (funcall (,entryrule) 0)))
447 (terpri s)
448 (prin1 `(defun parse-file (,(intern :f dst-package) dst-package)
449 ,(format nil "Parse file F using grammar ~A starting at ~A" grammarfile entryrule)
450 (parse-string (opossum:read-file ,(intern :f dst-package)) dst-package))
452 (terpri s)
453 (prin1 `(defun parse-stream (,(intern :stream dst-package) dst-package)
454 ,(format nil "Parse stream F using grammar ~A starting at ~A" grammarfile entryrule)
455 (parse-string (opossum:read-stream ,(intern :stream dst-package)) dst-package))
457 (fresh-line s))))
458 (loop :for aform :in forms
459 :do (when *trace* (format *trace-output* "Inserting form ~A~%" aform))
460 :do (terpri s)
461 :do (prin1 aform s)
462 :do (fresh-line s))
463 (terpri s)
464 (prin1
465 `(defparameter ,(intern "*trace*" dst-package) nil
466 "When non-nil, the generated parser function log to cl:*trace-output*.")
468 (terpri s)
470 (loop :for (sym code) :in actions
471 :when sym ;; the final action is named NIL because we push a
472 ;; NIL ahead of us in store-actions
473 :do (when *trace* (format *trace-output* "Inserting defun for ~A~%" sym))
474 :and :do (format s "~%(defun ~S (data)~% (declare (ignorable data))~% ~A)~%"
475 sym (cleanup-action-code code))))))))
477 (defun generate-parser-package (grammarfile &key (dst-package (make-package (gensym "opossum-parser-")))
478 start-rule (parse-file-fun #'opossum:parse-file))
479 "Create functions to parse using GRAMMARFILE in DST-PACKAGE, starting ar rule named HEAD.
480 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
481 (let* ((*package* dst-package)
482 (result (checking-parse grammarfile parse-file-fun))
483 ;; FIXME: check for complete parse
484 (*context* result)) ;; routines in pegutils.lisp expect *context* to be bound properly)
485 (let ((forms (transform (value result)))
486 (actions (actions result)))
487 (format *trace-output* "Injecting parser functions into ~A~%" dst-package)
488 (break "~A, ~A" forms actions)
489 (use-package '(:cl :opossum) dst-package)
490 (let ((entryrule (or (and start-rule (make-name start-rule))
491 (and forms (cadr (first forms))))))
492 (if (not entryrule)
493 (format *error-output* "Cannot find entry rule for parser")
494 (progn
495 (when *trace*
496 (format *trace-output* "Inserting definitions for parser entry points through ~A~%"
497 entryrule))
498 (intern (compile 'parse-string
499 `(lambda (,(intern :s dst-package))
500 ,(format nil "Parse S using grammar ~A starting at ~A" grammarfile entryrule)
501 (let ((*context* (make-instance 'opossum:context
502 :dst-package ,dst-package
503 :input ,(intern :s dst-package))))
504 (funcall (,entryrule) 0))))
505 dst-package)
506 (intern (compile 'parse-file
507 `(lambda (,(intern :f dst-package))
508 ,(format nil "Parse file F using grammar ~A starting at ~A" grammarfile entryrule)
509 (parse-string (opossum:read-file ,(intern :f dst-package)))))
510 dst-package)
512 (intern (compile 'parse-stream
513 `(lambda (,(intern :stream dst-package))
514 ,(format nil "Parse stream F using grammar ~A starting at ~A" grammarfile entryrule)
515 (parse-string (opossum:read-stream ,(intern :stream dst-package)))))
516 dst-package))))
517 (intern '*trace* dst-package)
518 (setf (documentation '*trace* 'cl:variable)
519 "When non-nil, the generated parser function log to cl:*trace-output*.")
520 (export '(:parse-string :parse-file :parse-stream :*trace*) dst-package)
522 (loop :for aform :in forms
523 :do (when *trace*
524 (format *trace-output* "Injecting form ~A~%" aform))
525 :do (destructuring-bind (defun-sym name args &rest body)
526 aform
527 (declare (ignore defun-sym))
528 (intern (compile name `(lambda ,args ,@body)) dst-package)))
529 (loop :for (sym code) :in actions
530 :when sym
531 :do (when *trace* (format *trace-output* "Injecting definition for ~A~%" sym))
532 :and :do (intern (compile sym `(lambda (data) (declare (ignorable data)) ,code)) dst-package)))))
536 (defun transform (tree &optional (depth 0))
537 (if (and tree
538 (consp tree))
539 (if (eq (first tree) ':action)
540 (progn
541 (when *trace*
542 (format *trace-output* "~AFound action ~A~%" (make-string depth :initial-element #\Space) tree))
543 tree)
544 (let ((data (mapcar #'(lambda (tr) (transform tr (1+ depth)))
545 tree)))
546 (loop :for el :in data
547 :when (and (listp el)
548 (eq (first el) ':action)
549 (symbolp (third el)))
550 :do (let ((*package* (dst-package *context*))
551 (action (third el)))
552 (when *trace*
553 (format *trace-output* "~&Applying action ~A to ~A~%" action data))
554 (handler-case
555 (return-from transform
556 (funcall (symbol-function action) data))
557 (undefined-function (x)
558 (progn
559 (format *error-output* "missing definition for ~A: ~A~%" action x)
560 ; (break "~A in ~A" action *package*)
561 tree)))))
562 data))
563 tree))