Bootstrapping infrastructure set up.
[cl-opossum.git] / pegutils.lisp
blob11c3f48dcb228e708776235f998f3072f42d33f5
1 ;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; -*-
2 ;;;
3 ;;; pegutils.lisp --- Utility functions for implementing PEG parsers
5 ;; Copyright (C) 2008 Utz-Uwe Haus <lisp@uuhaus.de>
6 ;; $Id$
7 ;; This code is free software; you can redistribute it and/or modify
8 ;; it under the terms of the version 3 of the GNU General
9 ;; Public License as published by the Free Software Foundation, as
10 ;; clarified by the prequel found in LICENSE.Lisp-GPL-Preface.
12 ;; This code is distributed in the hope that it will be useful, but
13 ;; without any warranty; without even the implied warranty of
14 ;; merchantability or fitness for a particular purpose. See the GNU
15 ;; Lesser General Public License for more details.
17 ;; Version 3 of the GNU General Public License is in the file
18 ;; LICENSE.GPL that was distributed with this file. If it is not
19 ;; present, you can access it from
20 ;; http://www.gnu.org/copyleft/gpl.txt (until superseded by a
21 ;; newer version) or write to the Free Software Foundation, Inc., 59
22 ;; Temple Place, Suite 330, Boston, MA 02111-1307 USA
24 ;; Commentary:
26 ;; Some code here is inspired by the metapeg library of John Leuner
28 ;;; Code:
31 (eval-when (:load-toplevel :compile-toplevel :execute)
32 (declaim (optimize (speed 0) (safety 3) (debug 3))))
34 (in-package #:opossum)
36 (defparameter *trace* nil "If non-nil, do extensive tracing of the parser functions.")
39 (defclass context ()
40 (;; these slots are copied when cloning a context for recursion
41 (input :accessor input :initarg :input :initform nil
42 :type 'string
43 :documentation "The input string being parsed.")
44 (destpkg :accessor destpkg :initarg :destpkg :initform nil
45 :type 'package
46 :documentation "The package into which symbols generated during the parse are interned.")
47 ;; these slots are shared by all cloned copies of a context -- use only STORE-ACTION to guarantee consistency
48 (actions :accessor actions :initarg :actions :initform '(NIL)
49 :type 'list
50 :documentation "The list of actions accumulated during the parse.")
51 (action-counter :accessor action-counter :initarg :action-counter :initform '(0)
52 :type '(cons (integer 0) null)
53 :documentation "The counter of actions.")
54 ;; these slots are what make a context unique
55 (parent :accessor parent :initarg :parent :initform nil)
56 (rule :accessor rule :initarg :rule :initform nil)
57 (children :accessor children :initform nil)
58 (value :accessor value :initarg :value :initform nil)
59 (start-index :accessor start-index :initarg :start-index :initform nil)
60 (end-index :accessor end-index :initarg :end-index :initform nil))
61 (:documentation "The parser context."))
63 (defmethod print-object ((obj context) stream)
64 (print-unreadable-object (obj stream :type T :identity NIL)
65 (format stream "rule ~A (~S) value ~S (~D:~D)"
66 (rule obj) (children obj) (value obj) (start-index obj) (end-index obj))))
68 (defmethod store-action ((ctx context) action)
69 (let ((a (actions ctx)))
70 (rplacd a (cons (car a) (cdr a)))
71 (rplaca a action)))
73 (defvar *context* nil "The current parser context.")
75 (defun clone-ctx (ctx rule)
76 "Create clone context of CTX for rule RULE."
77 (make-instance 'context
78 :input (input ctx)
79 :destpkg (destpkg ctx)
80 :actions (actions ctx)
81 :action-counter (action-counter ctx)
82 :parent ctx
83 :rule rule
84 :start-index (end-index ctx)))
86 (defun ctx-failed-p (ctx)
87 (unless ctx
88 (break "Botched context"))
89 (null (end-index ctx)))
91 (defun succeed (ctx value start-index end-index)
92 "Mark CTX as successful: set VALUE and matched region."
93 (setf (value ctx) value)
94 (setf (start-index ctx) start-index)
95 (setf (end-index ctx) end-index)
96 ; (break "generated success context ~A" ctx)
97 (when *trace*
98 (format *trace-output* "Matched: ~A (~D:~D)~%"
99 (rule ctx) (start-index ctx) (end-index ctx)))
100 ctx)
102 (defun fail ()
103 "Return a failure context."
104 (let ((ctx (make-instance 'context
105 :input (input *context*)
106 :rule ':fail
107 :value (rule *context*)
108 :start-index (start-index *context*)
109 :end-index (end-index *context*)
110 ;; probably some of these copies can be saved
111 :destpkg (destpkg *context*)
112 :actions (actions *context*)
113 :action-counter (action-counter *context*))))
114 ; (break "generated failure context ~A" ctx )
115 (when *trace*
116 (format *trace-output* "(failed: ~A ~A ~A)~%"
117 (value ctx) (start-index ctx) (end-index ctx)))
118 ctx))
122 (defun make-name (string)
123 (intern (concatenate 'string "parse-" string)
124 (destpkg *context*)))
126 (defun make-action-name (&key ctx)
127 "Return a symbol suitable to name the next action."
128 (incf (car (action-counter *context*)))
129 (let ((aname (if ctx
130 (format nil "opossum-action-~D-srcpos-~D-~D"
131 (car (action-counter *context*))
132 (start-index ctx)
133 (end-index ctx))
134 (format nil "opossum-action-~D-~D~D"
135 (car (action-counter *context*)))) ))
136 (intern aname (destpkg *context*))))
138 (defun char-list-to-string (char-list)
139 (coerce char-list 'string))
141 (defmacro build-parser-function (name parser)
142 `(let* ((*context* (clone-ctx *context* ,name))
143 (result (funcall ,parser offset)))
144 (unless result
145 (break "Yow"))
146 (if (ctx-failed-p result)
147 (fail)
148 (succeed *context* (value result) (start-index result) (end-index result)))))
152 (defun match-string (string)
153 #'(lambda (offset)
154 (let ((input (input *context*))
155 (len (length string)))
156 (if (and (>= (length input) (+ offset len))
157 (string= string input :start2 offset :end2 (+ offset len)))
158 (succeed (clone-ctx *context* 'opossum-string) string offset (+ offset (length string)))
159 (fail)))))
161 (defun match-char (char-list)
162 #'(lambda (offset)
163 (let ((input (input *context*)))
164 (when *trace*
165 (format *trace-output* "match-char: looking for one of `~{~A~}'~%" char-list))
166 (if (and (> (length input) offset)
167 (member (char input offset)
168 char-list :test #'char=))
169 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
170 (fail)))))
172 (defun match-octal-char-code (i1 i2 i3)
173 "Compare the character given by i3 + 8*i2 + 64*i1 to the next input character."
174 (let ((c (+ i3 (* 8 i2) (* 64 i1))))
175 #'(lambda (offset)
176 (let ((input (input *context*)))
177 (when *trace*
178 (format *trace-output* "match-octal-char-code: looking for ~D~%" c))
179 (if (and (> (length input) offset)
180 (= (char-int (char input offset)) c))
181 (succeed (clone-ctx *context* 'opossum-char) (char input offset) offset (+ offset 1))
182 (fail))))))
184 (defun match-char-range (lower-char upper-char)
185 "Match characters in the range between LOWER-CHAR and UPPER-CHAR (inclusive) as decided by CL:CHAR-CODE."
186 #'(lambda (offset)
187 (let ((input (input *context*)))
188 (when *trace*
189 (format *trace-output* "match-char-range: looking for ~A-~A~%" lower-char upper-char))
190 (if (and (> (length input) offset)
191 (let ((x (char-code (char input offset))))
192 (and (>= x (char-code lower-char))
193 (<= x (char-code upper-char)))))
194 (succeed (clone-ctx *context* 'opossum-char-range) (char input offset) offset (+ offset 1))
195 (fail)))))
197 (defun match-any-char (&optional ignored)
198 (declare (ignore ignored))
199 #'(lambda (offset)
200 "Match any character at OFFSET, fail only on EOF."
201 (let ((input (input *context*)))
202 (when *trace*
203 (format *trace-output* "match-any-char~%"))
204 (if (< (1+ offset) (length input))
205 (succeed (clone-ctx *context* 'opossum-anychar) (char input offset) offset (+ offset 1))
206 (fail)))))
208 (defun match-char-class (char-class)
209 "Regexp matching of next input character against [CHAR-CLASS] using cl-ppcre:scan."
210 (declare (type string char-class))
211 ;; FIXME: could use a pre-computed scanner
212 (let ((cc (format nil "[~A]" char-class)))
213 #'(lambda (offset)
214 "Match next character at OFFSET against the characters in CHAR-CLASS."
215 (let ((input (input *context*)))
216 (when *trace*
217 (format *trace-output* "match-char-class on ~A~%"))
218 (if (and (< (1+ offset) (length input))
219 (let ((c (char input offset)))
220 (cl-ppcre:scan cc (make-string 1 :initial-element c))))
221 (succeed (clone-ctx *context* 'opossum-charclass)
222 (char input offset) offset (+ offset 1))
223 (fail))))))
225 (defun fix-escape-sequences (char-list)
226 "Iterate over the list CHAR-LIST, glueing adjacent #\\ #\n and #\\ #\t chars into
227 #\Newline and #\Tab."
228 (cond
229 ((null char-list) char-list)
230 ((null (cdr char-list)) char-list)
232 (loop :with drop := nil
233 :for (c1 c2) :on char-list
234 :if drop
235 :do (setf drop nil)
236 :else
237 :collect (if (char= c1 #\\)
238 (case c2
239 ((#\n) (setf drop T) #\Newline)
240 ((#\t) (setf drop T) #\Tab)
241 ((#\r) (setf drop T) #\Linefeed)
242 (otherwise c1))
244 :end))))
247 ;; parsing combinator functions cribbed from libmetapeg
249 (defun either (&rest parsers)
250 "Produce a function that tries each of the functions in PARSERS sequentially until one succeeds and
251 returns the result of that function, or a failure context if none succeeded."
252 #'(lambda (offset)
253 (let ((*context* (clone-ctx *context* 'opossum-either)))
254 (when *trace*
255 (format *trace-output* "either: ~A ~A~%" *context* parsers))
256 (loop :for p :in parsers
257 :as result = (funcall p offset)
258 :when (not (ctx-failed-p result))
259 :return (succeed *context* (value result) offset (end-index result))
260 :finally (return (fail))))))
263 (defun optional (parser)
264 #'(lambda (offset)
265 (let ((*context* (clone-ctx *context* 'opossum-optional)))
266 (when *trace*
267 (format *trace-output* "optional: ~A ~A~%" *context* parser))
268 (let ((result (funcall parser offset)))
269 (if (ctx-failed-p result)
270 (succeed *context* 'optional offset offset)
271 (succeed *context* (value result) offset (end-index result)))))))
273 (defun follow (parser)
274 #'(lambda (offset)
275 (let ((*context* (clone-ctx *context* 'opossum-follow)))
276 (when *trace*
277 (format *trace-output* "follow: ~A ~A~%" *context* parser))
278 (let ((result (funcall parser offset)))
279 (if (ctx-failed-p result)
280 (fail)
281 (succeed *context* (value result)
282 ;; don't consume input
283 offset offset))))))
285 (defun many (parser)
286 #'(lambda (offset)
287 (let ((*context* (clone-ctx *context* 'opossum-many))
288 (start-offset offset)
289 children)
290 (when *trace*
291 (format *trace-output* "many: ~A ~A~%" *context* parser))
292 (loop :as result := (funcall parser offset)
293 :while (not (ctx-failed-p result))
294 :do (progn (push (value result) children)
295 (setf offset (end-index result)))
296 :finally (return (succeed *context* (nreverse children) start-offset offset))))))
299 (defun many1 (parser)
300 #'(lambda (offset)
301 (let* ((*context* (clone-ctx *context* 'opossum-many1))
302 (result (funcall parser offset)))
303 (when *trace*
304 (format *trace-output* "many1: ~A ~A~%" *context* parser))
305 (if (not (ctx-failed-p result))
306 (let ((result2 (funcall (many parser) (end-index result))))
307 (if (end-index result2)
308 (succeed *context* (cons (value result) (value result2)) offset (end-index result2))
309 (succeed *context* (value result) offset (end-index result))))
310 (fail)))))
313 (defun seq (&rest parsers)
314 #'(lambda (offset)
315 (assert (> (length parsers) 0))
316 (let ((*context* (clone-ctx *context* 'opossum-seq))
317 (start-offset offset)
318 child-values
319 child-nodes)
320 (when *trace*
321 (format *trace-output* "seq: ~A ~A~%" *context* parsers))
322 ;; run the parsers
323 (loop :for p :in parsers
324 :do (when *trace* (format *trace-output* " (seq ~A) trying ~A~%" *context* p))
325 :do (cond
326 ((consp p)
327 (push (succeed (clone-ctx *context* 'action) nil offset offset) child-nodes)
328 (push p child-values)
329 (setf (children *context*) (reverse child-nodes)))
331 (let ((result (funcall p offset)))
332 (if (end-index result)
333 (progn
334 (push result child-nodes)
335 (push (value result) child-values)
336 (setf offset (end-index result))
337 (setf (children *context*) (reverse child-nodes)))
338 (return (fail))))))
339 :finally (return (succeed *context* (reverse child-values) start-offset offset))))))
341 (defun negate (parser)
342 #'(lambda (offset)
343 "Return a successful context at OFFSET if PARSER succeeds, without advancing input position."
344 (let ((*context* (clone-ctx *context* 'opossum-negate)))
345 (let ((result (funcall parser offset)))
346 (if (ctx-failed-p result)
347 (succeed *context* 'negate offset offset)
348 (fail))))))
352 (defun read-stream (stream)
353 "Read STREAM and return a string of all its contents."
354 (let ((s ""))
355 (loop :as line := (read-line stream nil nil)
356 :while line
357 :do (setq s (concatenate 'string s line)))
360 (defun read-file (file)
361 "Read FILE and return a string of all its contents."
362 (with-open-file (f file :direction :input)
363 (let ((len (file-length f)))
364 (if len
365 (let ((s (make-string len)))
366 (read-sequence s f)
368 (read-stream f)))))
370 (defun make-default-dst-package (grammarfile)
371 (make-package (gensym "opossum-parser")
372 :documentation (format T "Opossum parser for grammar ~A" grammarfile)))
374 (defun generate-parser-package (grammarfile dst-package head)
375 "Create functions to parse using GRAMMARFILE in DST-PACKAGE, starting ar rule named HEAD.
376 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
377 (let ((*package* dst-package))
378 (multiple-value-bind (form actions)
379 (opossum:parse-file grammarfile)
380 (format *debug-io* "Injecting parser functions into ~A~%" dst-package)
381 (loop :for aform :in form
382 :do (progn
383 (format *debug-io* "Skipping aform ~A~%" aform)))
384 (loop :for (sym code) :in actions
385 :do (intern (compile sym `(lambda (data) (declare (ignorable data)) ,code)))
386 :do (format *debug-io* "Compiled ~A~%" sym))
387 (intern (compile 'parse-string
388 #'(lambda (s)
389 `,(format nil "Parse S using grammar ~A starting at ~A" grammarfile head)
390 (let ((*context* (make-instance 'opossum:context :dstpkg *package* :input s)))
391 (funcall (make-name head) 0)))))
392 (intern (compile 'parse-file
393 #'(lambda (f) (parse-string (read-file f)))))
394 (intern (compile 'parse-stream
395 #'(lambda (s) (parse-string (read-stream s)))))
396 (intern '*trace*)
397 (setf (documentation '*trace* 'cl:variable)
398 "When non-nil, the generated parser function log to cl:*trace-output*.")
399 (export '(parse-string parse-file parse-stream *trace*)))))
401 (defun get-iso-time ()
402 "Return a string in ISO format for the current time"
403 (multiple-value-bind (second minute hour date month year day daylight-p zone)
404 (get-decoded-time)
405 (declare (ignore day))
406 (format nil "~4D-~2,'0D-~2,'0D-~2,'0D:~2,'0D:~2,'0D (UCT~@D)"
407 year month date
408 hour minute second
409 (if daylight-p (1+ (- zone)) (- zone)))))
411 (defun cleanup-action-code (code)
412 "Remove trailing newlines so that the string CODE can be printed nicely."
413 (subseq code 0
414 (when (char= #\Newline (char code (1- (length code))))
415 (1+ (position #\Newline code :from-end T :test #'char/=)))))
417 (defun generate-parser-file (grammarfile dst-package dst-file &optional start-rule)
418 "Create lisp code in DST-FILE that can be loaded to yield functions to parse using GRAMMARFILE in DST-PACKAGE.
419 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
420 (let ((*package* dst-package))
421 (let ((result (opossum:parse-file grammarfile)))
422 ;; FIXME: check for complete parse
423 (let ((*context* result)) ;; routines in pegutils.lisp expect *context* to be bound properly
424 (let ((forms (transform (value result)))
425 (actions (actions result)))
426 (with-open-file (s dst-file :direction :output :if-exists :supersede)
427 (let ((*print-readably* T)
428 (*print-pretty* T)
429 (*print-circle* NIL))
430 (format s ";; This is a Common Lisp peg parser automatically generated by OPOSSUM -*- mode:lisp -*-~%")
431 (format s ";; generated from ~A on ~A~%" grammarfile (get-iso-time))
432 (format s "(eval-when (:load-toplevel :compile-toplevel :execute) (declaim (optimize (speed 0) (safety 3) (debug 3))))~%~%")
433 (format s "(defpackage #:~A~%" (package-name dst-package))
434 (format s " (:use #:cl #:opossum)~%")
435 (format s " (:export #:parse-string #:parse-file #:parse-stream))~%")
436 (format s "~%(in-package #:~A)~%" (package-name dst-package))
437 ;; First form is taken to be the start rule
438 (let ((entryrule (or (and start-rule (make-name start-rule))
439 (and forms (cadr (first forms))))))
440 (if (not entryrule)
441 (format *error-output* "Cannot find entry rule for parser")
442 (progn
443 (format *trace-output* "Inserting definitions for parser entry points through ~A~%"
444 opossum::entryrule)
445 (terpri s)
446 (prin1 `(defun parse-string (,(intern :s dst-package))
447 ,(format nil "Parse S using grammar ~A starting at ~A" grammarfile entryrule)
448 (let ((*context* (make-instance 'opossum:context
449 :dstpkg ,(package-name dst-package)
450 :input ,(intern :s dst-package))))
451 (funcall (,entryrule) 0)))
453 (fresh-line s))))
454 (loop :for aform :in forms
455 :do (format *trace-output* "Inserting form ~A~%" aform)
456 :do (terpri s)
457 :do (prin1 aform s)
458 :do (fresh-line s))
459 (terpri s)
460 (prin1
461 `(defparameter ,(intern "*trace*" dst-package) nil
462 "When non-nil, the generated parser function log to cl:*trace-output*."))
463 (terpri s)
465 (loop :for (sym code) :in actions
466 :when sym ;; the final action is named NIL because we push a
467 ;; NIL ahead of us in store-actions
468 :do (format *trace-output* "Inserting defun for ~A~%" sym)
469 :and :do (format s "~%(defun ~S (data)~% ~A)~%"
470 sym (cleanup-action-code code))))))))))
472 (defun transform (tree &optional (depth 0))
473 (if (and tree
474 (consp tree))
475 (if (eq (first tree) ':action)
476 (progn
477 (when *trace*
478 (format *trace-output* "~AFound action ~A~%" (make-string depth :initial-element #\Space) tree))
479 tree)
480 (let ((data (mapcar #'(lambda (tr) (transform tr (1+ depth)))
481 tree)))
482 (loop :for el :in data
483 :when (and (listp el)
484 (eq (first el) ':action)
485 (symbolp (third el)))
486 :do (let ((*package* (destpkg *context*))
487 (action (third el)))
488 (when *trace*
489 (format *trace-output* "~&Applying action ~A to ~A~%" action data))
490 (handler-case
491 (return-from transform
492 (funcall (symbol-function action) data))
493 (undefined-function (x)
494 (progn
495 (format *error-output* "missing definition for ~A: ~A~%" action x)
496 ; (break "~A in ~A" action *package*)
497 tree)))))
498 data))
499 tree))