1 ;;; -*- Mode: Lisp; Syntax: ANSI-Common-Lisp; Base: 10; -*-
3 ;;; pegutils.lisp --- Utility functions for implementing PEG parsers
5 ;; Copyright (C) 2008 Utz-Uwe Haus <lisp@uuhaus.de>
7 ;; This code is free software; you can redistribute it and/or modify
8 ;; it under the terms of the version 3 of the GNU General
9 ;; Public License as published by the Free Software Foundation, as
10 ;; clarified by the prequel found in LICENSE.Lisp-GPL-Preface.
12 ;; This code is distributed in the hope that it will be useful, but
13 ;; without any warranty; without even the implied warranty of
14 ;; merchantability or fitness for a particular purpose. See the GNU
15 ;; Lesser General Public License for more details.
17 ;; Version 3 of the GNU General Public License is in the file
18 ;; LICENSE.GPL that was distributed with this file. If it is not
19 ;; present, you can access it from
20 ;; http://www.gnu.org/copyleft/gpl.txt (until superseded by a
21 ;; newer version) or write to the Free Software Foundation, Inc., 59
22 ;; Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 ;; Some code here is inspired by the metapeg library of John Leuner
31 (eval-when (:load-toplevel
:compile-toplevel
:execute
)
32 (declaim (optimize (speed 0) (safety 3) (debug 3))))
34 (in-package #:opossum
)
36 (defparameter *trace
* nil
"If non-nil, do extensive tracing of the parser functions.")
40 (;; these slots are copied when cloning a context for recursion
41 (input :accessor input
:initarg
:input
:initform nil
43 :documentation
"The input string being parsed.")
44 (destpkg :accessor destpkg
:initarg
:destpkg
:initform nil
46 :documentation
"The package into which symbols generated during the parse are interned.")
47 ;; these slots are shared by all cloned copies of a context -- use only STORE-ACTION to guarantee consistency
48 (actions :accessor actions
:initarg
:actions
:initform
'(NIL)
50 :documentation
"The list of actions accumulated during the parse.")
51 (action-counter :accessor action-counter
:initarg
:action-counter
:initform
'(0)
52 :type
'(cons (integer 0) null
)
53 :documentation
"The counter of actions.")
54 ;; these slots are what make a context unique
55 (parent :accessor parent
:initarg
:parent
:initform nil
)
56 (rule :accessor rule
:initarg
:rule
:initform nil
)
57 (children :accessor children
:initform nil
)
58 (value :accessor value
:initarg
:value
:initform nil
)
59 (start-index :accessor start-index
:initarg
:start-index
:initform nil
)
60 (end-index :accessor end-index
:initarg
:end-index
:initform nil
))
61 (:documentation
"The parser context."))
63 (defmethod print-object ((obj context
) stream
)
64 (print-unreadable-object (obj stream
:type T
:identity NIL
)
65 (format stream
"rule ~A (~S) value ~S (~D:~D)"
66 (rule obj
) (children obj
) (value obj
) (start-index obj
) (end-index obj
))))
68 (defmethod store-action ((ctx context
) action
)
69 (let ((a (actions ctx
)))
70 (rplacd a
(cons (car a
) (cdr a
)))
73 (defvar *context
* nil
"The current parser context.")
75 (defun clone-ctx (ctx rule
)
76 "Create clone context of CTX for rule RULE."
77 (make-instance 'context
79 :destpkg
(destpkg ctx
)
80 :actions
(actions ctx
)
81 :action-counter
(action-counter ctx
)
84 :start-index
(end-index ctx
)))
86 (defun ctx-failed-p (ctx)
88 (break "Botched context"))
89 (null (end-index ctx
)))
91 (defun succeed (ctx value start-index end-index
)
92 "Mark CTX as successful: set VALUE and matched region."
93 (setf (value ctx
) value
)
94 (setf (start-index ctx
) start-index
)
95 (setf (end-index ctx
) end-index
)
96 ; (break "generated success context ~A" ctx)
98 (format *trace-output
* "Matched: ~A (~D:~D)~%"
99 (rule ctx
) (start-index ctx
) (end-index ctx
)))
103 "Return a failure context."
104 (let ((ctx (make-instance 'context
105 :input
(input *context
*)
107 :value
(rule *context
*)
108 :start-index
(start-index *context
*)
109 :end-index
(end-index *context
*)
110 ;; probably some of these copies can be saved
111 :destpkg
(destpkg *context
*)
112 :actions
(actions *context
*)
113 :action-counter
(action-counter *context
*))))
114 ; (break "generated failure context ~A" ctx )
116 (format *trace-output
* "(failed: ~A ~A ~A)~%"
117 (value ctx
) (start-index ctx
) (end-index ctx
)))
122 (defun make-name (string)
123 (intern (concatenate 'string
"parse-" string
)
124 (destpkg *context
*)))
126 (defun make-action-name (&key ctx
)
127 "Return a symbol suitable to name the next action."
128 (incf (car (action-counter *context
*)))
130 (format nil
"opossum-action-~D-srcpos-~D-~D"
131 (car (action-counter *context
*))
134 (format nil
"opossum-action-~D-~D~D"
135 (car (action-counter *context
*)))) ))
136 (intern aname
(destpkg *context
*))))
138 (defun char-list-to-string (char-list)
139 (coerce char-list
'string
))
141 (defmacro build-parser-function
(name parser
)
142 `(let* ((*context
* (clone-ctx *context
* ,name
))
143 (result (funcall ,parser offset
)))
146 (if (ctx-failed-p result
)
148 (succeed *context
* (value result
) (start-index result
) (end-index result
)))))
152 (defun match-string (string)
154 (let ((input (input *context
*))
155 (len (length string
)))
156 (if (and (>= (length input
) (+ offset len
))
157 (string= string input
:start2 offset
:end2
(+ offset len
)))
158 (succeed (clone-ctx *context
* 'opossum-string
) string offset
(+ offset
(length string
)))
161 (defun match-char (char-list)
163 (let ((input (input *context
*)))
165 (format *trace-output
* "match-char: looking for one of `~{~A~}'~%" char-list
))
166 (if (and (> (length input
) offset
)
167 (member (char input offset
)
168 char-list
:test
#'char
=))
169 (succeed (clone-ctx *context
* 'opossum-char
) (char input offset
) offset
(+ offset
1))
172 (defun match-octal-char-code (i1 i2 i3
)
173 "Compare the character given by i3 + 8*i2 + 64*i1 to the next input character."
174 (let ((c (+ i3
(* 8 i2
) (* 64 i1
))))
176 (let ((input (input *context
*)))
178 (format *trace-output
* "match-octal-char-code: looking for ~D~%" c
))
179 (if (and (> (length input
) offset
)
180 (= (char-int (char input offset
)) c
))
181 (succeed (clone-ctx *context
* 'opossum-char
) (char input offset
) offset
(+ offset
1))
184 (defun match-char-range (lower-char upper-char
)
185 "Match characters in the range between LOWER-CHAR and UPPER-CHAR (inclusive) as decided by CL:CHAR-CODE."
187 (let ((input (input *context
*)))
189 (format *trace-output
* "match-char-range: looking for ~A-~A~%" lower-char upper-char
))
190 (if (and (> (length input
) offset
)
191 (let ((x (char-code (char input offset
))))
192 (and (>= x
(char-code lower-char
))
193 (<= x
(char-code upper-char
)))))
194 (succeed (clone-ctx *context
* 'opossum-char-range
) (char input offset
) offset
(+ offset
1))
197 (defun match-any-char (&optional ignored
)
198 (declare (ignore ignored
))
200 "Match any character at OFFSET, fail only on EOF."
201 (let ((input (input *context
*)))
203 (format *trace-output
* "match-any-char~%"))
204 (if (< (1+ offset
) (length input
))
205 (succeed (clone-ctx *context
* 'opossum-anychar
) (char input offset
) offset
(+ offset
1))
208 (defun match-char-class (char-class)
209 "Regexp matching of next input character against [CHAR-CLASS] using cl-ppcre:scan."
210 (declare (type string char-class
))
211 ;; FIXME: could use a pre-computed scanner
212 (let ((cc (format nil
"[~A]" char-class
)))
214 "Match next character at OFFSET against the characters in CHAR-CLASS."
215 (let ((input (input *context
*)))
217 (format *trace-output
* "match-char-class on ~A~%"))
218 (if (and (< (1+ offset
) (length input
))
219 (let ((c (char input offset
)))
220 (cl-ppcre:scan cc
(make-string 1 :initial-element c
))))
221 (succeed (clone-ctx *context
* 'opossum-charclass
)
222 (char input offset
) offset
(+ offset
1))
225 (defun fix-escape-sequences (char-list)
226 "Iterate over the list CHAR-LIST, glueing adjacent #\\ #\n and #\\ #\t chars into
227 #\Newline and #\Tab."
229 ((null char-list
) char-list
)
230 ((null (cdr char-list
)) char-list
)
232 (loop :with drop
:= nil
233 :for
(c1 c2
) :on char-list
237 :collect
(if (char= c1
#\\)
239 ((#\n) (setf drop T
) #\Newline
)
240 ((#\t) (setf drop T
) #\Tab
)
241 ((#\r) (setf drop T
) #\Linefeed
)
247 ;; parsing combinator functions cribbed from libmetapeg
249 (defun either (&rest parsers
)
250 "Produce a function that tries each of the functions in PARSERS sequentially until one succeeds and
251 returns the result of that function, or a failure context if none succeeded."
253 (let ((*context
* (clone-ctx *context
* 'opossum-either
)))
255 (format *trace-output
* "either: ~A ~A~%" *context
* parsers
))
256 (loop :for p
:in parsers
257 :as result
= (funcall p offset
)
258 :when
(not (ctx-failed-p result
))
259 :return
(succeed *context
* (value result
) offset
(end-index result
))
260 :finally
(return (fail))))))
263 (defun optional (parser)
265 (let ((*context
* (clone-ctx *context
* 'opossum-optional
)))
267 (format *trace-output
* "optional: ~A ~A~%" *context
* parser
))
268 (let ((result (funcall parser offset
)))
269 (if (ctx-failed-p result
)
270 (succeed *context
* 'optional offset offset
)
271 (succeed *context
* (value result
) offset
(end-index result
)))))))
273 (defun follow (parser)
275 (let ((*context
* (clone-ctx *context
* 'opossum-follow
)))
277 (format *trace-output
* "follow: ~A ~A~%" *context
* parser
))
278 (let ((result (funcall parser offset
)))
279 (if (ctx-failed-p result
)
281 (succeed *context
* (value result
)
282 ;; don't consume input
287 (let ((*context
* (clone-ctx *context
* 'opossum-many
))
288 (start-offset offset
)
291 (format *trace-output
* "many: ~A ~A~%" *context
* parser
))
292 (loop :as result
:= (funcall parser offset
)
293 :while
(not (ctx-failed-p result
))
294 :do
(progn (push (value result
) children
)
295 (setf offset
(end-index result
)))
296 :finally
(return (succeed *context
* (nreverse children
) start-offset offset
))))))
299 (defun many1 (parser)
301 (let* ((*context
* (clone-ctx *context
* 'opossum-many1
))
302 (result (funcall parser offset
)))
304 (format *trace-output
* "many1: ~A ~A~%" *context
* parser
))
305 (if (not (ctx-failed-p result
))
306 (let ((result2 (funcall (many parser
) (end-index result
))))
307 (if (end-index result2
)
308 (succeed *context
* (cons (value result
) (value result2
)) offset
(end-index result2
))
309 (succeed *context
* (value result
) offset
(end-index result
))))
313 (defun seq (&rest parsers
)
315 (assert (> (length parsers
) 0))
316 (let ((*context
* (clone-ctx *context
* 'opossum-seq
))
317 (start-offset offset
)
321 (format *trace-output
* "seq: ~A ~A~%" *context
* parsers
))
323 (loop :for p
:in parsers
324 :do
(when *trace
* (format *trace-output
* " (seq ~A) trying ~A~%" *context
* p
))
327 (push (succeed (clone-ctx *context
* 'action
) nil offset offset
) child-nodes
)
328 (push p child-values
)
329 (setf (children *context
*) (reverse child-nodes
)))
331 (let ((result (funcall p offset
)))
332 (if (end-index result
)
334 (push result child-nodes
)
335 (push (value result
) child-values
)
336 (setf offset
(end-index result
))
337 (setf (children *context
*) (reverse child-nodes
)))
339 :finally
(return (succeed *context
* (reverse child-values
) start-offset offset
))))))
341 (defun negate (parser)
343 "Return a successful context at OFFSET if PARSER succeeds, without advancing input position."
344 (let ((*context
* (clone-ctx *context
* 'opossum-negate
)))
345 (let ((result (funcall parser offset
)))
346 (if (ctx-failed-p result
)
347 (succeed *context
* 'negate offset offset
)
352 (defun read-stream (stream)
353 "Read STREAM and return a string of all its contents."
355 (loop :as line
:= (read-line stream nil nil
)
357 :do
(setq s
(concatenate 'string s line
)))
360 (defun read-file (file)
361 "Read FILE and return a string of all its contents."
362 (with-open-file (f file
:direction
:input
)
363 (let ((len (file-length f
)))
365 (let ((s (make-string len
)))
370 (defun make-default-dst-package (grammarfile)
371 (make-package (gensym "opossum-parser")
372 :documentation
(format T
"Opossum parser for grammar ~A" grammarfile
)))
374 (defun generate-parser-package (grammarfile dst-package head
)
375 "Create functions to parse using GRAMMARFILE in DST-PACKAGE, starting ar rule named HEAD.
376 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
377 (let ((*package
* dst-package
))
378 (multiple-value-bind (form actions
)
379 (opossum:parse-file grammarfile
)
380 (format *debug-io
* "Injecting parser functions into ~A~%" dst-package
)
381 (loop :for aform
:in form
383 (format *debug-io
* "Skipping aform ~A~%" aform
)))
384 (loop :for
(sym code
) :in actions
385 :do
(intern (compile sym
`(lambda (data) (declare (ignorable data
)) ,code
)))
386 :do
(format *debug-io
* "Compiled ~A~%" sym
))
387 (intern (compile 'parse-string
389 `,(format nil
"Parse S using grammar ~A starting at ~A" grammarfile head
)
390 (let ((*context
* (make-instance 'opossum
:context
:dstpkg
*package
* :input s
)))
391 (funcall (make-name head
) 0)))))
392 (intern (compile 'parse-file
393 #'(lambda (f) (parse-string (read-file f
)))))
394 (intern (compile 'parse-stream
395 #'(lambda (s) (parse-string (read-stream s
)))))
397 (setf (documentation '*trace
* 'cl
:variable
)
398 "When non-nil, the generated parser function log to cl:*trace-output*.")
399 (export '(parse-string parse-file parse-stream
*trace
*)))))
401 (defun get-iso-time ()
402 "Return a string in ISO format for the current time"
403 (multiple-value-bind (second minute hour date month year day daylight-p zone
)
405 (declare (ignore day
))
406 (format nil
"~4D-~2,'0D-~2,'0D-~2,'0D:~2,'0D:~2,'0D (UCT~@D)"
409 (if daylight-p
(1+ (- zone
)) (- zone
)))))
411 (defun cleanup-action-code (code)
412 "Remove trailing newlines so that the string CODE can be printed nicely."
414 (when (char= #\Newline
(char code
(1- (length code
))))
415 (1+ (position #\Newline code
:from-end T
:test
#'char
/=)))))
417 (defun generate-parser-file (grammarfile dst-package dst-file
&optional start-rule
)
418 "Create lisp code in DST-FILE that can be loaded to yield functions to parse using GRAMMARFILE in DST-PACKAGE.
419 DST-PACKAGE will contain the 3 functions PARSE-STRING, PARSE-FILE and PARSE-STREAM as exported entrypoints."
420 (let ((*package
* dst-package
))
421 (let ((result (opossum:parse-file grammarfile
)))
422 ;; FIXME: check for complete parse
423 (let ((*context
* result
)) ;; routines in pegutils.lisp expect *context* to be bound properly
424 (let ((forms (transform (value result
)))
425 (actions (actions result
)))
426 (with-open-file (s dst-file
:direction
:output
:if-exists
:supersede
)
427 (let ((*print-readably
* T
)
429 (*print-circle
* NIL
))
430 (format s
";; This is a Common Lisp peg parser automatically generated by OPOSSUM -*- mode:lisp -*-~%")
431 (format s
";; generated from ~A on ~A~%" grammarfile
(get-iso-time))
432 (format s
"(eval-when (:load-toplevel :compile-toplevel :execute) (declaim (optimize (speed 0) (safety 3) (debug 3))))~%~%")
433 (format s
"(defpackage #:~A~%" (package-name dst-package
))
434 (format s
" (:use #:cl #:opossum)~%")
435 (format s
" (:export #:parse-string #:parse-file #:parse-stream))~%")
436 (format s
"~%(in-package #:~A)~%" (package-name dst-package
))
437 ;; First form is taken to be the start rule
438 (let ((entryrule (or (and start-rule
(make-name start-rule
))
439 (and forms
(cadr (first forms
))))))
441 (format *error-output
* "Cannot find entry rule for parser")
443 (format *trace-output
* "Inserting definitions for parser entry points through ~A~%"
446 (prin1 `(defun parse-string (,(intern :s dst-package
))
447 ,(format nil
"Parse S using grammar ~A starting at ~A" grammarfile entryrule
)
448 (let ((*context
* (make-instance 'opossum
:context
449 :dstpkg
,(package-name dst-package
)
450 :input
,(intern :s dst-package
))))
451 (funcall (,entryrule
) 0)))
454 (loop :for aform
:in forms
455 :do
(format *trace-output
* "Inserting form ~A~%" aform
)
461 `(defparameter ,(intern "*trace*" dst-package
) nil
462 "When non-nil, the generated parser function log to cl:*trace-output*."))
465 (loop :for
(sym code
) :in actions
466 :when sym
;; the final action is named NIL because we push a
467 ;; NIL ahead of us in store-actions
468 :do
(format *trace-output
* "Inserting defun for ~A~%" sym
)
469 :and
:do
(format s
"~%(defun ~S (data)~% ~A)~%"
470 sym
(cleanup-action-code code
))))))))))
472 (defun transform (tree &optional
(depth 0))
475 (if (eq (first tree
) ':action
)
478 (format *trace-output
* "~AFound action ~A~%" (make-string depth
:initial-element
#\Space
) tree
))
480 (let ((data (mapcar #'(lambda (tr) (transform tr
(1+ depth
)))
482 (loop :for el
:in data
483 :when
(and (listp el
)
484 (eq (first el
) ':action
)
485 (symbolp (third el
)))
486 :do
(let ((*package
* (destpkg *context
*))
489 (format *trace-output
* "~&Applying action ~A to ~A~%" action data
))
491 (return-from transform
492 (funcall (symbol-function action
) data
))
493 (undefined-function (x)
495 (format *error-output
* "missing definition for ~A: ~A~%" action x
)
496 ; (break "~A in ~A" action *package*)