1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; Encoding: utf-8; -*-
2 ;;; ---------------------------------------------------------------------------
4 ;;; Title: (including support for canonic XML according to J.Clark)
5 ;;; Created: 1999-09-09
6 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
7 ;;; Author: David Lichteblau <david@lichteblau.com>
8 ;;; License: Lisp-LGPL (See file COPYING for details).
9 ;;; ---------------------------------------------------------------------------
10 ;;; (c) copyright 1999 by Gilbert Baumann
11 ;;; (c) copyright 2004 by knowledgeTools Int. GmbH
12 ;;; (c) copyright 2004 by David Lichteblau (for headcraft.de)
14 ;;; This library is free software; you can redistribute it and/or
15 ;;; modify it under the terms of the GNU Library General Public
16 ;;; License as published by the Free Software Foundation; either
17 ;;; version 2 of the License, or (at your option) any later version.
19 ;;; This library is distributed in the hope that it will be useful,
20 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;;; Library General Public License for more details.
24 ;;; You should have received a copy of the GNU Library General Public
25 ;;; License along with this library; if not, write to the
26 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;;; Boston, MA 02111-1307 USA.
35 ;; | This document defines a subset of XML called canonical XML. The
36 ;; | intended use of canonical XML is in testing XML processors, as a
37 ;; | representation of the result of parsing an XML document.
39 ;; | Every well-formed XML document has a unique structurally equivalent
40 ;; | canonical XML document. Two structurally equivalent XML documents have
41 ;; | a byte-for-byte identical canonical XML document. Canonicalizing an
42 ;; | XML document requires only information that an XML processor is
43 ;; | required to make available to an application.
45 ;; | A canonical XML document conforms to the following grammar:
47 ;; | CanonXML ::= Pi* element Pi*
48 ;; | element ::= Stag (Datachar | Pi | element)* Etag
49 ;; | Stag ::= '<' Name Atts '>'
50 ;; | Etag ::= '</' Name '>'
51 ;; | Pi ::= '<?' Name ' ' (((Char - S) Char*)? - (Char* '?>' Char*)) '?>'
52 ;; | Atts ::= (' ' Name '=' '"' Datachar* '"')*
53 ;; | Datachar ::= '&' | '<' | '>' | '"'
54 ;; | | '	'| ' '| ' '
55 ;; | | (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD))
56 ;; | Name ::= (see XML spec)
57 ;; | Char ::= (see XML spec)
58 ;; | S ::= (see XML spec)
60 ;; | Attributes are in lexicographical order (in Unicode bit order).
62 ;; | A canonical XML document is encoded in UTF-8.
64 ;; | Ignorable white space is considered significant and is treated
65 ;; | equivalently to data.
67 ;; -- James Clark (jjc@jclark.com)
70 ;;;; SINK: an xml output sink
73 ((ystream :initarg
:ystream
:accessor sink-ystream
)
74 (width :initform
79 :initarg
:width
:accessor width
)
75 (canonical :initform t
:initarg
:canonical
:accessor canonical
)
76 (indentation :initform nil
:initarg
:indentation
:accessor indentation
)
77 (current-indentation :initform
0 :accessor current-indentation
)
78 (notations :initform
(make-buffer :element-type t
) :accessor notations
)
79 (name-for-dtd :accessor name-for-dtd
)
80 (previous-notation :initform nil
:accessor previous-notation
)
81 (have-doctype :initform nil
:accessor have-doctype
)
82 (stack :initform nil
:accessor stack
)))
84 (defmethod initialize-instance :after
((instance sink
) &key
)
85 (when (eq (canonical instance
) t
)
86 (setf (canonical instance
) 1))
87 (unless (member (canonical instance
) '(nil 1 2))
88 (error "Invalid canonical form: ~A" (canonical instance
)))
89 (when (and (canonical instance
) (indentation instance
))
90 (error "Cannot indent XML in canonical mode")))
92 (defun make-buffer (&key
(element-type '(unsigned-byte 8)))
94 :element-type element-type
98 ;; total haesslich, aber die ystreams will ich im moment eigentlich nicht
100 (macrolet ((define-maker (make-sink make-ystream
&rest args
)
101 `(defun ,make-sink
(,@args
&rest initargs
)
102 (apply #'make-instance
104 :ystream
(,make-ystream
,@args
)
106 (define-maker make-octet-vector-sink make-octet-vector-ystream
)
107 (define-maker make-octet-stream-sink make-octet-stream-ystream stream
)
108 (define-maker make-rod-sink make-rod-ystream
)
111 (define-maker make-character-stream-sink make-character-stream-ystream stream
)
114 (define-maker make-string-sink
/utf8 make-string-ystream
/utf8
)
117 (define-maker make-character-stream-sink
/utf8
118 make-character-stream-ystream
/utf8
122 (defun make-string-sink (&rest args
) (apply #'make-rod-sink args
))
125 (defmethod sax:end-document
((sink sink
))
126 (close-ystream (sink-ystream sink
)))
129 ;;;; doctype and notations
131 (defmethod sax:start-document
((sink sink
))
132 (unless (canonical sink
)
133 (%write-rod
#"<?xml version=\"1.0\" encoding=\"UTF-8\"?>" sink
)
134 (%write-rune
#/U
+000A sink
)))
136 (defmethod sax:start-dtd
((sink sink
) name public-id system-id
)
137 (setf (name-for-dtd sink
) name
)
138 (unless (canonical sink
)
139 (ensure-doctype sink public-id system-id
)))
141 (defun ensure-doctype (sink &optional public-id system-id
)
142 (unless (have-doctype sink
)
143 (setf (have-doctype sink
) t
)
144 (%write-rod
#"<!DOCTYPE " sink
)
145 (%write-rod
(name-for-dtd sink
) sink
)
148 (%write-rod
#" PUBLIC \"" sink
)
149 (unparse-string public-id sink
)
150 (%write-rod
#"\" \"" sink
)
151 (unparse-string system-id sink
)
152 (%write-rod
#"\"" sink
))
154 (%write-rod
#" SYSTEM \"" sink
)
155 (unparse-string public-id sink
)
156 (%write-rod
#"\"" sink
)))))
158 (defmethod sax:start-internal-subset
((sink sink
))
159 (ensure-doctype sink
)
160 (%write-rod
#" [" sink
)
161 (%write-rune
#/U
+000A sink
))
163 (defmethod sax:end-internal-subset
((sink sink
))
164 (ensure-doctype sink
)
165 (%write-rod
#"]" sink
))
167 (defmethod sax:notation-declaration
((sink sink
) name public-id system-id
)
168 (let ((prev (previous-notation sink
)))
169 (when (and (and (canonical sink
) (>= (canonical sink
) 2))
171 (not (rod< prev name
)))
172 (error "misordered notations; cannot unparse canonically"))
173 (setf (previous-notation sink
) name
))
174 (%write-rod
#"<!NOTATION " sink
)
175 (%write-rod name sink
)
177 ((zerop (length public-id
))
178 (%write-rod
#" SYSTEM '" sink
)
179 (%write-rod system-id sink
)
180 (%write-rune
#/' sink
))
181 ((zerop (length system-id
))
182 (%write-rod
#" PUBLIC '" sink
)
183 (%write-rod public-id sink
)
184 (%write-rune
#/' sink
))
186 (%write-rod
#" PUBLIC '" sink
)
187 (%write-rod public-id sink
)
188 (%write-rod
#"' '" sink
)
189 (%write-rod system-id sink
)
190 (%write-rune
#/' sink
)))
191 (%write-rune
#/> sink
)
192 (%write-rune
#/U
+000A sink
))
194 (defmethod sax:unparsed-entity-declaration
195 ((sink sink
) name public-id system-id notation-name
)
196 (unless (and (canonical sink
) (< (canonical sink
) 3))
197 (%write-rod
#"<!ENTITY " sink
)
198 (%write-rod name sink
)
200 ((zerop (length public-id
))
201 (%write-rod
#" SYSTEM '" sink
)
202 (%write-rod system-id sink
)
203 (%write-rune
#/' sink
))
204 ((zerop (length system-id
))
205 (%write-rod
#" PUBLIC '" sink
)
206 (%write-rod public-id sink
)
207 (%write-rune
#/' sink
))
209 (%write-rod
#" PUBLIC '" sink
)
210 (%write-rod public-id sink
)
211 (%write-rod
#"' '" sink
)
212 (%write-rod system-id sink
)
213 (%write-rune
#/' sink
)))
214 (%write-rod
#" NDATA " sink
)
215 (%write-rod notation-name sink
)
216 (%write-rune
#/> sink
)
217 (%write-rune
#/U
+000A sink
)))
219 (defmethod sax:external-entity-declaration
220 ((sink sink
) kind name public-id system-id
)
221 (when (canonical sink
)
222 (error "cannot serialize parsed entities in canonical mode"))
223 (%write-rod
#"<!ENTITY " sink
)
224 (when (eq kind
:parameter
)
225 (%write-rod
#" % " sink
))
226 (%write-rod name sink
)
228 ((zerop (length public-id
))
229 (%write-rod
#" SYSTEM '" sink
)
230 (%write-rod system-id sink
)
231 (%write-rune
#/' sink
))
232 ((zerop (length system-id
))
233 (%write-rod
#" PUBLIC '" sink
)
234 (%write-rod public-id sink
)
235 (%write-rune
#/' sink
))
237 (%write-rod
#" PUBLIC '" sink
)
238 (%write-rod public-id sink
)
239 (%write-rod
#"' '" sink
)
240 (%write-rod system-id sink
)
241 (%write-rune
#/' sink
)))
242 (%write-rune
#/> sink
)
243 (%write-rune
#/U
+000A sink
))
245 (defmethod sax:internal-entity-declaration
((sink sink
) kind name value
)
246 (when (canonical sink
)
247 (error "cannot serialize parsed entities in canonical mode"))
248 (%write-rod
#"<!ENTITY " sink
)
249 (when (eq kind
:parameter
)
250 (%write-rod
#" % " sink
))
251 (%write-rod name sink
)
252 (%write-rune
#/U
+0020 sink
)
253 (%write-rune
#/\" sink
)
254 (unparse-string value sink
)
255 (%write-rune
#/\" sink
)
256 (%write-rune
#/> sink
)
257 (%write-rune
#/U
+000A sink
))
259 (defmethod sax:element-declaration
((sink sink
) name model
)
260 (when (canonical sink
)
261 (error "cannot serialize element type declarations in canonical mode"))
262 (%write-rod
#"<!ELEMENT " sink
)
263 (%write-rod name sink
)
264 (%write-rune
#/U
+0020 sink
)
268 (%write-rod
"EMPTY" sink
))
270 (%write-rod
"#PCDATA" sink
))
272 (unparse-string m sink
))
276 (%write-rune
#/\
( sink
)
277 (loop for
(n . rest
) on
(cdr m
) do
280 (%write-rune
#\
, sink
)))
281 (%write-rune
#/\
) sink
))
283 (%write-rune
#/\
( sink
)
284 (loop for
(n . rest
) on
(cdr m
) do
287 (%write-rune
#\| sink
)))
288 (%write-rune
#/\
) sink
))
291 (%write-rod
#/* sink
))
294 (%write-rod
#/+ sink
))
297 (%write-rod
#/? sink
)))))))
299 (%write-rune
#/> sink
)
300 (%write-rune
#/U
+000A sink
))
302 (defmethod sax:attribute-declaration
((sink sink
) ename aname type default
)
303 (when (canonical sink
)
304 (error "cannot serialize attribute type declarations in canonical mode"))
305 (%write-rod
#"<!ATTLIST " sink
)
306 (%write-rod ename sink
)
307 (%write-rune
#/U
+0020 sink
)
308 (%write-rod aname sink
)
309 (%write-rune
#/U
+0020 sink
)
312 (%write-rod
(rod (string-upcase (symbol-name type
))) sink
))
314 (when (eq :NOTATION
(car type
))
315 (%write-rod
#"NOTATION " sink
))
316 (%write-rune
#/\
( sink
)
317 (loop for
(n . rest
) on
(cdr type
) do
320 (%write-rune
#\| sink
)))
321 (%write-rune
#/\
) sink
)))
324 (%write-rune
#/# sink
)
325 (%write-rod
(rod (string-upcase (symbol-name default
))) sink
))
327 (when (eq :FIXED
(car default
))
328 (%write-rod
#"#FIXED " sink
))
329 (%write-rune
#/\" sink
)
330 (unparse-string (second default
) sink
)
331 (%write-rune
#/\" sink
)))
332 (%write-rune
#/> sink
)
333 (%write-rune
#/U
+000A sink
))
335 (defmethod sax:end-dtd
((sink sink
))
336 (when (have-doctype sink
)
337 (%write-rod
#">" sink
)
338 (%write-rune
#/U
+000A sink
)))
343 (defstruct (tag (:constructor make-tag
(name)))
348 (defun sink-fresh-line (sink)
349 (unless (zerop (ystream-column (sink-ystream sink
)))
350 (%write-rune
#/U
+000A sink
) ;newline
353 (defun maybe-close-tag (sink)
354 (let ((tag (car (stack sink
))))
355 (when (and (tag-p tag
) (not (tag-have-gt tag
)))
356 (setf (tag-have-gt tag
) t
)
357 (%write-rune
#/> sink
))))
359 (defmethod sax:start-element
360 ((sink sink
) namespace-uri local-name qname attributes
)
361 (declare (ignore namespace-uri local-name
))
362 (maybe-close-tag sink
)
364 (incf (tag-n-children (first (stack sink
)))))
365 (push (make-tag qname
) (stack sink
))
366 (when (indentation sink
)
367 (sink-fresh-line sink
)
368 (start-indentation-block sink
))
369 (%write-rune
#/< sink
)
370 (%write-rod qname sink
)
371 (let ((atts (sort (copy-list attributes
) #'rod
< :key
#'sax
:attribute-qname
)))
373 (%write-rune
#/space sink
)
374 (%write-rod
(sax:attribute-qname a
) sink
)
375 (%write-rune
#/= sink
)
376 (%write-rune
#/\" sink
)
377 (unparse-string (sax:attribute-value a
) sink
)
378 (%write-rune
#/\" sink
)))
379 (when (canonical sink
)
380 (maybe-close-tag sink
)))
382 (defmethod sax:end-element
383 ((sink sink
) namespace-uri local-name qname
)
384 (declare (ignore namespace-uri local-name
))
385 (let ((tag (pop (stack sink
))))
387 (error "output does not nest: not in an element"))
388 (unless (rod= (tag-name tag
) qname
)
389 (error "output does not nest: expected ~A but got ~A"
390 (rod qname
) (rod (tag-name tag
))))
391 (when (indentation sink
)
392 (end-indentation-block sink
)
393 (unless (zerop (tag-n-children tag
))
394 (sink-fresh-line sink
)))
397 (%write-rod
'#.
(string-rod "</") sink
)
398 (%write-rod qname sink
)
399 (%write-rod
'#.
(string-rod ">") sink
))
401 (%write-rod
#"/>" sink
)))))
403 (defmethod sax:processing-instruction
((sink sink
) target data
)
404 (maybe-close-tag sink
)
405 (unless (rod-equal target
'#.
(string-rod "xml"))
406 (%write-rod
'#.
(string-rod "<?") sink
)
407 (%write-rod target sink
)
409 (%write-rune
#/space sink
)
410 (%write-rod data sink
))
411 (%write-rod
'#.
(string-rod "?>") sink
)))
413 (defmethod sax:start-cdata
((sink sink
))
414 (maybe-close-tag sink
)
415 (push :cdata
(stack sink
)))
417 (defmethod sax:characters
((sink sink
) data
)
418 (maybe-close-tag sink
)
420 ((and (eq (car (stack sink
)) :cdata
)
421 (not (canonical sink
))
422 (not (search #"]]" data
)))
423 (when (indentation sink
)
424 (sink-fresh-line sink
))
425 (%write-rod
#"<![CDATA[" sink
)
426 ;; XXX signal error if body is unprintable?
427 (map nil
(lambda (c) (%write-rune c sink
)) data
)
428 (%write-rod
#"]]>" sink
))
430 (if (indentation sink
)
431 (unparse-indented-text data sink
)
432 (let ((y (sink-ystream sink
)))
434 (loop for c across data do
(unparse-datachar c y
))
435 (loop for c across data do
(unparse-datachar-readable c y
))))))))
437 (defmethod sax:end-cdata
((sink sink
))
438 (unless (eq (pop (stack sink
)) :cdata
)
439 (error "output does not nest: not in a cdata section")))
442 (dotimes (x (current-indentation sink
))
443 (%write-rune
#/U
+0020 sink
))) ; space
445 (defun start-indentation-block (sink)
446 (incf (current-indentation sink
) (indentation sink
)))
448 (defun end-indentation-block (sink)
449 (decf (current-indentation sink
) (indentation sink
)))
451 (defun unparse-indented-text (data sink
)
452 (flet ((whitespacep (x)
453 (or (rune= x
#/U
+000A
) (rune= x
#/U
+0020))))
454 (let* ((n (length data
))
455 (pos (position-if-not #'whitespacep data
))
456 (need-whitespace-p nil
))
460 (sink-fresh-line sink
)
462 (let* ((w (or (position-if #'whitespacep data
:start
(1+ pos
)) n
))
463 (next (or (position-if-not #'whitespacep data
:start w
) n
)))
464 (when need-whitespace-p
465 (if (< (+ (ystream-column (sink-ystream sink
)) w
(- pos
))
467 (%write-rune
#/U
+0020 sink
)
468 (sink-fresh-line sink
)))
470 with y
= (sink-ystream sink
)
471 for i from pos below w do
472 (unparse-datachar-readable (elt data i
) y
))
473 (setf need-whitespace-p
(< w n
))
476 (%write-rune
#/U
+0020 sink
))))))
478 (defun unparse-string (str sink
)
479 (let ((y (sink-ystream sink
)))
480 (loop for rune across str do
(unparse-datachar rune y
))))
482 (defun unparse-datachar (c ystream
)
483 (cond ((rune= c
#/&) (write-rod '#.
(string-rod "&") ystream
))
484 ((rune= c
#/<) (write-rod '#.
(string-rod "<") ystream
))
485 ((rune= c
#/>) (write-rod '#.
(string-rod ">") ystream
))
486 ((rune= c
#/\") (write-rod '#.
(string-rod """) ystream
))
487 ((rune= c
#/U
+0009) (write-rod '#.
(string-rod "	") ystream
))
488 ((rune= c
#/U
+000A
) (write-rod '#.
(string-rod " ") ystream
))
489 ((rune= c
#/U
+000D
) (write-rod '#.
(string-rod " ") ystream
))
491 (write-rune c ystream
))))
493 (defun unparse-datachar-readable (c ystream
)
494 (cond ((rune= c
#/&) (write-rod '#.
(string-rod "&") ystream
))
495 ((rune= c
#/<) (write-rod '#.
(string-rod "<") ystream
))
496 ((rune= c
#/>) (write-rod '#.
(string-rod ">") ystream
))
497 ((rune= c
#/\") (write-rod '#.
(string-rod """) ystream
))
499 (write-rune c ystream
))))
501 (defun %write-rune
(c sink
)
502 (write-rune c
(sink-ystream sink
)))
504 (defun %write-rod
(r sink
)
505 (write-rod r
(sink-ystream sink
)))
508 ;;;; convenience functions for DOMless XML serialization
510 (defvar *current-element
*)
513 (defmacro with-xml-output
(sink &body body
)
514 `(invoke-with-xml-output (lambda () ,@body
) ,sink
))
516 (defun invoke-with-xml-output (fn sink
)
518 (*current-element
* nil
))
519 (sax:start-document
*sink
*)
521 (sax:end-document
*sink
*)))
523 (defmacro with-element
(qname &body body
)
524 `(invoke-with-element (lambda () ,@body
) ,qname
))
526 (defun maybe-emit-start-tag ()
527 (when *current-element
*
528 ;; starting child node, need to emit opening tag of parent first:
529 (destructuring-bind (qname &rest attributes
) *current-element
*
530 (sax:start-element
*sink
* nil nil qname
(reverse attributes
)))
531 (setf *current-element
* nil
)))
533 (defun invoke-with-element (fn qname
)
534 (setf qname
(rod qname
))
535 (maybe-emit-start-tag)
536 (let ((*current-element
* (list qname
)))
537 (multiple-value-prog1
539 (maybe-emit-start-tag)
540 (sax:end-element
*sink
* nil nil qname
))))
542 (defun attribute (name value
)
543 (push (sax:make-attribute
:qname
(rod name
) :value
(rod value
))
544 (cdr *current-element
*))
548 (maybe-emit-start-tag)
549 (sax:start-cdata
*sink
*)
550 (sax:characters
*sink
* (rod data
))
551 (sax:end-cdata
*sink
*)
555 (maybe-emit-start-tag)
556 (sax:characters
*sink
* (rod data
))
559 (defun rod-to-utf8-string (rod)
560 (let ((out (make-buffer :element-type
'character
)))
561 (runes-to-utf8/adjustable-string out rod
(length rod
))
564 (defun utf8-string-to-rod (str)
565 (let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str
))
566 (buffer (make-array (length bytes
) :element-type
'(unsigned-byte 16)))
567 (n (decode-sequence :utf-8 bytes
0 (length bytes
) buffer
0 0 nil
))
568 (result (make-array n
:element-type
'rune
)))
569 (map-into result
#'code-rune buffer
)