Sonderzeichen raus
[cxml/s11.git] / xml / unparse.lisp
bloba09a78e217e924243a499e92880f837a9b6fd3ab
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; Encoding: utf-8; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: Unparse XML
4 ;;; Title: (including support for canonic XML according to J.Clark)
5 ;;; Created: 1999-09-09
6 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
7 ;;; Author: David Lichteblau <david@lichteblau.com>
8 ;;; License: Lisp-LGPL (See file COPYING for details).
9 ;;; ---------------------------------------------------------------------------
10 ;;; (c) copyright 1999 by Gilbert Baumann
11 ;;; (c) copyright 2004 by knowledgeTools Int. GmbH
12 ;;; (c) copyright 2004 by David Lichteblau (for headcraft.de)
14 ;;; This library is free software; you can redistribute it and/or
15 ;;; modify it under the terms of the GNU Library General Public
16 ;;; License as published by the Free Software Foundation; either
17 ;;; version 2 of the License, or (at your option) any later version.
18 ;;;
19 ;;; This library is distributed in the hope that it will be useful,
20 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 ;;; Library General Public License for more details.
23 ;;;
24 ;;; You should have received a copy of the GNU Library General Public
25 ;;; License along with this library; if not, write to the
26 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
27 ;;; Boston, MA 02111-1307 USA.
29 (in-package :cxml)
31 ;;
32 ;; | Canonical XML
33 ;; | =============
34 ;; |
35 ;; | This document defines a subset of XML called canonical XML. The
36 ;; | intended use of canonical XML is in testing XML processors, as a
37 ;; | representation of the result of parsing an XML document.
38 ;; |
39 ;; | Every well-formed XML document has a unique structurally equivalent
40 ;; | canonical XML document. Two structurally equivalent XML documents have
41 ;; | a byte-for-byte identical canonical XML document. Canonicalizing an
42 ;; | XML document requires only information that an XML processor is
43 ;; | required to make available to an application.
44 ;; |
45 ;; | A canonical XML document conforms to the following grammar:
46 ;; |
47 ;; | CanonXML ::= Pi* element Pi*
48 ;; | element ::= Stag (Datachar | Pi | element)* Etag
49 ;; | Stag ::= '<' Name Atts '>'
50 ;; | Etag ::= '</' Name '>'
51 ;; | Pi ::= '<?' Name ' ' (((Char - S) Char*)? - (Char* '?>' Char*)) '?>'
52 ;; | Atts ::= (' ' Name '=' '"' Datachar* '"')*
53 ;; | Datachar ::= '&amp;' | '&lt;' | '&gt;' | '&quot;'
54 ;; | | '&#9;'| '&#10;'| '&#13;'
55 ;; | | (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD))
56 ;; | Name ::= (see XML spec)
57 ;; | Char ::= (see XML spec)
58 ;; | S ::= (see XML spec)
59 ;; |
60 ;; | Attributes are in lexicographical order (in Unicode bit order).
61 ;; |
62 ;; | A canonical XML document is encoded in UTF-8.
63 ;; |
64 ;; | Ignorable white space is considered significant and is treated
65 ;; | equivalently to data.
67 ;; -- James Clark (jjc@jclark.com)
70 ;;;; SINK: an xml output sink
72 (defclass sink ()
73 ((ystream :initarg :ystream :accessor sink-ystream)
74 (width :initform 79 :initarg :width :accessor width)
75 (canonical :initform t :initarg :canonical :accessor canonical)
76 (indentation :initform nil :initarg :indentation :accessor indentation)
77 (current-indentation :initform 0 :accessor current-indentation)
78 (notations :initform (make-buffer :element-type t) :accessor notations)
79 (name-for-dtd :accessor name-for-dtd)
80 (previous-notation :initform nil :accessor previous-notation)
81 (have-doctype :initform nil :accessor have-doctype)
82 (stack :initform nil :accessor stack)))
84 (defmethod initialize-instance :after ((instance sink) &key)
85 (when (eq (canonical instance) t)
86 (setf (canonical instance) 1))
87 (unless (member (canonical instance) '(nil 1 2))
88 (error "Invalid canonical form: ~A" (canonical instance)))
89 (when (and (canonical instance) (indentation instance))
90 (error "Cannot indent XML in canonical mode")))
92 (defun make-buffer (&key (element-type '(unsigned-byte 8)))
93 (make-array 1
94 :element-type element-type
95 :adjustable t
96 :fill-pointer 0))
98 ;; total haesslich, aber die ystreams will ich im moment eigentlich nicht
99 ;; dokumentieren
100 (macrolet ((define-maker (make-sink make-ystream &rest args)
101 `(defun ,make-sink (,@args &rest initargs)
102 (apply #'make-instance
103 'sink
104 :ystream (,make-ystream ,@args)
105 initargs))))
106 (define-maker make-octet-vector-sink make-octet-vector-ystream)
107 (define-maker make-octet-stream-sink make-octet-stream-ystream stream)
108 (define-maker make-rod-sink make-rod-ystream)
110 #+rune-is-character
111 (define-maker make-character-stream-sink make-character-stream-ystream stream)
113 #-rune-is-character
114 (define-maker make-string-sink/utf8 make-string-ystream/utf8)
116 #-rune-is-character
117 (define-maker make-character-stream-sink/utf8
118 make-character-stream-ystream/utf8
119 stream))
121 #+rune-is-character
122 (defun make-string-sink (&rest args) (apply #'make-rod-sink args))
125 (defmethod sax:end-document ((sink sink))
126 (close-ystream (sink-ystream sink)))
129 ;;;; doctype and notations
131 (defmethod sax:start-document ((sink sink))
132 (unless (canonical sink)
133 (%write-rod #"<?xml version=\"1.0\" encoding=\"UTF-8\"?>" sink)
134 (%write-rune #/U+000A sink)))
136 (defmethod sax:start-dtd ((sink sink) name public-id system-id)
137 (setf (name-for-dtd sink) name)
138 (unless (canonical sink)
139 (ensure-doctype sink public-id system-id)))
141 (defun ensure-doctype (sink &optional public-id system-id)
142 (unless (have-doctype sink)
143 (setf (have-doctype sink) t)
144 (%write-rod #"<!DOCTYPE " sink)
145 (%write-rod (name-for-dtd sink) sink)
146 (cond
147 (public-id
148 (%write-rod #" PUBLIC \"" sink)
149 (unparse-string public-id sink)
150 (%write-rod #"\" \"" sink)
151 (unparse-string system-id sink)
152 (%write-rod #"\"" sink))
153 (system-id
154 (%write-rod #" SYSTEM \"" sink)
155 (unparse-string public-id sink)
156 (%write-rod #"\"" sink)))))
158 (defmethod sax:start-internal-subset ((sink sink))
159 (ensure-doctype sink)
160 (%write-rod #" [" sink)
161 (%write-rune #/U+000A sink))
163 (defmethod sax:end-internal-subset ((sink sink))
164 (ensure-doctype sink)
165 (%write-rod #"]" sink))
167 (defmethod sax:notation-declaration ((sink sink) name public-id system-id)
168 (let ((prev (previous-notation sink)))
169 (when (and (and (canonical sink) (>= (canonical sink) 2))
170 prev
171 (not (rod< prev name)))
172 (error "misordered notations; cannot unparse canonically"))
173 (setf (previous-notation sink) name))
174 (%write-rod #"<!NOTATION " sink)
175 (%write-rod name sink)
176 (cond
177 ((zerop (length public-id))
178 (%write-rod #" SYSTEM '" sink)
179 (%write-rod system-id sink)
180 (%write-rune #/' sink))
181 ((zerop (length system-id))
182 (%write-rod #" PUBLIC '" sink)
183 (%write-rod public-id sink)
184 (%write-rune #/' sink))
186 (%write-rod #" PUBLIC '" sink)
187 (%write-rod public-id sink)
188 (%write-rod #"' '" sink)
189 (%write-rod system-id sink)
190 (%write-rune #/' sink)))
191 (%write-rune #/> sink)
192 (%write-rune #/U+000A sink))
194 (defmethod sax:unparsed-entity-declaration
195 ((sink sink) name public-id system-id notation-name)
196 (unless (and (canonical sink) (< (canonical sink) 3))
197 (%write-rod #"<!ENTITY " sink)
198 (%write-rod name sink)
199 (cond
200 ((zerop (length public-id))
201 (%write-rod #" SYSTEM '" sink)
202 (%write-rod system-id sink)
203 (%write-rune #/' sink))
204 ((zerop (length system-id))
205 (%write-rod #" PUBLIC '" sink)
206 (%write-rod public-id sink)
207 (%write-rune #/' sink))
209 (%write-rod #" PUBLIC '" sink)
210 (%write-rod public-id sink)
211 (%write-rod #"' '" sink)
212 (%write-rod system-id sink)
213 (%write-rune #/' sink)))
214 (%write-rod #" NDATA " sink)
215 (%write-rod notation-name sink)
216 (%write-rune #/> sink)
217 (%write-rune #/U+000A sink)))
219 (defmethod sax:external-entity-declaration
220 ((sink sink) kind name public-id system-id)
221 (when (canonical sink)
222 (error "cannot serialize parsed entities in canonical mode"))
223 (%write-rod #"<!ENTITY " sink)
224 (when (eq kind :parameter)
225 (%write-rod #" % " sink))
226 (%write-rod name sink)
227 (cond
228 ((zerop (length public-id))
229 (%write-rod #" SYSTEM '" sink)
230 (%write-rod system-id sink)
231 (%write-rune #/' sink))
232 ((zerop (length system-id))
233 (%write-rod #" PUBLIC '" sink)
234 (%write-rod public-id sink)
235 (%write-rune #/' sink))
237 (%write-rod #" PUBLIC '" sink)
238 (%write-rod public-id sink)
239 (%write-rod #"' '" sink)
240 (%write-rod system-id sink)
241 (%write-rune #/' sink)))
242 (%write-rune #/> sink)
243 (%write-rune #/U+000A sink))
245 (defmethod sax:internal-entity-declaration ((sink sink) kind name value)
246 (when (canonical sink)
247 (error "cannot serialize parsed entities in canonical mode"))
248 (%write-rod #"<!ENTITY " sink)
249 (when (eq kind :parameter)
250 (%write-rod #" % " sink))
251 (%write-rod name sink)
252 (%write-rune #/U+0020 sink)
253 (%write-rune #/\" sink)
254 (unparse-string value sink)
255 (%write-rune #/\" sink)
256 (%write-rune #/> sink)
257 (%write-rune #/U+000A sink))
259 (defmethod sax:element-declaration ((sink sink) name model)
260 (when (canonical sink)
261 (error "cannot serialize element type declarations in canonical mode"))
262 (%write-rod #"<!ELEMENT " sink)
263 (%write-rod name sink)
264 (%write-rune #/U+0020 sink)
265 (labels ((walk (m)
266 (cond
267 ((eq m :EMPTY)
268 (%write-rod "EMPTY" sink))
269 ((eq m :PCDATA)
270 (%write-rod "#PCDATA" sink))
271 ((atom m)
272 (unparse-string m sink))
274 (ecase (car m)
275 (and
276 (%write-rune #/\( sink)
277 (loop for (n . rest) on (cdr m) do
278 (walk n)
279 (when rest
280 (%write-rune #\, sink)))
281 (%write-rune #/\) sink))
283 (%write-rune #/\( sink)
284 (loop for (n . rest) on (cdr m) do
285 (walk n)
286 (when rest
287 (%write-rune #\| sink)))
288 (%write-rune #/\) sink))
290 (walk (second m))
291 (%write-rod #/* sink))
293 (walk (second m))
294 (%write-rod #/+ sink))
296 (walk (second m))
297 (%write-rod #/? sink)))))))
298 (walk model))
299 (%write-rune #/> sink)
300 (%write-rune #/U+000A sink))
302 (defmethod sax:attribute-declaration ((sink sink) ename aname type default)
303 (when (canonical sink)
304 (error "cannot serialize attribute type declarations in canonical mode"))
305 (%write-rod #"<!ATTLIST " sink)
306 (%write-rod ename sink)
307 (%write-rune #/U+0020 sink)
308 (%write-rod aname sink)
309 (%write-rune #/U+0020 sink)
310 (cond
311 ((atom type)
312 (%write-rod (rod (string-upcase (symbol-name type))) sink))
314 (when (eq :NOTATION (car type))
315 (%write-rod #"NOTATION " sink))
316 (%write-rune #/\( sink)
317 (loop for (n . rest) on (cdr type) do
318 (%write-rod n sink)
319 (when rest
320 (%write-rune #\| sink)))
321 (%write-rune #/\) sink)))
322 (cond
323 ((atom default)
324 (%write-rune #/# sink)
325 (%write-rod (rod (string-upcase (symbol-name default))) sink))
327 (when (eq :FIXED (car default))
328 (%write-rod #"#FIXED " sink))
329 (%write-rune #/\" sink)
330 (unparse-string (second default) sink)
331 (%write-rune #/\" sink)))
332 (%write-rune #/> sink)
333 (%write-rune #/U+000A sink))
335 (defmethod sax:end-dtd ((sink sink))
336 (when (have-doctype sink)
337 (%write-rod #">" sink)
338 (%write-rune #/U+000A sink)))
341 ;;;; elements
343 (defstruct (tag (:constructor make-tag (name)))
344 name
345 (n-children 0)
346 (have-gt nil))
348 (defun sink-fresh-line (sink)
349 (unless (zerop (ystream-column (sink-ystream sink)))
350 (%write-rune #/U+000A sink) ;newline
351 (indent sink)))
353 (defun maybe-close-tag (sink)
354 (let ((tag (car (stack sink))))
355 (when (and (tag-p tag) (not (tag-have-gt tag)))
356 (setf (tag-have-gt tag) t)
357 (%write-rune #/> sink))))
359 (defmethod sax:start-element
360 ((sink sink) namespace-uri local-name qname attributes)
361 (declare (ignore namespace-uri local-name))
362 (maybe-close-tag sink)
363 (when (stack sink)
364 (incf (tag-n-children (first (stack sink)))))
365 (push (make-tag qname) (stack sink))
366 (when (indentation sink)
367 (sink-fresh-line sink)
368 (start-indentation-block sink))
369 (%write-rune #/< sink)
370 (%write-rod qname sink)
371 (let ((atts (sort (copy-list attributes) #'rod< :key #'sax:attribute-qname)))
372 (dolist (a atts)
373 (%write-rune #/space sink)
374 (%write-rod (sax:attribute-qname a) sink)
375 (%write-rune #/= sink)
376 (%write-rune #/\" sink)
377 (unparse-string (sax:attribute-value a) sink)
378 (%write-rune #/\" sink)))
379 (when (canonical sink)
380 (maybe-close-tag sink)))
382 (defmethod sax:end-element
383 ((sink sink) namespace-uri local-name qname)
384 (declare (ignore namespace-uri local-name))
385 (let ((tag (pop (stack sink))))
386 (unless (tag-p tag)
387 (error "output does not nest: not in an element"))
388 (unless (rod= (tag-name tag) qname)
389 (error "output does not nest: expected ~A but got ~A"
390 (rod qname) (rod (tag-name tag))))
391 (when (indentation sink)
392 (end-indentation-block sink)
393 (unless (zerop (tag-n-children tag))
394 (sink-fresh-line sink)))
395 (cond
396 ((tag-have-gt tag)
397 (%write-rod '#.(string-rod "</") sink)
398 (%write-rod qname sink)
399 (%write-rod '#.(string-rod ">") sink))
401 (%write-rod #"/>" sink)))))
403 (defmethod sax:processing-instruction ((sink sink) target data)
404 (maybe-close-tag sink)
405 (unless (rod-equal target '#.(string-rod "xml"))
406 (%write-rod '#.(string-rod "<?") sink)
407 (%write-rod target sink)
408 (when data
409 (%write-rune #/space sink)
410 (%write-rod data sink))
411 (%write-rod '#.(string-rod "?>") sink)))
413 (defmethod sax:start-cdata ((sink sink))
414 (maybe-close-tag sink)
415 (push :cdata (stack sink)))
417 (defmethod sax:characters ((sink sink) data)
418 (maybe-close-tag sink)
419 (cond
420 ((and (eq (car (stack sink)) :cdata)
421 (not (canonical sink))
422 (not (search #"]]" data)))
423 (when (indentation sink)
424 (sink-fresh-line sink))
425 (%write-rod #"<![CDATA[" sink)
426 ;; XXX signal error if body is unprintable?
427 (map nil (lambda (c) (%write-rune c sink)) data)
428 (%write-rod #"]]>" sink))
430 (if (indentation sink)
431 (unparse-indented-text data sink)
432 (let ((y (sink-ystream sink)))
433 (if (canonical sink)
434 (loop for c across data do (unparse-datachar c y))
435 (loop for c across data do (unparse-datachar-readable c y))))))))
437 (defmethod sax:end-cdata ((sink sink))
438 (unless (eq (pop (stack sink)) :cdata)
439 (error "output does not nest: not in a cdata section")))
441 (defun indent (sink)
442 (dotimes (x (current-indentation sink))
443 (%write-rune #/U+0020 sink))) ; space
445 (defun start-indentation-block (sink)
446 (incf (current-indentation sink) (indentation sink)))
448 (defun end-indentation-block (sink)
449 (decf (current-indentation sink) (indentation sink)))
451 (defun unparse-indented-text (data sink)
452 (flet ((whitespacep (x)
453 (or (rune= x #/U+000A) (rune= x #/U+0020))))
454 (let* ((n (length data))
455 (pos (position-if-not #'whitespacep data))
456 (need-whitespace-p nil))
457 (cond
458 ((zerop n))
459 (pos
460 (sink-fresh-line sink)
461 (while (< pos n)
462 (let* ((w (or (position-if #'whitespacep data :start (1+ pos)) n))
463 (next (or (position-if-not #'whitespacep data :start w) n)))
464 (when need-whitespace-p
465 (if (< (+ (ystream-column (sink-ystream sink)) w (- pos))
466 (width sink))
467 (%write-rune #/U+0020 sink)
468 (sink-fresh-line sink)))
469 (loop
470 with y = (sink-ystream sink)
471 for i from pos below w do
472 (unparse-datachar-readable (elt data i) y))
473 (setf need-whitespace-p (< w n))
474 (setf pos next))))
476 (%write-rune #/U+0020 sink))))))
478 (defun unparse-string (str sink)
479 (let ((y (sink-ystream sink)))
480 (loop for rune across str do (unparse-datachar rune y))))
482 (defun unparse-datachar (c ystream)
483 (cond ((rune= c #/&) (write-rod '#.(string-rod "&amp;") ystream))
484 ((rune= c #/<) (write-rod '#.(string-rod "&lt;") ystream))
485 ((rune= c #/>) (write-rod '#.(string-rod "&gt;") ystream))
486 ((rune= c #/\") (write-rod '#.(string-rod "&quot;") ystream))
487 ((rune= c #/U+0009) (write-rod '#.(string-rod "&#9;") ystream))
488 ((rune= c #/U+000A) (write-rod '#.(string-rod "&#10;") ystream))
489 ((rune= c #/U+000D) (write-rod '#.(string-rod "&#13;") ystream))
491 (write-rune c ystream))))
493 (defun unparse-datachar-readable (c ystream)
494 (cond ((rune= c #/&) (write-rod '#.(string-rod "&amp;") ystream))
495 ((rune= c #/<) (write-rod '#.(string-rod "&lt;") ystream))
496 ((rune= c #/>) (write-rod '#.(string-rod "&gt;") ystream))
497 ((rune= c #/\") (write-rod '#.(string-rod "&quot;") ystream))
499 (write-rune c ystream))))
501 (defun %write-rune (c sink)
502 (write-rune c (sink-ystream sink)))
504 (defun %write-rod (r sink)
505 (write-rod r (sink-ystream sink)))
508 ;;;; convenience functions for DOMless XML serialization
510 (defvar *current-element*)
511 (defvar *sink*)
513 (defmacro with-xml-output (sink &body body)
514 `(invoke-with-xml-output (lambda () ,@body) ,sink))
516 (defun invoke-with-xml-output (fn sink)
517 (let ((*sink* sink)
518 (*current-element* nil))
519 (sax:start-document *sink*)
520 (funcall fn)
521 (sax:end-document *sink*)))
523 (defmacro with-element (qname &body body)
524 `(invoke-with-element (lambda () ,@body) ,qname))
526 (defun maybe-emit-start-tag ()
527 (when *current-element*
528 ;; starting child node, need to emit opening tag of parent first:
529 (destructuring-bind (qname &rest attributes) *current-element*
530 (sax:start-element *sink* nil nil qname (reverse attributes)))
531 (setf *current-element* nil)))
533 (defun invoke-with-element (fn qname)
534 (setf qname (rod qname))
535 (maybe-emit-start-tag)
536 (let ((*current-element* (list qname)))
537 (multiple-value-prog1
538 (funcall fn)
539 (maybe-emit-start-tag)
540 (sax:end-element *sink* nil nil qname))))
542 (defun attribute (name value)
543 (push (sax:make-attribute :qname (rod name) :value (rod value))
544 (cdr *current-element*))
545 value)
547 (defun cdata (data)
548 (maybe-emit-start-tag)
549 (sax:start-cdata *sink*)
550 (sax:characters *sink* (rod data))
551 (sax:end-cdata *sink*)
552 data)
554 (defun text (data)
555 (maybe-emit-start-tag)
556 (sax:characters *sink* (rod data))
557 data)
559 (defun rod-to-utf8-string (rod)
560 (let ((out (make-buffer :element-type 'character)))
561 (runes-to-utf8/adjustable-string out rod (length rod))
562 out))
564 (defun utf8-string-to-rod (str)
565 (let* ((bytes (map '(vector (unsigned-byte 8)) #'char-code str))
566 (buffer (make-array (length bytes) :element-type '(unsigned-byte 16)))
567 (n (decode-sequence :utf-8 bytes 0 (length bytes) buffer 0 0 nil))
568 (result (make-array n :element-type 'rune)))
569 (map-into result #'code-rune buffer)
570 result))