Fix/add documentation for klacks:list-attributes, get-attribute
[cxml.git] / xml / xml-parse.lisp
blobcfbb441ac1db179d0fd75be48aa340030a98f664
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: CXML; readtable: runes; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: XML parser
4 ;;; Created: 1999-07-17
5 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
6 ;;; Author: Henrik Motakef <hmot@henrik-motakef.de>
7 ;;; Author: David Lichteblau <david@lichteblau.com>
8 ;;; License: Lisp-LGPL (See file COPYING for details).
9 ;;; ---------------------------------------------------------------------------
10 ;;; (c) copyright 1999 by Gilbert Baumann
11 ;;; (c) copyright 2003 by Henrik Motakef
12 ;;; (c) copyright 2004 knowledgeTools Int. GmbH
13 ;;; (c) copyright 2004 David Lichteblau
14 ;;; (c) copyright 2005 David Lichteblau
16 ;;; This library is free software; you can redistribute it and/or
17 ;;; modify it under the terms of the GNU Library General Public
18 ;;; License as published by the Free Software Foundation; either
19 ;;; version 2 of the License, or (at your option) any later version.
20 ;;;
21 ;;; This library is distributed in the hope that it will be useful,
22 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 ;;; Library General Public License for more details.
25 ;;;
26 ;;; You should have received a copy of the GNU Library General Public
27 ;;; License along with this library; if not, write to the
28 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
29 ;;; Boston, MA 02111-1307 USA.
31 ;;; Streams
33 ;;; xstreams
35 ;; For reading runes, I defined my own streams, called xstreams,
36 ;; because we want to be fast. A function call or even a method call
37 ;; per character is not acceptable, instead of that we define a
38 ;; buffered stream with and advertised buffer layout, so that we
39 ;; could use the trick stdio uses: READ-RUNE and PEEK-RUNE are macros,
40 ;; directly accessing the buffer and only calling some underflow
41 ;; handler in case of stream underflows. This will yield to quite a
42 ;; performance boost vs calling READ-BYTE per character.
44 ;; Also we need to do encoding t conversion on ; this better done at large chunks of data rather than on a character
45 ;; by character basis. This way we need a dispatch on the active
46 ;; encoding only once in a while, instead of for each character. This
47 ;; allows us to use a CLOS interface to do the underflow handling.
49 ;;; zstreams
51 ;; Now, for reading tokens, we define another kind of streams, called
52 ;; zstreams. These zstreams also maintain an input stack to implement
53 ;; inclusion of external entities. This input stack contains xstreams
54 ;; or the special marker :STOP. Such a :STOP marker indicates, that
55 ;; input should not continue there, but well stop; that is simulate an
56 ;; EOF. The user is then responsible to pop this marker off the input
57 ;; stack.
59 ;; This input stack is also used to detect circular entity inclusion.
61 ;; The zstream tokenizer recognizes the following types of tokens and
62 ;; is controlled by the *DATA-BEHAVIOUR* flag. (Which should become a
63 ;; slot of zstreams instead).
65 ;; Common
66 ;; :xml-decl (<target> . <content>) ;processing-instruction starting with "<?xml"
67 ;; :pi (<target> . <content>) ;processing-instruction
68 ;; :stag (<name> . <atts>) ;start tag
69 ;; :etag (<name> . <atts>) ;end tag
70 ;; :ztag (<name> . <atts>) ;empty tag
71 ;; :<!ELEMENT
72 ;; :<!ENTITY
73 ;; :<!ATTLIST
74 ;; :<!NOTATION
75 ;; :<!DOCTYPE
76 ;; :<![
77 ;; :comment <content>
79 ;; *data-behaviour* = :DTD
81 ;; :nmtoken <interned-rod>
82 ;; :#required
83 ;; :#implied
84 ;; :#fixed
85 ;; :#pcdata
86 ;; :s
87 ;; :\[ :\] :\( :\) :|\ :\> :\" :\' :\, :\? :\* :\+
89 ;; *data-behaviour* = :DOC
91 ;; :entity-ref <interned-rod>
92 ;; :cdata <rod>
95 ;;; TODO
97 ;; o provide for a faster DOM
99 ;; o morph zstream into a context object and thus also get rid of
100 ;; special variables. Put the current DTD there too.
101 ;; [partly done]
103 ;; o the *scratch-pad* hack should become something much more
104 ;; reentrant, we could either define a system-wide resource
105 ;; or allocate some scratch-pads per context.
106 ;; [for thread-safety reasons the array are allocated per context now,
107 ;; reentrancy is still open]
109 ;; o CR handling in utf-16 deocders
111 ;; o UCS-4 reader
113 ;; o max depth together with circle detection
114 ;; (or proof, that our circle detection is enough).
115 ;; [gemeint ist zstream-push--david]
117 ;; o better extensibility wrt character representation, one may want to
118 ;; have
119 ;; - UCS-4 in vectoren
121 ;; o xstreams auslagern, documententieren und dann auch in SGML und
122 ;; CSS parser verwenden. (halt alles was zeichen liest).
123 ;; [ausgelagert sind sie; dokumentiert "so la la"; die Reintegration
124 ;; in Closure ist ein ganz anderes Thema]
126 ;; o recording of source locations for nodes.
128 ;; o based on the DTD and xml:space attribute implement HTML white
129 ;; space rules.
131 ;; o on a parser option, do not expand external entities.
133 ;;;; Validity constraints:
134 ;;;; (00) Root Element Type like (03), c.f. MAKE-ROOT-MODEL
135 ;;;; (01) Proper Declaration/PE Nesting P/MARKUP-DECL
136 ;;;; (02) Standalone Document Declaration all over the place [*]
137 ;;;; (03) Element Valid VALIDATE-*-ELEMENT, -CHARACTERS
138 ;;;; (04) Attribute Value Type VALIDATE-ATTRIBUTE
139 ;;;; (05) Unique Element Type Declaration DEFINE-ELEMENT
140 ;;;; (06) Proper Group/PE Nesting P/CSPEC
141 ;;;; (07) No Duplicate Types LEGAL-CONTENT-MODEL-P
142 ;;;; (08) ID VALIDATE-ATTRIBUTE
143 ;;;; (09) One ID per Element Type DEFINE-ATTRIBUTE
144 ;;;; (10) ID Attribute Default DEFINE-ATTRIBUTE
145 ;;;; (11) IDREF VALIDATE-ATTRIBUTE, P/DOCUMENT
146 ;;;; (12) Entity Name VALIDATE-ATTRIBUTE
147 ;;;; (13) Name Token VALIDATE-ATTRIBUTE
148 ;;;; (14) Notation Attributes VALIDATE-ATTRIBUTE, P/ATT-TYPE
149 ;;;; (15) One Notation Per Element Type DEFINE-ATTRIBUTE
150 ;;;; (16) No Notation on Empty Element DEFINE-ELEMENT, -ATTRIBUTE
151 ;;;; (17) Enumeration VALIDATE-ATTRIBUTE
152 ;;;; (18) Required Attribute PROCESS-ATTRIBUTES
153 ;;;; (19) Attribute Default Legal DEFINE-ATTRIBUTE
154 ;;;; (20) Fixed Attribute Default VALIDATE-ATTRIBUTE
155 ;;;; (21) Proper Conditional Section/PE Nesting P/CONDITIONAL-SECT, ...
156 ;;;; (22) Entity Declared [**]
157 ;;;; (23) Notation Declared P/ENTITY-DEF, P/DOCUMENT
158 ;;;; (24) Unique Notation Name DEFINE-NOTATION
159 ;;;;
160 ;;;; [*] Perhaps we could revert the explicit checks of (02), if we did
161 ;;;; _not_ read external subsets of standalone documents when parsing in
162 ;;;; validating mode. Violations of VC (02) constraints would then appear as
163 ;;;; wellformedness violations, right?
164 ;;;;
165 ;;;; [**] Although I haven't investigated this properly yet, I believe that
166 ;;;; we check this VC together with the WFC even in non-validating mode.
168 (in-package :cxml)
170 #+allegro
171 (setf (excl:named-readtable :runes) *readtable*)
173 (eval-when (:compile-toplevel :load-toplevel :execute)
174 (defparameter *fast* '(optimize (speed 3) (safety 0)))
175 ;;(defparameter *fast* '(optimize (speed 2) (safety 3)))
178 ;;; parser context
180 (defvar *ctx* nil)
182 (defstruct (context (:conc-name nil))
183 handler
184 (dtd nil)
185 model-stack
186 ;; xml:base machen wir fuer klacks mal gleich als expliziten stack:
187 base-stack
188 (referenced-notations '())
189 (id-table (%make-rod-hash-table))
190 ;; FIXME: Wofuer ist name-hashtable da? Will man das wissen?
191 (name-hashtable (make-rod-hashtable :size 2000))
192 (standalone-p nil)
193 (entity-resolver nil)
194 (disallow-internal-subset nil)
195 main-zstream)
197 (defvar *expand-pe-p* nil)
199 (defparameter *initial-namespace-bindings*
200 '((#"" . nil)
201 (#"xmlns" . #"http://www.w3.org/2000/xmlns/")
202 (#"xml" . #"http://www.w3.org/XML/1998/namespace")))
204 (defparameter *namespace-bindings* *initial-namespace-bindings*)
206 ;;;; ---------------------------------------------------------------------------
207 ;;;; xstreams
208 ;;;;
211 (defstruct (stream-name
212 (:print-function print-stream-name))
213 entity-name
214 entity-kind
215 uri)
217 (defun print-stream-name (object stream depth)
218 (declare (ignore depth))
219 (format stream "[~A ~S ~A]"
220 (rod-string (stream-name-entity-name object))
221 (stream-name-entity-kind object)
222 (stream-name-uri object)))
224 (deftype read-element () 'rune)
226 (defun call-with-open-xstream (fn stream)
227 (unwind-protect
228 (funcall fn stream)
229 (close-xstream stream)))
231 (defmacro with-open-xstream ((var value) &body body)
232 `(call-with-open-xstream (lambda (,var) ,@body) ,value))
234 (defun call-with-open-xfile (continuation &rest open-args)
235 (let ((input (apply #'open (car open-args) :element-type '(unsigned-byte 8) (cdr open-args))))
236 (unwind-protect
237 (progn
238 (funcall continuation (make-xstream input)))
239 (close input))))
241 (defmacro with-open-xfile ((stream &rest open-args) &body body)
242 `(call-with-open-xfile (lambda (,stream) .,body) .,open-args))
244 ;;;; -------------------------------------------------------------------
245 ;;;; Rechnen mit Runen
246 ;;;;
248 ;; Let us first define fast fixnum arithmetric get rid of type
249 ;; checks. (After all we know what we do here).
251 (defmacro fx-op (op &rest xs)
252 `(the fixnum (,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs))))
253 (defmacro fx-pred (op &rest xs)
254 `(,op ,@(mapcar (lambda (x) `(the fixnum ,x)) xs)))
256 (defmacro %+ (&rest xs) `(fx-op + ,@xs))
257 (defmacro %- (&rest xs) `(fx-op - ,@xs))
258 (defmacro %* (&rest xs) `(fx-op * ,@xs))
259 (defmacro %/ (&rest xs) `(fx-op floor ,@xs))
260 (defmacro %and (&rest xs) `(fx-op logand ,@xs))
261 (defmacro %ior (&rest xs) `(fx-op logior ,@xs))
262 (defmacro %xor (&rest xs) `(fx-op logxor ,@xs))
263 (defmacro %ash (&rest xs) `(fx-op ash ,@xs))
264 (defmacro %mod (&rest xs) `(fx-op mod ,@xs))
266 (defmacro %= (&rest xs) `(fx-pred = ,@xs))
267 (defmacro %<= (&rest xs) `(fx-pred <= ,@xs))
268 (defmacro %>= (&rest xs) `(fx-pred >= ,@xs))
269 (defmacro %< (&rest xs) `(fx-pred < ,@xs))
270 (defmacro %> (&rest xs) `(fx-pred > ,@xs))
272 ;;; XXX Geschwindigkeit dieser Definitionen untersuchen!
274 (defmacro rune-op (op &rest xs)
275 `(code-rune (,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs))))
276 (defmacro rune-pred (op &rest xs)
277 `(,op ,@(mapcar (lambda (x) `(rune-code ,x)) xs)))
279 (defmacro %rune+ (&rest xs) `(rune-op + ,@xs))
280 (defmacro %rune- (&rest xs) `(rune-op - ,@xs))
281 (defmacro %rune* (&rest xs) `(rune-op * ,@xs))
282 (defmacro %rune/ (&rest xs) `(rune-op floor ,@xs))
283 (defmacro %rune-and (&rest xs) `(rune-op logand ,@xs))
284 (defmacro %rune-ior (&rest xs) `(rune-op logior ,@xs))
285 (defmacro %rune-xor (&rest xs) `(rune-op logxor ,@xs))
286 (defmacro %rune-ash (a b) `(code-rune (ash (rune-code ,a) ,b)))
287 (defmacro %rune-mod (&rest xs) `(rune-op mod ,@xs))
289 (defmacro %rune= (&rest xs) `(rune-pred = ,@xs))
290 (defmacro %rune<= (&rest xs) `(rune-pred <= ,@xs))
291 (defmacro %rune>= (&rest xs) `(rune-pred >= ,@xs))
292 (defmacro %rune< (&rest xs) `(rune-pred < ,@xs))
293 (defmacro %rune> (&rest xs) `(rune-pred > ,@xs))
295 ;;;; ---------------------------------------------------------------------------
296 ;;;; rod hashtable
297 ;;;;
299 ;;; make-rod-hashtable
300 ;;; rod-hash-get hashtable rod &optional start end -> value ; successp
301 ;;; (setf (rod-hash-get hashtable rod &optional start end) new-value
304 (defstruct (rod-hashtable (:constructor make-rod-hashtable/low))
305 size ;size of table
306 table ;
309 (defun make-rod-hashtable (&key (size 200))
310 (setf size (nearest-greater-prime size))
311 (make-rod-hashtable/low
312 :size size
313 :table (make-array size :initial-element nil)))
315 (eval-when (:compile-toplevel :load-toplevel :execute)
316 (defconstant +fixnum-bits+
317 (1- (integer-length most-positive-fixnum))
318 "Pessimistic approximation of the number of bits of fixnums.")
320 (defconstant +fixnum-mask+
321 (1- (expt 2 +fixnum-bits+))
322 "Pessimistic approximation of the largest bit-mask, still being a fixnum."))
324 (definline stir (a b)
325 (%and +fixnum-mask+
326 (%xor (%ior (%ash (%and a #.(ash +fixnum-mask+ -5)) 5)
327 (%ash a #.(- 5 +fixnum-bits+)))
328 b)))
330 (definline rod-hash (rod start end)
331 "Compute a hash code out of a rod."
332 (let ((res (%- end start)))
333 (do ((i start (%+ i 1)))
334 ((%= i end))
335 (declare (type fixnum i))
336 (setf res (stir res (rune-code (%rune rod i)))))
337 res))
339 (definline rod=* (x y &key (start1 0) (end1 (length x))
340 (start2 0) (end2 (length y)))
341 (and (%= (%- end1 start1) (%- end2 start2))
342 (do ((i start1 (%+ i 1))
343 (j start2 (%+ j 1)))
344 ((%= i end1)
346 (unless (rune= (%rune x i) (%rune y j))
347 (return nil)))))
349 (definline rod=** (x y start1 end1 start2 end2)
350 (and (%= (%- end1 start1) (%- end2 start2))
351 (do ((i start1 (%+ i 1))
352 (j start2 (%+ j 1)))
353 ((%= i end1)
355 (unless (rune= (%rune x i) (%rune y j))
356 (return nil)))))
358 (defun rod-hash-get (hashtable rod &optional (start 0) (end (length rod)))
359 (declare (type (simple-array rune (*)) rod))
360 (let ((j (%mod (rod-hash rod start end)
361 (rod-hashtable-size hashtable))))
362 (dolist (q (svref (rod-hashtable-table hashtable) j)
363 (values nil nil nil))
364 (declare (type cons q))
365 (when (rod=** (car q) rod 0 (length (the (simple-array rune (*)) (car q))) start end)
366 (return (values (cdr q) t (car q)))))))
368 (defun rod-hash-set (new-value hashtable rod &optional (start 0) (end (length rod)))
369 (let ((j (%mod (rod-hash rod start end)
370 (rod-hashtable-size hashtable)))
371 (key nil))
372 (dolist (q (svref (rod-hashtable-table hashtable) j)
373 (progn
374 (setf key (rod-subseq* rod start end))
375 (push (cons key new-value)
376 (aref (rod-hashtable-table hashtable) j))))
377 (when (rod=* (car q) rod :start2 start :end2 end)
378 (setf key (car q))
379 (setf (cdr q) new-value)
380 (return)))
381 (values new-value key)))
383 #-rune-is-character
384 (defun rod-subseq* (source start &optional (end (length source)))
385 (unless (and (typep start 'fixnum) (>= start 0))
386 (error "~S is not a non-negative fixnum." start))
387 (unless (and (typep end 'fixnum) (>= end start))
388 (error "END argument, ~S, is not a fixnum no less than START, ~S." end start))
389 (when (> start (length source))
390 (error "START argument, ~S, should be no greater than length of rod." start))
391 (when (> end (length source))
392 (error "END argument, ~S, should be no greater than length of rod." end))
393 (locally
394 (declare (type fixnum start end))
395 (let ((res (make-rod (- end start))))
396 (declare (type rod res))
397 (do ((i (- (- end start) 1) (the fixnum (- i 1))))
398 ((< i 0) res)
399 (declare (type fixnum i))
400 (setf (%rune res i) (aref source (the fixnum (+ i start))))))))
402 #+rune-is-character
403 (defun rod-subseq* (source start &optional (end (length source)))
404 (subseq source start end))
406 (deftype ufixnum () `(unsigned-byte ,(integer-length most-positive-fixnum)))
408 #-rune-is-character
409 (defun rod-subseq** (source start &optional (end (length source)))
410 (declare (type (simple-array rune (*)) source)
411 (type ufixnum start)
412 (type ufixnum end)
413 (optimize (speed 3) (safety 0)))
414 (let ((res (make-array (%- end start) :element-type 'rune)))
415 (declare (type (simple-array rune (*)) res))
416 (let ((i (%- end start)))
417 (declare (type ufixnum i))
418 (loop
419 (setf i (- i 1))
420 (when (= i 0)
421 (return))
422 (setf (%rune res i) (%rune source (the ufixnum (+ i start))))))
423 res))
425 #+rune-is-character
426 (defun rod-subseq** (source start &optional (end (length source)))
427 (subseq source start end))
429 (defun (setf rod-hash-get) (new-value hashtable rod &optional (start 0) (end (length rod)))
430 (rod-hash-set new-value hashtable rod start end))
432 (defun intern-name (rod &optional (start 0) (end (length rod)))
433 (multiple-value-bind (value successp key) (rod-hash-get (name-hashtable *ctx*) rod start end)
434 (declare (ignore value))
435 (if successp
437 (nth-value 1 (rod-hash-set t (name-hashtable *ctx*) rod start end)))))
439 ;;;; ---------------------------------------------------------------------------
440 ;;;;
441 ;;;; rod collector
442 ;;;;
444 (defvar *scratch-pad*)
445 (defvar *scratch-pad-2*)
446 (defvar *scratch-pad-3*)
447 (defvar *scratch-pad-4*)
449 (declaim (type (simple-array rune (*))
450 *scratch-pad* *scratch-pad-2* *scratch-pad-3* *scratch-pad-4*))
452 (defmacro with-scratch-pads ((&optional) &body body)
453 `(let ((*scratch-pad* (make-array 1024 :element-type 'rune))
454 (*scratch-pad-2* (make-array 1024 :element-type 'rune))
455 (*scratch-pad-3* (make-array 1024 :element-type 'rune))
456 (*scratch-pad-4* (make-array 1024 :element-type 'rune)))
457 ,@body))
459 (defmacro %put-unicode-char (code-var put)
460 `(progn
461 (cond #+rune-is-utf-16
462 ((%> ,code-var #xFFFF)
463 (,put (the rune (code-rune (%+ #xD7C0 (%ash ,code-var -10)))))
464 (,put (the rune (code-rune (%ior #xDC00 (%and ,code-var #x03FF))))))
466 (,put (code-rune ,code-var))))))
468 (defun adjust-array-by-copying (old-array new-size)
469 "Adjust an array by copying and thus ensures, that result is a SIMPLE-ARRAY."
470 (let ((res (make-array new-size :element-type (array-element-type old-array))))
471 (replace res old-array
472 :start1 0 :end1 (length old-array)
473 :start2 0 :end2 (length old-array))
474 res))
476 (defmacro with-rune-collector-aux (scratch collect body mode)
477 (let ((rod (gensym))
478 (n (gensym))
479 (i (gensym))
480 (b (gensym)))
481 `(let ((,n (length ,scratch))
482 (,i 0)
483 (,b ,scratch))
484 (declare (type fixnum ,n ,i))
485 (macrolet
486 ((,collect (x)
487 `((lambda (x)
488 (locally
489 (declare #.*fast*)
490 (when (%>= ,',i ,',n)
491 (setf ,',n (* 2 ,',n))
492 (setf ,',b
493 (setf ,',scratch
494 (adjust-array-by-copying ,',scratch ,',n))))
495 (setf (aref (the (simple-array rune (*)) ,',b) ,',i) x)
496 (incf ,',i)))
497 ,x)))
498 ,@body
499 ,(ecase mode
500 (:intern
501 `(intern-name ,b 0 ,i))
502 (:copy
503 `(let ((,rod (make-rod ,i)))
504 (while (not (%= ,i 0))
505 (setf ,i (%- ,i 1))
506 (setf (%rune ,rod ,i)
507 (aref (the (simple-array rune (*)) ,b) ,i)))
508 ,rod))
509 (:raw
510 `(values ,b 0 ,i))
511 )))))
513 '(defmacro with-rune-collector-aux (scratch collect body mode)
514 (let ((rod (gensym))
515 (n (gensym))
516 (i (gensym))
517 (b (gensym)))
518 `(let ((,n (length ,scratch))
519 (,i 0))
520 (declare (type fixnum ,n ,i))
521 (macrolet
522 ((,collect (x)
523 `((lambda (x)
524 (locally
525 (declare #.*fast*)
526 (when (%>= ,',i ,',n)
527 (setf ,',n (* 2 ,',n))
528 (setf ,',scratch
529 (setf ,',scratch
530 (adjust-array-by-copying ,',scratch ,',n))))
531 (setf (aref (the (simple-array rune (*)) ,',scratch) ,',i) x)
532 (incf ,',i)))
533 ,x)))
534 ,@body
535 ,(ecase mode
536 (:intern
537 `(intern-name ,scratch 0 ,i))
538 (:copy
539 `(let ((,rod (make-rod ,i)))
540 (while (%> ,i 0)
541 (setf ,i (%- ,i 1))
542 (setf (%rune ,rod ,i)
543 (aref (the (simple-array rune (*)) ,scratch) ,i)))
544 ,rod))
545 (:raw
546 `(values ,scratch 0 ,i))
547 )))))
549 (defmacro with-rune-collector ((collect) &body body)
550 `(with-rune-collector-aux *scratch-pad* ,collect ,body :copy))
552 (defmacro with-rune-collector-2 ((collect) &body body)
553 `(with-rune-collector-aux *scratch-pad-2* ,collect ,body :copy))
555 (defmacro with-rune-collector-3 ((collect) &body body)
556 `(with-rune-collector-aux *scratch-pad-3* ,collect ,body :copy))
558 (defmacro with-rune-collector-4 ((collect) &body body)
559 `(with-rune-collector-aux *scratch-pad-4* ,collect ,body :copy))
561 (defmacro with-rune-collector/intern ((collect) &body body)
562 `(with-rune-collector-aux *scratch-pad* ,collect ,body :intern))
564 (defmacro with-rune-collector/raw ((collect) &body body)
565 `(with-rune-collector-aux *scratch-pad* ,collect ,body :raw))
568 (defmacro while-reading-runes ((reader stream-in) &rest body)
569 ;; Thou shalt not leave body via a non local exit
570 (let ((stream (make-symbol "STREAM"))
571 (rptr (make-symbol "RPTR"))
572 (fptr (make-symbol "FPTR"))
573 (buf (make-symbol "BUF")) )
574 `(let* ((,stream ,stream-in)
575 (,rptr (xstream-read-ptr ,stream))
576 (,fptr (xstream-fill-ptr ,stream))
577 (,buf (xstream-buffer ,stream)))
578 (declare (type fixnum ,rptr ,fptr)
579 (type xstream ,stream))
580 (macrolet ((,reader (res-var)
581 `(cond ((%= ,',rptr ,',fptr)
582 (setf (xstream-read-ptr ,',stream) ,',rptr)
583 (setf ,res-var (xstream-underflow ,',stream))
584 (setf ,',rptr (xstream-read-ptr ,',stream))
585 (setf ,',fptr (xstream-fill-ptr ,',stream))
586 (setf ,',buf (xstream-buffer ,',stream)))
588 (setf ,res-var
589 (aref (the (simple-array read-element (*)) ,',buf)
590 (the fixnum ,',rptr)))
591 (setf ,',rptr (%+ ,',rptr 1))))))
592 (prog1
593 (let () .,body)
594 (setf (xstream-read-ptr ,stream) ,rptr) )))))
597 ;;;; ---------------------------------------------------------------------------
598 ;;;; DTD
599 ;;;;
601 (define-condition xml-parse-error (simple-error) ()
602 (:documentation
603 "Superclass of all conditions signalled by the CXML parser."))
605 (define-condition well-formedness-violation (xml-parse-error) ()
606 (:documentation
607 "This condition is signalled for all well-formedness violations.
609 Note for validating mode: Sometimes violations of well-formedness are
610 first detected as validity errors by the parser and signalled as
611 instances of @class{validity-error} rather
612 than well-formedness-violation."))
614 (define-condition validity-error (xml-parse-error) ()
615 (:documentation
616 "Reports the violation of a validity constraint."))
618 ;; We make some effort to signal end of file as a special condition, but we
619 ;; don't actually try very hard. Not sure whether we should. Right now I
620 ;; would prefer not to document this class.
621 (define-condition end-of-xstream (well-formedness-violation) ())
623 (defun describe-xstream (x s)
624 (format s " Line ~D, column ~D in ~A~%"
625 (xstream-line-number x)
626 (xstream-column-number x)
627 (let ((name (xstream-name x)))
628 (cond
629 ((null name)
630 "<anonymous stream>")
631 ((eq :main (stream-name-entity-kind name))
632 (stream-name-uri name))
634 name)))))
636 (defun %error (class stream message)
637 (let* ((zmain (if *ctx* (main-zstream *ctx*) nil))
638 (zstream (if (zstream-p stream) stream zmain))
639 (xstream (if (xstream-p stream) stream nil))
640 (s (make-string-output-stream)))
641 (write-line message s)
642 (when xstream
643 (write-line "Location:" s)
644 (describe-xstream xstream s))
645 (when zstream
646 (let ((stack
647 (remove xstream (remove :stop (zstream-input-stack zstream)))))
648 (when stack
649 (write-line "Context:" s)
650 (dolist (x stack)
651 (describe-xstream x s)))))
652 (when (and zmain (not (eq zstream zmain)))
653 (let ((stack
654 (remove xstream (remove :stop (zstream-input-stack zmain)))))
655 (when stack
656 (write-line "Context in main document:" s)
657 (dolist (x stack)
658 (describe-xstream x s)))))
659 (error class
660 :format-control "~A"
661 :format-arguments (list (get-output-stream-string s)))))
663 (defun validity-error (fmt &rest args)
664 (%error 'validity-error
666 (format nil "Document not valid: ~?" fmt args)))
668 (defun wf-error (stream fmt &rest args)
669 (%error 'well-formedness-violation
670 stream
671 (format nil "Document not well-formed: ~?" fmt args)))
673 (defun eox (stream &optional x &rest args)
674 (%error 'end-of-xstream
675 stream
676 (format nil "End of file~@[: ~?~]" x args)))
678 (defclass cxml-parser (sax:sax-parser) ((ctx :initarg :ctx)))
680 (defun parser-xstream (parser)
681 (car (zstream-input-stack (main-zstream (slot-value parser 'ctx)))))
683 (defun parser-stream-name (parser)
684 (let ((xstream (parser-xstream parser)))
685 (if xstream
686 (xstream-name xstream)
687 nil)))
689 (defmethod sax:line-number ((parser cxml-parser))
690 (let ((x (parser-xstream parser)))
691 (if x
692 (xstream-line-number x)
693 nil)))
695 (defmethod sax:column-number ((parser cxml-parser))
696 (let ((x (parser-xstream parser)))
697 (if x
698 (xstream-column-number x)
699 nil)))
701 (defmethod sax:system-id ((parser cxml-parser))
702 (let ((name (parser-stream-name parser)))
703 (if name
704 (stream-name-uri name)
705 nil)))
707 (defmethod sax:xml-base ((parser cxml-parser))
708 (let ((uri (car (base-stack (slot-value parser 'ctx)))))
709 (if (or (null uri) (stringp uri))
711 (puri:render-uri uri nil))))
713 (defvar *validate* t)
714 (defvar *external-subset-p* nil)
716 (defun validate-start-element (ctx name)
717 (when *validate*
718 (let* ((pair (car (model-stack ctx)))
719 (newval (funcall (car pair) name)))
720 (unless newval
721 (validity-error "(03) Element Valid: ~A" (rod-string name)))
722 (setf (car pair) newval)
723 (let ((e (find-element name (dtd ctx))))
724 (unless e
725 (validity-error "(03) Element Valid: no definition for ~A"
726 (rod-string name)))
727 (maybe-compile-cspec e)
728 (push (copy-cons (elmdef-compiled-cspec e)) (model-stack ctx))))))
730 (defun copy-cons (x)
731 (cons (car x) (cdr x)))
733 (defun validate-end-element (ctx name)
734 (when *validate*
735 (let ((pair (car (model-stack ctx))))
736 (unless (eq (funcall (car pair) nil) t)
737 (validity-error "(03) Element Valid: ~A" (rod-string name)))
738 (pop (model-stack ctx)))))
740 (defun validate-characters (ctx rod)
741 (when *validate*
742 (let ((pair (car (model-stack ctx))))
743 (unless (funcall (cdr pair) rod)
744 (validity-error "(03) Element Valid: unexpected PCDATA")))))
746 (defun standalone-check-necessary-p (def)
747 (and *validate*
748 (standalone-p *ctx*)
749 (etypecase def
750 (elmdef (elmdef-external-p def))
751 (attdef (attdef-external-p def)))))
753 ;; attribute validation, defaulting, and normalization -- except for for
754 ;; uniqueness checks, which are done after namespaces have been declared
755 (defun process-attributes (ctx name attlist)
756 (let ((e (find-element name (dtd ctx))))
757 (cond
759 (dolist (ad (elmdef-attributes e)) ;handle default values
760 (unless (get-attribute (attdef-name ad) attlist)
761 (case (attdef-default ad)
762 (:IMPLIED)
763 (:REQUIRED
764 (when *validate*
765 (validity-error "(18) Required Attribute: ~S not specified"
766 (rod-string (attdef-name ad)))))
768 (when (standalone-check-necessary-p ad)
769 (validity-error "(02) Standalone Document Declaration: missing attribute value"))
770 (push (sax:make-attribute :qname (attdef-name ad)
771 :value (cadr (attdef-default ad))
772 :specified-p nil)
773 attlist)))))
774 (dolist (a attlist) ;normalize non-CDATA values
775 (let* ((qname (sax:attribute-qname a))
776 (adef (find-attribute e qname)))
777 (when adef
778 (when (and *validate*
779 sax:*namespace-processing*
780 (eq (attdef-type adef) :ID)
781 (find #/: (sax:attribute-value a)))
782 (validity-error "colon in ID attribute"))
783 (unless (eq (attdef-type adef) :CDATA)
784 (let ((canon (canon-not-cdata-attval (sax:attribute-value a))))
785 (when (and (standalone-check-necessary-p adef)
786 (not (rod= (sax:attribute-value a) canon)))
787 (validity-error "(02) Standalone Document Declaration: attribute value not normalized"))
788 (setf (sax:attribute-value a) canon))))))
789 (when *validate* ;maybe validate attribute values
790 (dolist (a attlist)
791 (validate-attribute ctx e a))))
792 ((and *validate* attlist)
793 (validity-error "(04) Attribute Value Type: no definition for element ~A"
794 (rod-string name)))))
795 attlist)
797 (defun get-attribute (name attributes)
798 (member name attributes :key #'sax:attribute-qname :test #'rod=))
800 (defun validate-attribute (ctx e a)
801 (when (sax:attribute-specified-p a) ;defaults checked by DEFINE-ATTRIBUTE
802 (let* ((qname (sax:attribute-qname a))
803 (adef
804 (or (find-attribute e qname)
805 (validity-error "(04) Attribute Value Type: not declared: ~A"
806 (rod-string qname)))))
807 (validate-attribute* ctx adef (sax:attribute-value a)))))
809 (defun validate-attribute* (ctx adef value)
810 (let ((type (attdef-type adef))
811 (default (attdef-default adef)))
812 (when (and (listp default)
813 (eq (car default) :FIXED)
814 (not (rod= value (cadr default))))
815 (validity-error "(20) Fixed Attribute Default: expected ~S but got ~S"
816 (rod-string (cadr default))
817 (rod-string value)))
818 (ecase (if (listp type) (car type) type)
819 (:ID
820 (unless (valid-name-p value)
821 (validity-error "(08) ID: not a name: ~S" (rod-string value)))
822 (when (eq (gethash value (id-table ctx)) t)
823 (validity-error "(08) ID: ~S not unique" (rod-string value)))
824 (setf (gethash value (id-table ctx)) t))
825 (:IDREF
826 (validate-idref ctx value))
827 (:IDREFS
828 (let ((names (split-names value)))
829 (unless names
830 (validity-error "(11) IDREF: malformed names"))
831 (mapc (curry #'validate-idref ctx) names)))
832 (:NMTOKEN
833 (validate-nmtoken value))
834 (:NMTOKENS
835 (let ((tokens (split-names value)))
836 (unless tokens
837 (validity-error "(13) Name Token: malformed NMTOKENS"))
838 (mapc #'validate-nmtoken tokens)))
839 (:ENUMERATION
840 (unless (member value (cdr type) :test #'rod=)
841 (validity-error "(17) Enumeration: value not declared: ~S"
842 (rod-string value))))
843 (:NOTATION
844 (unless (member value (cdr type) :test #'rod=)
845 (validity-error "(14) Notation Attributes: ~S" (rod-string value))))
846 (:ENTITY
847 (validate-entity value))
848 (:ENTITIES
849 (let ((names (split-names value)))
850 (unless names
851 (validity-error "(13) Name Token: malformed NMTOKENS"))
852 (mapc #'validate-entity names)))
853 (:CDATA))))
855 (defun validate-idref (ctx value)
856 (unless (valid-name-p value)
857 (validity-error "(11) IDREF: not a name: ~S" (rod-string value)))
858 (unless (gethash value (id-table ctx))
859 (setf (gethash value (id-table ctx)) nil)))
861 (defun validate-nmtoken (value)
862 (unless (valid-nmtoken-p value)
863 (validity-error "(13) Name Token: not a NMTOKEN: ~S"
864 (rod-string value))))
866 (defstruct (entdef (:constructor)))
868 (defstruct (internal-entdef
869 (:include entdef)
870 (:constructor make-internal-entdef (value))
871 (:conc-name #:entdef-))
872 (value (error "missing argument") :type rod)
873 (expansion nil)
874 (external-subset-p *external-subset-p*))
876 (defstruct (external-entdef
877 (:include entdef)
878 (:constructor make-external-entdef (extid ndata))
879 (:conc-name #:entdef-))
880 (extid (error "missing argument") :type extid)
881 (ndata nil :type (or rod null)))
883 (defun validate-entity (value)
884 (unless (valid-name-p value)
885 (validity-error "(12) Entity Name: not a name: ~S" (rod-string value)))
886 (let ((def (let ((*validate*
887 ;; Similarly the entity refs are internal and
888 ;; don't need normalization ... the unparsed
889 ;; entities (and entities) aren't "references"
890 ;; -- sun/valid/sa03.xml
891 nil))
892 (get-entity-definition value :general (dtd *ctx*)))))
893 (unless (and (typep def 'external-entdef) (entdef-ndata def))
894 ;; unparsed entity
895 (validity-error "(12) Entity Name: ~S" (rod-string value)))))
897 (defun split-names (rod)
898 (flet ((whitespacep (x)
899 (or (rune= x #/U+0009)
900 (rune= x #/U+000A)
901 (rune= x #/U+000D)
902 (rune= x #/U+0020))))
903 (if (let ((n (length rod)))
904 (and (not (zerop n))
905 (or (whitespacep (rune rod 0))
906 (whitespacep (rune rod (1- n))))))
908 (split-sequence-if #'whitespacep rod :remove-empty-subseqs t))))
910 (defun zstream-base-sysid (zstream)
911 (let ((base-sysid
912 (dolist (k (zstream-input-stack zstream))
913 (let ((base-sysid (stream-name-uri (xstream-name k))))
914 (when base-sysid (return base-sysid))))))
915 base-sysid))
917 (defun absolute-uri (sysid source-stream)
918 (let ((base-sysid (zstream-base-sysid source-stream)))
919 ;; XXX is the IF correct?
920 (if base-sysid
921 (puri:merge-uris sysid base-sysid)
922 sysid)))
924 (defstruct (extid (:constructor make-extid (public system)))
925 (public nil :type (or rod null))
926 (system (error "missing argument") :type (or puri:uri null)))
928 (setf (documentation 'extid 'type)
929 "Represents an External ID, consisting of a Public ID and a System ID.
931 @see-constructor{make-extiid}
932 @see-slot{exitid-system}
933 @see-slot{exitid-public}")
935 (setf (documentation #'make-extid 'function)
936 "@arg[publicid]{string or nil}
937 @arg[systemid]{@class{puri:uri} or nil}
938 @return{an instance of @class{extid}}
940 Create an object representing the External ID composed
941 of the specified Public ID and System ID.")
943 (setf (documentation #'extid-public 'function)
944 "@arg[extid]{A @class{extid}}
945 @return[publicid]{string or nil}
947 Returns the Public ID part of this External ID.")
949 (setf (documentation #'extid-system 'function)
950 "@arg[extid]{A @class{extid}}
951 @return[sytemid]{puri:uri or nil}
953 Returns the System ID part of this External ID.")
955 (defun absolute-extid (source-stream extid)
956 (let ((sysid (extid-system extid))
957 (result (copy-extid extid)))
958 (setf (extid-system result) (absolute-uri sysid source-stream))
959 result))
961 (defun define-entity (source-stream name kind def)
962 (setf name (intern-name name))
963 (when (and sax:*namespace-processing* (find #/: name))
964 (wf-error source-stream "colon in entity name"))
965 (let ((table
966 (ecase kind
967 (:general (dtd-gentities (dtd *ctx*)))
968 (:parameter (dtd-pentities (dtd *ctx*))))))
969 (unless (gethash name table)
970 (when (and source-stream (handler *ctx*))
971 (report-entity (handler *ctx*) kind name def))
972 (when (typep def 'external-entdef)
973 (setf (entdef-extid def)
974 (absolute-extid source-stream (entdef-extid def))))
975 (setf (gethash name table)
976 (cons *external-subset-p* def)))))
978 (defun get-entity-definition (entity-name kind dtd)
979 (unless dtd
980 (wf-error nil "entity not defined: ~A" (rod-string entity-name)))
981 (destructuring-bind (extp &rest def)
982 (gethash entity-name
983 (ecase kind
984 (:general (dtd-gentities dtd))
985 (:parameter (dtd-pentities dtd)))
986 '(nil))
987 (when (and *validate* (standalone-p *ctx*) extp)
988 (validity-error "(02) Standalone Document Declaration: entity reference: ~S"
989 (rod-string entity-name)))
990 def))
992 (defun entity->xstream (zstream entity-name kind &optional internalp)
993 ;; `zstream' is for error messages
994 (let ((def (get-entity-definition entity-name kind (dtd *ctx*))))
995 (unless def
996 (wf-error zstream "Entity '~A' is not defined." (rod-string entity-name)))
997 (let (r)
998 (etypecase def
999 (internal-entdef
1000 (when (and (standalone-p *ctx*)
1001 (entdef-external-subset-p def))
1002 (wf-error
1003 zstream
1004 "entity declared in external subset, but document is standalone"))
1005 (setf r (make-rod-xstream (entdef-value def)))
1006 (setf (xstream-name r)
1007 (make-stream-name :entity-name entity-name
1008 :entity-kind kind
1009 :uri nil)))
1010 (external-entdef
1011 (when internalp
1012 (wf-error zstream
1013 "entity not internal: ~A" (rod-string entity-name)))
1014 (when (entdef-ndata def)
1015 (wf-error zstream
1016 "reference to unparsed entity: ~A"
1017 (rod-string entity-name)))
1018 (setf r (xstream-open-extid (extid-using-catalog (entdef-extid def))))
1019 (setf (stream-name-entity-name (xstream-name r)) entity-name
1020 (stream-name-entity-kind (xstream-name r)) kind)))
1021 r)))
1023 (defun checked-get-entdef (name type)
1024 (let ((def (get-entity-definition name type (dtd *ctx*))))
1025 (unless def
1026 (wf-error nil "Entity '~A' is not defined." (rod-string name)))
1027 def))
1029 (defun xstream-open-extid* (entity-resolver pubid sysid)
1030 (let* ((stream
1031 (or (funcall (or entity-resolver (constantly nil)) pubid sysid)
1032 (open (uri-to-pathname sysid)
1033 :element-type '(unsigned-byte 8)
1034 :direction :input))))
1035 (make-xstream stream
1036 :name (make-stream-name :uri sysid)
1037 :initial-speed 1)))
1039 (defun xstream-open-extid (extid)
1040 (xstream-open-extid* (entity-resolver *ctx*)
1041 (extid-public extid)
1042 (extid-system extid)))
1044 (defun call-with-entity-expansion-as-stream (zstream cont name kind internalp)
1045 ;; `zstream' is for error messages
1046 (let ((in (entity->xstream zstream name kind internalp)))
1047 (push (stream-name-uri (xstream-name in)) (base-stack *ctx*))
1048 (unwind-protect
1049 (funcall cont in)
1050 (pop (base-stack *ctx*))
1051 (close-xstream in))))
1053 (defun ensure-dtd ()
1054 (unless (dtd *ctx*)
1055 (setf (dtd *ctx*) (make-dtd))
1056 (define-default-entities)))
1058 (defun define-default-entities ()
1059 (define-entity nil #"lt" :general (make-internal-entdef #"&#60;"))
1060 (define-entity nil #"gt" :general (make-internal-entdef #">"))
1061 (define-entity nil #"amp" :general (make-internal-entdef #"&#38;"))
1062 (define-entity nil #"apos" :general (make-internal-entdef #"'"))
1063 (define-entity nil #"quot" :general (make-internal-entdef #"\"")))
1065 (defstruct attdef
1066 ;; an attribute definition
1067 element ;name of element this attribute belongs to
1068 name ;name of attribute
1069 type ;type of attribute; either one of :CDATA, :ID, :IDREF, :IDREFS,
1070 ; :ENTITY, :ENTITIES, :NMTOKEN, :NMTOKENS, or
1071 ; (:NOTATION <name>*)
1072 ; (:ENUMERATION <name>*)
1073 default ;default value of attribute:
1074 ; :REQUIRED, :IMPLIED, (:FIXED content) or (:DEFAULT content)
1075 (external-p *external-subset-p*)
1078 (defstruct elmdef
1079 ;; an element definition
1080 name ;name of the element
1081 content ;content model [*]
1082 attributes ;list of defined attributes
1083 compiled-cspec ;cons of validation function for contentspec
1084 (external-p *external-subset-p*)
1087 ;; [*] in XML it is possible to define attributes before the element
1088 ;; itself is defined and since we hang attribute definitions into the
1089 ;; relevant element definitions, the `content' slot indicates whether an
1090 ;; element was actually defined. It is NIL until set to a content model
1091 ;; when the element type declaration is processed.
1093 (defun %make-rod-hash-table ()
1094 ;; XXX with portable hash tables, this is the only way to case-sensitively
1095 ;; use rods. However, EQUALP often has horrible performance! Most Lisps
1096 ;; provide extensions for user-defined equality, we should use them! There
1097 ;; is also a home-made hash table for rods defined below, written by
1098 ;; Gilbert (I think). We could also use that one, but I would prefer the
1099 ;; first method, even if it's unportable.
1100 (make-hash-table :test
1101 #+rune-is-character 'equal
1102 #-rune-is-character 'equalp))
1104 (defstruct dtd
1105 (elements (%make-rod-hash-table)) ;elmdefs
1106 (gentities (%make-rod-hash-table)) ;general entities
1107 (pentities (%make-rod-hash-table)) ;parameter entities
1108 (notations (%make-rod-hash-table)))
1110 (defun make-dtd-cache ()
1111 (puri:make-uri-space))
1113 (defvar *cache-all-dtds* nil)
1114 (defvar *dtd-cache* (make-dtd-cache))
1116 (defun remdtd (uri dtd-cache)
1117 (setf uri (puri:intern-uri uri dtd-cache))
1118 (prog1
1119 (and (getf (puri:uri-plist uri) 'dtd) t)
1120 (puri:unintern-uri uri dtd-cache)))
1122 (defun clear-dtd-cache (dtd-cache)
1123 (puri:unintern-uri t dtd-cache))
1125 (defun getdtd (uri dtd-cache)
1126 (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd))
1128 (defun (setf getdtd) (newval uri dtd-cache)
1129 (setf (getf (puri:uri-plist (puri:intern-uri uri dtd-cache)) 'dtd) newval)
1130 newval)
1133 ;;;;
1135 (defun find-element (name dtd)
1136 (gethash name (dtd-elements dtd)))
1138 (defun define-element (dtd element-name &optional content-model)
1139 (let ((e (find-element element-name dtd)))
1140 (cond
1141 ((null e)
1142 (prog1
1143 (setf (gethash element-name (dtd-elements dtd))
1144 (make-elmdef :name element-name :content content-model))
1145 (when content-model
1146 (sax:element-declaration (handler *ctx*) element-name content-model))))
1147 ((null content-model)
1150 (when *validate*
1151 (when (elmdef-content e)
1152 (validity-error "(05) Unique Element Type Declaration"))
1153 (when (eq content-model :EMPTY)
1154 (dolist (ad (elmdef-attributes e))
1155 (let ((type (attdef-type ad)))
1156 (when (and (listp type) (eq (car type) :NOTATION))
1157 (validity-error "(16) No Notation on Empty Element: ~S"
1158 (rod-string element-name)))))))
1159 (sax:element-declaration (handler *ctx*) element-name content-model)
1160 (setf (elmdef-content e) content-model)
1161 (setf (elmdef-external-p e) *external-subset-p*)
1162 e))))
1164 (defvar *redefinition-warning* nil)
1166 (defun define-attribute (dtd element name type default)
1167 (let ((adef (make-attdef :element element
1168 :name name
1169 :type type
1170 :default default))
1171 (e (or (find-element element dtd)
1172 (define-element dtd element))))
1173 (when (and *validate* (listp default))
1174 (unless (eq (attdef-type adef) :CDATA)
1175 (setf (second default) (canon-not-cdata-attval (second default))))
1176 (validate-attribute* *ctx* adef (second default)))
1177 (cond ((find-attribute e name)
1178 (when *redefinition-warning*
1179 (warn "Attribute \"~A\" of \"~A\" not redefined."
1180 (rod-string name)
1181 (rod-string element))))
1183 (when *validate*
1184 (when (eq type :ID)
1185 (when (find :ID (elmdef-attributes e) :key #'attdef-type)
1186 (validity-error "(09) One ID per Element Type: element ~A"
1187 (rod-string element)))
1188 (unless (member default '(:REQUIRED :IMPLIED))
1189 (validity-error "(10) ID Attribute Default: ~A"
1190 (rod-string element))))
1191 (flet ((notationp (type)
1192 (and (listp type) (eq (car type) :NOTATION))))
1193 (when (notationp type)
1194 (when (find-if #'notationp (elmdef-attributes e)
1195 :key #'attdef-type)
1196 (validity-error "(15) One Notation Per Element Type: ~S"
1197 (rod-string element)))
1198 (when (eq (elmdef-content e) :EMPTY)
1199 (validity-error "(16) No Notation on Empty Element: ~S"
1200 (rod-string element))))))
1201 (sax:attribute-declaration (handler *ctx*) element name type default)
1202 (push adef (elmdef-attributes e))))))
1204 (defun find-attribute (elmdef name)
1205 (find name (elmdef-attributes elmdef) :key #'attdef-name :test #'rod=))
1207 (defun define-notation (dtd name id)
1208 (let ((ns (dtd-notations dtd)))
1209 (when (gethash name ns)
1210 (validity-error "(24) Unique Notation Name: ~S" (rod-string name)))
1211 (setf (gethash name ns) id)))
1213 (defun find-notation (name dtd)
1214 (gethash name (dtd-notations dtd)))
1216 ;;;; ---------------------------------------------------------------------------
1217 ;;;; z streams and lexer
1218 ;;;;
1220 (defstruct zstream
1221 token-category
1222 token-semantic
1223 input-stack)
1225 (defun call-with-zstream (fn zstream)
1226 (unwind-protect
1227 (funcall fn zstream)
1228 (dolist (input (zstream-input-stack zstream))
1229 (cond #-x&y-streams-are-stream
1230 ((xstream-p input)
1231 (close-xstream input))
1232 #+x&y-streams-are-stream
1233 ((streamp input)
1234 (close input))))))
1236 (defmacro with-zstream ((zstream &rest args) &body body)
1237 `(call-with-zstream (lambda (,zstream) ,@body)
1238 (make-zstream ,@args)))
1240 (defun read-token (input)
1241 (cond ((zstream-token-category input)
1242 (multiple-value-prog1
1243 (values (zstream-token-category input)
1244 (zstream-token-semantic input))
1245 (setf (zstream-token-category input) nil
1246 (zstream-token-semantic input) nil)))
1248 (read-token-2 input))))
1250 (defun peek-token (input)
1251 (cond ((zstream-token-category input)
1252 (values
1253 (zstream-token-category input)
1254 (zstream-token-semantic input)))
1256 (multiple-value-bind (c s) (read-token input)
1257 (setf (zstream-token-category input) c
1258 (zstream-token-semantic input) s))
1259 (values (zstream-token-category input)
1260 (zstream-token-semantic input)))))
1262 (defun read-token-2 (input)
1263 (cond ((null (zstream-input-stack input))
1264 (values :eof nil))
1266 (let ((c (peek-rune (car (zstream-input-stack input)))))
1267 (cond ((eq c :eof)
1268 (cond ((eq (cadr (zstream-input-stack input)) :stop)
1269 (values :eof nil))
1271 (close-xstream (pop (zstream-input-stack input)))
1272 (if (null (zstream-input-stack input))
1273 (values :eof nil)
1274 (values :S nil) ;fake #x20 after PE expansion
1275 ))))
1277 (read-token-3 input)))))))
1279 (defvar *data-behaviour*
1280 ) ;either :DTD or :DOC
1282 (defun read-token-3 (zinput)
1283 (let ((input (car (zstream-input-stack zinput))))
1284 ;; PI Comment
1285 (let ((c (read-rune input)))
1286 (cond
1287 ;; first the common tokens
1288 ((rune= #/< c)
1289 (read-token-after-|<| zinput input))
1290 ;; now dispatch
1292 (ecase *data-behaviour*
1293 (:DTD
1294 (cond ((rune= #/\[ c) :\[)
1295 ((rune= #/\] c) :\])
1296 ((rune= #/\( c) :\()
1297 ((rune= #/\) c) :\))
1298 ((rune= #/\| c) :\|)
1299 ((rune= #/\> c) :\>)
1300 ((rune= #/\" c) :\")
1301 ((rune= #/\' c) :\')
1302 ((rune= #/\, c) :\,)
1303 ((rune= #/\? c) :\?)
1304 ((rune= #/\* c) :\*)
1305 ((rune= #/\+ c) :\+)
1306 ((name-rune-p c)
1307 (unread-rune c input)
1308 (values :nmtoken (read-name-token input)))
1309 ((rune= #/# c)
1310 (let ((q (read-name-token input)))
1311 (cond ((rod= q '#.(string-rod "REQUIRED")) :|#REQUIRED|)
1312 ((rod= q '#.(string-rod "IMPLIED")) :|#IMPLIED|)
1313 ((rod= q '#.(string-rod "FIXED")) :|#FIXED|)
1314 ((rod= q '#.(string-rod "PCDATA")) :|#PCDATA|)
1316 (wf-error zinput "Unknown token: ~S." q)))))
1317 ((or (rune= c #/U+0020)
1318 (rune= c #/U+0009)
1319 (rune= c #/U+000D)
1320 (rune= c #/U+000A))
1321 (values :S nil))
1322 ((rune= #/% c)
1323 (cond ((name-start-rune-p (peek-rune input))
1324 ;; an entity reference
1325 (read-pe-reference zinput))
1327 (values :%))))
1329 (wf-error zinput "Unexpected character ~S." c))))
1330 (:DOC
1331 (cond
1332 ((rune= c #/&)
1333 (multiple-value-bind (kind data) (read-entity-like input)
1334 (cond ((eq kind :ENTITY-REFERENCE)
1335 (values :ENTITY-REF data))
1336 ((eq kind :CHARACTER-REFERENCE)
1337 (values :CDATA
1338 (with-rune-collector (collect)
1339 (%put-unicode-char data collect)))))))
1341 (unread-rune c input)
1342 (values :CDATA (read-cdata input)))))))))))
1344 (definline check-rune (input actual expected)
1345 (unless (eql actual expected)
1346 (wf-error input "expected #/~A but found #/~A"
1347 (rune-char expected)
1348 (rune-char actual))))
1350 (defun read-pe-reference (zinput)
1351 (let* ((input (car (zstream-input-stack zinput)))
1352 (nam (read-name-token input)))
1353 (check-rune input #/\; (read-rune input))
1354 (cond (*expand-pe-p*
1355 ;; no external entities here!
1356 (let ((i2 (entity->xstream zinput nam :parameter)))
1357 (zstream-push i2 zinput))
1358 (values :S nil) ;space before inserted PE expansion.
1361 (values :PE-REFERENCE nam)) )))
1363 (defun read-token-after-|<| (zinput input)
1364 (let ((d (read-rune input)))
1365 (cond ((eq d :eof)
1366 (eox input "EOF after '<'"))
1367 ((rune= #/! d)
1368 (read-token-after-|<!| input))
1369 ((rune= #/? d)
1370 (multiple-value-bind (target content) (read-pi input)
1371 (cond ((rod= target '#.(string-rod "xml"))
1372 (values :xml-decl (cons target content)))
1373 ((rod-equal target '#.(string-rod "XML"))
1374 (wf-error zinput
1375 "You lost -- no XML processing instructions."))
1376 ((and sax:*namespace-processing* (position #/: target))
1377 (wf-error zinput
1378 "Processing instruction target ~S is not a ~
1379 valid NcName."
1380 (mu target)))
1382 (values :PI (cons target content))))))
1383 ((eq *data-behaviour* :DTD)
1384 (unread-rune d input)
1385 (unless (or (rune= #// d) (name-start-rune-p d))
1386 (wf-error zinput "Expected '!' or '?' after '<' in DTD."))
1387 (values :seen-< nil))
1388 ((rune= #// d)
1389 (let ((c (peek-rune input)))
1390 (cond ((name-start-rune-p c)
1391 (read-tag-2 zinput input :etag))
1393 (wf-error zinput
1394 "Expecting name start rune after \"</\".")))))
1395 ((name-start-rune-p d)
1396 (unread-rune d input)
1397 (read-tag-2 zinput input :stag))
1399 (wf-error zinput "Expected '!' or '?' after '<' in DTD.")))))
1401 (defun read-token-after-|<!| (input)
1402 (let ((d (read-rune input)))
1403 (cond ((eq d :eof)
1404 (eox input "EOF after \"<!\"."))
1405 ((name-start-rune-p d)
1406 (unread-rune d input)
1407 (let ((name (read-name-token input)))
1408 (cond ((rod= name '#.(string-rod "ELEMENT")) :|<!ELEMENT|)
1409 ((rod= name '#.(string-rod "ENTITY")) :|<!ENTITY|)
1410 ((rod= name '#.(string-rod "ATTLIST")) :|<!ATTLIST|)
1411 ((rod= name '#.(string-rod "NOTATION")) :|<!NOTATION|)
1412 ((rod= name '#.(string-rod "DOCTYPE")) :|<!DOCTYPE|)
1414 (wf-error input"`<!~A' unknown." (rod-string name))))))
1415 ((rune= #/\[ d)
1416 (values :|<![| nil))
1417 ((rune= #/- d)
1418 (setf d (read-rune input))
1419 (cond ((rune= #/- d)
1420 (values
1421 :COMMENT
1422 (read-comment-content input)))
1424 (wf-error input"Bad character ~S after \"<!-\"" d))))
1426 (wf-error input "Bad character ~S after \"<!\"" d)))))
1428 (definline read-S? (input)
1429 (while (member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D)
1430 :test #'eql)
1431 (consume-rune input)))
1433 (defun read-attribute-list (zinput input imagine-space-p)
1434 (cond ((or imagine-space-p
1435 (let ((c (peek-rune input)))
1436 (and (not (eq c :eof))
1437 (space-rune-p c))))
1438 (read-S? input)
1439 (cond ((eq (peek-rune input) :eof)
1440 nil)
1441 ((name-start-rune-p (peek-rune input))
1442 (cons (read-attribute zinput input)
1443 (read-attribute-list zinput input nil)))
1445 nil)))
1447 nil)))
1449 (defun read-entity-like (input)
1450 "Read an entity reference off the xstream `input'. Returns two values:
1451 either :ENTITY-REFERENCE <interned-rod> in case of a named entity
1452 or :CHARACTER-REFERENCE <integer> in case of character references.
1453 The initial #\\& is considered to be consumed already."
1454 (let ((c (peek-rune input)))
1455 (cond ((eq c :eof)
1456 (eox input "EOF after '&'"))
1457 ((rune= c #/#)
1458 (values :CHARACTER-REFERENCE (read-character-reference input)))
1460 (unless (name-start-rune-p (peek-rune input))
1461 (wf-error input "Expecting name after &."))
1462 (let ((name (read-name-token input)))
1463 (setf c (read-rune input))
1464 (unless (rune= c #/\;)
1465 (wf-error input "Expected \";\"."))
1466 (values :ENTITY-REFERENCE name))))))
1468 (defun read-tag-2 (zinput input kind)
1469 (let ((name (read-name-token input))
1470 (atts nil))
1471 (setf atts (read-attribute-list zinput input nil))
1473 ;; check for double attributes
1474 (do ((q atts (cdr q)))
1475 ((null q))
1476 (cond ((find (caar q) (cdr q) :key #'car)
1477 (wf-error zinput "Attribute ~S has two definitions in element ~S."
1478 (rod-string (caar q))
1479 (rod-string name)))))
1481 (cond ((eq (peek-rune input) #/>)
1482 (consume-rune input)
1483 (values kind (cons name atts)))
1484 ((eq (peek-rune input) #//)
1485 (consume-rune input)
1486 (check-rune input #/> (read-rune input))
1487 (values :ztag (cons name atts)))
1489 (wf-error zinput "syntax error in read-tag-2.")) )))
1491 (defun read-attribute (zinput input)
1492 (unless (name-start-rune-p (peek-rune input))
1493 (wf-error zinput "Expected name."))
1494 ;; arg thanks to the post mortem nature of name space declarations,
1495 ;; we could only process the attribute values post mortem.
1496 (let ((name (read-name-token input)))
1497 (while (let ((c (peek-rune input)))
1498 (and (not (eq c :eof))
1499 (or (rune= c #/U+0020)
1500 (rune= c #/U+0009)
1501 (rune= c #/U+000A)
1502 (rune= c #/U+000D))))
1503 (consume-rune input))
1504 (unless (eq (read-rune input) #/=)
1505 (wf-error zinput "Expected \"=\"."))
1506 (while (let ((c (peek-rune input)))
1507 (and (not (eq c :eof))
1508 (or (rune= c #/U+0020)
1509 (rune= c #/U+0009)
1510 (rune= c #/U+000A)
1511 (rune= c #/U+000D))))
1512 (consume-rune input))
1513 (cons name (read-att-value-2 input))))
1515 (defun canon-not-cdata-attval (value)
1516 ;; | If the declared value is not CDATA, then the XML processor must
1517 ;; | further process the normalized attribute value by discarding any
1518 ;; | leading and trailing space (#x20) characters, and by replacing
1519 ;; | sequences of space (#x20) characters by a single space (#x20)
1520 ;; | character.
1521 (with-rune-collector (collect)
1522 (let ((gimme-20 nil)
1523 (anything-seen-p nil))
1524 (map nil (lambda (c)
1525 (cond ((rune= c #/u+0020)
1526 (setf gimme-20 t))
1528 (when (and anything-seen-p gimme-20)
1529 (collect #/u+0020))
1530 (setf gimme-20 nil)
1531 (setf anything-seen-p t)
1532 (collect c))))
1533 value))))
1535 (definline data-rune-p (rune)
1536 ;; Any Unicode character, excluding FFFE, and FFFF.
1537 ;; Allow surrogates if using UTF-16, else allow >= 0x10000.
1538 (let ((c (rune-code rune)))
1539 (or (= c #x9) (= c #xA) (= c #xD)
1540 (<= #x20 c #xD7FF)
1541 #+rune-is-utf-16 (<= #xD800 c #xDFFF)
1542 (<= #xE000 c #xFFFD)
1543 #-rune-is-utf-16 (<= #x10000 c #x10FFFF))))
1545 (defun read-att-value (zinput input mode &optional canon-space-p (delim nil))
1546 (with-rune-collector-2 (collect)
1547 (labels ((muffle (input delim)
1548 (let (c)
1549 (loop
1550 (setf c (read-rune input))
1551 (cond ((eql delim c)
1552 (return))
1553 ((eq c :eof)
1554 (eox input "EOF"))
1555 ((rune= c #/&)
1556 (setf c (peek-rune input))
1557 (cond ((eql c :eof)
1558 (eox input))
1559 ((rune= c #/#)
1560 (let ((c (read-character-reference input)))
1561 (%put-unicode-char c collect)))
1563 (unless (name-start-rune-p (peek-rune input))
1564 (wf-error zinput "Expecting name after &."))
1565 (let ((name (read-name-token input)))
1566 (setf c (read-rune input))
1567 (check-rune input c #/\;)
1568 (ecase mode
1569 (:ATT
1570 (recurse-on-entity
1571 zinput name :general
1572 (lambda (zinput)
1573 (muffle (car (zstream-input-stack zinput))
1574 :eof))
1576 (:ENT
1577 ;; bypass, but never the less we
1578 ;; need to check for legal
1579 ;; syntax.
1580 ;; Must it be defined?
1581 ;; allerdings: unparsed sind verboten
1582 (collect #/&)
1583 (map nil (lambda (x) (collect x)) name)
1584 (collect #/\; )))))))
1585 ((and (eq mode :ENT) (rune= c #/%))
1586 (let ((d (peek-rune input)))
1587 (when (eq d :eof)
1588 (eox input))
1589 (unless (name-start-rune-p d)
1590 (wf-error zinput "Expecting name after %.")))
1591 (let ((name (read-name-token input)))
1592 (setf c (read-rune input))
1593 (check-rune input c #/\;)
1594 (cond (*expand-pe-p*
1595 (recurse-on-entity
1596 zinput name :parameter
1597 (lambda (zinput)
1598 (muffle (car (zstream-input-stack zinput))
1599 :eof))))
1601 (wf-error zinput "No PE here.")))))
1602 ((and (eq mode :ATT) (rune= c #/<))
1603 (wf-error zinput "unexpected #\/<"))
1604 ((and canon-space-p (space-rune-p c))
1605 (collect #/space))
1606 ((not (data-rune-p c))
1607 (wf-error zinput "illegal char: ~S." c))
1609 (collect c)))))))
1610 (declare (dynamic-extent #'muffle))
1611 (muffle input (or delim
1612 (let ((delim (read-rune input)))
1613 (unless (member delim '(#/\" #/\') :test #'eql)
1614 (wf-error zinput "invalid attribute delimiter"))
1615 delim))))))
1617 (defun read-character-reference (input)
1618 ;; The #/& is already read
1619 (let ((res
1620 (let ((c (read-rune input)))
1621 (check-rune input c #/#)
1622 (setq c (read-rune input))
1623 (cond ((eql c :eof)
1624 (eox input))
1625 ((eql c #/x)
1626 ;; hexadecimal
1627 (setq c (read-rune input))
1628 (when (eql c :eof)
1629 (eox input))
1630 (unless (digit-rune-p c 16)
1631 (wf-error input "garbage in character reference"))
1632 (prog1
1633 (parse-integer
1634 (with-output-to-string (sink)
1635 (write-char (rune-char c) sink)
1636 (while (progn
1637 (setq c (read-rune input))
1638 (when (eql c :eof)
1639 (eox input))
1640 (digit-rune-p c 16))
1641 (write-char (rune-char c) sink)))
1642 :radix 16)
1643 (check-rune input c #/\;)))
1644 ((rune<= #/0 c #/9)
1645 ;; decimal
1646 (prog1
1647 (parse-integer
1648 (with-output-to-string (sink)
1649 (write-char (rune-char c) sink)
1650 (while (progn
1651 (setq c (read-rune input))
1652 (when (eql c :eof)
1653 (eox input))
1654 (rune<= #/0 c #/9))
1655 (write-char (rune-char c) sink)))
1656 :radix 10)
1657 (check-rune input c #/\;)))
1659 (wf-error input "Bad char in numeric character entity."))))))
1660 (unless (code-data-char-p res)
1661 (wf-error
1662 input
1663 "expansion of numeric character reference (#x~X) is no data char."
1664 res))
1665 res))
1667 (defun read-pi (input)
1668 ;; "<?" is already read
1669 (let (name)
1670 (let ((c (peek-rune input)))
1671 (unless (name-start-rune-p c)
1672 (wf-error input "Expecting name after '<?'"))
1673 (setf name (read-name-token input)))
1674 (cond
1675 ((member (peek-rune input) '(#/U+0020 #/U+0009 #/U+000A #/U+000D)
1676 :test #'eql)
1677 (values name (read-pi-content input)))
1679 (unless (and (eql (read-rune input) #/?)
1680 (eql (read-rune input) #/>))
1681 (wf-error input "malformed processing instruction"))
1682 (values name "")))))
1684 (defun read-pi-content (input)
1685 (read-S? input)
1686 (let (d)
1687 (with-rune-collector (collect)
1688 (block nil
1689 (tagbody
1690 state-1
1691 (setf d (read-rune input))
1692 (when (eq d :eof)
1693 (eox input))
1694 (unless (data-rune-p d)
1695 (wf-error input "Illegal char: ~S." d))
1696 (when (rune= d #/?) (go state-2))
1697 (collect d)
1698 (go state-1)
1699 state-2 ;; #/? seen
1700 (setf d (read-rune input))
1701 (when (eq d :eof)
1702 (eox input))
1703 (unless (data-rune-p d)
1704 (wf-error input "Illegal char: ~S." d))
1705 (when (rune= d #/>) (return))
1706 (when (rune= d #/?)
1707 (collect #/?)
1708 (go state-2))
1709 (collect #/?)
1710 (collect d)
1711 (go state-1))))))
1713 (defun read-comment-content (input &aux d)
1714 (with-rune-collector (collect)
1715 (block nil
1716 (tagbody
1717 state-1
1718 (setf d (read-rune input))
1719 (when (eq d :eof)
1720 (eox input))
1721 (unless (data-rune-p d)
1722 (wf-error input "Illegal char: ~S." d))
1723 (when (rune= d #/-) (go state-2))
1724 (collect d)
1725 (go state-1)
1726 state-2 ;; #/- seen
1727 (setf d (read-rune input))
1728 (when (eq d :eof)
1729 (eox input))
1730 (unless (data-rune-p d)
1731 (wf-error input "Illegal char: ~S." d))
1732 (when (rune= d #/-) (go state-3))
1733 (collect #/-)
1734 (collect d)
1735 (go state-1)
1736 state-3 ;; #/- #/- seen
1737 (setf d (read-rune input))
1738 (when (eq d :eof)
1739 (eox input))
1740 (unless (data-rune-p d)
1741 (wf-error input "Illegal char: ~S." d))
1742 (when (rune= d #/>) (return))
1743 (wf-error input "'--' not allowed in a comment")
1744 (when (rune= d #/-)
1745 (collect #/-)
1746 (go state-3))
1747 (collect #/-)
1748 (collect #/-)
1749 (collect d)
1750 (go state-1)))))
1752 (defun read-cdata-sect (input &aux d)
1753 ;; <![CDATA[ is already read
1754 ;; read anything up to ]]>
1755 (with-rune-collector (collect)
1756 (block nil
1757 (tagbody
1758 state-1
1759 (setf d (read-rune input))
1760 (when (eq d :eof)
1761 (eox input))
1762 (unless (data-rune-p d)
1763 (wf-error input "Illegal char: ~S." d))
1764 (when (rune= d #/\]) (go state-2))
1765 (collect d)
1766 (go state-1)
1767 state-2 ;; #/] seen
1768 (setf d (read-rune input))
1769 (when (eq d :eof)
1770 (eox input))
1771 (unless (data-rune-p d)
1772 (wf-error input "Illegal char: ~S." d))
1773 (when (rune= d #/\]) (go state-3))
1774 (collect #/\])
1775 (collect d)
1776 (go state-1)
1777 state-3 ;; #/\] #/\] seen
1778 (setf d (read-rune input))
1779 (when (eq d :eof)
1780 (eox input))
1781 (unless (data-rune-p d)
1782 (wf-error input "Illegal char: ~S." d))
1783 (when (rune= d #/>)
1784 (return))
1785 (when (rune= d #/\])
1786 (collect #/\])
1787 (go state-3))
1788 (collect #/\])
1789 (collect #/\])
1790 (collect d)
1791 (go state-1)))))
1793 ;; some character categories
1795 (defun space-rune-p (rune)
1796 (declare (type rune rune))
1797 (or (rune= rune #/U+0020)
1798 (rune= rune #/U+0009)
1799 (rune= rune #/U+000A)
1800 (rune= rune #/U+000D)))
1802 (defun code-data-char-p (c)
1803 ;; Any Unicode character, excluding FFFE, and FFFF.
1804 ;; Allow surrogates if using UTF-16, else allow >= 0x10000.
1805 (or (= c #x9) (= c #xA) (= c #xD)
1806 (<= #x20 c #xD7FF)
1807 #+rune-is-utf-16 (<= #xD800 c #xDFFF)
1808 (<= #xE000 c #xFFFD)
1809 #-rune-is-utf-16 (<= #x10000 c #x10FFFF)))
1811 (defun pubid-char-p (c)
1812 (or (rune= c #/u+0020) (rune= c #/u+000D) (rune= c #/u+000A)
1813 (rune<= #/a c #/z)
1814 (rune<= #/A c #/Z)
1815 (rune<= #/0 c #/9)
1816 (member c '(#/- #/' #/\( #/\) #/+ #/, #/. #//
1817 #/: #/= #/? #/\; #/! #/* #/#
1818 #/@ #/$ #/_ #/%))))
1821 (defun expect (input category)
1822 (multiple-value-bind (cat sem) (read-token input)
1823 (unless (eq cat category)
1824 (wf-error input "Expected ~S saw ~S [~S]" category cat sem))
1825 (values cat sem)))
1827 (defun consume-token (input)
1828 (read-token input))
1830 ;;;; ---------------------------------------------------------------------------
1831 ;;;; Parser
1832 ;;;;
1834 (defun p/S (input)
1835 ;; S ::= (#x20 | #x9 | #xD | #xA)+
1836 (expect input :S)
1837 (while (eq (peek-token input) :S)
1838 (consume-token input)))
1840 (defun p/S? (input)
1841 ;; S ::= (#x20 | #x9 | #xD | #xA)+
1842 (while (eq (peek-token input) :S)
1843 (consume-token input)))
1845 (defun p/nmtoken (input)
1846 (nth-value 1 (expect input :nmtoken)))
1848 (defun p/name (input)
1849 (let ((result (p/nmtoken input)))
1850 (unless (name-start-rune-p (elt result 0))
1851 (wf-error input "Expected name."))
1852 result))
1854 (defun p/attlist-decl (input)
1855 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name (S AttDef)* S? '>'
1856 (let (elm-name)
1857 (expect input :|<!ATTLIST|)
1858 (p/S input)
1859 (setf elm-name (p/nmtoken input))
1860 (loop
1861 (let ((tok (read-token input)))
1862 (case tok
1864 (p/S? input)
1865 (cond ((eq (peek-token input) :>)
1866 (consume-token input)
1867 (return))
1869 (multiple-value-bind (name type default) (p/attdef input)
1870 (define-attribute (dtd *ctx*) elm-name name type default)) )))
1872 (return))
1873 (otherwise
1874 (wf-error input
1875 "Expected either another AttDef or end of \"<!ATTLIST\". -- saw ~S."
1876 tok)))))))
1878 (defun p/attdef (input)
1879 ;; [53] AttDef ::= Name S AttType S DefaultDecl
1880 (let (name type default)
1881 (setf name (p/nmtoken input))
1882 (p/S input)
1883 (setf type (p/att-type input))
1884 (p/S input)
1885 (setf default (p/default-decl input))
1886 (values name type default)))
1888 (defun p/list (input item-parser delimiter)
1889 ;; Parse something like S? <item> (S? <delimiter> <item>)* S?
1891 (declare (type function item-parser))
1892 (let (res)
1893 (p/S? input)
1894 (setf res (list (funcall item-parser input)))
1895 (loop
1896 (p/S? input)
1897 (cond ((eq (peek-token input) delimiter)
1898 (consume-token input)
1899 (p/S? input)
1900 (push (funcall item-parser input) res))
1902 (return))))
1903 (p/S? input)
1904 (reverse res)))
1906 (defun p/att-type (input)
1907 ;; [54] AttType ::= StringType | TokenizedType | EnumeratedType
1908 ;; [55] StringType ::= 'CDATA'
1909 ;; [56] TokenizedType ::= 'ID' /*VC: ID */
1910 ;; /*VC: One ID per Element Type */
1911 ;; /*VC: ID Attribute Default */
1912 ;; | 'IDREF' /*VC: IDREF */
1913 ;; | 'IDREFS' /*VC: IDREF */
1914 ;; | 'ENTITY' /*VC: Entity Name */
1915 ;; | 'ENTITIES' /*VC: Entity Name */
1916 ;; | 'NMTOKEN' /*VC: Name Token */
1917 ;; | 'NMTOKENS' /*VC: Name Token */
1918 ;; [57] EnumeratedType ::= NotationType | Enumeration
1919 ;; [58] NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
1920 ;; /* VC: Notation Attributes */
1921 ;; [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' /* VC: Enumeration */
1922 (multiple-value-bind (cat sem) (read-token input)
1923 (cond ((eq cat :nmtoken)
1924 (cond ((rod= sem '#.(string-rod "CDATA")) :CDATA)
1925 ((rod= sem '#.(string-rod "ID")) :ID)
1926 ((rod= sem '#.(string-rod "IDREF")) :IDREFS)
1927 ((rod= sem '#.(string-rod "IDREFS")) :IDREFS)
1928 ((rod= sem '#.(string-rod "ENTITY")) :ENTITY)
1929 ((rod= sem '#.(string-rod "ENTITIES")) :ENTITIES)
1930 ((rod= sem '#.(string-rod "NMTOKEN")) :NMTOKEN)
1931 ((rod= sem '#.(string-rod "NMTOKENS")) :NMTOKENS)
1932 ((rod= sem '#.(string-rod "NOTATION"))
1933 (let (names)
1934 (p/S input)
1935 (expect input :\()
1936 (setf names (p/list input #'p/nmtoken :\| ))
1937 (expect input :\))
1938 (when *validate*
1939 (setf (referenced-notations *ctx*)
1940 (append names (referenced-notations *ctx*))))
1941 (cons :NOTATION names)))
1943 (wf-error input "In p/att-type: ~S ~S." cat sem))))
1944 ((eq cat :\()
1945 ;; XXX Die Nmtoken-Syntax pruefen wir derzeit nur beim Validieren.
1946 (let (names)
1947 ;;(expect input :\()
1948 (setf names (p/list input #'p/nmtoken :\| ))
1949 (expect input :\))
1950 (cons :ENUMERATION names)))
1952 (wf-error input "In p/att-type: ~S ~S." cat sem)) )))
1954 (defun p/default-decl (input)
1955 ;; [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
1956 ;; | (('#FIXED' S)? AttValue) /* VC: Required Attribute */
1958 ;; /* VC: Attribute Default Legal */
1959 ;; /* WFC: No < in Attribute Values */
1960 ;; /* VC: Fixed Attribute Default */
1961 (multiple-value-bind (cat sem) (peek-token input)
1962 (cond ((eq cat :|#REQUIRED|)
1963 (consume-token input) :REQUIRED)
1964 ((eq cat :|#IMPLIED|)
1965 (consume-token input) :IMPLIED)
1966 ((eq cat :|#FIXED|)
1967 (consume-token input)
1968 (p/S input)
1969 (list :FIXED (p/att-value input)))
1970 ((or (eq cat :\') (eq cat :\"))
1971 (list :DEFAULT (p/att-value input)))
1973 (wf-error input "p/default-decl: ~S ~S." cat sem)) )))
1974 ;;;;
1976 ;; [70] EntityDecl ::= GEDecl | PEDecl
1977 ;; [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
1978 ;; [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
1979 ;; [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
1980 ;; [74] PEDef ::= EntityValue | ExternalID
1981 ;; [75] ExternalID ::= 'SYSTEM' S SystemLiteral
1982 ;; | 'PUBLIC' S PubidLiteral S SystemLiteral
1983 ;; [76] NDataDecl ::= S 'NDATA' S Name /* VC: Notation Declared */
1985 (defun p/entity-decl (input)
1986 (let (name def kind)
1987 (expect input :|<!ENTITY|)
1988 (p/S input)
1989 (cond ((eq (peek-token input) :%)
1990 (setf kind :parameter)
1991 (consume-token input)
1992 (p/S input))
1994 (setf kind :general)))
1995 (setf name (p/name input))
1996 (p/S input)
1997 (setf def (p/entity-def input kind))
1998 (define-entity input name kind def)
1999 (p/S? input)
2000 (expect input :\>)))
2002 (defun report-entity (h kind name def)
2003 (etypecase def
2004 (external-entdef
2005 (let ((extid (entdef-extid def))
2006 (ndata (entdef-ndata def)))
2007 (if ndata
2008 (sax:unparsed-entity-declaration h
2009 name
2010 (extid-public extid)
2011 (uri-rod (extid-system extid))
2012 ndata)
2013 (sax:external-entity-declaration h
2014 kind
2015 name
2016 (extid-public extid)
2017 (uri-rod (extid-system extid))))))
2018 (internal-entdef
2019 (sax:internal-entity-declaration h kind name (entdef-value def)))))
2021 (defun p/entity-def (input kind)
2022 (multiple-value-bind (cat sem) (peek-token input)
2023 (cond ((member cat '(:\" :\'))
2024 (make-internal-entdef (p/entity-value input)))
2025 ((and (eq cat :nmtoken)
2026 (or (rod= sem '#.(string-rod "SYSTEM"))
2027 (rod= sem '#.(string-rod "PUBLIC"))))
2028 (let (extid ndata)
2029 (setf extid (p/external-id input nil))
2030 (when (eq kind :general) ;NDATA allowed at all?
2031 (cond ((eq (peek-token input) :S)
2032 (p/S? input)
2033 (when (and (eq (peek-token input) :nmtoken)
2034 (rod= (nth-value 1 (peek-token input))
2035 '#.(string-rod "NDATA")))
2036 (consume-token input)
2037 (p/S input)
2038 (setf ndata (p/nmtoken input))
2039 (when *validate*
2040 (push ndata (referenced-notations *ctx*)))))))
2041 (make-external-entdef extid ndata)))
2043 (wf-error input "p/entity-def: ~S / ~S." cat sem)) )))
2045 (defun p/entity-value (input)
2046 (let ((delim (if (eq (read-token input) :\") #/\" #/\')))
2047 (read-att-value input
2048 (car (zstream-input-stack input))
2049 :ENT
2051 delim)))
2053 (defun p/att-value (input)
2054 (let ((delim (if (eq (read-token input) :\") #/\" #/\')))
2055 (read-att-value input
2056 (car (zstream-input-stack input))
2057 :ATT
2059 delim)))
2061 (defun p/external-id (input &optional (public-only-ok-p nil))
2062 ;; xxx public-only-ok-p
2063 (multiple-value-bind (cat sem) (read-token input)
2064 (cond ((and (eq cat :nmtoken) (rod= sem '#.(string-rod "SYSTEM")))
2065 (p/S input)
2066 (make-extid nil (p/system-literal input)))
2067 ((and (eq cat :nmtoken) (rod= sem '#.(string-rod "PUBLIC")))
2068 (let (pub sys)
2069 (p/S input)
2070 (setf pub (p/pubid-literal input))
2071 (when (eq (peek-token input) :S)
2072 (p/S input)
2073 (when (member (peek-token input) '(:\" :\'))
2074 (setf sys (p/system-literal input))))
2075 (when (and (not public-only-ok-p)
2076 (null sys))
2077 (wf-error input "System identifier needed for this PUBLIC external identifier."))
2078 (make-extid pub sys)))
2080 (wf-error input "Expected external-id: ~S / ~S." cat sem)))))
2083 ;; [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2084 ;; [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2085 ;; [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
2086 ;; | [-'()+,./:=?;!*#@$_%]
2088 (defun p/id (input)
2089 (multiple-value-bind (cat) (read-token input)
2090 (cond ((member cat '(:\" :\'))
2091 (let ((delim (if (eq cat :\") #/\" #/\')))
2092 (with-rune-collector (collect)
2093 (loop
2094 (let ((c (read-rune (car (zstream-input-stack input)))))
2095 (cond ((eq c :eof)
2096 (eox input "EOF in system literal."))
2097 ((rune= c delim)
2098 (return))
2100 (collect c))))))))
2102 (wf-error input "Expect either \" or \'.")))))
2104 ;; it is important to cache the orginal URI rod, since the re-serialized
2105 ;; uri-string can be different from the one parsed originally.
2106 (defun uri-rod (uri)
2107 (if uri
2108 (or (getf (puri:uri-plist uri) 'original-rod)
2109 (rod (puri:render-uri uri nil)))
2110 nil))
2112 (defun safe-parse-uri (str)
2113 ;; puri doesn't like strings starting with file:///, although that is a very
2114 ;; common is practise. Cut it away, we don't distinguish between scheme
2115 ;; :FILE and NIL anway.
2116 (when (eql (search "file://" str) 0)
2117 (setf str (subseq str (length "file://"))))
2118 (puri:parse-uri (coerce str 'simple-string)))
2120 (defun p/system-literal (input)
2121 (let* ((rod (p/id input))
2122 (result (safe-parse-uri (rod-string rod))))
2123 (setf (getf (puri:uri-plist result) 'original-rod) rod)
2124 result))
2126 (defun p/pubid-literal (input)
2127 (let ((result (p/id input)))
2128 (unless (every #'pubid-char-p result)
2129 (wf-error input "Illegal pubid: ~S." (rod-string result)))
2130 result))
2133 ;;;;
2135 (defun p/element-decl (input)
2136 (let (name content)
2137 (expect input :|<!ELEMENT|)
2138 (p/S input)
2139 (setf name (p/nmtoken input))
2140 (p/S input)
2141 (setf content (normalize-mixed-cspec (p/cspec input)))
2142 (unless (legal-content-model-p content *validate*)
2143 (wf-error input "Malformed or invalid content model: ~S." (mu content)))
2144 (p/S? input)
2145 (expect input :\>)
2146 (define-element (dtd *ctx*) name content)
2147 (list :element name content)))
2149 (defun maybe-compile-cspec (e)
2150 (or (elmdef-compiled-cspec e)
2151 (setf (elmdef-compiled-cspec e)
2152 (let ((cspec (elmdef-content e)))
2153 (unless cspec
2154 (validity-error "(03) Element Valid: no definition for ~A"
2155 (rod-string (elmdef-name e))))
2156 (multiple-value-call #'cons
2157 (compile-cspec cspec (standalone-check-necessary-p e)))))))
2159 (defun make-root-model (name)
2160 (cons (lambda (actual-name)
2161 (if (rod= actual-name name)
2162 (constantly :dummy)
2163 nil))
2164 (constantly t)))
2166 ;;; content spec validation:
2168 ;;; Given a `contentspec', COMPILE-CSPEC returns as multiple values two
2169 ;;; functions A and B of one argument to be called for every
2170 ;;; A. child element
2171 ;;; B. text child node
2173 ;;; Function A will be called with
2174 ;;; - the element name rod as its argument. If that element may appear
2175 ;;; at the current position, a new function to be called for the next
2176 ;;; child is returned. Otherwise NIL is returned.
2177 ;;; - argument NIL at the end of the element, it must then return T or NIL
2178 ;;; to indicate whether the end tag is valid.
2180 ;;; Function B will be called with the character data rod as its argument, it
2181 ;;; returns a boolean indicating whether this text node is allowed.
2183 ;;; That is, if one of the functions ever returns NIL, the node is
2184 ;;; rejected as invalid.
2186 (defun cmodel-done (actual-value)
2187 (null actual-value))
2189 (defun compile-cspec (cspec &optional standalone-check)
2190 (cond
2191 ((atom cspec)
2192 (ecase cspec
2193 (:EMPTY (values #'cmodel-done (constantly nil)))
2194 (:PCDATA (values #'cmodel-done (constantly t)))
2195 (:ANY
2196 (values (labels ((doit (name) (if name #'doit t))) #'doit)
2197 (constantly t)))))
2198 ((and (eq (car cspec) '*)
2199 (let ((subspec (second cspec)))
2200 (and (eq (car subspec) 'or) (eq (cadr subspec) :PCDATA))))
2201 (values (compile-mixed (second cspec))
2202 (constantly t)))
2204 (values (compile-content-model cspec)
2205 (lambda (rod)
2206 (when standalone-check
2207 (validity-error "(02) Standalone Document Declaration: whitespace"))
2208 (every #'white-space-rune-p rod))))))
2210 (defun compile-mixed (cspec)
2211 ;; das koennten wir theoretisch auch COMPILE-CONTENT-MODEL erledigen lassen
2212 (let ((allowed-names (cddr cspec)))
2213 (labels ((doit (actual-name)
2214 (cond
2215 ((null actual-name) t)
2216 ((member actual-name allowed-names :test #'rod=) #'doit)
2217 (t nil))))
2218 #'doit)))
2220 (defun compile-content-model (cspec &optional (continuation #'cmodel-done))
2221 (if (vectorp cspec)
2222 (lambda (actual-name)
2223 (if (and actual-name (rod= cspec actual-name))
2224 continuation
2225 nil))
2226 (ecase (car cspec)
2227 (and
2228 (labels ((traverse (seq)
2229 (compile-content-model (car seq)
2230 (if (cdr seq)
2231 (traverse (cdr seq))
2232 continuation))))
2233 (traverse (cdr cspec))))
2235 (let ((options (mapcar (rcurry #'compile-content-model continuation)
2236 (cdr cspec))))
2237 (lambda (actual-name)
2238 (some (rcurry #'funcall actual-name) options))))
2240 (let ((maybe (compile-content-model (second cspec) continuation)))
2241 (lambda (actual-name)
2242 (or (funcall maybe actual-name)
2243 (funcall continuation actual-name)))))
2245 (let (maybe-continuation)
2246 (labels ((recurse (actual-name)
2247 (if (null actual-name)
2248 (funcall continuation actual-name)
2249 (or (funcall maybe-continuation actual-name)
2250 (funcall continuation actual-name)))))
2251 (setf maybe-continuation
2252 (compile-content-model (second cspec) #'recurse))
2253 #'recurse)))
2255 (let ((it (cadr cspec)))
2256 (compile-content-model `(and ,it (* ,it)) continuation))))))
2258 (defun setp (list &key (test 'eql))
2259 (equal list (remove-duplicates list :test test)))
2261 (defun legal-content-model-p (cspec &optional validate)
2262 (or (eq cspec :PCDATA)
2263 (eq cspec :ANY)
2264 (eq cspec :EMPTY)
2265 (and (consp cspec)
2266 (eq (car cspec) '*)
2267 (consp (cadr cspec))
2268 (eq (car (cadr cspec)) 'or)
2269 (eq (cadr (cadr cspec)) :PCDATA)
2270 (every #'vectorp (cddr (cadr cspec)))
2271 (if (and validate (not (setp (cddr (cadr cspec)) :test #'rod=)))
2272 (validity-error "VC: No Duplicate Types (07)")
2274 (labels ((walk (x)
2275 (cond ((member x '(:PCDATA :ANY :EMPTY))
2276 nil)
2277 ((atom x) t)
2278 ((and (walk (car x))
2279 (walk (cdr x)))))))
2280 (walk cspec))))
2282 ;; wir fahren besser, wenn wir machen:
2284 ;; cspec ::= 'EMPTY' | 'ANY' | '#PCDATA'
2285 ;; | Name
2286 ;; | cs
2287 ;; cs ::= '(' S? cspec ( S? '|' S? cspec)* S? ')' ('?' | '*' | '+')?
2288 ;; und eine post factum analyse
2290 (defun p/cspec (input &optional recursivep)
2291 (let ((term
2292 (let ((names nil) op-cat op res stream)
2293 (multiple-value-bind (cat sem) (peek-token input)
2294 (cond ((eq cat :nmtoken)
2295 (consume-token input)
2296 (cond ((rod= sem '#.(string-rod "EMPTY"))
2297 :EMPTY)
2298 ((rod= sem '#.(string-rod "ANY"))
2299 :ANY)
2300 ((not recursivep)
2301 (wf-error input "invalid content spec"))
2303 sem)))
2304 ((eq cat :\#PCDATA)
2305 (consume-token input)
2306 :PCDATA)
2307 ((eq cat :\()
2308 (setf stream (car (zstream-input-stack input)))
2309 (consume-token input)
2310 (p/S? input)
2311 (setq names (list (p/cspec input t)))
2312 (p/S? input)
2313 (cond ((member (peek-token input) '(:\| :\,))
2314 (setf op-cat (peek-token input))
2315 (setf op (if (eq op-cat :\,) 'and 'or))
2316 (while (eq (peek-token input) op-cat)
2317 (consume-token input)
2318 (p/S? input)
2319 (push (p/cspec input t) names)
2320 (p/S? input))
2321 (setf res (cons op (reverse names))))
2323 (setf res (cons 'and names))))
2324 (p/S? input)
2325 (expect input :\))
2326 (when *validate*
2327 (unless (eq stream (car (zstream-input-stack input)))
2328 (validity-error "(06) Proper Group/PE Nesting")))
2329 res)
2331 (wf-error input "p/cspec - ~s / ~s" cat sem)))))))
2332 (cond ((eq (peek-token input) :?) (consume-token input) (list '? term))
2333 ((eq (peek-token input) :+) (consume-token input) (list '+ term))
2334 ((eq (peek-token input) :*) (consume-token input) (list '* term))
2336 term))))
2338 (defun normalize-mixed-cspec (cspec)
2339 ;; der Parser oben funktioniert huebsch fuer die children-Regel, aber
2340 ;; fuer Mixed ist das Ergebnis nicht praktisch, denn dort wollen wir
2341 ;; eigentlich auf eine Liste von Namen in einheitlichem Format hinaus.
2342 ;; Dazu normalisieren wir einfach in eine der beiden folgenden Formen:
2343 ;; (* (or :PCDATA ...rods...)) -- und zwar exakt so!
2344 ;; :PCDATA -- sonst ganz trivial
2345 (flet ((trivialp (c)
2346 (and (consp c)
2347 (and (eq (car c) 'and)
2348 (eq (cadr c) :PCDATA)
2349 (null (cddr c))))))
2350 (if (or (trivialp cspec) ;(and PCDATA)
2351 (and (consp cspec) ;(* (and PCDATA))
2352 (and (eq (car cspec) '*)
2353 (null (cddr cspec))
2354 (trivialp (cadr cspec)))))
2355 :PCDATA
2356 cspec)))
2358 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
2361 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name AttDefs S? '>'
2362 ;; [52] AttlistDecl ::= '<!ATTLIST' S Name S? '>'
2363 ;; [53] AttDefs ::= S Name S AttType S DefaultDecl AttDefs
2364 ;; [53] AttDefs ::=
2366 (defun p/notation-decl (input)
2367 (let (name id)
2368 (expect input :|<!NOTATION|)
2369 (p/S input)
2370 (setf name (p/name input))
2371 (p/S input)
2372 (setf id (p/external-id input t))
2373 (p/S? input)
2374 (expect input :\>)
2375 (sax:notation-declaration (handler *ctx*)
2376 name
2377 (if (extid-public id)
2378 (normalize-public-id (extid-public id))
2379 nil)
2380 (uri-rod (extid-system id)))
2381 (when (and sax:*namespace-processing* (find #/: name))
2382 (wf-error input "colon in notation name"))
2383 (when *validate*
2384 (define-notation (dtd *ctx*) name id))
2385 (list :notation-decl name id)))
2387 (defun normalize-public-id (rod)
2388 (with-rune-collector (collect)
2389 (let ((gimme-20 nil)
2390 (anything-seen-p nil))
2391 (map nil (lambda (c)
2392 (cond
2393 ((or (rune= c #/u+0009)
2394 (rune= c #/u+000A)
2395 (rune= c #/u+000D)
2396 (rune= c #/u+0020))
2397 (setf gimme-20 t))
2399 (when (and anything-seen-p gimme-20)
2400 (collect #/u+0020))
2401 (setf gimme-20 nil)
2402 (setf anything-seen-p t)
2403 (collect c))))
2404 rod))))
2408 (defun p/conditional-sect (input)
2409 (expect input :<!\[ )
2410 (let ((stream (car (zstream-input-stack input))))
2411 (p/S? input)
2412 (multiple-value-bind (cat sem) (read-token input)
2413 (cond ((and (eq cat :nmtoken)
2414 (rod= sem '#.(string-rod "INCLUDE")))
2415 (p/include-sect input stream))
2416 ((and (eq cat :nmtoken)
2417 (rod= sem '#.(string-rod "IGNORE")))
2418 (p/ignore-sect input stream))
2420 (wf-error input "Expected INCLUDE or IGNORE after \"<![\"."))))))
2422 (defun p/cond-expect (input cat initial-stream)
2423 (expect input cat)
2424 (when *validate*
2425 (unless (eq (car (zstream-input-stack input)) initial-stream)
2426 (validity-error "(21) Proper Conditional Section/PE Nesting"))))
2428 (defun p/include-sect (input initial-stream)
2429 ;; <![INCLUDE is already read.
2430 (p/S? input)
2431 (p/cond-expect input :\[ initial-stream)
2432 (p/ext-subset-decl input)
2433 (p/cond-expect input :\] initial-stream)
2434 (p/cond-expect input :\] initial-stream)
2435 (p/cond-expect input :\> initial-stream))
2437 (defun p/ignore-sect (input initial-stream)
2438 ;; <![IGNORE is already read.
2439 ;; XXX Is VC 21 being checked for nested sections?
2440 (p/S? input)
2441 (p/cond-expect input :\[ initial-stream)
2442 (let ((input (car (zstream-input-stack input))))
2443 (let ((level 0))
2444 (do ((c1 (read-rune input) (read-rune input))
2445 (c2 #/U+0000 c1)
2446 (c3 #/U+0000 c2))
2447 ((= level -1))
2448 (declare (type fixnum level))
2449 (cond ((eq c1 :eof)
2450 (eox input "EOF in <![IGNORE ... >")))
2451 (cond ((and (rune= c3 #/<) (rune= c2 #/!) (rune= c1 #/\[))
2452 (incf level)))
2453 (cond ((and (rune= c3 #/\]) (rune= c2 #/\]) (rune= c1 #/>))
2454 (decf level))) )))
2455 (unless (eq (car (zstream-input-stack input)) initial-stream)
2456 (validity-error "(21) Proper Conditional Section/PE Nesting")))
2458 (defun p/ext-subset-decl (input)
2459 ;; ( markupdecl | conditionalSect | S )*
2460 (loop
2461 (case (let ((*expand-pe-p* nil)) (peek-token input))
2462 (:|<![| (let ((*expand-pe-p* t)) (p/conditional-sect input)))
2463 (:S (consume-token input))
2464 (:eof (return))
2465 ((:|<!ELEMENT| :|<!ATTLIST| :|<!ENTITY| :|<!NOTATION| :PI :COMMENT)
2466 (let ((*expand-pe-p* t)
2467 (*external-subset-p* t))
2468 (p/markup-decl input)))
2469 ((:PE-REFERENCE)
2470 (let ((name (nth-value 1 (read-token input))))
2471 (recurse-on-entity input name :parameter
2472 (lambda (input)
2473 (etypecase (checked-get-entdef name :parameter)
2474 (external-entdef
2475 (p/ext-subset input))
2476 (internal-entdef
2477 (p/ext-subset-decl input)))
2478 (unless (eq :eof (peek-token input))
2479 (wf-error input "Trailing garbage."))))))
2480 (otherwise (return)))) )
2482 (defun p/markup-decl (input)
2483 (peek-token input)
2484 (let ((stream (car (zstream-input-stack input))))
2485 (multiple-value-prog1
2486 (p/markup-decl-unsafe input)
2487 (when *validate*
2488 (unless (eq stream (car (zstream-input-stack input)))
2489 (validity-error "(01) Proper Declaration/PE Nesting"))))))
2491 (defun p/markup-decl-unsafe (input)
2492 ;; markupdecl ::= elementdecl | AttlistDecl /* VC: Proper Declaration/PE Nesting */
2493 ;; | EntityDecl | NotationDecl
2494 ;; | PI | Comment /* WFC: PEs in Internal Subset */
2495 (let ((token (peek-token input))
2496 (*expand-pe-p* (and *expand-pe-p* *external-subset-p*)))
2497 (case token
2498 (:|<!ELEMENT| (p/element-decl input))
2499 (:|<!ATTLIST| (p/attlist-decl input))
2500 (:|<!ENTITY| (p/entity-decl input))
2501 (:|<!NOTATION| (p/notation-decl input))
2502 (:PI
2503 (let ((sem (nth-value 1 (read-token input))))
2504 (sax:processing-instruction (handler *ctx*) (car sem) (cdr sem))))
2505 (:COMMENT (consume-token input))
2506 (otherwise
2507 (wf-error input "p/markup-decl ~S" (peek-token input))))))
2509 (defun setup-encoding (input xml-header)
2510 (when (xml-header-encoding xml-header)
2511 (let ((enc (find-encoding (xml-header-encoding xml-header))))
2512 (cond (enc
2513 (setf (xstream-encoding (car (zstream-input-stack input)))
2514 enc))
2516 (warn "There is no such encoding: ~S." (xml-header-encoding xml-header)))))))
2518 (defun set-full-speed (input)
2519 (let ((xstream (car (zstream-input-stack input))))
2520 (when xstream
2521 (set-to-full-speed xstream))))
2523 (defun p/ext-subset (input)
2524 (cond ((eq (peek-token input) :xml-decl)
2525 (let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input))))))
2526 (setup-encoding input hd))
2527 (consume-token input)))
2528 (set-full-speed input)
2529 (p/ext-subset-decl input)
2530 (unless (eq (peek-token input) :eof)
2531 (wf-error input "Trailing garbage - ~S." (peek-token input))))
2533 (defvar *catalog* nil)
2535 (defun extid-using-catalog (extid)
2536 (if *catalog*
2537 (let ((sysid
2538 (resolve-extid (extid-public extid)
2539 (extid-system extid)
2540 *catalog*)))
2541 (if sysid
2542 (make-extid nil sysid)
2543 extid))
2544 extid))
2546 (defun p/doctype-decl (input &optional dtd-extid)
2547 (let ()
2548 (let ((*expand-pe-p* nil)
2549 name extid)
2550 (expect input :|<!DOCTYPE|)
2551 (p/S input)
2552 (setq name (p/nmtoken input))
2553 (when *validate*
2554 (setf (model-stack *ctx*) (list (make-root-model name))))
2555 (when (eq (peek-token input) :S)
2556 (p/S input)
2557 (unless (or (eq (peek-token input) :\[ )
2558 (eq (peek-token input) :\> ))
2559 (setf extid (p/external-id input t))))
2560 (when dtd-extid
2561 (setf extid dtd-extid))
2562 (p/S? input)
2563 (sax:start-dtd (handler *ctx*)
2564 name
2565 (and extid (extid-public extid))
2566 (and extid (uri-rod (extid-system extid))))
2567 (when (eq (peek-token input) :\[ )
2568 (when (disallow-internal-subset *ctx*)
2569 (wf-error input "document includes an internal subset"))
2570 (ensure-dtd)
2571 (consume-token input)
2572 (sax:start-internal-subset (handler *ctx*))
2573 (while (progn (p/S? input)
2574 (not (eq (peek-token input) :\] )))
2575 (if (eq (peek-token input) :PE-REFERENCE)
2576 (let ((name (nth-value 1 (read-token input))))
2577 (recurse-on-entity input name :parameter
2578 (lambda (input)
2579 (etypecase (checked-get-entdef name :parameter)
2580 (external-entdef
2581 (p/ext-subset input))
2582 (internal-entdef
2583 (p/ext-subset-decl input)))
2584 (unless (eq :eof (peek-token input))
2585 (wf-error input "Trailing garbage.")))))
2586 (let ((*expand-pe-p* t))
2587 (p/markup-decl input))))
2588 (consume-token input)
2589 (sax:end-internal-subset (handler *ctx*))
2590 (p/S? input))
2591 (expect input :>)
2592 (when extid
2593 (let* ((effective-extid
2594 (extid-using-catalog (absolute-extid input extid)))
2595 (sysid (extid-system effective-extid))
2596 (fresh-dtd-p (null (dtd *ctx*)))
2597 (cached-dtd
2598 (and fresh-dtd-p
2599 (not (standalone-p *ctx*))
2600 (getdtd sysid *dtd-cache*))))
2601 (cond
2602 (cached-dtd
2603 (setf (dtd *ctx*) cached-dtd)
2604 (report-cached-dtd cached-dtd))
2606 (let ((xi2 (xstream-open-extid effective-extid)))
2607 (with-zstream (zi2 :input-stack (list xi2))
2608 (ensure-dtd)
2609 (p/ext-subset zi2)
2610 (when (and fresh-dtd-p
2611 *cache-all-dtds*
2612 *validate*
2613 (not (standalone-p *ctx*)))
2614 (setf (getdtd sysid *dtd-cache*) (dtd *ctx*)))))))))
2615 (sax:end-dtd (handler *ctx*))
2616 (let ((dtd (dtd *ctx*)))
2617 (sax:entity-resolver
2618 (handler *ctx*)
2619 (lambda (name handler) (resolve-entity name handler dtd)))
2620 (sax::dtd (handler *ctx*) dtd))
2621 (list :DOCTYPE name extid))))
2623 (defun report-cached-dtd (dtd)
2624 (maphash (lambda (k v)
2625 (report-entity (handler *ctx*) :general k (cdr v)))
2626 (dtd-gentities dtd))
2627 (maphash (lambda (k v)
2628 (report-entity (handler *ctx*) :parameter k (cdr v)))
2629 (dtd-pentities dtd))
2630 (maphash (lambda (k v)
2631 (sax:notation-declaration
2632 (handler *ctx*)
2634 (if (extid-public v)
2635 (normalize-public-id (extid-public v))
2636 nil)
2637 (uri-rod (extid-system v))))
2638 (dtd-notations dtd)))
2640 (defun p/misc*-2 (input)
2641 ;; Misc*
2642 (while (member (peek-token input) '(:COMMENT :PI :S))
2643 (case (peek-token input)
2644 (:COMMENT
2645 (sax:comment (handler *ctx*) (nth-value 1 (peek-token input))))
2646 (:PI
2647 (sax:processing-instruction
2648 (handler *ctx*)
2649 (car (nth-value 1 (peek-token input)))
2650 (cdr (nth-value 1 (peek-token input))))))
2651 (consume-token input)))
2653 (defun p/document
2654 (input handler
2655 &key validate dtd root entity-resolver disallow-internal-subset
2656 (recode t))
2657 ;; check types of user-supplied arguments for better error messages:
2658 (check-type validate boolean)
2659 (check-type recode boolean)
2660 (check-type dtd (or null extid))
2661 (check-type root (or null rod))
2662 (check-type entity-resolver (or null function symbol))
2663 (check-type disallow-internal-subset boolean)
2664 #+rune-is-integer
2665 (when recode
2666 (setf handler (make-recoder handler #'rod-to-utf8-string)))
2667 (let* ((xstream (car (zstream-input-stack input)))
2668 (name (xstream-name xstream))
2669 (base (when name (stream-name-uri name)))
2670 (*ctx*
2671 (make-context :handler handler
2672 :main-zstream input
2673 :entity-resolver entity-resolver
2674 :base-stack (list (or base ""))
2675 :disallow-internal-subset disallow-internal-subset))
2676 (*validate* validate)
2677 (*namespace-bindings* *initial-namespace-bindings*))
2678 (sax:register-sax-parser handler (make-instance 'cxml-parser :ctx *ctx*))
2679 (sax:start-document handler)
2680 ;; document ::= XMLDecl? Misc* (doctypedecl Misc*)? element Misc*
2681 ;; Misc ::= Comment | PI | S
2682 ;; xmldecl::='<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
2683 ;; sddecl::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
2684 (let ((*data-behaviour* :DTD))
2685 ;; optional XMLDecl?
2686 (p/xmldecl input)
2687 ;; Misc*
2688 (p/misc*-2 input)
2689 ;; (doctypedecl Misc*)?
2690 (cond
2691 ((eq (peek-token input) :<!DOCTYPE)
2692 (p/doctype-decl input dtd)
2693 (p/misc*-2 input))
2694 (dtd
2695 (synthesize-doctype dtd input))
2696 ((and validate (not dtd))
2697 (validity-error "invalid document: no doctype")))
2698 (ensure-dtd)
2699 ;; Override expected root element if asked to
2700 (when root
2701 (setf (model-stack *ctx*) (list (make-root-model root))))
2702 ;; element
2703 (let ((*data-behaviour* :DOC))
2704 (fix-seen-< input)
2705 (p/element input))
2706 ;; optional Misc*
2707 (p/misc*-2 input)
2708 (p/eof input)
2709 (sax:end-document handler))))
2711 (defun synthesize-doctype (dtd input)
2712 (let ((dummy (string->xstream "<!DOCTYPE dummy>")))
2713 (setf (xstream-name dummy)
2714 (make-stream-name
2715 :entity-name "dummy doctype"
2716 :entity-kind :main
2717 :uri (zstream-base-sysid input)))
2718 (with-zstream (zstream :input-stack (list dummy))
2719 (p/doctype-decl zstream dtd))))
2721 (defun fix-seen-< (input)
2722 (when (eq (peek-token input) :seen-<)
2723 (multiple-value-bind (c s)
2724 (read-token-after-|<| input (car (zstream-input-stack input)))
2725 (setf (zstream-token-category input) c
2726 (zstream-token-semantic input) s))))
2728 (defun p/xmldecl (input)
2729 ;; we will use the attribute-value parser for the xml decl.
2730 (prog1
2731 (when (eq (peek-token input) :xml-decl)
2732 (let ((hd (parse-xml-decl (cdr (nth-value 1 (peek-token input))))))
2733 (setf (standalone-p *ctx*) (eq (xml-header-standalone-p hd) :yes))
2734 (setup-encoding input hd)
2735 (read-token input)
2736 hd))
2737 (set-full-speed input)))
2739 (defun p/eof (input)
2740 (unless (eq (peek-token input) :eof)
2741 (wf-error input "Garbage at end of document."))
2742 (when *validate*
2743 (maphash (lambda (k v)
2744 (unless v
2745 (validity-error "(11) IDREF: ~S not defined" (rod-string k))))
2746 (id-table *ctx*))
2748 (dolist (name (referenced-notations *ctx*))
2749 (unless (find-notation name (dtd *ctx*))
2750 (validity-error "(23) Notation Declared: ~S" (rod-string name))))))
2752 (defun p/element (input)
2753 (multiple-value-bind (cat n-b new-b uri lname qname attrs) (p/sztag input)
2754 (sax:start-element (handler *ctx*) uri lname qname attrs)
2755 (when (eq cat :stag)
2756 (let ((*namespace-bindings* n-b))
2757 (p/content input))
2758 (p/etag input qname))
2759 (sax:end-element (handler *ctx*) uri lname qname)
2760 (undeclare-namespaces new-b)
2761 (pop (base-stack *ctx*))
2762 (validate-end-element *ctx* qname)))
2764 (defun p/sztag (input)
2765 (multiple-value-bind (cat sem) (read-token input)
2766 (case cat
2767 ((:stag :ztag))
2768 (:eof (eox input))
2769 (t (wf-error input "element expected")))
2770 (destructuring-bind (&optional name &rest raw-attrs) sem
2771 (validate-start-element *ctx* name)
2772 (let* ((attrs
2773 (process-attributes *ctx* name (build-attribute-list raw-attrs)))
2774 (*namespace-bindings* *namespace-bindings*)
2775 new-namespaces)
2776 (when sax:*namespace-processing*
2777 (setf new-namespaces (declare-namespaces attrs))
2778 (mapc #'set-attribute-namespace attrs))
2779 (push (compute-base attrs) (base-stack *ctx*))
2780 (multiple-value-bind (uri prefix local-name)
2781 (if sax:*namespace-processing*
2782 (decode-qname name)
2783 (values nil nil nil))
2784 (declare (ignore prefix))
2785 (check-attribute-uniqueness attrs)
2786 (unless (or sax:*include-xmlns-attributes*
2787 (null sax:*namespace-processing*))
2788 (setf attrs
2789 (remove-if (compose #'xmlns-attr-p #'sax:attribute-qname)
2790 attrs)))
2791 (values cat
2792 *namespace-bindings*
2793 new-namespaces
2794 uri local-name name attrs))))))
2796 (defun p/etag (input qname)
2797 (multiple-value-bind (cat2 sem2) (read-token input)
2798 (unless (and (eq cat2 :etag)
2799 (eq (car sem2) qname))
2800 (wf-error input "Bad nesting. ~S / ~S"
2801 (mu qname)
2802 (mu (cons cat2 sem2))))
2803 (when (cdr sem2)
2804 (wf-error input "no attributes allowed in end tag"))))
2806 ;; copy&paste from cxml-rng
2807 (defun escape-uri (string)
2808 (with-output-to-string (out)
2809 (loop for c across (cxml::rod-to-utf8-string string) do
2810 (let ((code (char-code c)))
2811 ;; http://www.w3.org/TR/xlink/#link-locators
2812 (if (or (>= code 127) (<= code 32) (find c "<>\"{}|\\^`"))
2813 (format out "%~2,'0X" code)
2814 (write-char c out))))))
2816 (defun compute-base (attrs)
2817 (let ((new (sax:find-attribute #"xml:base" attrs))
2818 (current (car (base-stack *ctx*))))
2819 (if new
2820 (puri:merge-uris (escape-uri (sax:attribute-value new)) current)
2821 current)))
2823 (defun process-characters (input sem)
2824 (consume-token input)
2825 (when (search #"]]>" sem)
2826 (wf-error input "']]>' not allowed in CharData"))
2827 (validate-characters *ctx* sem))
2829 (defun process-cdata-section (input)
2830 (consume-token input)
2831 (let ((input (car (zstream-input-stack input))))
2832 (unless (and (rune= #/C (read-rune input))
2833 (rune= #/D (read-rune input))
2834 (rune= #/A (read-rune input))
2835 (rune= #/T (read-rune input))
2836 (rune= #/A (read-rune input))
2837 (rune= #/\[ (read-rune input)))
2838 (wf-error input "After '<![', 'CDATA[' is expected."))
2839 (validate-characters *ctx* #"hack") ;anything other than whitespace
2840 (read-cdata-sect input)))
2842 (defun p/content (input)
2843 ;; [43] content ::= (element | CharData | Reference | CDSect | PI | Comment)*
2844 (loop
2845 (multiple-value-bind (cat sem) (peek-token input)
2846 (case cat
2847 ((:stag :ztag)
2848 (p/element input))
2849 ((:CDATA)
2850 (process-characters input sem)
2851 (sax:characters (handler *ctx*) sem))
2852 ((:ENTITY-REF)
2853 (let ((name sem))
2854 (consume-token input)
2855 (recurse-on-entity input name :general
2856 (lambda (input)
2857 (prog1
2858 (etypecase (checked-get-entdef name :general)
2859 (internal-entdef (p/content input))
2860 (external-entdef (p/ext-parsed-ent input)))
2861 (unless (eq (peek-token input) :eof)
2862 (wf-error input "Trailing garbage. - ~S"
2863 (peek-token input))))))))
2864 ((:<!\[)
2865 (let ((data (process-cdata-section input)))
2866 (sax:start-cdata (handler *ctx*))
2867 (sax:characters (handler *ctx*) data)
2868 (sax:end-cdata (handler *ctx*))))
2869 ((:PI)
2870 (consume-token input)
2871 (sax:processing-instruction (handler *ctx*) (car sem) (cdr sem)))
2872 ((:COMMENT)
2873 (consume-token input)
2874 (sax:comment (handler *ctx*) sem))
2875 (otherwise
2876 (return))))))
2878 ;; [78] extParsedEnt ::= TextDecl? contentw
2879 ;; [79] extPE ::= TextDecl? extSubsetDecl
2881 (defstruct xml-header
2882 version
2883 encoding
2884 (standalone-p nil))
2886 (defun p/ext-parsed-ent (input)
2887 ;; [78] extParsedEnt ::= '<?xml' VersionInfo? EncodingDecl S? '?>' content
2888 (when (eq (peek-token input) :xml-decl)
2889 (let ((hd (parse-text-decl (cdr (nth-value 1 (peek-token input))))))
2890 (setup-encoding input hd))
2891 (consume-token input))
2892 (set-full-speed input)
2893 (p/content input))
2895 (defun parse-xml-decl (content)
2896 (let* ((res (make-xml-header))
2897 (i (make-rod-xstream content)))
2898 (with-zstream (z :input-stack (list i))
2899 (let ((atts (read-attribute-list z i t)))
2900 (unless (eq (peek-rune i) :eof)
2901 (wf-error i "Garbage at end of XMLDecl."))
2902 ;; versioninfo muss da sein
2903 ;; dann ? encodingdecl
2904 ;; dann ? sddecl
2905 ;; dann ende
2906 (unless (eq (caar atts) (intern-name '#.(string-rod "version")))
2907 (wf-error i "XMLDecl needs version."))
2908 (unless (and (>= (length (cdar atts)) 1)
2909 (every (lambda (x)
2910 (or (rune<= #/a x #/z)
2911 (rune<= #/A x #/Z)
2912 (rune<= #/0 x #/9)
2913 (rune= x #/_)
2914 (rune= x #/.)
2915 (rune= x #/:)
2916 (rune= x #/-)))
2917 (cdar atts)))
2918 (wf-error i"Bad XML version number: ~S." (rod-string (cdar atts))))
2919 (setf (xml-header-version res) (rod-string (cdar atts)))
2920 (pop atts)
2921 (when (eq (caar atts) (intern-name '#.(string-rod "encoding")))
2922 (unless (and (>= (length (cdar atts)) 1)
2923 (every (lambda (x)
2924 (or (rune<= #/a x #/z)
2925 (rune<= #/A x #/Z)
2926 (rune<= #/0 x #/9)
2927 (rune= x #/_)
2928 (rune= x #/.)
2929 (rune= x #/-)))
2930 (cdar atts))
2931 ((lambda (x)
2932 (or (rune<= #/a x #/z)
2933 (rune<= #/A x #/Z)))
2934 (aref (cdar atts) 0)))
2935 (wf-error i "Bad XML encoding name: ~S." (rod-string (cdar atts))))
2936 (setf (xml-header-encoding res) (rod-string (cdar atts)))
2937 (pop atts))
2938 (when (eq (caar atts) (intern-name '#.(string-rod "standalone")))
2939 (unless (or (rod= (cdar atts) '#.(string-rod "yes"))
2940 (rod= (cdar atts) '#.(string-rod "no")))
2941 (wf-error i "XMLDecl's 'standalone' attribute must be exactly \"yes\" or \"no\" and not ~S."
2942 (rod-string (cdar atts))))
2943 (setf (xml-header-standalone-p res)
2944 (if (rod-equal '#.(string-rod "yes") (cdar atts))
2945 :yes
2946 :no))
2947 (pop atts))
2948 (when atts
2949 (wf-error i "Garbage in XMLDecl: ~A" (rod-string content)))
2950 res))))
2952 (defun parse-text-decl (content)
2953 (let* ((res (make-xml-header))
2954 (i (make-rod-xstream content)))
2955 (with-zstream (z :input-stack (list i))
2956 (let ((atts (read-attribute-list z i t)))
2957 (unless (eq (peek-rune i) :eof)
2958 (wf-error i "Garbage at end of TextDecl"))
2959 ;; versioninfo optional
2960 ;; encodingdecl muss da sein
2961 ;; dann ende
2962 (when (eq (caar atts) (intern-name '#.(string-rod "version")))
2963 (unless (and (>= (length (cdar atts)) 1)
2964 (every (lambda (x)
2965 (or (rune<= #/a x #/z)
2966 (rune<= #/A x #/Z)
2967 (rune<= #/0 x #/9)
2968 (rune= x #/_)
2969 (rune= x #/.)
2970 (rune= x #/:)
2971 (rune= x #/-)))
2972 (cdar atts)))
2973 (wf-error i "Bad XML version number: ~S." (rod-string (cdar atts))))
2974 (setf (xml-header-version res) (rod-string (cdar atts)))
2975 (pop atts))
2976 (unless (eq (caar atts) (intern-name '#.(string-rod "encoding")))
2977 (wf-error i "TextDecl needs encoding."))
2978 (unless (and (>= (length (cdar atts)) 1)
2979 (every (lambda (x)
2980 (or (rune<= #/a x #/z)
2981 (rune<= #/A x #/Z)
2982 (rune<= #/0 x #/9)
2983 (rune= x #/_)
2984 (rune= x #/.)
2985 (rune= x #/-)))
2986 (cdar atts))
2987 ((lambda (x)
2988 (or (rune<= #/a x #/z)
2989 (rune<= #/A x #/Z)
2990 (rune<= #/0 x #/9)))
2991 (aref (cdar atts) 0)))
2992 (wf-error i "Bad XML encoding name: ~S." (rod-string (cdar atts))))
2993 (setf (xml-header-encoding res) (rod-string (cdar atts)))
2994 (pop atts)
2995 (when atts
2996 (wf-error i "Garbage in TextDecl: ~A" (rod-string content)))))
2997 res))
2999 ;;;; ---------------------------------------------------------------------------
3000 ;;;; mu
3001 ;;;;
3003 (defun mu (x)
3004 (cond ((stringp x) x)
3005 ((vectorp x) (rod-string x))
3006 ((consp x)
3007 (cons (mu (car x)) (mu (cdr x))))
3008 (x)))
3010 ;;;; ---------------------------------------------------------------------------
3011 ;;;; User interface ;;;;
3013 #-cxml-system::uri-is-namestring
3014 (defun specific-or (component &optional (alternative nil))
3015 (if (eq component :unspecific)
3016 alternative
3017 component))
3019 (defun string-or (str &optional (alternative nil))
3020 (if (zerop (length str))
3021 alternative
3022 str))
3024 #-cxml-system::uri-is-namestring
3025 (defun make-uri (&rest initargs &key path query &allow-other-keys)
3026 (apply #'make-instance
3027 'puri:uri
3028 :path (and path (escape-path path))
3029 :query (and query (escape-query query))
3030 initargs))
3032 #-cxml-system::uri-is-namestring
3033 (defun escape-path (list)
3034 (puri::render-parsed-path list t))
3036 #-cxml-system::uri-is-namestring
3037 (defun escape-query (pairs)
3038 (flet ((escape (str)
3039 (puri::encode-escaped-encoding str puri::*reserved-characters* t)))
3040 (let ((first t))
3041 (with-output-to-string (s)
3042 (dolist (pair pairs)
3043 (if first
3044 (setf first nil)
3045 (write-char #\& s))
3046 (write-string (escape (car pair)) s)
3047 (write-char #\= s)
3048 (write-string (escape (cdr pair)) s))))))
3050 #-cxml-system::uri-is-namestring
3051 (defun uri-parsed-query (uri)
3052 (flet ((unescape (str)
3053 (puri::decode-escaped-encoding str t puri::*reserved-characters*)))
3054 (let ((str (puri:uri-query uri)))
3055 (cond
3056 (str
3057 (let ((pairs '()))
3058 (dolist (s (split-sequence-if (lambda (x) (eql x #\&)) str))
3059 (destructuring-bind (name value)
3060 (split-sequence-if (lambda (x) (eql x #\=)) s)
3061 (push (cons (unescape name) (unescape value)) pairs)))
3062 (reverse pairs)))
3064 nil)))))
3066 #-cxml-system::uri-is-namestring
3067 (defun query-value (name alist)
3068 (cdr (assoc name alist :test #'equal)))
3070 #-cxml-system::uri-is-namestring
3071 (defun pathname-to-uri (pathname)
3072 (let ((path
3073 ;; FIXME: should we really leave ".." in base URIs?
3074 (append (mapcar (lambda (x)
3075 (cond ((member x '(:up :back)) "..")
3076 (t x)))
3077 (pathname-directory pathname))
3078 (list
3079 (if (specific-or (pathname-type pathname))
3080 (concatenate 'string
3081 (pathname-name pathname)
3083 (pathname-type pathname))
3084 (pathname-name pathname))))))
3085 (if (eq (car path) :relative)
3086 (make-uri :path path)
3087 (make-uri :scheme :file
3088 :host (concatenate 'string
3089 (string-or (host-namestring pathname))
3091 (specific-or (pathname-device pathname)))
3092 :path path))))
3094 #+cxml-system::uri-is-namestring
3095 (defun pathname-to-uri (pathname)
3096 (puri:parse-uri (namestring pathname)))
3098 #-cxml-system::uri-is-namestring
3099 (defun parse-name.type (str)
3100 (if str
3101 (let ((i (position #\. str :from-end t)))
3102 (if i
3103 (values (subseq str 0 i) (subseq str (1+ i)))
3104 (values str nil)))
3105 (values nil nil)))
3107 #-cxml-system::uri-is-namestring
3108 (defun uri-to-pathname (uri)
3109 (let ((scheme (puri:uri-scheme uri))
3110 (path (loop for e in (puri:uri-parsed-path uri)
3111 collect (if (stringp e)
3112 (puri::decode-escaped-encoding e t nil)
3113 e))))
3114 (unless (member scheme '(nil :file))
3115 (error 'xml-parse-error
3116 :format-control "URI scheme ~S not supported"
3117 :format-arguments (list scheme)))
3118 (if (eq (car path) :relative)
3119 (multiple-value-bind (name type)
3120 (parse-name.type (car (last path)))
3121 (make-pathname :directory (butlast path)
3122 :name name
3123 :type type))
3124 (multiple-value-bind (name type)
3125 (parse-name.type (car (last (cdr path))))
3126 (destructuring-bind (host device)
3127 (split-sequence-if (lambda (x) (eql x #\+))
3128 (or (puri:uri-host uri) "+"))
3129 (make-pathname :host (string-or host)
3130 :device (string-or device)
3131 :directory (cons :absolute (butlast (cdr path)))
3132 :name name
3133 :type type))))))
3134 #+cxml-system::uri-is-namestring
3135 (defun uri-to-pathname (uri)
3136 (let ((pathname (puri:render-uri uri nil)))
3137 (when (equalp (pathname-host pathname) "+")
3138 (setf (slot-value pathname 'lisp::host) "localhost"))
3139 pathname))
3141 (defun parse
3142 (input handler &rest args
3143 &key validate dtd root entity-resolver disallow-internal-subset
3144 recode pathname)
3145 "@arg[input]{A string, pathname, octet vector, or stream.}
3146 @arg[handler]{A @class{SAX handler}}
3147 @arg[validate]{Boolean. Defaults to @code{nil}. If true, parse in
3148 validating mode, i.e. assert that the document contains a DOCTYPE
3149 declaration and conforms to the DTD declared.}
3150 @arg[dtd]{unless @code{nil}, an extid instance specifying the external
3151 subset to load. This options overrides the extid specified in the
3152 document type declaration, if any. See below for @fun{make-extid}.
3153 This option is useful for verification purposes together with the
3154 @var{root} and @var{disallow-internal-subset} arguments.}
3155 @arg[root]{The expected root element name, or @code{nil} (the default).
3156 If specified, this argument overrides the name stated in the input's
3157 DOCTYPE (if any).}
3158 @arg[entity-resolver]{@code{nil} or a function of two arguments which
3159 is invoked for every entity referenced by the document with the
3160 entity's Public ID (a rod) and System ID (an URI object) as arguments.
3161 The function may either return nil, CXML will then try to resolve the
3162 entity as usual. Alternatively it may return a Common Lisp stream
3163 specialized on @code{(unsigned-byte 8)} which will be used instead.
3164 (It may also signal an error, of course, which can be useful to prohibit
3165 parsed XML documents from including arbitrary files readable by
3166 the parser.)}
3167 @arg[disallow-internal-subset]{Boolean. If true, signal
3168 an error if the document contains an internal subset.}
3169 @arg[recode]{Boolean. (Ignored on Lisps with Unicode
3170 support.) Recode rods to UTF-8 strings. Defaults to true.
3171 Make sure to use @fun{utf8-dom:make-dom-builder} if this
3172 option is enabled and @fun{rune-dom:make-dom-builder}
3173 otherwise.}
3174 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3176 Parse an XML document from @var{input}, which can be a string, pathname,
3177 octet vector, or stream.
3179 Return values from this function depend on the SAX handler used.
3180 This is an old-style convenience wrapper around the new-style interface
3181 @fun{parse}.
3183 Parse an XML document from @var{filename}, and signal SAX events to
3184 @var{handler} while doing so.
3186 All SAX parsing functions share the same keyword arguments. Refer to
3187 @fun{parse} for details on keyword arguments."
3188 (declare (ignore validate dtd root entity-resolver disallow-internal-subset
3189 recode))
3190 (let ((args
3191 (loop
3192 for (name value) on args by #'cddr
3193 unless (eq name :pathname)
3194 append (list name value))))
3195 (etypecase input
3196 (xstream (apply #'parse-xstream input handler args))
3197 (pathname (apply #'parse-file input handler args))
3198 (rod (apply #'parse-rod input handler args))
3199 (array (apply #'parse-octets input handler args))
3200 (stream
3201 (let ((xstream (make-xstream input :speed 8192)))
3202 (setf (xstream-name xstream)
3203 (make-stream-name
3204 :entity-name "main document"
3205 :entity-kind :main
3206 :uri (if pathname
3207 (pathname-to-uri (merge-pathnames pathname))
3208 (safe-stream-sysid input))))
3209 (apply #'parse-xstream xstream handler args))))))
3211 (defun parse-xstream (xstream handler &rest args)
3212 (let ((*ctx* nil))
3213 (handler-case
3214 (with-zstream (zstream :input-stack (list xstream))
3215 (peek-rune xstream)
3216 (with-scratch-pads ()
3217 (apply #'p/document zstream handler args)))
3218 (runes-encoding:encoding-error (c)
3219 (wf-error xstream "~A" c)))))
3221 (defun parse-file (filename handler &rest args)
3222 "@arg[filename]{An pathname designator.}
3223 @arg[handler]{A @class{SAX handler}}
3224 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3226 This is an old-style convenience wrapper around the new-style interface
3227 @fun{parse}.
3229 Parse an XML document from @var{filename}, and signal SAX events to
3230 @var{handler} while doing so.
3232 All SAX parsing functions share the same keyword arguments. Refer to
3233 @fun{parse} for details on keyword arguments."
3234 (with-open-xfile (input filename)
3235 (setf (xstream-name input)
3236 (make-stream-name
3237 :entity-name "main document"
3238 :entity-kind :main
3239 :uri (pathname-to-uri (merge-pathnames filename))))
3240 (apply #'parse-xstream input handler args)))
3242 (defun resolve-synonym-stream (stream)
3243 (while (typep stream 'synonym-stream)
3244 (setf stream (symbol-value (synonym-stream-symbol stream))))
3245 stream)
3247 (defun safe-stream-sysid (stream)
3248 (if (and (typep (resolve-synonym-stream stream) 'file-stream)
3249 ;; ignore-errors, because sb-bsd-sockets creates instances of
3250 ;; FILE-STREAMs that aren't
3251 (ignore-errors (pathname stream)))
3252 (pathname-to-uri (merge-pathnames (pathname stream)))
3253 nil))
3255 (deftype |SAX HANDLER| ()
3256 'sax:abstract-handler
3257 "Historically, any object has been usable as a SAX handler with CXML,
3258 as long as it implemented all SAX events, i.e. had methods
3259 for the generic functions defined in the SAX package.
3261 While this approach still works, it is now recommended that SAX handlers
3262 should be implemented by subclassing @class{abstract-handler} or one
3263 of its subclasses. Useful subclasses are @class{content-handler}
3264 and @class{default-handler}.
3266 (In addition, the value @code{nil} is valid SAX handler, which discards
3267 all events it receives.)
3269 As a rule of thumb, write a subclass of @class{default-handler} if
3270 you want to handle only a few special SAX events and ignore the rest,
3271 because this class has no-op default methods for all events.
3273 If, however, you want to make certain that your class implements all
3274 important SAX events explicitly, a good choice is @class{content-handler},
3275 which has no-op default methods only for less important, DTD-related
3276 events, and requires subclasses to implement all events related to the
3277 content model.
3279 In some cases, it might be helpful to implement @class{abstract-handler}
3280 directly, which has no default event methods at all.")
3282 (defun parse-stream (stream handler &rest args)
3283 "@arg[stream]{An (unsigned-byte 8) stream}
3284 @arg[handler]{A @class{SAX handler}}
3285 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3287 This is an old-style convenience wrapper around the new-style interface
3288 @fun{parse}.
3290 Parse an XML document from @var{stream}, and signal SAX events to
3291 @var{handler} while doing so.
3293 All SAX parsing functions share the same keyword arguments. Refer to
3294 @fun{parse} for details on keyword arguments."
3295 (let ((xstream
3296 (make-xstream
3297 stream
3298 :name (make-stream-name
3299 :entity-name "main document"
3300 :entity-kind :main
3301 :uri (safe-stream-sysid stream))
3302 :initial-speed 1)))
3303 (apply #'parse-xstream xstream handler args)))
3305 (defun parse-empty-document
3306 (uri qname handler &key public-id system-id entity-resolver (recode t))
3307 "@arg[uri]{a string or nil}
3308 @arg[qname]{a string or nil}
3309 @arg[handler]{a @class{SAX handler}}
3310 @arg[public-id]{a string or nil}
3311 @arg[system-id]{a @type{puri:uri} or nil}
3312 @arg[entity-resolver]{@code{nil} or a function of two arguments which
3313 is invoked for every entity referenced by the document with the
3314 entity's Public ID (a rod) and System ID (an URI object) as arguments.
3315 The function may either return nil, CXML will then try to resolve the
3316 entity as usual. Alternatively it may return a Common Lisp stream
3317 specialized on @code{(unsigned-byte 8)} which will be used instead.
3318 (It may also signal an error, of course, which can be useful to prohibit
3319 parsed XML documents from including arbitrary files readable by
3320 the parser.)}
3321 @arg[recode]{Boolean. (Ignored on Lisps with Unicode
3322 support.) Recode rods to UTF-8 strings. Defaults to true.
3323 Make sure to use @fun{utf8-dom:make-dom-builder} if this
3324 option is enabled and @fun{rune-dom:make-dom-builder}
3325 otherwise.}
3326 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3328 Simulate parsing of a document with a document element @var{qname}
3329 having no attributes except for an optional namespace
3330 declaration to @var{uri}. If an external ID is specified
3331 (@var{system-id}, @var{public-id}), find, parse, and report
3332 this DTD as if with @fun{parse-file}, using the specified
3333 entity resolver."
3334 (check-type uri (or null rod))
3335 (check-type qname (or null rod))
3336 (check-type public-id (or null rod))
3337 (check-type system-id (or null puri:uri))
3338 (check-type entity-resolver (or null function symbol))
3339 (check-type recode boolean)
3340 #+rune-is-integer
3341 (when recode
3342 (setf handler (make-recoder handler #'rod-to-utf8-string)))
3343 (let ((*ctx*
3344 (make-context :handler handler :entity-resolver entity-resolver))
3345 (*validate* nil)
3346 (extid
3347 (when (or public-id system-id)
3348 (extid-using-catalog (make-extid public-id system-id)))))
3349 (sax:start-document handler)
3350 (when extid
3351 (sax:start-dtd handler
3352 qname
3353 (and public-id)
3354 (and system-id (uri-rod system-id)))
3355 (setf (dtd *ctx*) (getdtd (extid-system extid) *dtd-cache*))
3356 (unless (dtd *ctx*)
3357 (with-scratch-pads ()
3358 (let ((*data-behaviour* :DTD))
3359 (let ((xi2 (xstream-open-extid extid)))
3360 (with-zstream (zi2 :input-stack (list xi2))
3361 (ensure-dtd)
3362 (p/ext-subset zi2))))))
3363 (sax:end-dtd handler)
3364 (let ((dtd (dtd *ctx*)))
3365 (sax:entity-resolver handler (lambda (n h) (resolve-entity n h dtd)))
3366 (sax::dtd handler dtd)))
3367 (ensure-dtd)
3368 (when (or uri qname)
3369 (let* ((attrs
3370 (when uri
3371 (list (sax:make-attribute :qname #"xmlns"
3372 :value (rod uri)
3373 :specified-p t))))
3374 (*namespace-bindings* *namespace-bindings*)
3375 new-namespaces)
3376 (when sax:*namespace-processing*
3377 (setf new-namespaces (declare-namespaces attrs))
3378 (mapc #'set-attribute-namespace attrs))
3379 (multiple-value-bind (uri prefix local-name)
3380 (if sax:*namespace-processing* (decode-qname qname) nil)
3381 (declare (ignore prefix))
3382 (unless (or sax:*include-xmlns-attributes*
3383 (null sax:*namespace-processing*))
3384 (setf attrs nil))
3385 (sax:start-element (handler *ctx*) uri local-name qname attrs)
3386 (sax:end-element (handler *ctx*) uri local-name qname))
3387 (undeclare-namespaces new-namespaces)))
3388 (sax:end-document handler)))
3390 (defun parse-dtd-file (filename &optional handler)
3391 "@arg[filename]{An pathname designator.}
3392 @arg[handler]{A @class{SAX handler}}
3393 @return{A @class{dtd} instance.}
3395 Parse @a[http://www.w3.org/TR/2000/REC-xml-20001006#NT-extSubset]{declarations}
3396 from @var{filename} and return an object representing the DTD,
3397 suitable as an argument to @code{validate} with @fun{parse}."
3398 (with-open-file (s filename :element-type '(unsigned-byte 8))
3399 (parse-dtd-stream s handler)))
3401 (defun parse-dtd-stream (stream &optional handler)
3402 "@arg[stream]{An (unsigned-byte 8) stream.}
3403 @arg[handler]{A @class{SAX handler}}
3404 @return{A @class{dtd} instance.}
3406 Parse @a[http://www.w3.org/TR/2000/REC-xml-20001006#NT-extSubset]{declarations}
3407 from @var{stream} and return an object representing the DTD,
3408 suitable as an argument to @code{validate} with @fun{parse}."
3409 (let ((input (make-xstream stream)))
3410 (setf (xstream-name input)
3411 (make-stream-name
3412 :entity-name "dtd"
3413 :entity-kind :main
3414 :uri (safe-stream-sysid stream)))
3415 (let ((*ctx* (make-context :handler handler))
3416 (*validate* t)
3417 (*data-behaviour* :DTD))
3418 (with-zstream (zstream :input-stack (list input))
3419 (with-scratch-pads ()
3420 (ensure-dtd)
3421 (peek-rune input)
3422 (p/ext-subset zstream)
3423 (dtd *ctx*))))))
3425 (defun parse-rod (string handler &rest args)
3426 "@arg[string]{An string of unicode characters.}
3427 @arg[handler]{A @class{SAX handler}}
3428 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3430 This is an old-style convenience wrapper around the new-style interface
3431 @fun{parse}.
3433 Parse an XML document from @var{string}, and signal SAX events to
3434 @var{handler} while doing so.
3436 Note: This function assumes that @var{string} has already been decoded into
3437 Unicode runes and ignores the encoding specified in the XML declaration,
3438 if any.
3440 All SAX parsing functions share the same keyword arguments. Refer to
3441 @fun{parse} for details on keyword arguments."
3442 (let ((xstream (string->xstream string)))
3443 (setf (xstream-name xstream)
3444 (make-stream-name
3445 :entity-name "main document"
3446 :entity-kind :main
3447 :uri nil))
3448 (apply #'parse-xstream xstream handler args)))
3450 (defun string->xstream (string)
3451 (make-rod-xstream (string-rod string)))
3453 (defun parse-octets (octets handler &rest args)
3454 "@arg[octets]{An (unsigned-byte 8) vector.}
3455 @arg[handler]{A @class{SAX handler}}
3456 @return{The value returned by @fun{sax:end-document} on @var{handler}.}
3458 This is an old-style convenience wrapper around the new-style interface
3459 @fun{parse}.
3461 Parse an XML document from @var{octets}, and signal SAX events to
3462 @var{handler} while doing so.
3464 All SAX parsing functions share the same keyword arguments. Refer to
3465 @fun{parse} for details on keyword arguments."
3466 (apply #'parse-stream (make-octet-input-stream octets) handler args))
3468 ;;;;
3470 (defun zstream-push (new-xstream zstream)
3471 (cond ((find-if (lambda (x)
3472 (and (xstream-p x)
3473 (eql (stream-name-entity-name (xstream-name x))
3474 (stream-name-entity-name (xstream-name new-xstream)))
3475 (eql (stream-name-entity-kind (xstream-name x))
3476 (stream-name-entity-kind (xstream-name new-xstream)))))
3477 (zstream-input-stack zstream))
3478 (wf-error zstream "Infinite recursion.")))
3479 (push new-xstream (zstream-input-stack zstream))
3480 zstream)
3482 (defun recurse-on-entity (zstream name kind continuation &optional internalp)
3483 (assert (not (zstream-token-category zstream)))
3484 (call-with-entity-expansion-as-stream
3485 zstream
3486 (lambda (new-xstream)
3487 (push :stop (zstream-input-stack zstream))
3488 (zstream-push new-xstream zstream)
3489 (prog1
3490 (funcall continuation zstream)
3491 (assert (eq (peek-token zstream) :eof))
3492 (assert (eq (pop (zstream-input-stack zstream)) new-xstream))
3493 (close-xstream new-xstream)
3494 (assert (eq (pop (zstream-input-stack zstream)) :stop))
3495 (setf (zstream-token-category zstream) nil)
3496 '(consume-token zstream)) )
3497 name
3498 kind
3499 internalp))
3502 (defmacro read-data-until* ((predicate input res res-start res-end) &body body)
3503 ;; fast variant -- for now disabled for no apparent reason
3504 ;; -> res, res-start, res-end
3505 `(let* ((rptr (xstream-read-ptr ,input))
3506 (p0 rptr)
3507 (fptr (xstream-fill-ptr ,input))
3508 (buf (xstream-buffer ,input))
3509 ,res ,res-start ,res-end)
3510 (declare (type fixnum rptr fptr p0)
3511 (type (simple-array read-element (*)) buf))
3512 (loop
3513 (cond ((%= rptr fptr)
3514 ;; underflow -- hmm inject the scratch-pad with what we
3515 ;; read and continue, while using read-rune and collecting
3516 ;; d.h. besser waere hier auch while-reading zu benutzen.
3517 (setf (xstream-read-ptr ,input) rptr)
3518 (multiple-value-setq (,res ,res-start ,res-end)
3519 (with-rune-collector/raw (collect)
3520 (do ((i p0 (%+ i 1)))
3521 ((%= i rptr))
3522 (collect (%rune buf i)))
3523 (let (c)
3524 (loop
3525 (cond ((%= rptr fptr)
3526 (setf (xstream-read-ptr ,input) rptr)
3527 (setf c (peek-rune input))
3528 (cond ((eq c :eof)
3529 (return)))
3530 (setf rptr (xstream-read-ptr ,input)
3531 fptr (xstream-fill-ptr ,input)
3532 buf (xstream-buffer ,input)))
3534 (setf c (%rune buf rptr))))
3535 (cond ((,predicate c)
3536 ;; we stop
3537 (setf (xstream-read-ptr ,input) rptr)
3538 (return))
3540 ;; we continue
3541 (collect c)
3542 (setf rptr (%+ rptr 1))) )))))
3543 (return))
3544 ((,predicate (%rune buf rptr))
3545 ;; we stop
3546 (setf (xstream-read-ptr ,input) rptr)
3547 (setf ,res buf ,res-start p0 ,res-end rptr)
3548 (return) )
3550 we continue
3551 (sf rptr (%+ rptr 1))) ))
3552 ,@body ))
3555 (defmacro read-data-until* ((predicate input res res-start res-end) &body body)
3556 "Read data from `input' until `predicate' applied to the read char
3557 turns true. Then execute `body' with `res', `res-start', `res-end'
3558 bound to denote a subsequence (of RUNEs) containing the read portion.
3559 The rune upon which `predicate' turned true is neither consumed from
3560 the stream, nor included in `res'.
3562 Keep the predicate short, this it may be included more than once into
3563 the macro's expansion."
3565 (let ((input-var (gensym))
3566 (collect (gensym))
3567 (c (gensym)))
3568 `(LET ((,input-var ,input))
3569 (MULTIPLE-VALUE-BIND (,res ,res-start ,res-end)
3570 (WITH-RUNE-COLLECTOR/RAW (,collect)
3571 (LOOP
3572 (LET ((,c (PEEK-RUNE ,input-var)))
3573 (COND ((EQ ,c :EOF)
3574 ;; xxx error message
3575 (RETURN))
3576 ((FUNCALL ,predicate ,c)
3577 (RETURN))
3579 (,collect ,c)
3580 (CONSUME-RUNE ,input-var))))))
3581 (LOCALLY
3582 ,@body)))))
3584 (defun read-name-token (input)
3585 (read-data-until* ((lambda (rune)
3586 (declare (type rune rune))
3587 (not (name-rune-p rune)))
3588 input
3589 r rs re)
3590 (intern-name r rs re)))
3592 (defun read-cdata (input)
3593 (read-data-until* ((lambda (rune)
3594 (declare (type rune rune))
3595 (when (and (%rune< rune #/U+0020)
3596 (not (or (%rune= rune #/U+0009)
3597 (%rune= rune #/U+000a)
3598 (%rune= rune #/U+000d))))
3599 (wf-error input "code point invalid: ~A" rune))
3600 (or (%rune= rune #/<) (%rune= rune #/&)))
3601 input
3602 source start end)
3603 (locally
3604 (declare (type (simple-array rune (*)) source)
3605 (type ufixnum start)
3606 (type ufixnum end)
3607 (optimize (speed 3) (safety 0)))
3608 (let ((res (make-array (%- end start) :element-type 'rune)))
3609 (declare (type (simple-array rune (*)) res))
3610 (let ((i (%- end start)))
3611 (declare (type ufixnum i))
3612 (loop
3613 (setf i (- i 1))
3614 (setf (%rune res i) (%rune source (the ufixnum (+ i start))))
3615 (when (= i 0)
3616 (return))))
3617 res))))
3619 ;; used only by read-att-value-2
3620 (defun internal-entity-expansion (name)
3621 (let ((def (get-entity-definition name :general (dtd *ctx*))))
3622 (unless def
3623 (wf-error nil "Entity '~A' is not defined." (rod-string name)))
3624 (unless (typep def 'internal-entdef)
3625 (wf-error nil "Entity '~A' is not an internal entity." name))
3626 (or (entdef-expansion def)
3627 (setf (entdef-expansion def) (find-internal-entity-expansion name)))))
3629 ;; used only by read-att-value-2
3630 (defun find-internal-entity-expansion (name)
3631 (with-zstream (zinput)
3632 (with-rune-collector-3 (collect)
3633 (labels ((muffle (input)
3634 (let (c)
3635 (loop
3636 (setf c (read-rune input))
3637 (cond ((eq c :eof)
3638 (return))
3639 ((rune= c #/&)
3640 (setf c (peek-rune input))
3641 (cond ((eql c :eof)
3642 (eox input))
3643 ((rune= c #/#)
3644 (let ((c (read-character-reference input)))
3645 (%put-unicode-char c collect)))
3647 (unless (name-start-rune-p c)
3648 (wf-error zinput "Expecting name after &."))
3649 (let ((name (read-name-token input)))
3650 (setf c (read-rune input))
3651 (check-rune input c #/\;)
3652 (recurse-on-entity
3653 zinput name :general
3654 (lambda (zinput)
3655 (muffle (car (zstream-input-stack zinput)))))))))
3656 ((rune= c #/<)
3657 (wf-error zinput "unexpected #\/<"))
3658 ((space-rune-p c)
3659 (collect #/space))
3660 ((not (data-rune-p c))
3661 (wf-error zinput "illegal char: ~S." c))
3663 (collect c)))))))
3664 (declare (dynamic-extent #'muffle))
3665 (recurse-on-entity
3666 zinput name :general
3667 (lambda (zinput)
3668 (muffle (car (zstream-input-stack zinput)))))))))
3670 ;; callback for DOM
3671 (defun resolve-entity (name handler dtd)
3672 (let ((*validate* nil))
3673 (if (get-entity-definition name :general dtd)
3674 (let* ((*ctx* (make-context :handler handler :dtd dtd))
3675 (*data-behaviour* :DOC))
3676 (with-zstream (input)
3677 (with-scratch-pads ()
3678 (recurse-on-entity
3679 input name :general
3680 (lambda (input)
3681 (prog1
3682 (etypecase (checked-get-entdef name :general)
3683 (internal-entdef (p/content input))
3684 (external-entdef (p/ext-parsed-ent input)))
3685 (unless (eq (peek-token input) :eof)
3686 (wf-error input "Trailing garbage. - ~S"
3687 (peek-token input)))))))))
3688 nil)))
3690 (defun read-att-value-2 (input)
3691 (let ((delim (read-rune input)))
3692 (when (eql delim :eof)
3693 (eox input))
3694 (unless (member delim '(#/\" #/\') :test #'eql)
3695 (wf-error input
3696 "Bad attribute value delimiter ~S, must be either #\\\" or #\\\'."
3697 (rune-char delim)))
3698 (with-rune-collector-4 (collect)
3699 (loop
3700 (let ((c (read-rune input)))
3701 (cond ((eq c :eof)
3702 (eox input "EOF"))
3703 ((rune= c delim)
3704 (return))
3705 ((rune= c #/<)
3706 (wf-error input "'<' not allowed in attribute values"))
3707 ((rune= #/& c)
3708 (multiple-value-bind (kind sem) (read-entity-like input)
3709 (ecase kind
3710 (:CHARACTER-REFERENCE
3711 (%put-unicode-char sem collect))
3712 (:ENTITY-REFERENCE
3713 (let* ((exp (internal-entity-expansion sem))
3714 (n (length exp)))
3715 (declare (type (simple-array rune (*)) exp))
3716 (do ((i 0 (%+ i 1)))
3717 ((%= i n))
3718 (collect (%rune exp i))))))))
3719 ((space-rune-p c)
3720 (collect #/u+0020))
3722 (collect c))))))))
3724 ;;;;;;;;;;;;;;;;;
3726 ;;; Namespace stuff
3728 ;; We already know that name is part of a valid XML name, so all we
3729 ;; have to check is that the first rune is a name-start-rune and that
3730 ;; there is not colon in it.
3731 (defun nc-name-p (name)
3732 (and (plusp (length name))
3733 (name-start-rune-p (rune name 0))
3734 (notany #'(lambda (rune) (rune= #/: rune)) name)))
3736 (defun split-qname (qname)
3737 (declare (type runes:simple-rod qname))
3738 (let ((pos (position #/: qname)))
3739 (if pos
3740 (let ((prefix (subseq qname 0 pos))
3741 (local-name (subseq qname (1+ pos))))
3742 (when (zerop pos)
3743 (wf-error nil "empty namespace prefix"))
3744 (if (nc-name-p local-name)
3745 (values prefix local-name)
3746 (wf-error nil "~S is not a valid NcName."
3747 (rod-string local-name))))
3748 (values () qname))))
3750 (defun decode-qname (qname)
3751 "decode-qname name => namespace-uri, prefix, local-name"
3752 (declare (type runes:simple-rod qname))
3753 (multiple-value-bind (prefix local-name) (split-qname qname)
3754 (let ((uri (find-namespace-binding prefix)))
3755 (if uri
3756 (values uri prefix local-name)
3757 (values nil nil qname)))))
3760 (defun find-namespace-binding (prefix)
3761 (cdr (or (assoc (or prefix #"") *namespace-bindings* :test #'rod=)
3762 (wf-error nil "Undeclared namespace prefix: ~A" (rod-string prefix)))))
3764 ;; FIXME: Should probably be refactored by adding :start and :end to rod=/rod-equal
3765 (defun rod-starts-with (prefix rod)
3766 (and (<= (length prefix) (length rod))
3767 (dotimes (i (length prefix) t)
3768 (unless (rune= (rune prefix i) (rune rod i))
3769 (return nil)))))
3771 (defun xmlns-attr-p (attr-name)
3772 (rod-starts-with #.(string-rod "xmlns") attr-name))
3774 (defun attrname->prefix (attrname)
3775 (if (< 5 (length attrname))
3776 (subseq attrname 6)
3777 nil))
3779 (defun find-namespace-declarations (attributes)
3780 (loop
3781 for attribute in attributes
3782 for qname = (sax:attribute-qname attribute)
3783 when (xmlns-attr-p qname)
3784 collect (cons (attrname->prefix qname) (sax:attribute-value attribute))))
3786 (defun declare-namespaces (attributes)
3787 (let ((ns-decls (find-namespace-declarations attributes)))
3788 (dolist (ns-decl ns-decls)
3789 ;; check some namespace validity constraints
3790 (let ((prefix (car ns-decl))
3791 (uri (cdr ns-decl)))
3792 (cond
3793 ((and (rod= prefix #"xml")
3794 (not (rod= uri #"http://www.w3.org/XML/1998/namespace")))
3795 (wf-error nil
3796 "Attempt to rebind the prefix \"xml\" to ~S." (mu uri)))
3797 ((and (rod= uri #"http://www.w3.org/XML/1998/namespace")
3798 (not (rod= prefix #"xml")))
3799 (wf-error nil
3800 "The namespace ~
3801 URI \"http://www.w3.org/XML/1998/namespace\" may not ~
3802 be bound to the prefix ~S, only \"xml\" is legal."
3803 (mu prefix)))
3804 ((and (rod= prefix #"xmlns")
3805 (rod= uri #"http://www.w3.org/2000/xmlns/"))
3806 (wf-error nil
3807 "Attempt to bind the prefix \"xmlns\" to its predefined ~
3808 URI \"http://www.w3.org/2000/xmlns/\", which is ~
3809 forbidden for no good reason."))
3810 ((rod= prefix #"xmlns")
3811 (wf-error nil
3812 "Attempt to bind the prefix \"xmlns\" to the URI ~S, ~
3813 but it may not be declared." (mu uri)))
3814 ((rod= uri #"http://www.w3.org/2000/xmlns/")
3815 (wf-error nil
3816 "The namespace URI \"http://www.w3.org/2000/xmlns/\" may ~
3817 not be bound to prefix ~S (or any other)." (mu prefix)))
3818 ((and (rod= uri #"") prefix)
3819 (wf-error nil
3820 "Only the default namespace (the one without a prefix) ~
3821 may be bound to an empty namespace URI, thus ~
3822 undeclaring it."))
3824 (push (cons prefix (if (rod= #"" uri) nil uri))
3825 *namespace-bindings*)
3826 (sax:start-prefix-mapping (handler *ctx*)
3827 (car ns-decl)
3828 (cdr ns-decl))))))
3829 ns-decls))
3831 (defun undeclare-namespaces (ns-decls)
3832 (dolist (ns-decl ns-decls)
3833 (sax:end-prefix-mapping (handler *ctx*) (car ns-decl))))
3835 (defun build-attribute-list (attr-alist)
3836 ;; fixme: if there is a reason this function reverses attribute order,
3837 ;; it should be documented.
3838 (let (attributes)
3839 (dolist (pair attr-alist)
3840 (push (sax:make-attribute :qname (car pair)
3841 :value (cdr pair)
3842 :specified-p t)
3843 attributes))
3844 attributes))
3846 (defun check-attribute-uniqueness (attributes)
3847 ;; 5.3 Uniqueness of Attributes
3848 ;; In XML documents conforming to [the xmlns] specification, no
3849 ;; tag may contain two attributes which:
3850 ;; 1. have identical names, or
3851 ;; 2. have qualified names with the same local part and with
3852 ;; prefixes which have been bound to namespace names that are
3853 ;; identical.
3855 ;; 1. is checked by read-tag-2, so we only deal with 2 here
3856 (loop for (attr-1 . rest) on attributes do
3857 (when (and (sax:attribute-namespace-uri attr-1)
3858 (find-if (lambda (attr-2)
3859 (and (rod= (sax:attribute-namespace-uri attr-1)
3860 (sax:attribute-namespace-uri attr-2))
3861 (rod= (sax:attribute-local-name attr-1)
3862 (sax:attribute-local-name attr-2))))
3863 rest))
3864 (wf-error nil
3865 "Multiple definitions of attribute ~S in namespace ~S."
3866 (mu (sax:attribute-local-name attr-1))
3867 (mu (sax:attribute-namespace-uri attr-1))))))
3869 (defun set-attribute-namespace (attribute)
3870 (let ((qname (sax:attribute-qname attribute)))
3871 (if (and sax:*use-xmlns-namespace* (rod= qname #"xmlns"))
3872 (setf (sax:attribute-namespace-uri attribute)
3873 #"http://www.w3.org/2000/xmlns/")
3874 (multiple-value-bind (prefix local-name) (split-qname qname)
3875 (when (and prefix ;; default namespace doesn't apply to attributes
3876 (or (not (rod= #"xmlns" prefix))
3877 sax:*use-xmlns-namespace*))
3878 (setf (sax:attribute-namespace-uri attribute)
3879 (decode-qname qname)))
3880 (setf (sax:attribute-local-name attribute) local-name)))))
3882 ;;;;;;;;;;;;;;;;;
3884 ;; System Identifier Protocol
3886 ;; A system identifier is an object obeying to the system identifier
3887 ;; protocol. Often something like an URL or a pathname.
3889 ;; OPEN-SYS-ID sys-id [generic function]
3891 ;; Opens the resource associated with the system identifier `sys-id'
3892 ;; for reading and returns a stream. For now it is expected, that the
3893 ;; stream is an octet stream (one of element type (unsigned-byte 8)).
3895 ;; More precisely: The returned object only has to obey to the xstream
3896 ;; controller protocol. (That is it has to provide implementations for
3897 ;; READ-OCTETS and XSTREAM-CONTROLLER-CLOSE).
3899 ;; MERGE-SYS-ID sys-id base [generic function]
3901 ;; Merges two system identifiers. That is resolve `sys-id' relative to
3902 ;; `base' yielding an absolute system identifier suitable for
3903 ;; OPEN-SYS-ID.
3906 ;;;;;;;;;;;;;;;;;
3907 ;;; SAX validation handler
3909 (defclass validator ()
3910 ((context :initarg :context :accessor context)
3911 (cdatap :initform nil :accessor cdatap)))
3913 (defun make-validator (dtd root)
3914 "@arg[dtd]{An @class{dtd} instance.}
3915 @arg[root]{Element name, a string.}
3916 @return{A @class{SAX handler}.}
3918 Create a SAX handler which validates against a DTD instance.
3919 The document's root element must be named @code{root}.
3920 Used with @fun{dom:map-document}, this validates a document
3921 object as if by re-reading it with a validating parser, except
3922 that declarations recorded in the document instance are completely
3923 ignored.
3925 Example:
3927 @pre{(let ((d (parse-file \"~/test.xml\" (cxml-dom:make-dom-builder)))
3928 (x (parse-dtd-file \"~/test.dtd\")))
3929 (dom:map-document (cxml:make-validator x #\"foo\") d))}"
3930 (make-instance 'validator
3931 :context (make-context
3932 :handler nil
3933 :dtd dtd
3934 :model-stack (list (make-root-model root)))))
3936 (macrolet ((with-context ((validator) &body body)
3937 `(let ((*ctx* (context ,validator))
3938 (*validate* t))
3939 (with-scratch-pads () ;nicht schoen
3940 ,@body))))
3941 (defmethod sax:start-element ((handler validator) uri lname qname attributes)
3942 uri lname
3943 (with-context (handler)
3944 (validate-start-element *ctx* qname)
3945 (process-attributes *ctx* qname attributes)))
3947 (defmethod sax:start-cdata ((handler validator))
3948 (setf (cdatap handler) t))
3950 (defmethod sax:characters ((handler validator) data)
3951 (with-context (handler)
3952 (validate-characters *ctx* (if (cdatap handler) #"hack" data))))
3954 (defmethod sax:end-cdata ((handler validator))
3955 (setf (cdatap handler) nil))
3957 (defmethod sax:end-element ((handler validator) uri lname qname)
3958 uri lname
3959 (with-context (handler)
3960 (validate-end-element *ctx* qname))))