Use CXML's rune implementation and XML parser.
[closure-html.git] / src / xml / xml-canonic.lisp
blobf9e0d480a816b57b09382b471088fde2d6633aaf
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: XML; readtable: glisp; Encoding: utf-8; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: Dump canonic XML according to J.Clark
4 ;;; Created: 1999-09-09
5 ;;; Author: Gilbert Baumann <unk6@rz.uni-karlsruhe.de>
6 ;;; License: LGPL (See file COPYING for details).
7 ;;; ---------------------------------------------------------------------------
8 ;;; © copyright 1999 by Gilbert Baumann
10 ;;; This library is free software; you can redistribute it and/or
11 ;;; modify it under the terms of the GNU Library General Public
12 ;;; License as published by the Free Software Foundation; either
13 ;;; version 2 of the License, or (at your option) any later version.
14 ;;;
15 ;;; This library is distributed in the hope that it will be useful,
16 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 ;;; Library General Public License for more details.
19 ;;;
20 ;;; You should have received a copy of the GNU Library General Public
21 ;;; License along with this library; if not, write to the
22 ;;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;;; Boston, MA 02111-1307 USA.
25 (in-package :xml)
27 ;;
28 ;; | Canonical XML
29 ;; | =============
30 ;; |
31 ;; | This document defines a subset of XML called canonical XML. The
32 ;; | intended use of canonical XML is in testing XML processors, as a
33 ;; | representation of the result of parsing an XML document.
34 ;; |
35 ;; | Every well-formed XML document has a unique structurally equivalent
36 ;; | canonical XML document. Two structurally equivalent XML documents have
37 ;; | a byte-for-byte identical canonical XML document. Canonicalizing an
38 ;; | XML document requires only information that an XML processor is
39 ;; | required to make available to an application.
40 ;; |
41 ;; | A canonical XML document conforms to the following grammar:
42 ;; |
43 ;; | CanonXML ::= Pi* element Pi*
44 ;; | element ::= Stag (Datachar | Pi | element)* Etag
45 ;; | Stag ::= '<' Name Atts '>'
46 ;; | Etag ::= '</' Name '>'
47 ;; | Pi ::= '<?' Name ' ' (((Char - S) Char*)? - (Char* '?>' Char*)) '?>'
48 ;; | Atts ::= (' ' Name '=' '"' Datachar* '"')*
49 ;; | Datachar ::= '&amp;' | '&lt;' | '&gt;' | '&quot;'
50 ;; | | '&#9;'| '&#10;'| '&#13;'
51 ;; | | (Char - ('&' | '<' | '>' | '"' | #x9 | #xA | #xD))
52 ;; | Name ::= (see XML spec)
53 ;; | Char ::= (see XML spec)
54 ;; | S ::= (see XML spec)
55 ;; |
56 ;; | Attributes are in lexicographical order (in Unicode bit order).
57 ;; |
58 ;; | A canonical XML document is encoded in UTF-8.
59 ;; |
60 ;; | Ignorable white space is considered significant and is treated
61 ;; | equivalently to data.
63 ;; -- James Clark (jjc@jclark.com)
65 (defvar *quux*) ;!!!BIG HACK!!!
67 (defun unparse-document (doc sink)
68 (mapc (rcurry #'unparse-node sink) (dom:child-nodes doc)))
70 (defun unparse-node (node sink)
71 (cond ((dom:element-p node)
72 (write-rune #/< sink)
73 (write-rod (dom:tag-name node) sink)
74 ;; atts
75 (let ((atts (sort (copy-list (dom:items (dom:attributes node)))
76 #'rod< :key #'dom:name)))
77 (dolist (a atts)
78 (write-rune #/space sink)
79 (write-rod (dom:name a) sink)
80 (write-rune #/= sink)
81 (write-rune #/\" sink)
82 (let ((*quux* nil))
83 (map nil (lambda (c) (unparse-datachar c sink)) (dom:value a)))
84 (write-rune #/\" sink)))
85 (write-rod '#.(string-rod ">") sink)
86 (dolist (k (dom:child-nodes node))
87 (unparse-node k sink))
88 (write-rod '#.(string-rod "</") sink)
89 (write-rod (dom:tag-name node) sink)
90 (write-rod '#.(string-rod ">") sink))
91 ((dom:processing-instruction-p node)
92 (unless (rod-equal (dom:target node) '#.(string-rod "xml"))
93 (write-rod '#.(string-rod "<?") sink)
94 (write-rod (dom:target node) sink)
95 (write-rune #/space sink)
96 (write-rod (dom:data node) sink)
97 (write-rod '#.(string-rod "?>") sink) ))
98 ((dom:text-node-p node)
99 (let ((*quux* nil))
100 (map nil (lambda (c) (unparse-datachar c sink))
101 (dom:data node))))
103 (error "Oops in unparse: ~S." node))))
105 (defun unparse-datachar (c sink)
106 (cond ((rune= c #/&) (write-rod '#.(string-rod "&amp;") sink))
107 ((rune= c #/<) (write-rod '#.(string-rod "&lt;") sink))
108 ((rune= c #/>) (write-rod '#.(string-rod "&gt;") sink))
109 ((rune= c #/\") (write-rod '#.(string-rod "&quot;") sink))
110 ((rune= c #/U+0009) (write-rod '#.(string-rod "&#9;") sink))
111 ((rune= c #/U+000A) (write-rod '#.(string-rod "&#10;") sink))
112 ((rune= c #/U+000D) (write-rod '#.(string-rod "&#13;") sink))
114 (write-rune c sink))))
116 (defun write-rod (rod sink)
117 (let ((*quux* nil))
118 (map nil (lambda (c) (write-rune c sink)) rod)))
120 (defun write-rune (rune sink)
121 (cond ((<= #xD800 rune #xDBFF)
122 (setf *quux* rune))
123 ((<= #xDC00 rune #xDFFF)
124 (let ((q (logior (ash (- *quux* #xD7C0) 10) (- rune #xDC00))))
125 (write-rune-0 q sink))
126 (setf *quux* nil))
128 (write-rune-0 rune sink))))
130 (defun write-rune-0 (rune sink)
131 (labels ((wr (x)
132 (write-char (code-char x) sink)))
133 (cond ((<= #x00000000 rune #x0000007F)
134 (wr rune))
135 ((<= #x00000080 rune #x000007FF)
136 (wr (logior #b11000000 (ldb (byte 5 6) rune)))
137 (wr (logior #b10000000 (ldb (byte 6 0) rune))))
138 ((<= #x00000800 rune #x0000FFFF)
139 (wr (logior #b11100000 (ldb (byte 4 12) rune)))
140 (wr (logior #b10000000 (ldb (byte 6 6) rune)))
141 (wr (logior #b10000000 (ldb (byte 6 0) rune))))
142 ((<= #x00010000 rune #x001FFFFF)
143 (wr (logior #b11110000 (ldb (byte 3 18) rune)))
144 (wr (logior #b10000000 (ldb (byte 6 12) rune)))
145 (wr (logior #b10000000 (ldb (byte 6 6) rune)))
146 (wr (logior #b10000000 (ldb (byte 6 0) rune))))
147 ((<= #x00200000 rune #x03FFFFFF)
148 (wr (logior #b11111000 (ldb (byte 2 24) rune)))
149 (wr (logior #b10000000 (ldb (byte 6 18) rune)))
150 (wr (logior #b10000000 (ldb (byte 6 12) rune)))
151 (wr (logior #b10000000 (ldb (byte 6 6) rune)))
152 (wr (logior #b10000000 (ldb (byte 6 0) rune))))
153 ((<= #x04000000 rune #x7FFFFFFF)
154 (wr (logior #b11111100 (ldb (byte 1 30) rune)))
155 (wr (logior #b10000000 (ldb (byte 6 24) rune)))
156 (wr (logior #b10000000 (ldb (byte 6 18) rune)))
157 (wr (logior #b10000000 (ldb (byte 6 12) rune)))
158 (wr (logior #b10000000 (ldb (byte 6 6) rune)))
159 (wr (logior #b10000000 (ldb (byte 6 0) rune)))))))
161 (defun rod< (rod1 rod2)
162 (do ((i 0 (+ i 1)))
163 (nil)
164 (cond ((= i (length rod1))
165 (return t))
166 ((= i (length rod2))
167 (return nil))
168 ((< (aref rod1 i) (aref rod2 i))
169 (return t))
170 ((> (aref rod1 i) (aref rod2 i))
171 (return nil)))))