Revert "Clean up attributes before passing them to SAX"
[closure-html.git] / src / parse / html-parser.lisp
blob1fdd457da0892cc601a9c126fa95afedc774a67d
1 (in-package :closure-html)
3 ;;; FIXME: I liked the old SLURP-CATALOG code better than the LOOP below.
4 ;;; (Except for the use of NETLIB and URI, which we don't have here.)
6 #||
8 (defun slurp-catalog (catalog-url)
9 ;; Really dirty implementation
10 (setf *simple-catalog* nil)
11 (multiple-value-bind (io header) (netlib::open-document-2 catalog-url)
12 (declare (ignore header))
13 (unwind-protect
14 (let ((str (glisp::gstream-as-string io)))
15 (with-input-from-string (input str)
16 (do ((x (read input nil nil) (read input nil nil)))
17 ((null x))
18 (assert (equal (symbol-name x) "PUBLIC"))
19 (let ((name (read input))
20 (file (read input)))
21 (assert (stringp name))
22 (assert (stringp file))
23 (push (cons name (url:merge-url (url:parse-url file) catalog-url))
24 *simple-catalog*)))))
25 (g/close io))))
27 (format T "~&;; Parsing DTD~% ")
28 (sgml:slurp-catalog (url:parse-url "file://closure/resources/dtd/catalog"))
29 (setf cl-user::*html-dtd* (sgml:parse-dtd '(:public "-//W3C//DTD HTML 4.0 Frameset//EN")))
30 (format T "~&;; done~%")
32 ||#
34 (defparameter sgml::*simple-catalog*
35 (let ((base
36 (make-pathname
37 :name nil
38 :type nil
39 :defaults (merge-pathnames
40 "resources/"
41 (asdf:component-relative-pathname
42 (asdf:find-system :closure-html))))))
43 (loop
44 :for (name . filename)
45 :in '(("-//W3O//DTD W3 HTML 3.0//EN" . "dtd/HTML-3.0")
46 ("NETSCAPE-Bookmark-file-1" . "dtd/NETSCAPE-Bookmark-file-1")
47 ("-//W3C//ENTITIES Special//EN//HTML" . "dtd/Entities-Special")
48 ("-//W3C//ENTITIES Symbols//EN//HTML" . "dtd/Entities-Symbols")
49 ("-//W3C//ENTITIES Latin1//EN//HTML" . "dtd/Entities-Latin1")
50 ("-//W3C//DTD HTML 4.0 Frameset//EN" . "dtd/DTD-HTML-4.0-Frameset")
51 ("-//W3C//DTD HTML 4.0//EN" . "dtd/DTD-HTML-4.0")
52 ("-//W3C//DTD HTML 4.0 Transitional//EN" . "dtd/DTD-HTML-4.0-Transitional"))
53 :collect (cons name (merge-pathnames filename base)))))
55 (defparameter *html-dtd*
56 (sgml:parse-dtd '(:public "-//W3C//DTD HTML 4.0 Frameset//EN")))
58 (defun parse-xstream (input handler)
59 (setf (sgml::a-stream-scratch input)
60 (make-array #.(* 2 4096) :element-type 'runes:rune))
61 (sgml::setup-code-vector input :utf-8)
62 (let* ((dtd *html-dtd*)
63 (sgml::*unmungle-attribute-case* t)
64 (r (sgml:sgml-parse dtd input))
65 (pt (sgml::post-mortem-heuristic dtd r)))
66 (if handler
67 (serialize-pt pt handler)
68 pt)))
70 (defun parse (input handler)
71 (etypecase input
72 (xstream
73 (parse-xstream input handler))
74 (rod
75 #-rune-is-integer (setf input (string-rod input))
76 (let ((xstream (make-rod-xstream input)))
77 ;;; (setf (xstream-name xstream)
78 ;;; (make-stream-name
79 ;;; :entity-name "main document"
80 ;;; :entity-kind :main
81 ;;; :uri nil))
82 (parse-xstream xstream handler)))
83 ((and array (not string))
84 (parse (make-octet-input-stream input) handler))
85 #+rune-is-integer
86 (string
87 (let ((bytes
88 (make-array (length input) :element-type '(unsigned-byte 8))))
89 (map-into bytes #'char-code input)
90 (parse bytes handler)))
91 (pathname
92 (with-open-file (s input :element-type '(unsigned-byte 8))
93 (parse s handler)))
94 (stream
95 (let ((xstream (make-xstream input :speed 8192)))
96 ;;; (setf (xstream-name xstream)
97 ;;; (make-stream-name
98 ;;; :entity-name "main document"
99 ;;; :entity-kind :main
100 ;;; :uri (pathname-to-uri
101 ;;; (merge-pathnames (or pathname (pathname input))))))
102 (parse-xstream xstream handler)))))
104 (defun serialize-pt-attributes (plist recode)
105 (loop
106 for (name value) on plist by #'cddr
107 unless
108 ;; better don't emit as HAX what would be bogus as SAX anyway
109 (string-equal name "xmlns")
110 collect
111 (let* ((n #+rune-is-character (coerce (symbol-name name) 'rod)
112 #-rune-is-character (symbol-name name))
113 (v (etypecase value
114 (symbol (coerce (string-downcase (symbol-name value)) 'rod))
115 (rod (funcall recode value))
116 (string (coerce value 'rod)))))
117 (hax:make-attribute n v t))))
119 (defun serialize-pt (document handler
120 &key (name "HTML") public-id system-id (documentp t))
121 (let* ((recodep (or #+rune-is-integer (hax:%want-strings-p handler)))
122 (recode
123 (if recodep
124 (lambda (rod)
125 (if (typep rod 'rod)
126 (rod-to-utf8-string rod)
127 rod))
128 #'identity)))
129 (when documentp
130 (hax:start-document handler name public-id system-id))
131 (labels ((recurse (pt)
132 (cond
133 ((eq (gi pt) :pcdata)
134 (hax:characters handler (funcall recode (pt-attrs pt))))
136 (let* ((name (symbol-name (pt-name pt)))
137 (name
138 #+rune-is-character (coerce name 'rod)
139 #-rune-is-character
140 (if recodep name (string-rod name)))
141 (attrs
142 (serialize-pt-attributes (pt-attrs pt) recode)))
143 (hax:start-element handler name attrs)
144 (mapc #'recurse (pt-children pt))
145 (hax:end-element handler name))))))
146 (recurse document))
147 (when documentp
148 (hax:end-document handler))))
150 (defclass pt-builder (hax:abstract-handler)
151 ((current :initform nil :accessor current)
152 (root :initform nil :accessor root)))
154 #-rune-is-character
155 (defmethod hax:%want-strings-p ((handler pt-builder))
156 nil)
158 (defun make-pt-builder ()
159 (make-instance 'pt-builder))
161 (defmethod hax:start-document ((handler pt-builder) name pubid sysid)
162 (declare (ignore name pubid sysid))
163 nil)
165 (defun unserialize-pt-attributes (attrs)
166 (loop
167 for a in attrs
168 collect (intern (string-upcase (hax:attribute-name a)) :keyword)
169 collect (hax:attribute-value a)))
171 (defmethod hax:start-element ((handler pt-builder) name attrs)
172 (let* ((parent (current handler))
173 (this (sgml::make-pt/low
174 :name (intern (string-upcase name) :keyword)
175 :attrs (unserialize-pt-attributes attrs)
176 :parent parent)))
177 (setf (current handler) this)
178 (if parent
179 (push this (pt-children parent))
180 (setf (root handler) this))))
182 (defmethod hax:characters ((handler pt-builder) data)
183 (push (sgml::make-pt/low
184 :name :pcdata
185 :attrs data
186 :parent (current handler))
187 (pt-children (current handler))))
189 (defmethod hax:comment ((handler pt-builder) data)
190 ;; zzz haven't found out what the representation of comments is...
191 data)
193 (defmethod hax:end-element ((handler pt-builder) name)
194 (let ((current (current handler)))
195 (setf (pt-children current) (nreverse (pt-children current)))
196 (setf (current handler) (pt-parent current))))
198 (defmethod hax:end-document ((handler pt-builder))
199 (root handler))