cxml-like PARSE function and more serialization.
[closure-html.git] / src / parse / html-parser.lisp
blobc4b8077511760d1d6ed277dd58f1df8e3e5cdb9b
1 (in-package :closure-html)
3 ;;; FIXME: I liked the old SLURP-CATALOG code better than the LOOP below.
4 ;;; (Except for the use of NETLIB and URI, which we don't have here.)
6 #||
8 (defun slurp-catalog (catalog-url)
9 ;; Really dirty implementation
10 (setf *simple-catalog* nil)
11 (multiple-value-bind (io header) (netlib::open-document-2 catalog-url)
12 (declare (ignore header))
13 (unwind-protect
14 (let ((str (glisp::gstream-as-string io)))
15 (with-input-from-string (input str)
16 (do ((x (read input nil nil) (read input nil nil)))
17 ((null x))
18 (assert (equal (symbol-name x) "PUBLIC"))
19 (let ((name (read input))
20 (file (read input)))
21 (assert (stringp name))
22 (assert (stringp file))
23 (push (cons name (url:merge-url (url:parse-url file) catalog-url))
24 *simple-catalog*)))))
25 (g/close io))))
27 (format T "~&;; Parsing DTD~% ")
28 (sgml:slurp-catalog (url:parse-url "file://closure/resources/dtd/catalog"))
29 (setf cl-user::*html-dtd* (sgml:parse-dtd '(:public "-//W3C//DTD HTML 4.0 Frameset//EN")))
30 (format T "~&;; done~%")
32 ||#
34 (defparameter sgml::*simple-catalog*
35 (let ((base
36 (merge-pathnames
37 "resources/"
38 (asdf:component-relative-pathname
39 (asdf:find-system :closure-html)))))
40 (loop
41 :for (name . filename)
42 :in '(("-//W3O//DTD W3 HTML 3.0//EN" . "dtd/HTML-3.0")
43 ("NETSCAPE-Bookmark-file-1" . "dtd/NETSCAPE-Bookmark-file-1")
44 ("-//W3C//ENTITIES Special//EN//HTML" . "dtd/Entities-Special")
45 ("-//W3C//ENTITIES Symbols//EN//HTML" . "dtd/Entities-Symbols")
46 ("-//W3C//ENTITIES Latin1//EN//HTML" . "dtd/Entities-Latin1")
47 ("-//W3C//DTD HTML 4.0 Frameset//EN" . "dtd/DTD-HTML-4.0-Frameset")
48 ("-//W3C//DTD HTML 4.0//EN" . "dtd/DTD-HTML-4.0")
49 ("-//W3C//DTD HTML 4.0 Transitional//EN" . "dtd/DTD-HTML-4.0-Transitional"))
50 :collect (cons name (merge-pathnames filename base)))))
52 (defparameter *html-dtd*
53 (sgml:parse-dtd '(:public "-//W3C//DTD HTML 4.0 Frameset//EN")))
55 (defun parse-xstream (input handler)
56 (setf (sgml::a-stream-scratch input)
57 (make-array #.(* 2 4096) :element-type 'runes:rune))
58 (sgml::setup-code-vector input :utf-8)
59 (let* ((dtd *html-dtd*)
60 (r (sgml:sgml-parse dtd input))
61 (pt (sgml::post-mortem-heuristic dtd r)))
62 (if handler
63 (serialize-pt pt handler)
64 pt)))
66 (defun parse (input handler &key pathname)
67 (etypecase input
68 (xstream
69 (parse-xstream input handler))
70 (rod
71 (let ((xstream (make-rod-xstream (string-rod input))))
72 ;;; (setf (xstream-name xstream)
73 ;;; (make-stream-name
74 ;;; :entity-name "main document"
75 ;;; :entity-kind :main
76 ;;; :uri nil))
77 (parse-xstream xstream handler)))
78 (array
79 (parse (make-octet-input-stream input) handler))
80 (pathname
81 (with-open-file (s input :element-type '(unsigned-byte 8))
82 (parse s handler :pathname input)))
83 (stream
84 (let ((xstream (make-xstream input :speed 8192)))
85 ;;; (setf (xstream-name xstream)
86 ;;; (make-stream-name
87 ;;; :entity-name "main document"
88 ;;; :entity-kind :main
89 ;;; :uri (pathname-to-uri
90 ;;; (merge-pathnames (or pathname (pathname input))))))
91 (parse-xstream xstream handler)))))
93 (defun serialize-pt (document handler &key (name "HTML") public-id system-id)
94 (hax:start-document handler name public-id system-id)
95 (labels ((recurse (pt)
96 (cond
97 ((eq (gi pt) :pcdata)
98 (hax:characters handler (pt-attrs pt)))
100 (let ((name (symbol-name (pt-name pt))))
101 (hax:start-element handler name (pt-attrs pt))
102 (mapc #'recurse (pt-children pt))
103 (hax:end-element handler name))))))
104 (recurse document))
105 (hax:end-document handler))