b51beb172f9224ec39c5d2e9323ed8bbd96eb0d5
[closure-html.git] / src / parse / sgml-parse.lisp
blobb51beb172f9224ec39c5d2e9323ed8bbd96eb0d5
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: SGML; Readtable: GLISP; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: An SGML Parser
4 ;;; Created: 1996-10-21
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: MIT style (see below)
7 ;;; ---------------------------------------------------------------------------
8 ;;; (c) copyright 1996-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
17 ;;;
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
20 ;;;
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 ;; Changes
31 ;; When Who What
32 ;; ----------------------------------------------------------------------------
33 ;; 2001-05-17 GB - empty tags are threated as start tags.
34 ;; [we can do better than that].
36 ;; 2001-05-14 GB - finally all mumbling goes with PARSE-WARN.
38 ;; 1999-09-19 GB - a-streams are gone now, we use the xstreams from the
39 ;; XML parser.
41 ;; 1999-08-24 GB = new scheme to handle unsyntactic nesting of FONT tags
42 ;; thru' post mortem analysis; due to that:
43 ;; - *FONT-HEURISTIC-P*: new special variable
44 ;; - HTAG-NODE: new structure type
45 ;; - New stack machine output token type :HTAG
46 ;; - TAG-AS-MARKER-P: new predicate
47 ;; - POST-MORTEM-HEURISTIC: new function
48 ;; - PARSE-HTML: use it
49 ;; - POST-MORTEM/FIX-TOP-LEVEL-STRUCTURE: moved code
50 ;; from PARSE-HTML here
52 ;; - HTML-PARSE-FILE, HTML-PARSE-URL: new functions
54 ;; 1999-08-23 GB - EMPTY-ELEMENT-P: new function
55 ;; - SGML-PARSE, TRANSITION: new output token :open/close
56 ;; to accommodate empty elements (like <IMG>, <BR>)
58 ;; 1999-08-19 GB - nuked FAT-WHITE-SPACE-P
59 ;; - nuked U16->STRING (using ROD-STRING now).
60 ;; - nuked VECTOR/U16
61 ;; - use #/.. read syntax instead of #.(char-code) idiom
62 ;; - use RUNE=, RUNE<= et al
63 ;; - use ROD= et al
64 ;; - twixed SUBSEQ/U16 to SUBSEQ/ROD
65 ;; - prays that everything still works.
67 (in-package :SGML)
69 #+:CMU
70 (shadow "CODE-CHAR")
71 #+:CMU
72 (declaim (inline code-char))
73 #+:CMU
74 (defun code-char (code)
75 (if (< code char-code-limit)
76 (cl:code-char code)
77 nil))
79 ;; This is a high-speed implementation of an SGML parser.
81 ;; NOTES
83 ;; o We do our own buffering here, which allows us to access the buffer
84 ;; directly and thus cutting down the number of function calls per character
85 ;; drastically.
87 ;; o The DTD has to be 'compiled' before we drive a little determistic PDA
88 ;; from it.
90 ;;; TODO
92 ;; o das mit dem <body> = full speed functioniert irgentwie nicht
93 ;; richtig ;-( wenn das BODY tag fehlt. -- es kommt nur etwas spaet
94 ;; wegen token lookahead, was aber kein problem sein sollte.
96 ;; o improve error handling
97 ;; - After we have encountered an error while reading a tag nuke everything
98 ;; until '>'.
99 ;; - Maybe?
100 ;; Forgive something like <foo x="10%> and back up until #/" by patching
101 ;; the read buffer
102 ;; - emit warning if somebody writes e.g. <A href=http://foo/bar/baz>,
103 ;; which is illegal
105 ;; o in SGML parser:
106 ;; - account for the SGML inclusion '+(..)' and exclusion '-(..)' feature
107 ;; - implement the HTML idea of CDATA for SCRIPT and (uhm!) XMP.
108 ;; - thanks to the W3C: '<' and '>' are now part of CSS-2 syntax so we
109 ;; need something similar for STYLE also.
111 ;; o merge as much is possible with the XML parser in a modular fashion
113 ;; o scratch-pad hack needs to be reentrant !!!
114 ;; [make xml-parser's implementation of with-rune-collector safe
115 ;; and use that?]
117 ;; o the FORM hack should be there.
118 ;; (we also need an <INPUT> hack).
120 ;; o finally ensure that the rune abstraction is fine.
122 ;; o report errors using exactly one mechanism and include file name
123 ;; (uri) into error messages; better yet: define a error message
124 ;; structure -- in the future it should be possible to have something
125 ;; like an error message listener.
127 ;; o since we use xstreams now morph the eof representation from NIL
128 ;; to :EOF. Or better yet: give the READ-RUNE macro a standard
129 ;; signature including eof-error-p and eof-value. (Wouldn't hurt,
130 ;; since it is a macro).
132 ;; o In DTD abstraction: ELM-INCLUSION needs flag error-p
134 ;; o we need rune-max and therefore are not dependent on UCS-2
135 ;; anymore, should the need araise simply redefine rune-max.
137 ;;; IDEA
139 ;; Build the parse tree out of lazy lists. Incremental rendering would
140 ;; then become cheap to implement.
142 (defparameter *preserves-comments-elements*
143 '(:STYLE :SCRIPT)
144 "List of names of elements, which should preserve (may contain) comment tokens.")
146 (defparameter *font-heuristic-p* t
147 "Whether to handle FONT by heuristic and post mortem processing.")
149 (defparameter *anchor-heuristic-p* nil
150 "Whether to handle A tags by heuristic and post mortem processing.")
152 (defparameter *gt-ends-comment-p* nil
153 "Whether and '>' ends a comment? -- A warning is emitted still.")
155 (defvar *options/parser-silent-p* nil)
156 (defvar *line-number*)
157 (defparameter *parse-warn-level* 5)
160 ;; HTAG = heuristic tag
161 ;; These are unparsed tags, which are inserted verbatim into the parse tree
162 ;; for post mortem analysis.
163 ;; See explaination in FONT element fixer far below.
165 (defstruct (htag-node (:include pt)) ) ;heuristic tag
166 (defstruct (hstag-node (:include htag-node)) ) ;heuristic start tag
167 (defstruct (hetag-node (:include htag-node)) ) ;heuristic end tag
169 ;; The predicate TAG-AS-MARKER-P decides, whether a given tag is
170 ;; inserted as HSTAG or HETAG.
172 ;;;; --------------------------------------------------------------------------
173 ;;;; Predicates and Utilities on Runes
175 ;; Most of these should really go into util.lisp
177 (definline name-start-rune-p (char)
178 (or (rune<= #/a char #/z)
179 (rune<= #/A char #/Z)))
181 (definline name-rune-p (char)
182 (or (name-start-rune-p char)
183 (digit-rune-p char)
184 (rune= char #/.)
185 (rune= char #/-)))
187 (definline sloopy-name-rune-p (char)
188 (or (name-rune-p char)
189 (rune= char #/%)
190 (rune= char #//) ;manche schreiben ganze urls ohne Gaensefuesschen
191 (rune= char #/:)
192 (rune= char #/~)
193 (rune= char #/#) ;farben werden auch gerne genommen
194 (rune= char #/_)
195 (rune= char #/+)
196 (rune= char #/?)
197 (rune= char #/!)
198 (rune= char #/@)
199 ;; ganzer ECMA-Script Code kommt auch vor
200 (rune= char #/\()
201 (rune= char #/\))
202 (rune= char #/')
203 (rune= char #/\")
204 (rune= char #/\;)
205 (rune= char #/,)
206 (rune= char #/[)
207 (rune= char #/])
208 (rune= char #/&) ))
210 (definline sloopy-value-rune-p (char)
211 (or (sloopy-name-rune-p char)
212 (rune= char #/=)))
214 (definline alpha-rune-p (char)
215 (or (rune<= #/a char #/z)
216 (rune<= #/A char #/Z)))
218 (definline upcase-name-rune (rune)
219 (rune-upcase rune))
221 ;;;; --------------------------------------------------------------------------
222 ;;;; Rod Utilities
224 (definline subseq/rod (source start end)
225 ;; Optimized version of subseq for arrays of runes
226 (declare (type rod source)
227 (type fixnum start end))
228 (let ((res (make-rod (- end start))))
229 (declare (type rod res))
230 (do ((i (- (- end start) 1) (the fixnum (- i 1))))
231 ((< i 0) res)
232 (declare (type fixnum i))
233 (setf (%rune res i) (%rune source (the fixnum (+ i start)))))))
235 ;;; ---------------------------------------------------------------------------
236 ;;; Buffered I/O
238 (eval-when (compile load eval)
239 (defparameter *buf-size* 4096))
241 (defmacro a-read-byte (input)
242 (let ((c (gensym)))
243 `(let ((,c (runes:read-rune ,input)))
244 (if (eq ,c :eof) nil ,c))))
246 (defmacro a-peek-byte (input)
247 (let ((c (gensym)))
248 `(let ((,c (runes:peek-rune ,input)))
249 (if (eq ,c :eof) nil ,c))))
251 (defmacro a-unread-byte (byte input)
252 `(runes:unread-rune ,byte ,input))
254 (defmacro a-stream-position (input)
255 `(runes:xstream-position ,input))
257 (defun make-a-stream (&key cl-stream)
258 (runes:make-xstream cl-stream :initial-speed 1 :speed 8192))
260 (defmethod runes::read-octets (sequence (stream html-glisp:gstream) start end)
261 (html-glisp:g/read-byte-sequence sequence stream :start start :end end))
263 (defmethod runes::xstream/close ((stream html-glisp:gstream))
264 (html-glisp:g/close stream))
266 ;; a fake definition -- XXX non-reentrant!
268 (defun a-stream-scratch (input)
269 (getf (runes::xstream-plist input) 'scratch-pad))
271 (defun (setf a-stream-scratch) (new-value input)
272 (setf (getf (runes::xstream-plist input) 'scratch-pad) new-value))
274 ;;;; -------------------------------------------------------------------------
275 ;;;; Reporting Errors
276 ;;;;
278 (defun read-tag-error (input format-string &rest format-args)
279 (apply #'parse-warn input 4 format-string format-args)
280 (throw 'read-tag-error
281 (values :pcdata
282 (string-rod "##BAD TAG##"))))
284 ;;; Warning Levels
285 ;;; --------------
286 ;;; 1 - Absolutely normal mumbleing
287 ;;; 2 - Laziness introduced by the HTML standard
288 ;;; 3 - Semantic error upon parsing attributes
289 ;;; 4 - Accidents happening while parsing the structure
290 ;;; 5 - More serve errors
293 (defun parse-warn (input level fmt &rest args)
294 (let ((*print-pretty* nil)) ;disable ugly^H^H^H^Hpretty printing
295 (when (>= level *parse-warn-level*)
296 (unless *options/parser-silent-p*
297 (let ((preample (format nil ";; Parser warning: ~11@<Line ~D,~> ~11@<column ~D~>: ~5A "
298 (and input (ignore-errors (runes:xstream-line-number input)))
299 (and input (ignore-errors (runes:xstream-column-number input)))
300 (make-string level :initial-element #\*))))
301 (fresh-line *trace-output*)
302 (write-string preample *trace-output*)
303 (with-input-from-string (i (apply #'format nil fmt args))
304 (do ((x (read-line i nil nil) (read-line i nil nil))
305 (firstp t nil))
306 ((null x))
307 (unless firstp
308 (write-string ";; " *trace-output*)
309 (dotimes (i (- (length preample) 3))
310 (write-char #\space *trace-output*)))
311 (write-string x *trace-output*)
312 (terpri *trace-output*))))))))
314 ;;; ---------------------------------------------------------------------------
315 ;;; SGML lexer
317 (defun read-token (input dtd)
318 ;; Reads on token from the stream `input'
319 ;; Returns:
320 ;; :pcdata cdata
321 ;; :start-tag name atts
322 ;; :end-tag name
323 ;; :empty-tag name atts
324 ;; :comment cdata
325 ;; :define-tag cdata
326 ;; :experimental-tag cdata
327 ;; :eof
328 (let ((ch (a-read-byte input)))
329 (cond ((null ch)
330 :eof)
331 ((rune= ch #/<)
332 (read-tag input dtd))
334 (a-unread-byte ch input)
335 (read-pcdata input dtd)) )))
337 (defun push-on-scratch (input sp ch)
338 ;; Push the character ch onto the scratch pad of `input' and enlarge if neccessary
339 (setf (aref (a-stream-scratch input) sp) ch)
340 (setf sp (+ sp 1))
341 (cond ((= sp (length (a-stream-scratch input))) ;end of scratch pad reached?
342 (enlarge-scratch-pad input)))
345 (defun read-pcdata (input dtd)
346 (let* ((scratch (a-stream-scratch input)) ;scratch pad
347 (sp 0) ;pointer into scratch pad
348 (se (length scratch)) ;end of scratch pad
349 ) ;code vector
350 (declare (type (simple-array rune (*)) scratch))
351 (declare (type fixnum sp se))
352 (loop
353 (let ((ch (a-read-byte input)))
354 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
355 ;; a-read-byte returns a rune.
356 ;;; (declare (type (or null (unsigned-byte 8)) ch))
357 (declare (type (or null rune) ch))
358 (cond ((null ch) ;eof
359 (return))
360 ((rune= ch #/<) ;end of pcdata
361 (a-unread-byte ch input)
362 (return))
363 ((rune= ch #/&)
364 (setf sp (read-entity-ref input dtd sp)))
366 (setf (aref scratch sp) ch) ;recode character read
367 (setf sp (the fixnum (+ sp 1)))
368 (cond ((= sp se) ;end of scratch pad reached?
369 (enlarge-scratch-pad input)
370 (setf scratch (a-stream-scratch input)
371 se (length scratch))))))))
372 (values :pcdata
373 (subseq/rod scratch 0 sp)) ))
375 (defun read-entity-ref (input dtd sp)
376 ;; Reads an entity reference into the stream's scratch pad from position
377 ;; `sp' upwards.
378 ;; Returns the new write pointer. The initial "&" is already read from the
379 ;; stream.
380 ;; Syntax:
381 ;; entity-ref ::= "&" "#" <digit>+ (";")?
382 ;; entity-ref ::= "&" "#" "x" <hex-digit>+ (";")?
383 ;; entity-ref ::= "&" <name-start> <name-char>* (";")?
384 (let ((ch (a-read-byte input)))
385 (cond ((null ch) ;eof
386 (parse-warn input 3 "EOF in entity")
387 (push-on-scratch input sp #/&))
388 ((rune= ch #/#) ;numeric reference?
389 (read-numeric-entity input sp))
390 ((name-start-rune-p ch) ;named entity?
391 (read-named-entity input dtd sp ch))
393 (parse-warn input 3
394 "Saw character '~A' after '&' -- bad entity reference?!"
395 (or (rune-char ch)
396 (format nil "&#x~4,'0X" ch)))
397 (a-unread-byte ch input) ;it might be something interesting
398 (push-on-scratch input sp #/&)) )))
400 (defun read-numeric-entity (input sp)
401 ;; "&#" already read
402 (let ((ch (a-read-byte input)))
403 (setf sp (push-on-scratch input sp #/&))
404 (setf sp (push-on-scratch input sp #/#))
405 (cond ((null ch) ;eof
406 (parse-warn input 3 "EOF in entity")
409 ((digit-rune-p ch)
410 (read-numeric-entity-aux input (- sp 2) sp 10 ch))
412 ((rune= ch #/x)
413 ;; Hex entity
414 (setf sp (push-on-scratch input sp #/x))
415 (setf ch (a-read-byte input))
416 (cond ((null ch)
417 (parse-warn input 3 "EOF after '&#x'.")
419 ((not (digit-rune-p ch 16))
420 (parse-warn input 3 "Bad character after '&#x'.")
421 (a-unread-byte ch input)
424 (read-numeric-entity-aux input (- sp 3) sp 16 ch))))
426 (a-unread-byte ch input)
427 (parse-warn input 3 "Bad character after '&#'")
428 sp) )))
430 (defun read-numeric-entity-aux (input s0 sp radix ch)
431 ;; Aux routine for read-numeric-entity
432 ;; at s0..sp in the scratch pad is the already read prefix ('&#' or '&#x')
433 ;; Radix is the radix to use (10 or 16)
434 ;; returns new 'sp'
435 ;; 'ch' is the first digit
436 (let ((s1 sp))
437 (setf sp (push-on-scratch input sp ch))
438 (do ((ch (a-read-byte input) (a-read-byte input)))
439 ((or (null ch) (not (digit-rune-p ch radix)))
440 ;; Ok. [s1..sp) now is the digit sequence
441 (let ((num (parse-integer (map 'string #'rune-char
442 (subseq (a-stream-scratch input) s1 sp))
443 :radix radix)))
444 (cond ((<= 0 num #xFFFF)
445 ;; Proper entity value
446 (when (and (not (null ch)) (not (rune= ch #/\;)))
447 (a-unread-byte ch input))
448 ;; Rewind scratch pad to `s0' and push character `num'
449 (setf sp (push-on-scratch input s0 (code-rune num))))
451 ;; num too large; emit warning and leave scratch pad alone
452 (when (not (null ch))
453 (a-unread-byte ch input))
454 (parse-warn input 3 "Numeric enity ~A does not fit into our 16-bit strings; -- ignored."
455 (rod-string (rod-subseq (a-stream-scratch input) s0 sp)))))))
456 (setf sp (push-on-scratch input sp ch)))
457 sp))
459 (defun read-named-entity (input dtd sp ch)
460 ;; Just in case we want to leave the entity alone
461 (let ((s0 sp))
462 (setf sp (push-on-scratch input sp #/&))
463 (let ((s1 sp))
464 (setf sp (push-on-scratch input sp ch))
465 (do ((ch (a-read-byte input) (a-read-byte input)))
466 ((or (null ch) (not (name-rune-p ch)))
467 ;; Ok. [s1..sp) now is the name, try to resolve it
468 (let ((nums (find-named-entity dtd (subseq (a-stream-scratch input) s1 sp))))
469 (cond ((not (null nums))
470 ;; Proper entity value
471 ;; Rewind scratch pad to `s0' and push characters in `nums'
472 (dotimes (i (length nums))
473 (setf sp (push-on-scratch input s0 (aref nums i))))
474 (when (and (not (null ch)) (not (rune= ch #/\;)))
475 (a-unread-byte ch input)))
477 (when (not (null ch))
478 (a-unread-byte ch input))
479 (parse-warn input 3 "[~D] There is no such entity defined: ~A -- ignored."
480 (a-stream-position input)
481 (rod-string (rod-subseq (a-stream-scratch input) s0 sp)))))) )
482 (setf sp (push-on-scratch input sp ch))))
483 sp))
485 (defun find-named-entity (dtd fat-string)
486 (let ((str (rod-string fat-string)))
487 (let ((r (cdr (assoc str (sgml::dtd-entities dtd) :test #'string=))))
488 r)))
490 (defun enlarge-scratch-pad (input)
491 (let* ((old (a-stream-scratch input))
492 (se (length old)))
493 (declare (type fixnum se)
494 (type (simple-array rune (*)) old))
495 (let ((new (make-rod (+ (length (a-stream-scratch input)) *buf-size*))))
496 (declare (type rod new))
497 (do ((i (- se 1) (the fixnum (- i 1))))
498 ((< i 0))
499 (declare (type fixnum i))
500 (setf (aref new i) (aref old i)))
501 (setf (a-stream-scratch input) new))))
503 ;;; ------------------------------------------------------------
506 ;; tag ::= <start-tag> | <end-tag> | <exp-tag> | <comment>
507 ;; end-tag ::= "<" "/" <name> WSP ">"
508 ;; empty-tag ::= "<" <name> <atts> WSP "/" ">"
509 ;; start-tag ::= "<" <name> <atts> WSP ">"
510 ;; exp-tag ::= "<" "?" <any>* ">"
511 ;; comment ::= "<" "!" "-" "-" (<any>* - ("-" "-")) "-" "-" ">"
512 ;; | "<" "!" ">"
513 ;; att ::= <value>
514 ;; | <name> WSP "=" WSP <value>
515 ;; value ::= <literal> | <name>
516 ;; literal ::= """ <char>* """
517 ;; | "'" <char>* "'"
518 ;; atts ::= ( WSP <att> )*
519 ;; WSP ::= <white-space>*
520 ;; name ::= <name-start-char> <name-char>*
521 ;; char ::= <any> | <enitity-ref>
523 (defun read-tag (input dtd)
524 ;; The "<" is already read.
525 (catch 'read-tag-error
526 (let ((ch (a-peek-byte input)))
527 (cond ((rune= ch #//)
528 (a-read-byte input)
529 (read-end-tag input))
530 ((rune= ch #/!)
531 (a-read-byte input)
532 (read-define-tag input dtd))
533 ((rune= ch #/?)
534 (a-read-byte input)
535 (read-experimental-tag input))
536 ((and (not (null ch)) (name-start-rune-p ch))
537 (read-start-tag input dtd))
539 (parse-warn input 3 "Bad character after '<': '~A' -- ignored."
540 (rune-char ch))
541 (let ((res (string-rod "<")))
542 (values :pcdata res))) ))) )
544 (defun read-start-tag (input dtd)
545 (multiple-value-bind (name atts) (read-name-and-attributes input dtd)
546 (let ((ch (a-read-byte input)))
547 (cond ((rune= ch #/>)
548 (values :start-tag name atts))
549 ((rune= ch #/<)
550 (parse-warn input 3
551 "A '<' ended this tag.")
552 (a-unread-byte ch input)
553 (values :start-tag name atts))
554 ((rune= ch #//)
555 (setf ch (a-read-byte input))
556 (cond ((rune= ch #/>)
557 (values :empty-tag name atts))
559 (read-tag-error input "Expected '>' after '<' .. '/'"))))
561 (read-tag-error input "Expected '>'")) ))))
563 (defun read-end-tag (input)
564 (let ((name (read-name input)))
565 (skip-white-space input)
566 (let ((ch (a-read-byte input)))
567 (cond ((null ch)
568 (read-tag-error input "In end tag: Expected '>' got end-of-file instead."))
569 ((rune= ch #/>)
570 (values :end-tag name))
572 (read-tag-error input "In end tag: Expected '>'")) ))))
574 (defun read-name-and-attributes (input dtd)
575 (let ((name (read-name input))
576 (atts nil))
577 (loop
578 (skip-white-space input)
579 (cond ((member (a-peek-byte input) '(#/< #/> #//) :test #'rune=)
580 (return)))
581 (push (read-attribute input dtd) atts))
582 (values name (nreverse atts)) ))
584 (defun read-name (input)
585 (let ((ch (a-peek-byte input))
586 (sp 0))
587 (cond ((and (not (null ch)) (name-start-rune-p ch))
588 (do ((ch (a-read-byte input) (a-read-byte input)))
589 ((not (and ch (name-rune-p ch)))
590 (when ch
591 (a-unread-byte ch input))
592 (subseq/rod (a-stream-scratch input) 0 sp))
593 (setf sp (push-on-scratch input sp (upcase-name-rune ch)))))
595 (read-tag-error input "Not a name")) )))
597 (defun skip-white-space (input)
598 (do ((ch (a-read-byte input) (a-read-byte input)))
599 ((not (and ch (white-space-rune-p ch)))
600 (when ch (a-unread-byte ch input)))))
602 (defun read-attribute (input dtd)
603 (skip-white-space input)
604 (let ((slot (read-sloopy-name input)))
605 ;;(print (list 'slot '= (mungle slot) (mungle (vector (a-peek-byte input)))))
606 (skip-white-space input)
607 (let ((c (a-peek-byte input)))
608 (cond ((and (not (null c)) (rune= c #/=))
609 (a-read-byte input)
610 (skip-white-space input)
611 (let ((value (read-value input dtd)))
612 (cons slot value)))
614 slot)))))
616 (defun read-value (input dtd)
617 (let ((ch (a-peek-byte input)))
618 (cond ((rune= ch #/')
619 (a-read-byte input)
620 (read-literal input dtd ch))
621 ((rune= ch #/\")
622 (a-read-byte input)
623 (read-literal input dtd ch))
624 ((and ch (sloopy-name-rune-p ch))
625 (read-sloopy-value input))
627 (read-tag-error input "Bad value '~A' seen"
628 (or (rune-char ch)
629 (format nil "U+~4,'0X" (rune-code ch))))))))
631 (defun read-literal (input dtd delim)
632 (let* ((scratch (a-stream-scratch input)) ;scratch pad
633 (sp 0) ;pointer into scratch pad
634 (se (length scratch)) ;end of scratch pad
635 ) ;code vector
636 (declare (type rod scratch))
637 (declare (type fixnum sp se))
638 (loop
639 (let ((ch (a-read-byte input)))
640 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
641 ;; a-read-byte returns a rune.
642 ;;; (declare (type (or null (unsigned-byte 8)) ch))
643 (declare (type (or null rune) ch))
644 (cond ((null ch) ;eof
645 (read-tag-error input "Eof in literal"))
646 ((rune= ch delim)
647 (return))
648 ((rune= ch #/&)
649 (setf sp (read-entity-ref input dtd sp)))
651 (setf (aref scratch sp) ch) ;recode character read
652 (setf sp (the fixnum (+ sp 1)))
653 (cond ((= sp se) ;end of scratch pad reached?
654 (enlarge-scratch-pad input)
655 (setf scratch (a-stream-scratch input)
656 se (length scratch))))))))
657 (subseq/rod scratch 0 sp) ))
659 (defun read-sloopy-name (input)
660 (let ((ch (a-peek-byte input))
661 (sp 0))
662 (cond ((and (not (null ch)) (sloopy-name-rune-p ch))
663 (do ((ch (a-read-byte input) (a-read-byte input)))
664 ((not (and ch (sloopy-name-rune-p ch)))
665 (when ch
666 (a-unread-byte ch input))
667 (subseq/rod (a-stream-scratch input) 0 sp))
668 (setf sp (push-on-scratch input sp ch))))
670 (read-tag-error input
671 "Expected sloopy name, got ~A"
672 (or (rune-char ch) (format nil "U+~4,'0X" ch)) )) )))
674 (defun read-sloopy-value (input)
675 (let ((ch (a-peek-byte input))
676 (sp 0))
677 (cond ((and (not (null ch)) (sloopy-value-rune-p ch))
678 (do ((ch (a-read-byte input) (a-read-byte input)))
679 ((not (and ch (sloopy-value-rune-p ch)))
680 (when ch
681 (a-unread-byte ch input))
682 (subseq/rod (a-stream-scratch input) 0 sp))
683 (setf sp (push-on-scratch input sp ch))))
685 (read-tag-error input "Expected sloopy value, got ~A"
686 (or (rune-char ch) (format nil "U+~4,'0X" ch)) )) )))
688 (defun read-define-tag (input dtd)
689 (let ((ch (a-peek-byte input)))
690 (cond ((null ch)
691 (read-tag-error input "unexpected EOF"))
692 ((rune= ch #/>)
693 ;; empty define tag -- to be ignored
694 (a-read-byte input)
695 (read-token input dtd))
696 ((rune= ch #/-)
697 ;; comment?
698 (a-read-byte input)
699 (let ((ch (a-peek-byte input)))
700 (cond ((and (not (null ch)) (rune= ch #/-))
701 (read-comment input))
703 (read-tag-error input "Expected '-' after \"<!-\"")))))
705 (read-define-tag-2 input)) )))
707 (defun read-define-tag-2 (input)
708 ;; TODO: Comments
709 ;; we simply slurp until '>'
710 (let ((sp 0))
711 (do ((ch (a-read-byte input) (a-read-byte input)))
712 ((and ch (rune= ch #/>))
713 (values :define-tag (subseq/rod (a-stream-scratch input) 0 sp)))
714 (setf sp (push-on-scratch input sp ch))) ))
716 (defun read-comment (input)
717 (a-read-byte input) ;consume the '-'
718 (let ((c0 0)
719 (c1 (or (a-read-byte input) (read-tag-error input "Unexpected EOF")))
720 (c2 (or (a-read-byte input) (read-tag-error input "Unexpected EOF")))
721 (sp 0)
722 (warned-p nil))
723 (loop
724 (psetq c0 c1
725 c1 c2
726 c2 (a-read-byte input))
727 (cond ((null c2)
728 (read-tag-error input "EOF within comment."))
729 ((and (rune= c0 #/-)
730 (rune= c1 #/-)
731 (rune= c2 #/>))
732 (return))
733 ((and *gt-ends-comment-p*
734 (rune= c2 #/>))
735 (parse-warn input 3 "A '>' ends this comment.")
736 (return)))
737 (cond ((and (rune= c0 #/-) (rune= c1 #/-))
738 (unless warned-p
739 (parse-warn input 4 "\"--\" seen within comment; This is strongly depreciated.")
740 (setf warned-p t))))
741 (setf sp (push-on-scratch input sp c0)))
742 (values :comment (subseq/rod (a-stream-scratch input) 0 sp)) ))
744 ;;;; ------------------------------------------------------------------------------------------
746 (defun name-start-char-p (ch)
747 (alpha-char-p ch))
749 (defun name-char-p (ch)
750 (or (alphanumericp ch) (char= ch #\.) (char= ch #\-)) )
752 (defun valid-name-string-p (string)
753 "Is the string `string' a valid name string according to the SGML
754 conventions?"
755 (and (> (length string) 0)
756 (name-start-char-p (char string 0))
757 (every #'name-char-p string)) )
759 ;;;; ------------------------------------------------------------------------------------------
760 ;;;; Resolving Entities
761 ;;;;
763 ;;;; TODO: Check that numeric entities are within 0..#xFFFF;
765 (defun resolve-numeric-entity (string start end) ; --> string ; new start
766 (let ((j (or (position-if-not #'digit-rune-p string :start start :end end) end)))
767 (values
768 (let ((n (parse-integer (rod-string (rod-subseq string start j)) :radix 10)))
769 (rod n))
770 (if (and (< j end) (rune= (rune string j) #/\;))
771 (+ j 1)
772 j))))
774 (defun resolve-hex-entity (string start end) ; --> string ; new start
775 ;; Resolves a hexadecimal entity like "&#x2A;", start should point
776 ;; to the character directy after the '&#x'.
777 (let ((j (or (position-if-not (rcurry #'digit-rune-p 16) string :start start :end end) end)))
778 (values
779 (let ((n (parse-integer (rod-string (rod-subseq string start j)) :radix 16)))
780 (rod n))
781 (if (and (< j end) (rune= (rune string j) #/\;))
782 (+ j 1)
783 j))))
785 (defun resolve-named-entity (string entities start end &optional input)
786 ;; --> string ; new start
787 (let ((j (or (position-if-not #'name-start-rune-p string :start start :end end) end)))
788 (let ((res
789 (or (dolist (k entities)
790 (when (and (= (length (car k)) (- j start))
791 ;; XXX this compare conses far too much!
792 (rod= (string-rod (subseq (car k) 0 (- j start)))
793 (rod-subseq string start j)))
794 (return (string-rod (cdr k))) ) )
795 (dolist (k entities nil)
796 (when (and (>= (length (car k)) (- j start))
797 ;; XXX dito
798 (rod= (string-rod (subseq (car k) 0 (- j start)))
799 (rod-subseq string start j)))
800 (return (string-rod (cdr k))) )))))
801 (cond ((not (null res))
802 (values
803 (resolve-entities-in-string res entities 0 (length res) input) ;right?
804 (if (and (< j end) (rune= (rune string j) #/\;))
805 (+ j 1)
806 j)))
808 (parse-warn input 3 "Entity &~a; is not defined." (subseq string start j))
809 (values (subseq string (1- start) j) j))))))
811 (defun resolve-entities-in-string (string entities
812 &optional (start 0) (end (length string)) input)
813 ;; Resolve all entity references introduced by "&" in the string
814 ;; `string'. `start' and `end' specify a substring to operate on.
815 ;; For error messages `input' may be the input stream the data is
816 ;; coming from originally.
817 (let ((i (position #/& string :start start :end end :test #'rune=)))
818 (cond ((null i)
819 ;; no further entities in string -- all done
820 (rod-subseq string start end))
821 ((and (< (+ i 2) end)
822 (rune= #/# (rune string (+ i 1)))
823 (digit-rune-p (rune string (+ i 2))))
824 ;; numeric entity seen
825 (multiple-value-bind (res j) (resolve-numeric-entity string (+ i 2) end)
826 (concatenate 'rod (subseq string start i)
828 (resolve-entities-in-string string entities j end input))))
829 ((and (< (+ i 2) end)
830 (rune= #/# (rune string (+ i 1)))
831 (rune= #/x (rune string (+ i 2))))
832 ;; hexadecimal entity seen
833 (multiple-value-bind (res j) (resolve-hex-entity string (+ i 3) end)
834 (concatenate 'rod (subseq string start i)
836 (resolve-entities-in-string string entities j end input))))
837 ((and (< (+ i 1) end)
838 (alpha-rune-p (aref string (+ i 1))))
839 ;; this must be a named entity
840 (multiple-value-bind (res j)
841 (resolve-named-entity string entities (+ i 1) end input)
842 (concatenate 'rod (subseq string start i)
844 (resolve-entities-in-string string entities j end input))))
846 ;; no entity reference.
847 (concatenate 'rod (subseq string start i)
848 (rod #/&)
849 (resolve-entities-in-string string entities (+ i 1) end input))))))
852 ;;;; ------------------------------------------------------------------------------------------
853 ;;;; Mungling of Attribute values
854 ;;;;
856 (let ((kw-pkg (find-package :keyword)))
857 (defun kintern (x)
858 (intern x kw-pkg)))
860 (defun canon-value (input dtd tag slot value)
861 (let* ((attlist (find-element-attlist dtd tag))
862 (looked (assoc slot attlist)))
863 (cond ((and looked (listp (cadr looked)))
864 (or (find value (cadr looked)
865 :test #'(lambda (x y)
866 (string-equal (string x) (string y))))
867 (progn
868 ;; Oh yeah! monster format strings are fun!
869 (parse-warn input 3
870 "~S is a bad value for the '~A' slot of '<~A>', which could ~
871 ~{~#[not take any value~;only take '~A'~:;take one of ~@{'~A'~#[~; or ~:;, ~]~}~]~:}."
872 value slot tag (cadr looked)))))
873 ((member (cadr looked) '(:number))
874 (or (maybe-parse-integer value)
875 (progn
876 (parse-warn input 3 "~S is not NUMBER (attribute '~A' of '<~A>')."
877 value slot tag)
878 nil)))
879 ((member (cadr looked) '(:name :id))
880 (cond ((valid-name-string-p value)
881 (kintern (string-upcase value)))
883 (parse-warn input 3 "~S is not NAME (attribute '~A' of '<~A>')."
884 value slot tag)
885 nil)))
886 (looked
887 value)
889 (parse-warn input 3 "The '<~A>' element has no '~A' slot." tag slot)
890 nil) )))
892 (defun find-slot-value-pair (input dtd tag value)
893 (let* ((attlist (find-element-attlist dtd tag))
894 (looked nil))
895 (dolist (att attlist)
896 (cond ((and (listp (cadr att))
897 (setq looked (find value (cadr att)
898 :test #'(lambda (x y)
899 (string-equal (string x) (string y))))))
900 (return-from find-slot-value-pair (values (car att) looked)))))
901 ;;fall thru'
902 (parse-warn input 3
903 "The '<~A>' tag has no slot which could take the '~A' keyword.~%~
904 ~1{~#[There are no possible slots at all.~;~
905 Only possible slot is:~:;~
906 Possible slots are:~]~
907 ~@{~& ~1{Slot '~A'~20T could ~{~#[not take any value~;~
908 only take '~A'~:;~
909 take one of ~@{'~A'~#[~; or ~:;, ~]~}~].~:}~}~}~:}"
910 tag value (remove-if-not #'(lambda (x) (consp (cadr x))) attlist)) ))
913 ;;;; ------------------------------------------------------------
915 (defun read-token* (input dtd)
916 ;;(skip-white-space input)
917 (multiple-value-bind (kind a b) (read-token input dtd)
918 (ecase kind
919 (:pcdata (make-start-tag :name :pcdata :atts a))
920 (:start-tag
921 (let ((name (kintern (rod-string a))))
922 (if (tag-exists? dtd name)
923 (make-start-tag :name name :atts (mungle-attlist dtd name b))
924 (progn
925 (parse-warn input 4 "There is no such thing as <~A> -- ignored." name)
926 (read-token* input dtd)))))
927 (:end-tag
928 (let ((name (kintern (rod-string a))))
929 (if (tag-exists? dtd name)
930 (make-end-tag :name name)
931 (progn
932 (parse-warn input 4 "There is no such thing as </~A> -- ignored." name)
933 (read-token* input dtd)))) )
934 (:empty-tag
935 (parse-warn input 2 "Oops -- there is an empty tag; but this is only HTML?!")
936 (let ((name (kintern (rod-string a))))
937 (if (tag-exists? dtd name)
938 (make-start-tag :name name :atts (mungle-attlist dtd name b))
939 (progn
940 (parse-warn input 4 "There is no such thing as <~A> -- ignored." name)
941 (read-token* input dtd))))
942 ;;(read-token* input dtd)
944 (:define-tag (read-token* input dtd))
945 (:experimental-tag
946 (parse-warn input 2 "Ignoreing processing instruction tag: '~A'" (mungle a))
947 (read-token* input dtd))
948 (:comment
949 (make-comment-token :data a))
950 (:eof
951 (make-end-tag :name :%top)) )))
953 (defun tag-exists? (dtd name)
954 (and
955 ;;(not (eq name :font)) ;xxx
956 ;;(not (eq name :center)) ;xxx
957 ;;(not (eq name :div)) ;xxx
958 ;;(not (eq name :img))
959 ;;(not (eq name :form))
960 (not (eq name :noscript)) ;needed for www.sgi.com
961 (sgml::find-element dtd name nil nil)))
963 (defun foofoo (r)
964 (cond ((integerp r) (string-rod (prin1-to-string r)))
965 ((symbolp r) (string-rod (princ-to-string r)))
966 ((stringp r) (string-rod r))
968 (error "foofoo: Hmm ~S ?!" r))))
970 ;;; The renderer might depend on upper-case attribute values, so let's leave
971 ;;; this off by default. For the benefit of html <-> xml conversions we
972 ;;; don't want to check the DTD every time we convert an attribute though,
973 ;;; so we need this mode for lower-case attribute values.
974 (defvar *unmungle-attribute-case* nil)
976 (defun mungle-attlist (dtd tag atts)
977 (mapcan (lambda (x)
978 (cond ((atom x)
979 ;; this clause isn't unicode-safe
980 (multiple-value-bind (slot value)
981 (sgml::find-slot-value-pair nil dtd tag (mungle x))
982 (when value
983 (setf value (foofoo value))
984 (when *unmungle-attribute-case*
985 (setf value (rod-downcase value))))
986 (and slot
987 (list slot value))))
989 (let ((slot (kintern (string-upcase (mungle (car x))))))
990 (list slot (cdr x))))))
991 atts))
993 (defun read-experimental-tag (input)
994 ;; TODO: Comments
995 ;; we simply slurp until '>'
996 (let ((sp 0))
997 (do ((ch (a-read-byte input) (a-read-byte input)))
998 ((and ch (rune= ch #/>))
999 (values :experimental-tag (subseq/rod (a-stream-scratch input) 0 sp)))
1000 (setf sp (push-on-scratch input sp ch))) ))
1003 ;;; ---------------------------------------------------------------------------
1004 ;;; The PDA
1007 (defun handle-meta-tag-in-parser (input attrs)
1008 (when (rod-equal (string-rod "content-type") (getf attrs :http-equiv))
1009 (let ((content-type (getf attrs :content)))
1010 (and content-type
1011 (multiple-value-bind (type subtype parameters)
1012 (closure-mime-types:parse-mime-content-type
1013 (rod-string content-type))
1014 (declare (ignore type subtype))
1015 (let ((cs (assoc :charset parameters :test #'string-equal)))
1016 (when cs
1017 (setup-code-vector input (cdr cs)))))))))
1019 (defun sgml-parse (dtd input)
1020 (let* ((stack (list (make-start-tag :name :%top :atts nil)))
1021 (s (sgml::make-pt/low :name 'top))
1022 (r s)
1023 (eof? nil)
1024 (eingabe nil)
1025 ausgabe)
1026 (loop
1027 (do ()
1028 ((or eof? (not (null (cdr eingabe)))))
1029 (let ((tok (read-token* input dtd)))
1030 (when (and (end-tag-p tok) (eq (tag-name tok) :%top))
1031 (setf eof? t))
1032 (setf eingabe (nconc eingabe (list tok)))))
1033 (multiple-value-setq (stack eingabe ausgabe) (transition input dtd stack eingabe))
1034 (cond ((eq ausgabe :accept)
1035 (return))
1037 ((eq ausgabe :error)
1038 (return))
1040 ((eq ausgabe :close)
1041 (setf s (sgml:pt-parent s)))
1043 ((eq (car ausgabe) :comment)
1044 (setf (sgml:pt-children s)
1045 (nconc (sgml:pt-children s) (list (sgml::make-pt/low
1046 :name (cadr ausgabe)
1047 :attrs (caddr ausgabe)
1048 :children nil
1049 :parent s)))))
1051 ((eq (car ausgabe) :open)
1052 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1053 (cond ((and (eq (cadr ausgabe) :meta))
1054 (handle-meta-tag-in-parser input (caddr ausgabe))))
1055 ;; when the BODY tag is openend, switch the streams speed to full speed.
1056 (cond ((and (eq (cadr ausgabe) :body))
1057 (setf (runes::xstream-speed input)
1058 (length (runes::xstream-os-buffer input)))))
1059 (let ((n (sgml::make-pt/low
1060 :name (cadr ausgabe)
1061 :attrs (caddr ausgabe)
1062 :children nil
1063 :parent s)))
1064 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))
1065 s n) ))
1067 ((eq (car ausgabe) :open/close)
1068 ;; code duplication alert!
1069 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1070 (cond ((and (eq (cadr ausgabe) :meta))
1071 (handle-meta-tag-in-parser input (caddr ausgabe))))
1072 (let ((n (sgml::make-pt/low
1073 :name (cadr ausgabe)
1074 :attrs (caddr ausgabe)
1075 :children nil
1076 :parent s)))
1077 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))
1078 s n) )
1079 (setf s (sgml:pt-parent s)))
1081 ((eq (car ausgabe) :htag)
1082 ;; code duplication alert!
1083 (let* ((v (cadr ausgabe))
1084 (n (cond ((start-tag-p v)
1085 (make-hstag-node
1086 :name (tag-name v)
1087 :attrs (start-tag-atts v)
1088 :children nil
1089 :parent s))
1090 ((end-tag-p v)
1091 (make-hetag-node
1092 :name (tag-name v)
1093 :attrs nil
1094 :children nil
1095 :parent s))
1097 (error "fix your code.")))))
1098 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))) ))
1100 r) )
1102 (defun empty-element-p (dtd gi)
1103 (null (elm-inclusion dtd gi)))
1105 (defun transition (input dtd stack eingabe) ;; --> stack', eingabe', ausgabe
1106 (cond ((and stack (eq (tag-name (car stack)) :pcdata))
1107 (values (cdr stack)
1108 eingabe
1109 :close))
1110 ((null eingabe)
1111 (cond ((null stack)
1112 (values nil nil :accept))
1114 (values stack eingabe :error))))
1115 ;; (aS, </a>W) -> (S, W, </a>)
1116 ((comment-token-p (car eingabe))
1117 (values stack
1118 (cdr eingabe)
1119 (if (member (tag-name (car stack)) *preserves-comments-elements*)
1120 (progn
1121 (list :comment :pcdata (comment-token-data (car eingabe))))
1122 nil)))
1124 ((and (tag-p (car eingabe))
1125 (tag-as-marker-p (tag-name (car eingabe))))
1126 (values stack
1127 (cdr eingabe)
1128 (list :htag (car eingabe))))
1130 ((and (end-tag-p (car eingabe))
1131 stack
1132 (eq (tag-name (car stack))
1133 (tag-name (car eingabe))))
1134 (values (cdr stack)
1135 (cdr eingabe)
1136 :close))
1138 ((and (start-tag-p (car eingabe))
1139 stack
1140 (member (tag-name (car eingabe)) (elm-inclusion dtd (tag-name (car stack)))) )
1141 (cond
1142 ((empty-element-p dtd (tag-name (car eingabe)))
1143 (values stack
1144 (cdr eingabe)
1145 (list :open/close (tag-name (car eingabe)) (start-tag-atts (car eingabe)))))
1147 (values (cons (car eingabe) stack)
1148 (cdr eingabe)
1149 (list :open (tag-name (car eingabe)) (start-tag-atts (car eingabe)))))))
1151 ((and (white-space-token-p (car eingabe))
1152 stack
1153 (not (member :pcdata (elm-inclusion dtd (tag-name (car stack))))))
1154 ;; ignorieren
1155 (values stack (cdr eingabe) nil))
1156 ((null stack)
1157 (error "Oops empty stack in TRANSITION on ~S." eingabe))
1159 (let ((x (resolve dtd (tag-name (car stack)) (car eingabe))))
1160 (if x
1161 (values stack
1162 (cons x eingabe)
1163 nil)
1164 (values stack
1165 (heuristic input dtd (car stack) eingabe)
1166 nil)))) ))
1168 (defun tag-as-marker-p (gi)
1169 (and *font-heuristic-p*
1170 (eq gi :font))
1171 (and *anchor-heuristic-p*
1172 (eq gi :a)) )
1174 (defun white-space-token-p (x)
1175 (and (start-tag-p x)
1176 (eq (tag-name x) :pcdata)
1177 (every #'white-space-rune-p (start-tag-atts x))))
1179 ;;; Heuristic conflict resolution
1181 (defun shortest-different-beginning (x y)
1182 (do ((q x (cdr q))
1183 (i 0 (+ i 1)))
1184 ((null q))
1185 (do ((p y (cdr p))
1186 (j 0 (+ j 1)))
1187 ((null p))
1188 (if (eq p q)
1189 (return-from shortest-different-beginning
1190 (values (subseq x 0 i)
1191 (subseq y 0 j)))))))
1193 (defun document-action (input context alte-eingabe neue-eingabe is-default-p)
1194 (let ((see (car alte-eingabe)))
1195 (parse-warn input 4 "[~A] Saw ~A in ~A ~A"
1196 (if is-default-p "-" "H") see context
1197 (multiple-value-bind (a b) (shortest-different-beginning alte-eingabe neue-eingabe)
1198 (cond ((and (null a) (null b))
1199 (format nil "-- ??? patched ~S -> ~S" alte-eingabe neue-eingabe))
1200 ((null b)
1201 (format nil "-- nuked~{ ~A~}." a))
1202 ((null a)
1203 (format nil "-- inserted~{ ~A~}." b))
1205 (format nil "-- patched~{ ~A~} ->~{ ~A~}." a b)) )) )))
1207 (defun heuristic (input dtd context eingabe)
1208 (let ((see (car eingabe))
1209 (is-default-p nil))
1210 (labels ((is (tag state)
1211 (and (elms-eqv dtd state (tag-name context))
1212 (if (char= (char (symbol-name tag) 0) #\/)
1213 (and (end-tag-p (car eingabe))
1214 (elms-eqv dtd
1215 (tag-name (car eingabe))
1216 (kintern (subseq (symbol-name tag) 1))))
1217 (and (start-tag-p (car eingabe))
1218 (elms-eqv dtd (tag-name (car eingabe)) tag))))))
1219 (let ((neu
1220 (cond ((and (member :HTML (find-dtd-top-elements dtd))
1221 (cond ((and (end-tag-p see) (eq (tag-name see) :%top))
1222 (cons (elm-etag (tag-name context)) eingabe))
1224 ((and (start-tag-p see)
1225 (eq (tag-name see) :style))
1226 (cons (make-start-tag :name :SPAN
1227 :atts (list :class (rod "illegalstyle")))
1228 (cdr eingabe)))
1230 ((is :center #|in|# :h1)
1231 (list* (elm-etag (tag-name context))
1232 (car eingabe) context (cdr eingabe)))
1234 #+(OR)
1235 ;; this one for KMP
1236 ((is :h2 #|in|# :a)
1237 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1239 ((or (is :center #|in|# :font)
1240 (is :p #|in|# :font))
1241 ;; Uff -- the attributes of FONT are lost here.
1242 ;; we have to extend `context' to include these.
1243 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1244 ((is :hr #|in|# :i)
1245 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1246 ((elms-eqv dtd (tag-name context) ':font)
1247 (cons (elm-etag (tag-name context)) eingabe))
1248 ((is :tr #|in|# :center)
1249 (cons (elm-etag (tag-name context)) eingabe))
1250 ((is :/h2 #|in|# :h1)
1251 (cons (elm-etag (tag-name context)) eingabe))
1252 ((is :ul #|in|# :h1)
1253 (cons (elm-etag (tag-name context)) eingabe))
1254 ((is :/center #|in|# :h3)
1255 (cons (elm-etag (tag-name context)) eingabe))
1256 ((is :/td #|in|# :div)
1257 (cons (elm-etag (tag-name context)) eingabe))
1258 ((is :p #|in|# :ul)
1259 (cons (make-start-tag :name :li) eingabe))
1260 ((is :a #|in|# :ul)
1261 (cons (make-start-tag :name :li) eingabe))
1262 ((is :img #|in|# :ul)
1263 (cons (make-start-tag :name :li) eingabe))
1265 ((is :pcdata #|in|# :ul)
1266 (cons (make-start-tag :name :li) eingabe))
1268 ((is :td #|in|# :table)
1269 (list* (make-start-tag :name :tr)
1270 eingabe))
1271 ((is :pcdata #|in|# :table)
1272 (list* (elm-etag (tag-name context)) (car eingabe) context
1273 (cdr eingabe)))
1275 ((is :frameset #|in|# :noscript)
1276 (list* (elm-etag (tag-name context)) eingabe))
1278 ((is :form #|in|# :table)
1279 (list* (elm-etag (tag-name context))
1280 (car eingabe)
1281 context
1282 (cdr eingabe)))
1284 ((is :/form #|in|# :tbody)
1285 ;; we should better check here wether FORM is open at all.
1286 (list* (cadr eingabe)
1287 (car eingabe)
1288 (cddr eingabe)))
1290 ;; new as of 1999-08-31
1291 ((is :td #|in|# :li)
1292 (list* (elm-etag (tag-name context))
1293 eingabe))
1294 ((is :/td #|in|# :li)
1295 (list* (elm-etag (tag-name context))
1296 eingabe))
1297 ((is :td #|in|# :ul)
1298 (list* (elm-etag (tag-name context))
1299 eingabe))
1300 ((is :/td #|in|# :ul)
1301 (list* (elm-etag (tag-name context))
1302 eingabe))
1306 (setq is-default-p t)
1307 (cdr eingabe)) )))
1308 (document-action input (tag-name context) eingabe neu is-default-p)
1309 neu))))
1311 (defun parse-html (input &optional (charset :iso-8859-1))
1312 (let ((dtd closure-html:*html-dtd*))
1313 (let ((input (runes:make-xstream input :initial-speed 1 :speed 128)))
1314 (setf (a-stream-scratch input)
1315 (make-array #.(* 2 *buf-size*) :element-type 'rune))
1316 (setup-code-vector input charset)
1317 (let ((r (sgml-parse dtd input)))
1318 (post-mortem-heuristic dtd r)))) )
1320 (defun post-mortem-heuristic (dtd parse-tree)
1321 "Do possible post mortem heuristic on a parse tree."
1322 (when *font-heuristic-p*
1323 (setf parse-tree (post-mortem/fix-font dtd parse-tree)))
1324 (setf parse-tree (post-mortem/fix-top-level-structure parse-tree))
1325 parse-tree)
1327 (defun post-mortem/fix-top-level-structure (parse-tree)
1328 ;; The hacking below is needed because of buggy input. Something like this
1329 ;; <base ..> <html> <head> .. <body> ... </html>
1330 ;; are two documents in one. Here we merge then into one HTML document.
1331 ;; Note that this defeats later incremental rendering pretty well.
1332 ;; Also this is HTML specific.
1333 ;; Das muss dennoch alles noch anders werden hier.
1334 (let ((r parse-tree)
1335 (head-elts nil)
1336 (body-elts nil)
1337 (frameset-elts nil)
1338 (head-warn-flag nil)
1339 (body-warn-flag nil)
1340 (frameset nil)
1341 (body nil))
1342 (when (> (length (pt-children r)) 1)
1343 (parse-warn nil 4 "Multiple HTML elements in document."))
1344 (dolist (k (pt-children r))
1345 (ecase (gi k)
1346 (:html
1347 (dolist (k (pt-children k))
1348 (ecase (gi k)
1349 (:head
1350 (when head-elts
1351 (setf head-warn-flag t))
1352 (setf head-elts (nconc head-elts (pt-children k))))
1353 ((:body)
1354 (setq body k)
1355 (when body-elts
1356 (setf body-warn-flag t))
1357 (setf body-elts (nconc body-elts (pt-children k))))
1358 ((:frameset)
1359 (setq frameset k)
1360 (setf frameset-elts (nconc frameset-elts (pt-children k))) ))))))
1361 (when head-warn-flag
1362 (parse-warn nil 4 "Multiple HEAD elements."))
1363 (when body-warn-flag
1364 (parse-warn nil 4 "Multiple BODY elements."))
1365 (let* ((html (make-pt/low :name :html :parent nil))
1366 (head (make-pt/low :name :head :parent html :children head-elts)))
1367 (cond (frameset
1368 (cond (body
1369 (parse-warn nil 4 "Body present while FRAMSET is present -- nuked!")
1370 (setf body nil body-elts nil))
1372 (setf (pt-parent frameset) html
1373 (pt-children frameset) frameset-elts)
1376 (cond (body
1377 (setf (pt-parent body) html
1378 (pt-children body) body-elts))
1380 (setf body (make-pt/low :name :body
1381 :parent html
1382 :children body-elts))))))
1383 (dolist (k head-elts) (setf (pt-parent k) head))
1384 (dolist (k body-elts) (setf (pt-parent k) body))
1385 (dolist (k frameset-elts) (setf (pt-parent k) frameset))
1386 (setf (pt-children html)
1387 (append (and head (list head))
1388 (and body (list body))
1389 (and frameset (list frameset))))
1390 html) ))
1392 ;;; FONT post mortem heuristic
1393 ;;; ==========================
1395 ;; Since FONT is by far the most often misused element, we handle these by a
1396 ;; post mortem analysis. If *font-heuristic-p* is true, parser does not care
1397 ;; for FONT start or end tags, but inserts HSTAG-NODEs and HETAG-NODEs into
1398 ;; the parse tree blindly. (Thus emulating the Mosaic approach to rendering).
1400 ;; Example
1401 ;; -------
1403 ;; <p> <font> foo <b> bar </font> baz </b>
1405 ;; is then parsed as:
1407 ;; (P #<HSTAG font> "foo" (B "bar" #<HETAG font> "baz"))
1410 ;; We now always want to return a parse tree, which conforms to the DTD, and
1411 ;; thus have to mungle this somehow into a sane HTML parse tree by wraping
1412 ;; FONT nodes around the right parts.
1414 ;; The first thing we do is to find all pairs of HSTAG, HETAG
1415 ;; nodes. We then use the following algorithm:
1417 ;; Algorithm
1418 ;; ---------
1420 ;; S = HSTAG node
1421 ;; E = HETAG node
1423 ;; if S and E are on the same level then // [*] that is E, S have the same parent
1424 ;; p <- S.parent
1425 ;; ;; partitionate p.children as:
1426 ;; p.children = (,@sb S ,@si E ,@se)
1427 ;; if si = () then
1428 ;; ;; FONT element spans nothing, so forget it
1429 ;; else
1430 ;; if p may contain FONT and
1431 ;; for all x in si: FONT may contain x
1432 ;; then
1433 ;; p.children <- (,@sb (font ,@si) ,@se)
1434 ;; all done
1436 ;; if S is higher in tree than E then
1437 ;; V <- ancestor of E, with V.parent = S.parent
1438 ;; insert a copy of E directly before E
1439 ;; insert a copy of S directly before first child of V
1440 ;; apply the algorithm recursively
1442 ;; if E is higher in tree than S then
1443 ;; ;; this analog to the case above
1444 ;; V <- ancestor of S, with V.parent = E.parent // this ancestor need not to exist!
1445 ;; insert a copy of S directly after V
1446 ;; insert a copy of E direclly after last child of V
1447 ;; apply the algorithm recursively
1450 ;; [*] This is not right. This was an thinko: Two nodes on the same
1451 ;; level do not always have the parent nodes.
1453 ;; NOTE: My first formulation of the algorithm used access pathen and the
1454 ;; implementation below does also -- this is inefficient and should be
1455 ;; changed.
1457 ;; NOTE: It would be nice, if we could prove that switching on
1458 ;; *FONT-HEURISTIC-P* does not hurd any conforming document.
1460 ;; When the need araises, we could also use the same method to deal
1461 ;; malicious B, I, et al tags; But bad nesting isn't that popular any more
1462 ;; these days.
1464 ;; Expirience however showed, that we would need something similar for FORM,
1465 ;; since people have the habit of spitting a FORM anywhere they see
1466 ;; fit. [Most popular is right between TR and TD]. Visit www.deja.com and you
1467 ;; see what I mean. FORM elements are extremely important, since one may be
1468 ;; able to cope with gliberish on the screen, but not with non-functional
1469 ;; forms. Another thing here is <INPUT> elements in inlegal spots.
1471 ;;; TODO
1473 ;; We still want to emit parser warnings, if FONT elements are misused.
1475 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1476 ;;; (declare (special q))
1477 ;;; (map-htag-pairs (lambda (stag etag)
1478 ;;; (let (s e)
1479 ;;; (setf s (pt-path parse-tree stag))
1480 ;;; (remove-pt stag)
1481 ;;; (setf e (pt-path parse-tree etag))
1482 ;;; (remove-pt etag)
1483 ;;; (mungle-font-pair dtd parse-tree stag s e)))
1484 ;;; parse-tree :font)
1485 ;;; parse-tree)
1487 (defun mungle-font-pair (dtd root tag s e)
1488 ;; NOTE: ignore-errors is needed, since ELM-INCLUSION checks for existing
1489 ;; element names (this is introduced by our pseudo TOP element).
1490 (cond ((equal s e))
1491 ((and (= (length s) (length e))
1492 (equal (butlast s) (butlast e)))
1493 ;; implicit assert was (butlast s) equal (butlast e), which is not always true.
1494 (let ((p (pt-access root (butlast s))))
1495 (cond ((and (member :font (ignore-errors (elm-inclusion dtd (gi p))))
1496 (do ((j (car (last s)) (+ j 1)))
1497 ((>= j (car (last e))) t)
1498 (when (not (null (elt (pt-children p) j)))
1499 ;; this should always be true!
1500 (unless (member (gi (elt (pt-children p) j))
1501 (ignore-errors (elm-inclusion dtd :font)))
1502 (return nil)))))
1503 (let* ((before (loop for i from 0 to (1- (car (last s)))
1504 collect (elt (pt-children p) i)))
1505 (between (loop for i from (car (last s)) to (1- (car (last e)))
1506 collect (elt (pt-children p) i)))
1507 (after (loop for i from (car (last e)) to (1- (length (pt-children p)))
1508 collect (elt (pt-children p) i)))
1509 (new (make-pt/low :name (gi tag)
1510 :attrs (pt-attrs tag)
1511 :children between
1512 :parent p)))
1513 (setf (pt-children p) (append before (list new) after))
1514 (dolist (k between)
1515 (setf (pt-parent k) new)) ))
1517 (loop for i from (car (last s)) to (1- (car (last e))) do
1518 (mungle-font-pair
1519 dtd root tag
1520 (append (butlast s) (list i 0))
1521 (append (butlast s)
1522 (list i (length (pt-children (elt (pt-children p) i)))))))))))
1523 ((and (< (length s) (length e))
1524 '(equal (subseq e (length s)) s))
1525 (let ((v (subseq e 0 (length s))))
1526 (mungle-font-pair dtd root tag (append v (list 0)) e)
1527 (mungle-font-pair dtd root tag s v)))
1528 ((and (> (length s) (length e))
1529 '(equal (subseq s (length e)) e))
1530 (let* ((v (subseq s 0 (length e)))
1531 (m ))
1532 (mungle-font-pair dtd root tag
1533 (append (butlast v) (list (+ 1 (car (last v))))) e)
1534 (setf m (length (pt-children (pt-access root v))))
1535 (mungle-font-pair dtd root tag
1536 s (append v (list m)))))
1538 (let (p)
1539 ;; search to first common parent
1540 (do ((i 0 (+ i 1)))
1541 ((or (= i (length s))
1542 (= i (length e))
1543 (not (eq (elt s i) (elt e i))))
1544 (setf p (subseq s 0 i))))
1545 (let* ((v1 (append (subseq s 0 (length p))
1546 (list (+ 1 (elt s (length p))))))
1547 (v2 v1))
1548 (mungle-font-pair dtd root tag s v1)
1549 (mungle-font-pair dtd root tag v2 e))))
1552 (defun map-htag-pairs (continuation parse-tree gi)
1553 ;; this function maps the inner htags first.
1554 (let ((stack nil))
1555 (labels ((walk (x)
1556 (cond ((and (hstag-node-p x) (eq (gi x) gi))
1557 (push x stack))
1558 ((and (hetag-node-p x) (eq (gi x) gi))
1559 (cond ((null stack)
1560 (remove-pt x)
1561 (parse-warn nil 4 "Superfluous ~A end tag." gi))
1563 (funcall continuation (pop stack) x))))
1565 (mapc #'walk (pt-children x))))))
1566 (walk parse-tree)
1567 ;; take care for non-closed start tags
1568 (unless (null stack)
1569 (parse-warn nil 4 "Unclosed ~A elements." gi)
1570 (let ((new (mapcar (lambda (x)
1571 (declare (ignore x))
1572 (make-hetag-node :name gi
1573 :attrs nil
1574 :parent parse-tree
1575 :children nil))
1576 stack)))
1577 (setf (pt-children parse-tree)
1578 (append (pt-children parse-tree) new))
1579 (mapc continuation stack new))) )))
1581 ;;; FORM post mortem heuristic
1582 ;;; ==========================
1584 ;; FORM is another element, which some authors get incredible
1585 ;; wrong. We deal with this in a similar way;
1587 ;; while we parse HTAG are put into the parse tree, we sort it out
1588 ;; after we are done parsing; There are two possibilities:
1590 ;; a. We are able to insert FORM in a sane way.
1591 ;; b. We are not able to do this
1593 ;; How is b. now handled? I see two solutions:
1595 ;; - basically leave HTAGs in the tree.
1596 ;; - permitt multiple FORM elements and connect them via some special
1597 ;; attribute.
1601 (defun setup-code-vector (input charset)
1602 (let ((enc (runes-encoding:find-encoding charset)))
1603 (cond ((not (null enc))
1604 (setf (runes:xstream-encoding input) enc))
1606 (parse-warn input 4 "There is no such encoding: ~S." charset)))))
1608 (defun sgml-parse-file (filename dtd)
1609 (with-open-file (input filename :direction :input :element-type '(unsigned-byte 8))
1610 (sgml-parse dtd (make-a-stream :cl-stream (cl-byte-stream->gstream input)))))
1612 (defun html-parse-file (filename)
1613 (with-open-file (input filename :direction :input :element-type '(unsigned-byte 8))
1614 (parse-html (cl-byte-stream->gstream input))))
1617 (defun html-parse-url (url)
1618 (unless (url:url-p url) (setf url (url:parse-url url)))
1619 (netlib:with-open-document ((input mime-type) url)
1620 mime-type
1621 (parse-html input)))
1624 (defun check-saneness (pt &optional (dtd closure-html:*html-dtd*))
1625 (dolist (k (pt-children pt))
1626 (unless (member (gi k) (elm-inclusion dtd (gi pt)))
1627 (warn "Unallowed ~A element within ~A." (gi k) (gi pt)))
1628 (unless (eq (pt-parent k) pt)
1629 (warn "Parent/child linkage broken."))
1630 (cond ((htag-node-p k)
1631 (warn "HTAG node (~S) left in parse tree. (parent ~S)" (gi k) (gi pt))))
1632 (check-saneness k)))
1634 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1635 ;;; ;; das hatten wir schon, so geht das nicht -- oder nur sehr schwer.
1636 ;;; dtd
1637 ;;; (let ((font-stack nil)) ;stack of open HTAG FONT elements
1638 ;;; (labels ((walk (pt)
1639 ;;; (cond ((hstag-node-p pt)
1640 ;;; (push pt font-stack)
1641 ;;; nil)
1642 ;;; ((hetag-node-p pt)
1643 ;;; (pop font-stack)
1644 ;;; nil)
1645 ;;; (t
1646 ;;; (cond ((not (null font-stack))
1647 ;;; ;; some fonts are open
1648 ;;; ;; warp them around this node
1649 ;;;
1650 ;;; )
1651 ;;; (t
1652 ;;; (let ((new-children (mapcan #'walk (pt-children pt))))
1653 ;;; (dolist (k new-children)
1654 ;;; (setf (pt-parent k) pt))
1655 ;;; (setf (pt-children pt) new-children)
1656 ;;; (list pt))) )) )))
1657 ;;; (car (walk parse-tree)) )))
1659 (defun post-mortem/fix-font (dtd parse-tree)
1660 (declare (special q))
1661 (let ((pairs nil))
1662 ;; erstmal alle font tag paare suchen
1663 (map-htag-pairs (lambda (stag etag)
1664 (push (cons stag etag) pairs))
1665 parse-tree :font)
1666 (setf pairs (reverse pairs))
1667 ;; dann alle raus nehmen
1668 '(dolist (k pairs)
1669 (remove-pt (car k))
1670 (remove-pt (cdr k)))
1671 ;; dann alle munglen
1672 (dolist (pair pairs)
1673 (let ((stag (car pair)) (etag (cdr pair)) s e)
1674 (setf s (pt-path parse-tree stag))
1675 (assert (eq (pt-access parse-tree s) stag))
1676 (remove-pt stag)
1677 (setf e (pt-path parse-tree etag))
1678 (assert (eq (pt-access parse-tree e) etag))
1679 (remove-pt etag)
1680 (cond ((and s e)
1681 (mungle-font-pair dtd parse-tree stag s e))
1683 (warn "Hmm..."))))))
1684 parse-tree)
1686 ;;; ===========================================================================
1687 ;;; 'A' Heuristic
1690 ;; We want to match Netscape's behaviour and thus by experimentation,
1691 ;; we conclude:
1693 (defun shortest-path-to (dtd pathen goal &optional (max-depth 10))
1694 (cond ((<= max-depth 0)
1695 nil)
1696 ((find-if (lambda (p)
1697 (eql (car p) goal))
1698 pathen))
1700 (shortest-path-to dtd
1701 (mapcan (lambda (p)
1702 (mapcar (lambda (s)
1703 (cons s p))
1704 (elm-surclusion dtd (first p))))
1705 pathen)
1706 goal
1707 (1- max-depth))) ))
1712 ;; Start Tag anaylis
1714 (defun blah (offending)
1715 (with-open-file (sink "/tmp/t.html"
1716 :direction :output
1717 :if-exists :new-version)
1718 (let ((dtd closure-html:*html-dtd*))
1719 (let ((p (shortest-path dtd :BODY offending)))
1720 (let ((p2 (shortest-path dtd offending :PCDATA)))
1721 (format sink "~A<BR>~%" offending)
1722 (when (and p p2)
1723 (pop p) ;forget BODY
1724 (pop p2)
1725 (setq p2 (butlast p2))
1726 ;; pre-material
1727 (dolist (k (butlast p)) (format sink "<~A>~%" k))
1728 (format sink "<A href='xxx'>~%")
1729 (format sink "<img src='/tmp/a.gif'>A")
1730 (format sink "<~A>~%" (car (last p)))
1731 (dolist (k p2) (format sink "<~A>~%" k))
1732 (format sink "<img src='/tmp/b.gif'>B")
1733 (dolist (k (reverse p2)) (format sink "</~A>~%" k))
1734 (format sink "</~A>~%" (car (last p)))
1735 (format sink "<img src='/tmp/c.gif'>C")
1736 (dolist (k (reverse (butlast p))) (format sink "</~A>~%" k))
1737 ))))))
1739 (defun open-in-netscape (url)
1740 (glisp:run-unix-shell-command
1741 (format nil "my-netscape -remote 'openURL(~A)'" url)))
1743 (defun bluu ()
1744 (let ((i 0))
1745 (dolist (off (all-elms closure-html:*html-dtd*))
1746 (cond (t '(or (member :B (elm-inclusion closure-html:*html-dtd* off))
1747 (member :P (elm-inclusion closure-html:*html-dtd* off)))
1748 (blah off)
1749 (format T "~&;; ~A" off)
1750 (open-in-netscape "file:/tmp/t.html")
1751 (sleep 1)
1752 (let ((nam (format nil "/tmp/~A.gif" off)))
1753 (glisp:run-unix-shell-command (format nil "(my-xwd -id 0x3c003bf | xwdtopnm | ppmtogif) 2>/dev/null > ~A &" nam)))
1754 (sleep 1))
1756 (format T "~&;; Skipping ~A, because inclusion is ~A."
1757 off (elm-inclusion closure-html:*html-dtd* off)))))))
1760 (defun equivalence-classes (prediate set)
1761 (let ((classes nil))
1762 (dolist (item set)
1763 (do ((cp classes (cdr cp)))
1764 ((null cp)
1765 (push (list item) classes))
1766 (cond ((funcall prediate (caar cp) item)
1767 (push item (car cp))
1768 (return)))))
1769 classes))
1771 (defun shortest-path (dtd from to &optional (max-depth 10))
1772 (reverse (shortest-path/aux dtd (list (list from)) to max-depth)))
1774 (defun shortest-path/aux (dtd pathen to &optional (max-depth 10))
1775 (cond ((<= max-depth 0)
1776 nil)
1777 ((null pathen)
1778 nil)
1779 ((find-if (lambda (p) (eq (car p) to)) pathen))
1780 ((shortest-path/aux
1782 (remove-duplicates
1783 (mapcan (lambda (path)
1784 (mapcar (lambda (child)
1785 (cons child path))
1786 (elm-inclusion dtd (car path))))
1787 pathen)
1788 :test (lambda (x y) (eql (car x) (car y))))
1790 (1- max-depth)))))
1793 (defun bloo ()
1794 (let ((dtd closure-html:*html-dtd*))
1795 (equivalence-classes (lambda (x y)
1796 (and (set-equal (elm-inclusion dtd x) (elm-inclusion dtd y))
1797 (set-equal (elm-surclusion dtd x) (elm-surclusion dtd y))))
1798 (all-elms dtd))))
1801 ;;; ===========================================================================
1803 (defun mungle (x)
1804 (cond ((and (vectorp x) (not (stringp x))) (rod-string x))
1805 ((atom x) x)
1806 ((cons (mungle (car x))
1807 (mungle (cdr x))))))
1810 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1813 (format T "~&;; Parse error (line ~D column ~D): [~A] Saw ~A in ~A."
1814 (runes:xstream-line-number input)
1815 (runes:xstream-column-number input))