1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: SGML; Readtable: GLISP; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: An SGML Parser
4 ;;; Created: 1996-10-21
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: MIT style (see below)
7 ;;; ---------------------------------------------------------------------------
8 ;;; (c) copyright 1996-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 ;; ----------------------------------------------------------------------------
33 ;; 2001-05-17 GB - empty tags are threated as start tags.
34 ;; [we can do better than that].
36 ;; 2001-05-14 GB - finally all mumbling goes with PARSE-WARN.
38 ;; 1999-09-19 GB - a-streams are gone now, we use the xstreams from the
41 ;; 1999-08-24 GB = new scheme to handle unsyntactic nesting of FONT tags
42 ;; thru' post mortem analysis; due to that:
43 ;; - *FONT-HEURISTIC-P*: new special variable
44 ;; - HTAG-NODE: new structure type
45 ;; - New stack machine output token type :HTAG
46 ;; - TAG-AS-MARKER-P: new predicate
47 ;; - POST-MORTEM-HEURISTIC: new function
48 ;; - PARSE-HTML: use it
49 ;; - POST-MORTEM/FIX-TOP-LEVEL-STRUCTURE: moved code
50 ;; from PARSE-HTML here
52 ;; - HTML-PARSE-FILE, HTML-PARSE-URL: new functions
54 ;; 1999-08-23 GB - EMPTY-ELEMENT-P: new function
55 ;; - SGML-PARSE, TRANSITION: new output token :open/close
56 ;; to accommodate empty elements (like <IMG>, <BR>)
58 ;; 1999-08-19 GB - nuked FAT-WHITE-SPACE-P
59 ;; - nuked U16->STRING (using ROD-STRING now).
61 ;; - use #/.. read syntax instead of #.(char-code) idiom
62 ;; - use RUNE=, RUNE<= et al
64 ;; - twixed SUBSEQ/U16 to SUBSEQ/ROD
65 ;; - prays that everything still works.
72 (declaim (inline code-char
))
74 (defun code-char (code)
75 (if (< code char-code-limit
)
79 ;; This is a high-speed implementation of an SGML parser.
83 ;; o We do our own buffering here, which allows us to access the buffer
84 ;; directly and thus cutting down the number of function calls per character
87 ;; o The DTD has to be 'compiled' before we drive a little determistic PDA
92 ;; o das mit dem <body> = full speed functioniert irgentwie nicht
93 ;; richtig ;-( wenn das BODY tag fehlt. -- es kommt nur etwas spaet
94 ;; wegen token lookahead, was aber kein problem sein sollte.
96 ;; o improve error handling
97 ;; - After we have encountered an error while reading a tag nuke everything
100 ;; Forgive something like <foo x="10%> and back up until #/" by patching
102 ;; - emit warning if somebody writes e.g. <A href=http://foo/bar/baz>,
106 ;; - account for the SGML inclusion '+(..)' and exclusion '-(..)' feature
107 ;; - implement the HTML idea of CDATA for SCRIPT and (uhm!) XMP.
108 ;; - thanks to the W3C: '<' and '>' are now part of CSS-2 syntax so we
109 ;; need something similar for STYLE also.
111 ;; o merge as much is possible with the XML parser in a modular fashion
113 ;; o scratch-pad hack needs to be reentrant !!!
114 ;; [make xml-parser's implementation of with-rune-collector safe
117 ;; o the FORM hack should be there.
118 ;; (we also need an <INPUT> hack).
120 ;; o finally ensure that the rune abstraction is fine.
122 ;; o report errors using exactly one mechanism and include file name
123 ;; (uri) into error messages; better yet: define a error message
124 ;; structure -- in the future it should be possible to have something
125 ;; like an error message listener.
127 ;; o since we use xstreams now morph the eof representation from NIL
128 ;; to :EOF. Or better yet: give the READ-RUNE macro a standard
129 ;; signature including eof-error-p and eof-value. (Wouldn't hurt,
130 ;; since it is a macro).
132 ;; o In DTD abstraction: ELM-INCLUSION needs flag error-p
134 ;; o we need rune-max and therefore are not dependent on UCS-2
135 ;; anymore, should the need araise simply redefine rune-max.
139 ;; Build the parse tree out of lazy lists. Incremental rendering would
140 ;; then become cheap to implement.
142 (defparameter *preserves-comments-elements
*
144 "List of names of elements, which should preserve (may contain) comment tokens.")
146 (defparameter *font-heuristic-p
* t
147 "Whether to handle FONT by heuristic and post mortem processing.")
149 (defparameter *anchor-heuristic-p
* nil
150 "Whether to handle A tags by heuristic and post mortem processing.")
152 (defparameter *gt-ends-comment-p
* nil
153 "Whether and '>' ends a comment? -- A warning is emitted still.")
155 (defvar *options
/parser-silent-p
* nil
)
156 (defvar *line-number
*)
157 (defparameter *parse-warn-level
* 5)
160 ;; HTAG = heuristic tag
161 ;; These are unparsed tags, which are inserted verbatim into the parse tree
162 ;; for post mortem analysis.
163 ;; See explaination in FONT element fixer far below.
165 (defstruct (htag-node (:include pt
)) ) ;heuristic tag
166 (defstruct (hstag-node (:include htag-node
)) ) ;heuristic start tag
167 (defstruct (hetag-node (:include htag-node
)) ) ;heuristic end tag
169 ;; The predicate TAG-AS-MARKER-P decides, whether a given tag is
170 ;; inserted as HSTAG or HETAG.
172 ;;;; --------------------------------------------------------------------------
173 ;;;; Predicates and Utilities on Runes
175 ;; Most of these should really go into util.lisp
177 (definline name-start-rune-p
(char)
178 (or (rune<= #/a char
#/z
)
179 (rune<= #/A char
#/Z
)))
181 (definline name-rune-p
(char)
182 (or (name-start-rune-p char
)
187 (definline sloopy-name-rune-p
(char)
188 (or (name-rune-p char
)
190 (rune= char
#//) ;manche schreiben ganze urls ohne Gaensefuesschen
193 (rune= char
#/#) ;farben werden auch gerne genommen
199 ;; ganzer ECMA-Script Code kommt auch vor
210 (definline sloopy-value-rune-p
(char)
211 (or (sloopy-name-rune-p char
)
214 (definline alpha-rune-p
(char)
215 (or (rune<= #/a char
#/z
)
216 (rune<= #/A char
#/Z
)))
218 (definline upcase-name-rune
(rune)
221 ;;;; --------------------------------------------------------------------------
224 (definline subseq
/rod
(source start end
)
225 ;; Optimized version of subseq for arrays of runes
226 (declare (type rod source
)
227 (type fixnum start end
))
228 (let ((res (make-rod (- end start
))))
229 (declare (type rod res
))
230 (do ((i (- (- end start
) 1) (the fixnum
(- i
1))))
232 (declare (type fixnum i
))
233 (setf (%rune res i
) (%rune source
(the fixnum
(+ i start
)))))))
235 ;;; ---------------------------------------------------------------------------
238 (eval-when (compile load eval
)
239 (defparameter *buf-size
* 4096))
241 (defmacro a-read-byte
(input)
243 `(let ((,c
(runes:read-rune
,input
)))
244 (if (eq ,c
:eof
) nil
,c
))))
246 (defmacro a-peek-byte
(input)
248 `(let ((,c
(runes:peek-rune
,input
)))
249 (if (eq ,c
:eof
) nil
,c
))))
251 (defmacro a-unread-byte
(byte input
)
252 `(runes:unread-rune
,byte
,input
))
254 (defmacro a-stream-position
(input)
255 `(runes:xstream-position
,input
))
257 (defun make-a-stream (&key cl-stream
)
258 (runes:make-xstream cl-stream
:initial-speed
1 :speed
8192))
260 (defmethod runes::read-octets
(sequence (stream html-glisp
:gstream
) start end
)
261 (html-glisp:g
/read-byte-sequence sequence stream
:start start
:end end
))
263 (defmethod runes::xstream
/close
((stream html-glisp
:gstream
))
264 (html-glisp:g
/close stream
))
266 ;; a fake definition -- XXX non-reentrant!
268 (defun a-stream-scratch (input)
269 (getf (runes::xstream-plist input
) 'scratch-pad
))
271 (defun (setf a-stream-scratch
) (new-value input
)
272 (setf (getf (runes::xstream-plist input
) 'scratch-pad
) new-value
))
274 ;;;; -------------------------------------------------------------------------
275 ;;;; Reporting Errors
278 (defun read-tag-error (input format-string
&rest format-args
)
279 (apply #'parse-warn input
4 format-string format-args
)
280 (throw 'read-tag-error
282 (string-rod "##BAD TAG##"))))
286 ;;; 1 - Absolutely normal mumbleing
287 ;;; 2 - Laziness introduced by the HTML standard
288 ;;; 3 - Semantic error upon parsing attributes
289 ;;; 4 - Accidents happening while parsing the structure
290 ;;; 5 - More serve errors
293 (defun parse-warn (input level fmt
&rest args
)
294 (let ((*print-pretty
* nil
)) ;disable ugly^H^H^H^Hpretty printing
295 (when (>= level
*parse-warn-level
*)
296 (unless *options
/parser-silent-p
*
297 (let ((preample (format nil
";; Parser warning: ~11@<Line ~D,~> ~11@<column ~D~>: ~5A "
298 (and input
(ignore-errors (runes:xstream-line-number input
)))
299 (and input
(ignore-errors (runes:xstream-column-number input
)))
300 (make-string level
:initial-element
#\
*))))
301 (fresh-line *trace-output
*)
302 (write-string preample
*trace-output
*)
303 (with-input-from-string (i (apply #'format nil fmt args
))
304 (do ((x (read-line i nil nil
) (read-line i nil nil
))
308 (write-string ";; " *trace-output
*)
309 (dotimes (i (- (length preample
) 3))
310 (write-char #\space
*trace-output
*)))
311 (write-string x
*trace-output
*)
312 (terpri *trace-output
*))))))))
314 ;;; ---------------------------------------------------------------------------
317 (defun read-token (input dtd
)
318 ;; Reads on token from the stream `input'
321 ;; :start-tag name atts
323 ;; :empty-tag name atts
326 ;; :experimental-tag cdata
328 (let ((ch (a-read-byte input
)))
332 (read-tag input dtd
))
334 (a-unread-byte ch input
)
335 (read-pcdata input dtd
)) )))
337 (defun push-on-scratch (input sp ch
)
338 ;; Push the character ch onto the scratch pad of `input' and enlarge if neccessary
339 (setf (aref (a-stream-scratch input
) sp
) ch
)
341 (cond ((= sp
(length (a-stream-scratch input
))) ;end of scratch pad reached?
342 (enlarge-scratch-pad input
)))
345 (defun read-pcdata (input dtd
)
346 (let* ((scratch (a-stream-scratch input
)) ;scratch pad
347 (sp 0) ;pointer into scratch pad
348 (se (length scratch
)) ;end of scratch pad
350 (declare (type (simple-array rune
(*)) scratch
))
351 (declare (type fixnum sp se
))
353 (let ((ch (a-read-byte input
)))
354 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
355 ;; a-read-byte returns a rune.
356 ;;; (declare (type (or null (unsigned-byte 8)) ch))
357 (declare (type (or null rune
) ch
))
358 (cond ((null ch
) ;eof
360 ((rune= ch
#/<) ;end of pcdata
361 (a-unread-byte ch input
)
364 (setf sp
(read-entity-ref input dtd sp
)))
366 (setf (aref scratch sp
) ch
) ;recode character read
367 (setf sp
(the fixnum
(+ sp
1)))
368 (cond ((= sp se
) ;end of scratch pad reached?
369 (enlarge-scratch-pad input
)
370 (setf scratch
(a-stream-scratch input
)
371 se
(length scratch
))))))))
373 (subseq/rod scratch
0 sp
)) ))
375 (defun read-entity-ref (input dtd sp
)
376 ;; Reads an entity reference into the stream's scratch pad from position
378 ;; Returns the new write pointer. The initial "&" is already read from the
381 ;; entity-ref ::= "&" "#" <digit>+ (";")?
382 ;; entity-ref ::= "&" "#" "x" <hex-digit>+ (";")?
383 ;; entity-ref ::= "&" <name-start> <name-char>* (";")?
384 (let ((ch (a-read-byte input
)))
385 (cond ((null ch
) ;eof
386 (parse-warn input
3 "EOF in entity")
387 (push-on-scratch input sp
#/&))
388 ((rune= ch
#/#) ;numeric reference?
389 (read-numeric-entity input sp
))
390 ((name-start-rune-p ch
) ;named entity?
391 (read-named-entity input dtd sp ch
))
394 "Saw character '~A' after '&' -- bad entity reference?!"
396 (format nil
"&#x~4,'0X" ch
)))
397 (a-unread-byte ch input
) ;it might be something interesting
398 (push-on-scratch input sp
#/&)) )))
400 (defun read-numeric-entity (input sp
)
402 (let ((ch (a-read-byte input
)))
403 (setf sp
(push-on-scratch input sp
#/&))
404 (setf sp
(push-on-scratch input sp
#/#))
405 (cond ((null ch
) ;eof
406 (parse-warn input
3 "EOF in entity")
410 (read-numeric-entity-aux input
(- sp
2) sp
10 ch
))
414 (setf sp
(push-on-scratch input sp
#/x
))
415 (setf ch
(a-read-byte input
))
417 (parse-warn input
3 "EOF after '&#x'.")
419 ((not (digit-rune-p ch
16))
420 (parse-warn input
3 "Bad character after '&#x'.")
421 (a-unread-byte ch input
)
424 (read-numeric-entity-aux input
(- sp
3) sp
16 ch
))))
426 (a-unread-byte ch input
)
427 (parse-warn input
3 "Bad character after '&#'")
430 (defun read-numeric-entity-aux (input s0 sp radix ch
)
431 ;; Aux routine for read-numeric-entity
432 ;; at s0..sp in the scratch pad is the already read prefix ('&#' or '&#x')
433 ;; Radix is the radix to use (10 or 16)
435 ;; 'ch' is the first digit
437 (setf sp
(push-on-scratch input sp ch
))
438 (do ((ch (a-read-byte input
) (a-read-byte input
)))
439 ((or (null ch
) (not (digit-rune-p ch radix
)))
440 ;; Ok. [s1..sp) now is the digit sequence
441 (let ((num (parse-integer (map 'string
#'rune-char
442 (subseq (a-stream-scratch input
) s1 sp
))
444 (cond ((<= 0 num
#xFFFF
)
445 ;; Proper entity value
446 (when (and (not (null ch
)) (not (rune= ch
#/\
;)))
447 (a-unread-byte ch input
))
448 ;; Rewind scratch pad to `s0' and push character `num'
449 (setf sp
(push-on-scratch input s0
(code-rune num
))))
451 ;; num too large; emit warning and leave scratch pad alone
452 (when (not (null ch
))
453 (a-unread-byte ch input
))
454 (parse-warn input
3 "Numeric enity ~A does not fit into our 16-bit strings; -- ignored."
455 (rod-string (rod-subseq (a-stream-scratch input
) s0 sp
)))))))
456 (setf sp
(push-on-scratch input sp ch
)))
459 (defun read-named-entity (input dtd sp ch
)
460 ;; Just in case we want to leave the entity alone
462 (setf sp
(push-on-scratch input sp
#/&))
464 (setf sp
(push-on-scratch input sp ch
))
465 (do ((ch (a-read-byte input
) (a-read-byte input
)))
466 ((or (null ch
) (not (name-rune-p ch
)))
467 ;; Ok. [s1..sp) now is the name, try to resolve it
468 (let ((nums (find-named-entity dtd
(subseq (a-stream-scratch input
) s1 sp
))))
469 (cond ((not (null nums
))
470 ;; Proper entity value
471 ;; Rewind scratch pad to `s0' and push characters in `nums'
472 (dotimes (i (length nums
))
473 (setf sp
(push-on-scratch input s0
(aref nums i
))))
474 (when (and (not (null ch
)) (not (rune= ch
#/\
;)))
475 (a-unread-byte ch input
)))
477 (when (not (null ch
))
478 (a-unread-byte ch input
))
479 (parse-warn input
3 "[~D] There is no such entity defined: ~A -- ignored."
480 (a-stream-position input
)
481 (rod-string (rod-subseq (a-stream-scratch input
) s0 sp
)))))) )
482 (setf sp
(push-on-scratch input sp ch
))))
485 (defun find-named-entity (dtd fat-string
)
486 (let ((str (rod-string fat-string
)))
487 (let ((r (cdr (assoc str
(sgml::dtd-entities dtd
) :test
#'string
=))))
490 (defun enlarge-scratch-pad (input)
491 (let* ((old (a-stream-scratch input
))
493 (declare (type fixnum se
)
494 (type (simple-array rune
(*)) old
))
495 (let ((new (make-rod (+ (length (a-stream-scratch input
)) *buf-size
*))))
496 (declare (type rod new
))
497 (do ((i (- se
1) (the fixnum
(- i
1))))
499 (declare (type fixnum i
))
500 (setf (aref new i
) (aref old i
)))
501 (setf (a-stream-scratch input
) new
))))
503 ;;; ------------------------------------------------------------
506 ;; tag ::= <start-tag> | <end-tag> | <exp-tag> | <comment>
507 ;; end-tag ::= "<" "/" <name> WSP ">"
508 ;; empty-tag ::= "<" <name> <atts> WSP "/" ">"
509 ;; start-tag ::= "<" <name> <atts> WSP ">"
510 ;; exp-tag ::= "<" "?" <any>* ">"
511 ;; comment ::= "<" "!" "-" "-" (<any>* - ("-" "-")) "-" "-" ">"
514 ;; | <name> WSP "=" WSP <value>
515 ;; value ::= <literal> | <name>
516 ;; literal ::= """ <char>* """
518 ;; atts ::= ( WSP <att> )*
519 ;; WSP ::= <white-space>*
520 ;; name ::= <name-start-char> <name-char>*
521 ;; char ::= <any> | <enitity-ref>
523 (defun read-tag (input dtd
)
524 ;; The "<" is already read.
525 (catch 'read-tag-error
526 (let ((ch (a-peek-byte input
)))
527 (cond ((rune= ch
#//)
529 (read-end-tag input
))
532 (read-define-tag input dtd
))
535 (read-experimental-tag input
))
536 ((and (not (null ch
)) (name-start-rune-p ch
))
537 (read-start-tag input dtd
))
539 (parse-warn input
3 "Bad character after '<': '~A' -- ignored."
541 (let ((res (string-rod "<")))
542 (values :pcdata res
))) ))) )
544 (defun read-start-tag (input dtd
)
545 (multiple-value-bind (name atts
) (read-name-and-attributes input dtd
)
546 (let ((ch (a-read-byte input
)))
547 (cond ((rune= ch
#/>)
548 (values :start-tag name atts
))
551 "A '<' ended this tag.")
552 (a-unread-byte ch input
)
553 (values :start-tag name atts
))
555 (setf ch
(a-read-byte input
))
556 (cond ((rune= ch
#/>)
557 (values :empty-tag name atts
))
559 (read-tag-error input
"Expected '>' after '<' .. '/'"))))
561 (read-tag-error input
"Expected '>'")) ))))
563 (defun read-end-tag (input)
564 (let ((name (read-name input
)))
565 (skip-white-space input
)
566 (let ((ch (a-read-byte input
)))
568 (read-tag-error input
"In end tag: Expected '>' got end-of-file instead."))
570 (values :end-tag name
))
572 (read-tag-error input
"In end tag: Expected '>'")) ))))
574 (defun read-name-and-attributes (input dtd
)
575 (let ((name (read-name input
))
578 (skip-white-space input
)
579 (cond ((member (a-peek-byte input
) '(#/< #/> #//) :test
#'rune
=)
581 (push (read-attribute input dtd
) atts
))
582 (values name
(nreverse atts
)) ))
584 (defun read-name (input)
585 (let ((ch (a-peek-byte input
))
587 (cond ((and (not (null ch
)) (name-start-rune-p ch
))
588 (do ((ch (a-read-byte input
) (a-read-byte input
)))
589 ((not (and ch
(name-rune-p ch
)))
591 (a-unread-byte ch input
))
592 (subseq/rod
(a-stream-scratch input
) 0 sp
))
593 (setf sp
(push-on-scratch input sp
(upcase-name-rune ch
)))))
595 (read-tag-error input
"Not a name")) )))
597 (defun skip-white-space (input)
598 (do ((ch (a-read-byte input
) (a-read-byte input
)))
599 ((not (and ch
(white-space-rune-p ch
)))
600 (when ch
(a-unread-byte ch input
)))))
602 (defun read-attribute (input dtd
)
603 (skip-white-space input
)
604 (let ((slot (read-sloopy-name input
)))
605 ;;(print (list 'slot '= (mungle slot) (mungle (vector (a-peek-byte input)))))
606 (skip-white-space input
)
607 (let ((c (a-peek-byte input
)))
608 (cond ((and (not (null c
)) (rune= c
#/=))
610 (skip-white-space input
)
611 (let ((value (read-value input dtd
)))
616 (defun read-value (input dtd
)
617 (let ((ch (a-peek-byte input
)))
618 (cond ((rune= ch
#/')
620 (read-literal input dtd ch
))
623 (read-literal input dtd ch
))
624 ((and ch
(sloopy-name-rune-p ch
))
625 (read-sloopy-value input
))
627 (read-tag-error input
"Bad value '~A' seen"
629 (format nil
"U+~4,'0X" (rune-code ch
))))))))
631 (defun read-literal (input dtd delim
)
632 (let* ((scratch (a-stream-scratch input
)) ;scratch pad
633 (sp 0) ;pointer into scratch pad
634 (se (length scratch
)) ;end of scratch pad
636 (declare (type rod scratch
))
637 (declare (type fixnum sp se
))
639 (let ((ch (a-read-byte input
)))
640 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
641 ;; a-read-byte returns a rune.
642 ;;; (declare (type (or null (unsigned-byte 8)) ch))
643 (declare (type (or null rune
) ch
))
644 (cond ((null ch
) ;eof
645 (read-tag-error input
"Eof in literal"))
649 (setf sp
(read-entity-ref input dtd sp
)))
651 (setf (aref scratch sp
) ch
) ;recode character read
652 (setf sp
(the fixnum
(+ sp
1)))
653 (cond ((= sp se
) ;end of scratch pad reached?
654 (enlarge-scratch-pad input
)
655 (setf scratch
(a-stream-scratch input
)
656 se
(length scratch
))))))))
657 (subseq/rod scratch
0 sp
) ))
659 (defun read-sloopy-name (input)
660 (let ((ch (a-peek-byte input
))
662 (cond ((and (not (null ch
)) (sloopy-name-rune-p ch
))
663 (do ((ch (a-read-byte input
) (a-read-byte input
)))
664 ((not (and ch
(sloopy-name-rune-p ch
)))
666 (a-unread-byte ch input
))
667 (subseq/rod
(a-stream-scratch input
) 0 sp
))
668 (setf sp
(push-on-scratch input sp ch
))))
670 (read-tag-error input
671 "Expected sloopy name, got ~A"
672 (or (rune-char ch
) (format nil
"U+~4,'0X" ch
)) )) )))
674 (defun read-sloopy-value (input)
675 (let ((ch (a-peek-byte input
))
677 (cond ((and (not (null ch
)) (sloopy-value-rune-p ch
))
678 (do ((ch (a-read-byte input
) (a-read-byte input
)))
679 ((not (and ch
(sloopy-value-rune-p ch
)))
681 (a-unread-byte ch input
))
682 (subseq/rod
(a-stream-scratch input
) 0 sp
))
683 (setf sp
(push-on-scratch input sp ch
))))
685 (read-tag-error input
"Expected sloopy value, got ~A"
686 (or (rune-char ch
) (format nil
"U+~4,'0X" ch
)) )) )))
688 (defun read-define-tag (input dtd
)
689 (let ((ch (a-peek-byte input
)))
691 (read-tag-error input
"unexpected EOF"))
693 ;; empty define tag -- to be ignored
695 (read-token input dtd
))
699 (let ((ch (a-peek-byte input
)))
700 (cond ((and (not (null ch
)) (rune= ch
#/-
))
701 (read-comment input
))
703 (read-tag-error input
"Expected '-' after \"<!-\"")))))
705 (read-define-tag-2 input
)) )))
707 (defun read-define-tag-2 (input)
709 ;; we simply slurp until '>'
711 (do ((ch (a-read-byte input
) (a-read-byte input
)))
712 ((and ch
(rune= ch
#/>))
713 (values :define-tag
(subseq/rod
(a-stream-scratch input
) 0 sp
)))
714 (setf sp
(push-on-scratch input sp ch
))) ))
716 (defun read-comment (input)
717 (a-read-byte input
) ;consume the '-'
719 (c1 (or (a-read-byte input
) (read-tag-error input
"Unexpected EOF")))
720 (c2 (or (a-read-byte input
) (read-tag-error input
"Unexpected EOF")))
726 c2
(a-read-byte input
))
728 (read-tag-error input
"EOF within comment."))
733 ((and *gt-ends-comment-p
*
735 (parse-warn input
3 "A '>' ends this comment.")
737 (cond ((and (rune= c0
#/-
) (rune= c1
#/-
))
739 (parse-warn input
4 "\"--\" seen within comment; This is strongly depreciated.")
741 (setf sp
(push-on-scratch input sp c0
)))
742 (values :comment
(subseq/rod
(a-stream-scratch input
) 0 sp
)) ))
744 ;;;; ------------------------------------------------------------------------------------------
746 (defun name-start-char-p (ch)
749 (defun name-char-p (ch)
750 (or (alphanumericp ch
) (char= ch
#\.
) (char= ch
#\-
)) )
752 (defun valid-name-string-p (string)
753 "Is the string `string' a valid name string according to the SGML
755 (and (> (length string
) 0)
756 (name-start-char-p (char string
0))
757 (every #'name-char-p string
)) )
759 ;;;; ------------------------------------------------------------------------------------------
760 ;;;; Resolving Entities
763 ;;;; TODO: Check that numeric entities are within 0..#xFFFF;
765 (defun resolve-numeric-entity (string start end
) ; --> string ; new start
766 (let ((j (or (position-if-not #'digit-rune-p string
:start start
:end end
) end
)))
768 (let ((n (parse-integer (rod-string (rod-subseq string start j
)) :radix
10)))
770 (if (and (< j end
) (rune= (rune string j
) #/\
;))
774 (defun resolve-hex-entity (string start end
) ; --> string ; new start
775 ;; Resolves a hexadecimal entity like "*", start should point
776 ;; to the character directy after the '&#x'.
777 (let ((j (or (position-if-not (rcurry #'digit-rune-p
16) string
:start start
:end end
) end
)))
779 (let ((n (parse-integer (rod-string (rod-subseq string start j
)) :radix
16)))
781 (if (and (< j end
) (rune= (rune string j
) #/\
;))
785 (defun resolve-named-entity (string entities start end
&optional input
)
786 ;; --> string ; new start
787 (let ((j (or (position-if-not #'name-start-rune-p string
:start start
:end end
) end
)))
789 (or (dolist (k entities
)
790 (when (and (= (length (car k
)) (- j start
))
791 ;; XXX this compare conses far too much!
792 (rod= (string-rod (subseq (car k
) 0 (- j start
)))
793 (rod-subseq string start j
)))
794 (return (string-rod (cdr k
))) ) )
795 (dolist (k entities nil
)
796 (when (and (>= (length (car k
)) (- j start
))
798 (rod= (string-rod (subseq (car k
) 0 (- j start
)))
799 (rod-subseq string start j
)))
800 (return (string-rod (cdr k
))) )))))
801 (cond ((not (null res
))
803 (resolve-entities-in-string res entities
0 (length res
) input
) ;right?
804 (if (and (< j end
) (rune= (rune string j
) #/\
;))
808 (parse-warn input
3 "Entity &~a; is not defined." (subseq string start j
))
809 (values (subseq string
(1- start
) j
) j
))))))
811 (defun resolve-entities-in-string (string entities
812 &optional
(start 0) (end (length string
)) input
)
813 ;; Resolve all entity references introduced by "&" in the string
814 ;; `string'. `start' and `end' specify a substring to operate on.
815 ;; For error messages `input' may be the input stream the data is
816 ;; coming from originally.
817 (let ((i (position #/& string
:start start
:end end
:test
#'rune
=)))
819 ;; no further entities in string -- all done
820 (rod-subseq string start end
))
821 ((and (< (+ i
2) end
)
822 (rune= #/# (rune string
(+ i
1)))
823 (digit-rune-p (rune string
(+ i
2))))
824 ;; numeric entity seen
825 (multiple-value-bind (res j
) (resolve-numeric-entity string
(+ i
2) end
)
826 (concatenate 'rod
(subseq string start i
)
828 (resolve-entities-in-string string entities j end input
))))
829 ((and (< (+ i
2) end
)
830 (rune= #/# (rune string
(+ i
1)))
831 (rune= #/x
(rune string
(+ i
2))))
832 ;; hexadecimal entity seen
833 (multiple-value-bind (res j
) (resolve-hex-entity string
(+ i
3) end
)
834 (concatenate 'rod
(subseq string start i
)
836 (resolve-entities-in-string string entities j end input
))))
837 ((and (< (+ i
1) end
)
838 (alpha-rune-p (aref string
(+ i
1))))
839 ;; this must be a named entity
840 (multiple-value-bind (res j
)
841 (resolve-named-entity string entities
(+ i
1) end input
)
842 (concatenate 'rod
(subseq string start i
)
844 (resolve-entities-in-string string entities j end input
))))
846 ;; no entity reference.
847 (concatenate 'rod
(subseq string start i
)
849 (resolve-entities-in-string string entities
(+ i
1) end input
))))))
852 ;;;; ------------------------------------------------------------------------------------------
853 ;;;; Mungling of Attribute values
856 (let ((kw-pkg (find-package :keyword
)))
860 (defun canon-value (input dtd tag slot value
)
861 (let* ((attlist (find-element-attlist dtd tag
))
862 (looked (assoc slot attlist
)))
863 (cond ((and looked
(listp (cadr looked
)))
864 (or (find value
(cadr looked
)
865 :test
#'(lambda (x y
)
866 (string-equal (string x
) (string y
))))
868 ;; Oh yeah! monster format strings are fun!
870 "~S is a bad value for the '~A' slot of '<~A>', which could ~
871 ~{~#[not take any value~;only take '~A'~:;take one of ~@{'~A'~#[~; or ~:;, ~]~}~]~:}."
872 value slot tag
(cadr looked
)))))
873 ((member (cadr looked
) '(:number
))
874 (or (maybe-parse-integer value
)
876 (parse-warn input
3 "~S is not NUMBER (attribute '~A' of '<~A>')."
879 ((member (cadr looked
) '(:name
:id
))
880 (cond ((valid-name-string-p value
)
881 (kintern (string-upcase value
)))
883 (parse-warn input
3 "~S is not NAME (attribute '~A' of '<~A>')."
889 (parse-warn input
3 "The '<~A>' element has no '~A' slot." tag slot
)
892 (defun find-slot-value-pair (input dtd tag value
)
893 (let* ((attlist (find-element-attlist dtd tag
))
895 (dolist (att attlist
)
896 (cond ((and (listp (cadr att
))
897 (setq looked
(find value
(cadr att
)
898 :test
#'(lambda (x y
)
899 (string-equal (string x
) (string y
))))))
900 (return-from find-slot-value-pair
(values (car att
) looked
)))))
903 "The '<~A>' tag has no slot which could take the '~A' keyword.~%~
904 ~1{~#[There are no possible slots at all.~;~
905 Only possible slot is:~:;~
906 Possible slots are:~]~
907 ~@{~& ~1{Slot '~A'~20T could ~{~#[not take any value~;~
909 take one of ~@{'~A'~#[~; or ~:;, ~]~}~].~:}~}~}~:}"
910 tag value
(remove-if-not #'(lambda (x) (consp (cadr x
))) attlist
)) ))
913 ;;;; ------------------------------------------------------------
915 (defun read-token* (input dtd
)
916 ;;(skip-white-space input)
917 (multiple-value-bind (kind a b
) (read-token input dtd
)
919 (:pcdata
(make-start-tag :name
:pcdata
:atts a
))
921 (let ((name (kintern (rod-string a
))))
922 (if (tag-exists? dtd name
)
923 (make-start-tag :name name
:atts
(mungle-attlist dtd name b
))
925 (parse-warn input
4 "There is no such thing as <~A> -- ignored." name
)
926 (read-token* input dtd
)))))
928 (let ((name (kintern (rod-string a
))))
929 (if (tag-exists? dtd name
)
930 (make-end-tag :name name
)
932 (parse-warn input
4 "There is no such thing as </~A> -- ignored." name
)
933 (read-token* input dtd
)))) )
935 (parse-warn input
2 "Oops -- there is an empty tag; but this is only HTML?!")
936 (let ((name (kintern (rod-string a
))))
937 (if (tag-exists? dtd name
)
938 (make-start-tag :name name
:atts
(mungle-attlist dtd name b
))
940 (parse-warn input
4 "There is no such thing as <~A> -- ignored." name
)
941 (read-token* input dtd
))))
942 ;;(read-token* input dtd)
944 (:define-tag
(read-token* input dtd
))
946 (parse-warn input
2 "Ignoreing processing instruction tag: '~A'" (mungle a
))
947 (read-token* input dtd
))
949 (make-comment-token :data a
))
951 (make-end-tag :name
:%top
)) )))
953 (defun tag-exists?
(dtd name
)
955 ;;(not (eq name :font)) ;xxx
956 ;;(not (eq name :center)) ;xxx
957 ;;(not (eq name :div)) ;xxx
958 ;;(not (eq name :img))
959 ;;(not (eq name :form))
960 (not (eq name
:noscript
)) ;needed for www.sgi.com
961 (sgml::find-element dtd name nil nil
)))
964 (cond ((integerp r
) (string-rod (prin1-to-string r
)))
965 ((symbolp r
) (string-rod (princ-to-string r
)))
966 ((stringp r
) (string-rod r
))
968 (error "foofoo: Hmm ~S ?!" r
))))
970 ;;; The renderer might depend on upper-case attribute values, so let's leave
971 ;;; this off by default. For the benefit of html <-> xml conversions we
972 ;;; don't want to check the DTD every time we convert an attribute though,
973 ;;; so we need this mode for lower-case attribute values.
974 (defvar *unmungle-attribute-case
* nil
)
976 (defun mungle-attlist (dtd tag atts
)
979 ;; this clause isn't unicode-safe
980 (multiple-value-bind (slot value
)
981 (sgml::find-slot-value-pair nil dtd tag
(mungle x
))
983 (setf value
(foofoo value
))
984 (when *unmungle-attribute-case
*
985 (setf value
(rod-downcase value
))))
989 (let ((slot (kintern (string-upcase (mungle (car x
))))))
990 (list slot
(cdr x
))))))
993 (defun read-experimental-tag (input)
995 ;; we simply slurp until '>'
997 (do ((ch (a-read-byte input
) (a-read-byte input
)))
998 ((and ch
(rune= ch
#/>))
999 (values :experimental-tag
(subseq/rod
(a-stream-scratch input
) 0 sp
)))
1000 (setf sp
(push-on-scratch input sp ch
))) ))
1003 ;;; ---------------------------------------------------------------------------
1007 (defun handle-meta-tag-in-parser (input attrs
)
1008 (when (rod-equal (string-rod "content-type") (getf attrs
:http-equiv
))
1009 (let ((content-type (getf attrs
:content
)))
1011 (multiple-value-bind (type subtype parameters
)
1012 (closure-mime-types:parse-mime-content-type
1013 (rod-string content-type
))
1014 (declare (ignore type subtype
))
1015 (let ((cs (assoc :charset parameters
:test
#'string-equal
)))
1017 (setup-code-vector input
(cdr cs
)))))))))
1019 (defun sgml-parse (dtd input
)
1020 (let* ((stack (list (make-start-tag :name
:%top
:atts nil
)))
1021 (s (sgml::make-pt
/low
:name
'top
))
1028 ((or eof?
(not (null (cdr eingabe
)))))
1029 (let ((tok (read-token* input dtd
)))
1030 (when (and (end-tag-p tok
) (eq (tag-name tok
) :%top
))
1032 (setf eingabe
(nconc eingabe
(list tok
)))))
1033 (multiple-value-setq (stack eingabe ausgabe
) (transition input dtd stack eingabe
))
1034 (cond ((eq ausgabe
:accept
)
1037 ((eq ausgabe
:error
)
1040 ((eq ausgabe
:close
)
1041 (setf s
(sgml:pt-parent s
)))
1043 ((eq (car ausgabe
) :comment
)
1044 (setf (sgml:pt-children s
)
1045 (nconc (sgml:pt-children s
) (list (sgml::make-pt
/low
1046 :name
(cadr ausgabe
)
1047 :attrs
(caddr ausgabe
)
1051 ((eq (car ausgabe
) :open
)
1052 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1053 (cond ((and (eq (cadr ausgabe
) :meta
))
1054 (handle-meta-tag-in-parser input
(caddr ausgabe
))))
1055 ;; when the BODY tag is openend, switch the streams speed to full speed.
1056 (cond ((and (eq (cadr ausgabe
) :body
))
1057 (setf (runes::xstream-speed input
)
1058 (length (runes::xstream-os-buffer input
)))))
1059 (let ((n (sgml::make-pt
/low
1060 :name
(cadr ausgabe
)
1061 :attrs
(caddr ausgabe
)
1064 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))
1067 ((eq (car ausgabe
) :open
/close
)
1068 ;; code duplication alert!
1069 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1070 (cond ((and (eq (cadr ausgabe
) :meta
))
1071 (handle-meta-tag-in-parser input
(caddr ausgabe
))))
1072 (let ((n (sgml::make-pt
/low
1073 :name
(cadr ausgabe
)
1074 :attrs
(caddr ausgabe
)
1077 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))
1079 (setf s
(sgml:pt-parent s
)))
1081 ((eq (car ausgabe
) :htag
)
1082 ;; code duplication alert!
1083 (let* ((v (cadr ausgabe
))
1084 (n (cond ((start-tag-p v
)
1087 :attrs
(start-tag-atts v
)
1097 (error "fix your code.")))))
1098 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))) ))
1102 (defun empty-element-p (dtd gi
)
1103 (null (elm-inclusion dtd gi
)))
1105 (defun transition (input dtd stack eingabe
) ;; --> stack', eingabe', ausgabe
1106 (cond ((and stack
(eq (tag-name (car stack
)) :pcdata
))
1112 (values nil nil
:accept
))
1114 (values stack eingabe
:error
))))
1115 ;; (aS, </a>W) -> (S, W, </a>)
1116 ((comment-token-p (car eingabe
))
1119 (if (member (tag-name (car stack
)) *preserves-comments-elements
*)
1121 (list :comment
:pcdata
(comment-token-data (car eingabe
))))
1124 ((and (tag-p (car eingabe
))
1125 (tag-as-marker-p (tag-name (car eingabe
))))
1128 (list :htag
(car eingabe
))))
1130 ((and (end-tag-p (car eingabe
))
1132 (eq (tag-name (car stack
))
1133 (tag-name (car eingabe
))))
1138 ((and (start-tag-p (car eingabe
))
1140 (member (tag-name (car eingabe
)) (elm-inclusion dtd
(tag-name (car stack
)))) )
1142 ((empty-element-p dtd
(tag-name (car eingabe
)))
1145 (list :open
/close
(tag-name (car eingabe
)) (start-tag-atts (car eingabe
)))))
1147 (values (cons (car eingabe
) stack
)
1149 (list :open
(tag-name (car eingabe
)) (start-tag-atts (car eingabe
)))))))
1151 ((and (white-space-token-p (car eingabe
))
1153 (not (member :pcdata
(elm-inclusion dtd
(tag-name (car stack
))))))
1155 (values stack
(cdr eingabe
) nil
))
1157 (error "Oops empty stack in TRANSITION on ~S." eingabe
))
1159 (let ((x (resolve dtd
(tag-name (car stack
)) (car eingabe
))))
1165 (heuristic input dtd
(car stack
) eingabe
)
1168 (defun tag-as-marker-p (gi)
1169 (and *font-heuristic-p
*
1171 (and *anchor-heuristic-p
*
1174 (defun white-space-token-p (x)
1175 (and (start-tag-p x
)
1176 (eq (tag-name x
) :pcdata
)
1177 (every #'white-space-rune-p
(start-tag-atts x
))))
1179 ;;; Heuristic conflict resolution
1181 (defun shortest-different-beginning (x y
)
1189 (return-from shortest-different-beginning
1190 (values (subseq x
0 i
)
1191 (subseq y
0 j
)))))))
1193 (defun document-action (input context alte-eingabe neue-eingabe is-default-p
)
1194 (let ((see (car alte-eingabe
)))
1195 (parse-warn input
4 "[~A] Saw ~A in ~A ~A"
1196 (if is-default-p
"-" "H") see context
1197 (multiple-value-bind (a b
) (shortest-different-beginning alte-eingabe neue-eingabe
)
1198 (cond ((and (null a
) (null b
))
1199 (format nil
"-- ??? patched ~S -> ~S" alte-eingabe neue-eingabe
))
1201 (format nil
"-- nuked~{ ~A~}." a
))
1203 (format nil
"-- inserted~{ ~A~}." b
))
1205 (format nil
"-- patched~{ ~A~} ->~{ ~A~}." a b
)) )) )))
1207 (defun heuristic (input dtd context eingabe
)
1208 (let ((see (car eingabe
))
1210 (labels ((is (tag state
)
1211 (and (elms-eqv dtd state
(tag-name context
))
1212 (if (char= (char (symbol-name tag
) 0) #\
/)
1213 (and (end-tag-p (car eingabe
))
1215 (tag-name (car eingabe
))
1216 (kintern (subseq (symbol-name tag
) 1))))
1217 (and (start-tag-p (car eingabe
))
1218 (elms-eqv dtd
(tag-name (car eingabe
)) tag
))))))
1220 (cond ((and (member :HTML
(find-dtd-top-elements dtd
))
1221 (cond ((and (end-tag-p see
) (eq (tag-name see
) :%top
))
1222 (cons (elm-etag (tag-name context
)) eingabe
))
1224 ((and (start-tag-p see
)
1225 (eq (tag-name see
) :style
))
1226 (cons (make-start-tag :name
:SPAN
1227 :atts
(list :class
(rod "illegalstyle")))
1230 ((is :center
#|in|
# :h1
)
1231 (list* (elm-etag (tag-name context
))
1232 (car eingabe
) context
(cdr eingabe
)))
1237 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1239 ((or (is :center
#|in|
# :font
)
1240 (is :p
#|in|
# :font
))
1241 ;; Uff -- the attributes of FONT are lost here.
1242 ;; we have to extend `context' to include these.
1243 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1245 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1246 ((elms-eqv dtd
(tag-name context
) ':font
)
1247 (cons (elm-etag (tag-name context
)) eingabe
))
1248 ((is :tr
#|in|
# :center
)
1249 (cons (elm-etag (tag-name context
)) eingabe
))
1250 ((is :/h2
#|in|
# :h1
)
1251 (cons (elm-etag (tag-name context
)) eingabe
))
1252 ((is :ul
#|in|
# :h1
)
1253 (cons (elm-etag (tag-name context
)) eingabe
))
1254 ((is :/center
#|in|
# :h3
)
1255 (cons (elm-etag (tag-name context
)) eingabe
))
1256 ((is :/td
#|in|
# :div
)
1257 (cons (elm-etag (tag-name context
)) eingabe
))
1259 (cons (make-start-tag :name
:li
) eingabe
))
1261 (cons (make-start-tag :name
:li
) eingabe
))
1262 ((is :img
#|in|
# :ul
)
1263 (cons (make-start-tag :name
:li
) eingabe
))
1265 ((is :pcdata
#|in|
# :ul
)
1266 (cons (make-start-tag :name
:li
) eingabe
))
1268 ((is :td
#|in|
# :table
)
1269 (list* (make-start-tag :name
:tr
)
1271 ((is :pcdata
#|in|
# :table
)
1272 (list* (elm-etag (tag-name context
)) (car eingabe
) context
1275 ((is :frameset
#|in|
# :noscript
)
1276 (list* (elm-etag (tag-name context
)) eingabe
))
1278 ((is :form
#|in|
# :table
)
1279 (list* (elm-etag (tag-name context
))
1284 ((is :/form
#|in|
# :tbody
)
1285 ;; we should better check here wether FORM is open at all.
1286 (list* (cadr eingabe
)
1290 ;; new as of 1999-08-31
1291 ((is :td
#|in|
# :li
)
1292 (list* (elm-etag (tag-name context
))
1294 ((is :/td
#|in|
# :li
)
1295 (list* (elm-etag (tag-name context
))
1297 ((is :td
#|in|
# :ul
)
1298 (list* (elm-etag (tag-name context
))
1300 ((is :/td
#|in|
# :ul
)
1301 (list* (elm-etag (tag-name context
))
1306 (setq is-default-p t
)
1308 (document-action input
(tag-name context
) eingabe neu is-default-p
)
1311 (defun parse-html (input &optional
(charset :iso-8859-1
))
1312 (let ((dtd closure-html
:*html-dtd
*))
1313 (let ((input (runes:make-xstream input
:initial-speed
1 :speed
128)))
1314 (setf (a-stream-scratch input
)
1315 (make-array #.
(* 2 *buf-size
*) :element-type
'rune
))
1316 (setup-code-vector input charset
)
1317 (let ((r (sgml-parse dtd input
)))
1318 (post-mortem-heuristic dtd r
)))) )
1320 (defun post-mortem-heuristic (dtd parse-tree
)
1321 "Do possible post mortem heuristic on a parse tree."
1322 (when *font-heuristic-p
*
1323 (setf parse-tree
(post-mortem/fix-font dtd parse-tree
)))
1324 (setf parse-tree
(post-mortem/fix-top-level-structure parse-tree
))
1327 (defun post-mortem/fix-top-level-structure
(parse-tree)
1328 ;; The hacking below is needed because of buggy input. Something like this
1329 ;; <base ..> <html> <head> .. <body> ... </html>
1330 ;; are two documents in one. Here we merge then into one HTML document.
1331 ;; Note that this defeats later incremental rendering pretty well.
1332 ;; Also this is HTML specific.
1333 ;; Das muss dennoch alles noch anders werden hier.
1334 (let ((r parse-tree
)
1338 (head-warn-flag nil
)
1339 (body-warn-flag nil
)
1342 (when (> (length (pt-children r
)) 1)
1343 (parse-warn nil
4 "Multiple HTML elements in document."))
1344 (dolist (k (pt-children r
))
1347 (dolist (k (pt-children k
))
1351 (setf head-warn-flag t
))
1352 (setf head-elts
(nconc head-elts
(pt-children k
))))
1356 (setf body-warn-flag t
))
1357 (setf body-elts
(nconc body-elts
(pt-children k
))))
1360 (setf frameset-elts
(nconc frameset-elts
(pt-children k
))) ))))))
1361 (when head-warn-flag
1362 (parse-warn nil
4 "Multiple HEAD elements."))
1363 (when body-warn-flag
1364 (parse-warn nil
4 "Multiple BODY elements."))
1365 (let* ((html (make-pt/low
:name
:html
:parent nil
))
1366 (head (make-pt/low
:name
:head
:parent html
:children head-elts
)))
1369 (parse-warn nil
4 "Body present while FRAMSET is present -- nuked!")
1370 (setf body nil body-elts nil
))
1372 (setf (pt-parent frameset
) html
1373 (pt-children frameset
) frameset-elts
)
1377 (setf (pt-parent body
) html
1378 (pt-children body
) body-elts
))
1380 (setf body
(make-pt/low
:name
:body
1382 :children body-elts
))))))
1383 (dolist (k head-elts
) (setf (pt-parent k
) head
))
1384 (dolist (k body-elts
) (setf (pt-parent k
) body
))
1385 (dolist (k frameset-elts
) (setf (pt-parent k
) frameset
))
1386 (setf (pt-children html
)
1387 (append (and head
(list head
))
1388 (and body
(list body
))
1389 (and frameset
(list frameset
))))
1392 ;;; FONT post mortem heuristic
1393 ;;; ==========================
1395 ;; Since FONT is by far the most often misused element, we handle these by a
1396 ;; post mortem analysis. If *font-heuristic-p* is true, parser does not care
1397 ;; for FONT start or end tags, but inserts HSTAG-NODEs and HETAG-NODEs into
1398 ;; the parse tree blindly. (Thus emulating the Mosaic approach to rendering).
1403 ;; <p> <font> foo <b> bar </font> baz </b>
1405 ;; is then parsed as:
1407 ;; (P #<HSTAG font> "foo" (B "bar" #<HETAG font> "baz"))
1410 ;; We now always want to return a parse tree, which conforms to the DTD, and
1411 ;; thus have to mungle this somehow into a sane HTML parse tree by wraping
1412 ;; FONT nodes around the right parts.
1414 ;; The first thing we do is to find all pairs of HSTAG, HETAG
1415 ;; nodes. We then use the following algorithm:
1423 ;; if S and E are on the same level then // [*] that is E, S have the same parent
1425 ;; ;; partitionate p.children as:
1426 ;; p.children = (,@sb S ,@si E ,@se)
1428 ;; ;; FONT element spans nothing, so forget it
1430 ;; if p may contain FONT and
1431 ;; for all x in si: FONT may contain x
1433 ;; p.children <- (,@sb (font ,@si) ,@se)
1436 ;; if S is higher in tree than E then
1437 ;; V <- ancestor of E, with V.parent = S.parent
1438 ;; insert a copy of E directly before E
1439 ;; insert a copy of S directly before first child of V
1440 ;; apply the algorithm recursively
1442 ;; if E is higher in tree than S then
1443 ;; ;; this analog to the case above
1444 ;; V <- ancestor of S, with V.parent = E.parent // this ancestor need not to exist!
1445 ;; insert a copy of S directly after V
1446 ;; insert a copy of E direclly after last child of V
1447 ;; apply the algorithm recursively
1450 ;; [*] This is not right. This was an thinko: Two nodes on the same
1451 ;; level do not always have the parent nodes.
1453 ;; NOTE: My first formulation of the algorithm used access pathen and the
1454 ;; implementation below does also -- this is inefficient and should be
1457 ;; NOTE: It would be nice, if we could prove that switching on
1458 ;; *FONT-HEURISTIC-P* does not hurd any conforming document.
1460 ;; When the need araises, we could also use the same method to deal
1461 ;; malicious B, I, et al tags; But bad nesting isn't that popular any more
1464 ;; Expirience however showed, that we would need something similar for FORM,
1465 ;; since people have the habit of spitting a FORM anywhere they see
1466 ;; fit. [Most popular is right between TR and TD]. Visit www.deja.com and you
1467 ;; see what I mean. FORM elements are extremely important, since one may be
1468 ;; able to cope with gliberish on the screen, but not with non-functional
1469 ;; forms. Another thing here is <INPUT> elements in inlegal spots.
1473 ;; We still want to emit parser warnings, if FONT elements are misused.
1475 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1476 ;;; (declare (special q))
1477 ;;; (map-htag-pairs (lambda (stag etag)
1479 ;;; (setf s (pt-path parse-tree stag))
1480 ;;; (remove-pt stag)
1481 ;;; (setf e (pt-path parse-tree etag))
1482 ;;; (remove-pt etag)
1483 ;;; (mungle-font-pair dtd parse-tree stag s e)))
1484 ;;; parse-tree :font)
1487 (defun mungle-font-pair (dtd root tag s e
)
1488 ;; NOTE: ignore-errors is needed, since ELM-INCLUSION checks for existing
1489 ;; element names (this is introduced by our pseudo TOP element).
1491 ((and (= (length s
) (length e
))
1492 (equal (butlast s
) (butlast e
)))
1493 ;; implicit assert was (butlast s) equal (butlast e), which is not always true.
1494 (let ((p (pt-access root
(butlast s
))))
1495 (cond ((and (member :font
(ignore-errors (elm-inclusion dtd
(gi p
))))
1496 (do ((j (car (last s
)) (+ j
1)))
1497 ((>= j
(car (last e
))) t
)
1498 (when (not (null (elt (pt-children p
) j
)))
1499 ;; this should always be true!
1500 (unless (member (gi (elt (pt-children p
) j
))
1501 (ignore-errors (elm-inclusion dtd
:font
)))
1503 (let* ((before (loop for i from
0 to
(1- (car (last s
)))
1504 collect
(elt (pt-children p
) i
)))
1505 (between (loop for i from
(car (last s
)) to
(1- (car (last e
)))
1506 collect
(elt (pt-children p
) i
)))
1507 (after (loop for i from
(car (last e
)) to
(1- (length (pt-children p
)))
1508 collect
(elt (pt-children p
) i
)))
1509 (new (make-pt/low
:name
(gi tag
)
1510 :attrs
(pt-attrs tag
)
1513 (setf (pt-children p
) (append before
(list new
) after
))
1515 (setf (pt-parent k
) new
)) ))
1517 (loop for i from
(car (last s
)) to
(1- (car (last e
))) do
1520 (append (butlast s
) (list i
0))
1522 (list i
(length (pt-children (elt (pt-children p
) i
)))))))))))
1523 ((and (< (length s
) (length e
))
1524 '(equal (subseq e
(length s
)) s
))
1525 (let ((v (subseq e
0 (length s
))))
1526 (mungle-font-pair dtd root tag
(append v
(list 0)) e
)
1527 (mungle-font-pair dtd root tag s v
)))
1528 ((and (> (length s
) (length e
))
1529 '(equal (subseq s
(length e
)) e
))
1530 (let* ((v (subseq s
0 (length e
)))
1532 (mungle-font-pair dtd root tag
1533 (append (butlast v
) (list (+ 1 (car (last v
))))) e
)
1534 (setf m
(length (pt-children (pt-access root v
))))
1535 (mungle-font-pair dtd root tag
1536 s
(append v
(list m
)))))
1539 ;; search to first common parent
1541 ((or (= i
(length s
))
1543 (not (eq (elt s i
) (elt e i
))))
1544 (setf p
(subseq s
0 i
))))
1545 (let* ((v1 (append (subseq s
0 (length p
))
1546 (list (+ 1 (elt s
(length p
))))))
1548 (mungle-font-pair dtd root tag s v1
)
1549 (mungle-font-pair dtd root tag v2 e
))))
1552 (defun map-htag-pairs (continuation parse-tree gi
)
1553 ;; this function maps the inner htags first.
1556 (cond ((and (hstag-node-p x
) (eq (gi x
) gi
))
1558 ((and (hetag-node-p x
) (eq (gi x
) gi
))
1561 (parse-warn nil
4 "Superfluous ~A end tag." gi
))
1563 (funcall continuation
(pop stack
) x
))))
1565 (mapc #'walk
(pt-children x
))))))
1567 ;; take care for non-closed start tags
1568 (unless (null stack
)
1569 (parse-warn nil
4 "Unclosed ~A elements." gi
)
1570 (let ((new (mapcar (lambda (x)
1571 (declare (ignore x
))
1572 (make-hetag-node :name gi
1577 (setf (pt-children parse-tree
)
1578 (append (pt-children parse-tree
) new
))
1579 (mapc continuation stack new
))) )))
1581 ;;; FORM post mortem heuristic
1582 ;;; ==========================
1584 ;; FORM is another element, which some authors get incredible
1585 ;; wrong. We deal with this in a similar way;
1587 ;; while we parse HTAG are put into the parse tree, we sort it out
1588 ;; after we are done parsing; There are two possibilities:
1590 ;; a. We are able to insert FORM in a sane way.
1591 ;; b. We are not able to do this
1593 ;; How is b. now handled? I see two solutions:
1595 ;; - basically leave HTAGs in the tree.
1596 ;; - permitt multiple FORM elements and connect them via some special
1601 (defun setup-code-vector (input charset
)
1602 (let ((enc (runes-encoding:find-encoding charset
)))
1603 (cond ((not (null enc
))
1604 (setf (runes:xstream-encoding input
) enc
))
1606 (parse-warn input
4 "There is no such encoding: ~S." charset
)))))
1608 (defun sgml-parse-file (filename dtd
)
1609 (with-open-file (input filename
:direction
:input
:element-type
'(unsigned-byte 8))
1610 (sgml-parse dtd
(make-a-stream :cl-stream
(cl-byte-stream->gstream input
)))))
1612 (defun html-parse-file (filename)
1613 (with-open-file (input filename
:direction
:input
:element-type
'(unsigned-byte 8))
1614 (parse-html (cl-byte-stream->gstream input
))))
1617 (defun html-parse-url (url)
1618 (unless (url:url-p url
) (setf url
(url:parse-url url
)))
1619 (netlib:with-open-document
((input mime-type
) url
)
1621 (parse-html input
)))
1624 (defun check-saneness (pt &optional
(dtd closure-html
:*html-dtd
*))
1625 (dolist (k (pt-children pt
))
1626 (unless (member (gi k
) (elm-inclusion dtd
(gi pt
)))
1627 (warn "Unallowed ~A element within ~A." (gi k
) (gi pt
)))
1628 (unless (eq (pt-parent k
) pt
)
1629 (warn "Parent/child linkage broken."))
1630 (cond ((htag-node-p k
)
1631 (warn "HTAG node (~S) left in parse tree. (parent ~S)" (gi k
) (gi pt
))))
1632 (check-saneness k
)))
1634 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1635 ;;; ;; das hatten wir schon, so geht das nicht -- oder nur sehr schwer.
1637 ;;; (let ((font-stack nil)) ;stack of open HTAG FONT elements
1638 ;;; (labels ((walk (pt)
1639 ;;; (cond ((hstag-node-p pt)
1640 ;;; (push pt font-stack)
1642 ;;; ((hetag-node-p pt)
1643 ;;; (pop font-stack)
1646 ;;; (cond ((not (null font-stack))
1647 ;;; ;; some fonts are open
1648 ;;; ;; warp them around this node
1652 ;;; (let ((new-children (mapcan #'walk (pt-children pt))))
1653 ;;; (dolist (k new-children)
1654 ;;; (setf (pt-parent k) pt))
1655 ;;; (setf (pt-children pt) new-children)
1656 ;;; (list pt))) )) )))
1657 ;;; (car (walk parse-tree)) )))
1659 (defun post-mortem/fix-font
(dtd parse-tree
)
1660 (declare (special q
))
1662 ;; erstmal alle font tag paare suchen
1663 (map-htag-pairs (lambda (stag etag
)
1664 (push (cons stag etag
) pairs
))
1666 (setf pairs
(reverse pairs
))
1667 ;; dann alle raus nehmen
1670 (remove-pt (cdr k
)))
1671 ;; dann alle munglen
1672 (dolist (pair pairs
)
1673 (let ((stag (car pair
)) (etag (cdr pair
)) s e
)
1674 (setf s
(pt-path parse-tree stag
))
1675 (assert (eq (pt-access parse-tree s
) stag
))
1677 (setf e
(pt-path parse-tree etag
))
1678 (assert (eq (pt-access parse-tree e
) etag
))
1681 (mungle-font-pair dtd parse-tree stag s e
))
1683 (warn "Hmm..."))))))
1686 ;;; ===========================================================================
1690 ;; We want to match Netscape's behaviour and thus by experimentation,
1693 (defun shortest-path-to (dtd pathen goal
&optional
(max-depth 10))
1694 (cond ((<= max-depth
0)
1696 ((find-if (lambda (p)
1700 (shortest-path-to dtd
1704 (elm-surclusion dtd
(first p
))))
1712 ;; Start Tag anaylis
1714 (defun blah (offending)
1715 (with-open-file (sink "/tmp/t.html"
1717 :if-exists
:new-version
)
1718 (let ((dtd closure-html
:*html-dtd
*))
1719 (let ((p (shortest-path dtd
:BODY offending
)))
1720 (let ((p2 (shortest-path dtd offending
:PCDATA
)))
1721 (format sink
"~A<BR>~%" offending
)
1723 (pop p
) ;forget BODY
1725 (setq p2
(butlast p2
))
1727 (dolist (k (butlast p
)) (format sink
"<~A>~%" k
))
1728 (format sink
"<A href='xxx'>~%")
1729 (format sink
"<img src='/tmp/a.gif'>A")
1730 (format sink
"<~A>~%" (car (last p
)))
1731 (dolist (k p2
) (format sink
"<~A>~%" k
))
1732 (format sink
"<img src='/tmp/b.gif'>B")
1733 (dolist (k (reverse p2
)) (format sink
"</~A>~%" k
))
1734 (format sink
"</~A>~%" (car (last p
)))
1735 (format sink
"<img src='/tmp/c.gif'>C")
1736 (dolist (k (reverse (butlast p
))) (format sink
"</~A>~%" k
))
1739 (defun open-in-netscape (url)
1740 (glisp:run-unix-shell-command
1741 (format nil
"my-netscape -remote 'openURL(~A)'" url
)))
1745 (dolist (off (all-elms closure-html
:*html-dtd
*))
1746 (cond (t '(or (member :B
(elm-inclusion closure-html
:*html-dtd
* off
))
1747 (member :P
(elm-inclusion closure-html
:*html-dtd
* off
)))
1749 (format T
"~&;; ~A" off
)
1750 (open-in-netscape "file:/tmp/t.html")
1752 (let ((nam (format nil
"/tmp/~A.gif" off
)))
1753 (glisp:run-unix-shell-command
(format nil
"(my-xwd -id 0x3c003bf | xwdtopnm | ppmtogif) 2>/dev/null > ~A &" nam
)))
1756 (format T
"~&;; Skipping ~A, because inclusion is ~A."
1757 off
(elm-inclusion closure-html
:*html-dtd
* off
)))))))
1760 (defun equivalence-classes (prediate set
)
1761 (let ((classes nil
))
1763 (do ((cp classes
(cdr cp
)))
1765 (push (list item
) classes
))
1766 (cond ((funcall prediate
(caar cp
) item
)
1767 (push item
(car cp
))
1771 (defun shortest-path (dtd from to
&optional
(max-depth 10))
1772 (reverse (shortest-path/aux dtd
(list (list from
)) to max-depth
)))
1774 (defun shortest-path/aux
(dtd pathen to
&optional
(max-depth 10))
1775 (cond ((<= max-depth
0)
1779 ((find-if (lambda (p) (eq (car p
) to
)) pathen
))
1783 (mapcan (lambda (path)
1784 (mapcar (lambda (child)
1786 (elm-inclusion dtd
(car path
))))
1788 :test
(lambda (x y
) (eql (car x
) (car y
))))
1794 (let ((dtd closure-html
:*html-dtd
*))
1795 (equivalence-classes (lambda (x y
)
1796 (and (set-equal (elm-inclusion dtd x
) (elm-inclusion dtd y
))
1797 (set-equal (elm-surclusion dtd x
) (elm-surclusion dtd y
))))
1801 ;;; ===========================================================================
1804 (cond ((and (vectorp x
) (not (stringp x
))) (rod-string x
))
1806 ((cons (mungle (car x
))
1807 (mungle (cdr x
))))))
1810 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1813 (format T
"~&;; Parse error (line ~D column ~D): [~A] Saw ~A in ~A."
1814 (runes:xstream-line-number input
)
1815 (runes:xstream-column-number input
))