1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: SGML; Readtable: GLISP; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: An SGML Parser
4 ;;; Created: 1996-10-21
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: MIT style (see below)
7 ;;; ---------------------------------------------------------------------------
8 ;;; (c) copyright 1996-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 ;; ----------------------------------------------------------------------------
33 ;; 2001-05-17 GB - empty tags are threated as start tags.
34 ;; [we can do better than that].
36 ;; 2001-05-14 GB - finally all mumbling goes with PARSE-WARN.
38 ;; 1999-09-19 GB - a-streams are gone now, we use the xstreams from the
41 ;; 1999-08-24 GB = new scheme to handle unsyntactic nesting of FONT tags
42 ;; thru' post mortem analysis; due to that:
43 ;; - *FONT-HEURISTIC-P*: new special variable
44 ;; - HTAG-NODE: new structure type
45 ;; - New stack machine output token type :HTAG
46 ;; - TAG-AS-MARKER-P: new predicate
47 ;; - POST-MORTEM-HEURISTIC: new function
48 ;; - PARSE-HTML: use it
49 ;; - POST-MORTEM/FIX-TOP-LEVEL-STRUCTURE: moved code
50 ;; from PARSE-HTML here
52 ;; - HTML-PARSE-FILE, HTML-PARSE-URL: new functions
54 ;; 1999-08-23 GB - EMPTY-ELEMENT-P: new function
55 ;; - SGML-PARSE, TRANSITION: new output token :open/close
56 ;; to accommodate empty elements (like <IMG>, <BR>)
58 ;; 1999-08-19 GB - nuked FAT-WHITE-SPACE-P
59 ;; - nuked U16->STRING (using ROD-STRING now).
61 ;; - use #/.. read syntax instead of #.(char-code) idiom
62 ;; - use RUNE=, RUNE<= et al
64 ;; - twixed SUBSEQ/U16 to SUBSEQ/ROD
65 ;; - prays that everything still works.
72 (declaim (inline code-char
))
74 (defun code-char (code)
75 (if (< code char-code-limit
)
79 ;; This is a high-speed implementation of an SGML parser.
83 ;; o We do our own buffering here, which allows us to access the buffer
84 ;; directly and thus cutting down the number of function calls per character
87 ;; o The DTD has to be 'compiled' before we drive a little determistic PDA
92 ;; o das mit dem <body> = full speed functioniert irgentwie nicht
93 ;; richtig ;-( wenn das BODY tag fehlt. -- es kommt nur etwas spaet
94 ;; wegen token lookahead, was aber kein problem sein sollte.
96 ;; o improve error handling
97 ;; - After we have encountered an error while reading a tag nuke everything
100 ;; Forgive something like <foo x="10%> and back up until #/" by patching
102 ;; - emit warning if somebody writes e.g. <A href=http://foo/bar/baz>,
106 ;; - account for the SGML inclusion '+(..)' and exclusion '-(..)' feature
107 ;; - implement the HTML idea of CDATA for SCRIPT and (uhm!) XMP.
108 ;; - thanks to the W3C: '<' and '>' are now part of CSS-2 syntax so we
109 ;; need something similar for STYLE also.
111 ;; o merge as much is possible with the XML parser in a modular fashion
113 ;; o scratch-pad hack needs to be reentrant !!!
114 ;; [make xml-parser's implementation of with-rune-collector safe
117 ;; o the FORM hack should be there.
118 ;; (we also need an <INPUT> hack).
120 ;; o finally ensure that the rune abstraction is fine.
122 ;; o report errors using exactly one mechanism and include file name
123 ;; (uri) into error messages; better yet: define a error message
124 ;; structure -- in the future it should be possible to have something
125 ;; like an error message listener.
127 ;; o since we use xstreams now morph the eof representation from NIL
128 ;; to :EOF. Or better yet: give the READ-RUNE macro a standard
129 ;; signature including eof-error-p and eof-value. (Wouldn't hurt,
130 ;; since it is a macro).
132 ;; o In DTD abstraction: ELM-INCLUSION needs flag error-p
134 ;; o we need rune-max and therefore are not dependent on UCS-2
135 ;; anymore, should the need araise simply redefine rune-max.
139 ;; Build the parse tree out of lazy lists. Incremental rendering would
140 ;; then become cheap to implement.
142 (defparameter *preserves-comments-elements
*
144 "List of names of elements, which should preserve (may contain) comment tokens.")
146 (defparameter *font-heuristic-p
* t
147 "Whether to handle FONT by heuristic and post mortem processing.")
149 (defparameter *anchor-heuristic-p
* nil
150 "Whether to handle A tags by heuristic and post mortem processing.")
152 (defparameter *gt-ends-comment-p
* nil
153 "Whether and '>' ends a comment? -- A warning is emitted still.")
155 (defvar *options
/parser-silent-p
* nil
)
156 (defvar *line-number
*)
157 (defparameter *parse-warn-level
* 5)
160 ;; HTAG = heuristic tag
161 ;; These are unparsed tags, which are inserted verbatim into the parse tree
162 ;; for post mortem analysis.
163 ;; See explaination in FONT element fixer far below.
165 (defstruct (htag-node (:include pt
)) ) ;heuristic tag
166 (defstruct (hstag-node (:include htag-node
)) ) ;heuristic start tag
167 (defstruct (hetag-node (:include htag-node
)) ) ;heuristic end tag
169 ;; The predicate TAG-AS-MARKER-P decides, whether a given tag is
170 ;; inserted as HSTAG or HETAG.
172 ;;;; --------------------------------------------------------------------------
173 ;;;; Predicates and Utilities on Runes
175 ;; Most of these should really go into util.lisp
177 (definline name-start-rune-p
(char)
178 (or (rune<= #/a char
#/z
)
179 (rune<= #/A char
#/Z
)))
181 (definline name-rune-p
(char)
182 (or (name-start-rune-p char
)
187 (definline sloopy-name-rune-p
(char)
188 (or (name-rune-p char
)
190 (rune= char
#//) ;manche schreiben ganze urls ohne Gaensefuesschen
193 (rune= char
#/#) ;farben werden auch gerne genommen
199 ;; ganzer ECMA-Script Code kommt auch vor
210 (definline sloopy-value-rune-p
(char)
211 (or (sloopy-name-rune-p char
)
214 (definline alpha-rune-p
(char)
215 (or (rune<= #/a char
#/z
)
216 (rune<= #/A char
#/Z
)))
218 (definline upcase-name-rune
(rune)
221 ;;;; --------------------------------------------------------------------------
224 (definline subseq
/rod
(source start end
)
225 ;; Optimized version of subseq for arrays of runes
226 (declare (type rod source
)
227 (type fixnum start end
))
228 (let ((res (make-rod (- end start
))))
229 (declare (type rod res
))
230 (do ((i (- (- end start
) 1) (the fixnum
(- i
1))))
232 (declare (type fixnum i
))
233 (setf (%rune res i
) (%rune source
(the fixnum
(+ i start
)))))))
235 ;;; ---------------------------------------------------------------------------
238 (eval-when (compile load eval
)
239 (defparameter *buf-size
* 4096))
241 (defmacro a-read-byte
(input)
243 `(let ((,c
(runes:read-rune
,input
)))
244 (if (eq ,c
:eof
) nil
,c
))))
246 (defmacro a-peek-byte
(input)
248 `(let ((,c
(runes:peek-rune
,input
)))
249 (if (eq ,c
:eof
) nil
,c
))))
251 (defmacro a-unread-byte
(byte input
)
252 `(runes:unread-rune
,byte
,input
))
254 (defmacro a-stream-position
(input)
255 `(runes:xstream-position
,input
))
257 (defun make-a-stream (&key cl-stream
)
258 (runes:make-xstream cl-stream
:initial-speed
1 :speed
8192))
260 (defmethod runes::read-octets
(sequence (stream html-glisp
:gstream
) start end
)
261 (html-glisp:g
/read-byte-sequence sequence stream
:start start
:end end
))
263 (defmethod runes::xstream
/close
((stream html-glisp
:gstream
))
264 (html-glisp:g
/close stream
))
266 ;; a fake definition -- XXX non-reentrant!
268 (defun a-stream-scratch (input)
269 (getf (runes::xstream-plist input
) 'scratch-pad
))
271 (defun (setf a-stream-scratch
) (new-value input
)
272 (setf (getf (runes::xstream-plist input
) 'scratch-pad
) new-value
))
274 ;;;; -------------------------------------------------------------------------
275 ;;;; Reporting Errors
278 (defun read-tag-error (input format-string
&rest format-args
)
279 (apply #'parse-warn input
4 format-string format-args
)
280 (throw 'read-tag-error
282 (string-rod "##BAD TAG##"))))
286 ;;; 1 - Absolutely normal mumbleing
287 ;;; 2 - Laziness introduced by the HTML standard
288 ;;; 3 - Semantic error upon parsing attributes
289 ;;; 4 - Accidents happening while parsing the structure
290 ;;; 5 - More serve errors
293 (defun parse-warn (input level fmt
&rest args
)
294 (let ((*print-pretty
* nil
)) ;disable ugly^H^H^H^Hpretty printing
295 (when (>= level
*parse-warn-level
*)
296 (unless *options
/parser-silent-p
*
297 (let ((preample (format nil
";; Parser warning: ~11@<Line ~D,~> ~11@<column ~D~>: ~5A "
298 (and input
(ignore-errors (runes:xstream-line-number input
)))
299 (and input
(ignore-errors (runes:xstream-column-number input
)))
300 (make-string level
:initial-element
#\
*))))
301 (fresh-line *trace-output
*)
302 (write-string preample
*trace-output
*)
303 (with-input-from-string (i (apply #'format nil fmt args
))
304 (do ((x (read-line i nil nil
) (read-line i nil nil
))
308 (write-string ";; " *trace-output
*)
309 (dotimes (i (- (length preample
) 3))
310 (write-char #\space
*trace-output
*)))
311 (write-string x
*trace-output
*)
312 (terpri *trace-output
*))))))))
314 ;;; ---------------------------------------------------------------------------
317 (defun read-token (input dtd
)
318 ;; Reads on token from the stream `input'
321 ;; :start-tag name atts
323 ;; :empty-tag name atts
326 ;; :experimental-tag cdata
328 (let ((ch (a-read-byte input
)))
332 (read-tag input dtd
))
334 (a-unread-byte ch input
)
335 (read-pcdata input dtd
)) )))
337 (defun push-on-scratch (input sp ch
)
338 ;; Push the character ch onto the scratch pad of `input' and enlarge if neccessary
339 (setf (aref (a-stream-scratch input
) sp
) ch
)
341 (cond ((= sp
(length (a-stream-scratch input
))) ;end of scratch pad reached?
342 (enlarge-scratch-pad input
)))
345 (defun read-pcdata (input dtd
)
346 (let* ((scratch (a-stream-scratch input
)) ;scratch pad
347 (sp 0) ;pointer into scratch pad
348 (se (length scratch
)) ;end of scratch pad
350 (declare (type (simple-array rune
(*)) scratch
))
351 (declare (type fixnum sp se
))
353 (let ((ch (a-read-byte input
)))
354 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
355 ;; a-read-byte returns a rune.
356 ;;; (declare (type (or null (unsigned-byte 8)) ch))
357 (declare (type (or null rune
) ch
))
358 (cond ((null ch
) ;eof
360 ((rune= ch
#/<) ;end of pcdata
361 (a-unread-byte ch input
)
364 (setf sp
(read-entity-ref input dtd sp
))
365 (if (>= sp se
) ;read-entity-ref may enlarge scratchpad
366 (setf scratch
(a-stream-scratch input
)
367 se
(length scratch
))))
369 (setf (aref scratch sp
) ch
) ;recode character read
370 (setf sp
(the fixnum
(+ sp
1)))
371 (cond ((= sp se
) ;end of scratch pad reached?
372 (enlarge-scratch-pad input
)
373 (setf scratch
(a-stream-scratch input
)
374 se
(length scratch
))))))))
376 (subseq/rod scratch
0 sp
)) ))
378 (defun read-entity-ref (input dtd sp
)
379 ;; Reads an entity reference into the stream's scratch pad from position
381 ;; Returns the new write pointer. The initial "&" is already read from the
384 ;; entity-ref ::= "&" "#" <digit>+ (";")?
385 ;; entity-ref ::= "&" "#" "x" <hex-digit>+ (";")?
386 ;; entity-ref ::= "&" <name-start> <name-char>* (";")?
387 (let ((ch (a-read-byte input
)))
388 (cond ((null ch
) ;eof
389 (parse-warn input
3 "EOF in entity")
390 (push-on-scratch input sp
#/&))
391 ((rune= ch
#/#) ;numeric reference?
392 (read-numeric-entity input sp
))
393 ((name-start-rune-p ch
) ;named entity?
394 (read-named-entity input dtd sp ch
))
397 "Saw character '~A' after '&' -- bad entity reference?!"
399 (format nil
"&#x~4,'0X" ch
)))
400 (a-unread-byte ch input
) ;it might be something interesting
401 (push-on-scratch input sp
#/&)) )))
403 (defun read-numeric-entity (input sp
)
405 (let ((ch (a-read-byte input
)))
406 (setf sp
(push-on-scratch input sp
#/&))
407 (setf sp
(push-on-scratch input sp
#/#))
408 (cond ((null ch
) ;eof
409 (parse-warn input
3 "EOF in entity")
413 (read-numeric-entity-aux input
(- sp
2) sp
10 ch
))
417 (setf sp
(push-on-scratch input sp
#/x
))
418 (setf ch
(a-read-byte input
))
420 (parse-warn input
3 "EOF after '&#x'.")
422 ((not (digit-rune-p ch
16))
423 (parse-warn input
3 "Bad character after '&#x'.")
424 (a-unread-byte ch input
)
427 (read-numeric-entity-aux input
(- sp
3) sp
16 ch
))))
429 (a-unread-byte ch input
)
430 (parse-warn input
3 "Bad character after '&#'")
433 (defun read-numeric-entity-aux (input s0 sp radix ch
)
434 ;; Aux routine for read-numeric-entity
435 ;; at s0..sp in the scratch pad is the already read prefix ('&#' or '&#x')
436 ;; Radix is the radix to use (10 or 16)
438 ;; 'ch' is the first digit
440 (setf sp
(push-on-scratch input sp ch
))
441 (do ((ch (a-read-byte input
) (a-read-byte input
)))
442 ((or (null ch
) (not (digit-rune-p ch radix
)))
443 ;; Ok. [s1..sp) now is the digit sequence
444 (let ((num (parse-integer (map 'string
#'rune-char
445 (subseq (a-stream-scratch input
) s1 sp
))
447 (cond ((<= 0 num
#xFFFF
)
448 ;; Proper entity value
449 (when (and (not (null ch
)) (not (rune= ch
#/\
;)))
450 (a-unread-byte ch input
))
451 ;; Rewind scratch pad to `s0' and push character `num'
452 (setf sp
(push-on-scratch input s0
(code-rune num
))))
454 ;; num too large; emit warning and leave scratch pad alone
455 (when (not (null ch
))
456 (a-unread-byte ch input
))
457 (parse-warn input
3 "Numeric enity ~A does not fit into our 16-bit strings; -- ignored."
458 (rod-string (rod-subseq (a-stream-scratch input
) s0 sp
)))))))
459 (setf sp
(push-on-scratch input sp ch
)))
462 (defun read-named-entity (input dtd sp ch
)
463 ;; Just in case we want to leave the entity alone
465 (setf sp
(push-on-scratch input sp
#/&))
467 (setf sp
(push-on-scratch input sp ch
))
468 (do ((ch (a-read-byte input
) (a-read-byte input
)))
469 ((or (null ch
) (not (name-rune-p ch
)))
470 ;; Ok. [s1..sp) now is the name, try to resolve it
471 (let ((nums (find-named-entity dtd
(subseq (a-stream-scratch input
) s1 sp
))))
472 (cond ((not (null nums
))
473 ;; Proper entity value
474 ;; Rewind scratch pad to `s0' and push characters in `nums'
475 (dotimes (i (length nums
))
476 (setf sp
(push-on-scratch input s0
(aref nums i
))))
477 (when (and (not (null ch
)) (not (rune= ch
#/\
;)))
478 (a-unread-byte ch input
)))
480 (when (not (null ch
))
481 (a-unread-byte ch input
))
482 (parse-warn input
3 "[~D] There is no such entity defined: ~A -- ignored."
483 (a-stream-position input
)
484 (rod-string (rod-subseq (a-stream-scratch input
) s0 sp
)))))) )
485 (setf sp
(push-on-scratch input sp ch
))))
488 (defun find-named-entity (dtd fat-string
)
489 (let ((str (rod-string fat-string
)))
490 (let ((r (cdr (assoc str
(sgml::dtd-entities dtd
) :test
#'string
=))))
493 (defun enlarge-scratch-pad (input)
494 (let* ((old (a-stream-scratch input
))
496 (declare (type fixnum se
)
497 (type (simple-array rune
(*)) old
))
498 (let ((new (make-rod (+ (length (a-stream-scratch input
)) *buf-size
*))))
499 (declare (type rod new
))
500 (do ((i (- se
1) (the fixnum
(- i
1))))
502 (declare (type fixnum i
))
503 (setf (aref new i
) (aref old i
)))
504 (setf (a-stream-scratch input
) new
))))
506 ;;; ------------------------------------------------------------
509 ;; tag ::= <start-tag> | <end-tag> | <exp-tag> | <comment>
510 ;; end-tag ::= "<" "/" <name> WSP ">"
511 ;; empty-tag ::= "<" <name> <atts> WSP "/" ">"
512 ;; start-tag ::= "<" <name> <atts> WSP ">"
513 ;; exp-tag ::= "<" "?" <any>* ">"
514 ;; comment ::= "<" "!" "-" "-" (<any>* - ("-" "-")) "-" "-" ">"
517 ;; | <name> WSP "=" WSP <value>
518 ;; value ::= <literal> | <name>
519 ;; literal ::= """ <char>* """
521 ;; atts ::= ( WSP <att> )*
522 ;; WSP ::= <white-space>*
523 ;; name ::= <name-start-char> <name-char>*
524 ;; char ::= <any> | <enitity-ref>
526 (defun read-tag (input dtd
)
527 ;; The "<" is already read.
528 (catch 'read-tag-error
529 (let ((ch (a-peek-byte input
)))
530 (cond ((rune= ch
#//)
532 (read-end-tag input
))
535 (read-define-tag input dtd
))
538 (read-experimental-tag input
))
539 ((and (not (null ch
)) (name-start-rune-p ch
))
540 (read-start-tag input dtd
))
542 (parse-warn input
3 "Bad character after '<': '~A' -- ignored."
544 (let ((res (string-rod "<")))
545 (values :pcdata res
))) ))) )
547 (defun read-start-tag (input dtd
)
548 (multiple-value-bind (name atts
) (read-name-and-attributes input dtd
)
549 (let ((ch (a-read-byte input
)))
551 (read-tag-error input
"EOF inside tag"))
553 (values :start-tag name atts
))
556 "A '<' ended this tag.")
557 (a-unread-byte ch input
)
558 (values :start-tag name atts
))
560 (setf ch
(a-read-byte input
))
561 (cond ((rune= ch
#/>)
562 (values :empty-tag name atts
))
564 (read-tag-error input
"Expected '>' after '<' .. '/'"))))
566 (read-tag-error input
"Expected '>'")) ))))
568 (defun read-end-tag (input)
569 (let ((name (read-name input
)))
570 (skip-white-space input
)
571 (let ((ch (a-read-byte input
)))
573 (read-tag-error input
"In end tag: Expected '>' got end-of-file instead."))
575 (values :end-tag name
))
577 (read-tag-error input
"In end tag: Expected '>'")) ))))
579 (defun read-name-and-attributes (input dtd
)
580 (let ((name (read-name input
))
583 (skip-white-space input
)
584 (cond ((member (a-peek-byte input
) '(#/< #/> #//) :test
#'eql
)
586 (push (read-attribute input dtd
) atts
))
587 (values name
(nreverse atts
)) ))
589 (defun read-name (input)
590 (let ((ch (a-peek-byte input
))
592 (cond ((and (not (null ch
)) (name-start-rune-p ch
))
593 (do ((ch (a-read-byte input
) (a-read-byte input
)))
594 ((not (and ch
(name-rune-p ch
)))
596 (a-unread-byte ch input
))
597 (subseq/rod
(a-stream-scratch input
) 0 sp
))
598 (setf sp
(push-on-scratch input sp
(upcase-name-rune ch
)))))
600 (read-tag-error input
"Not a name")) )))
602 (defun skip-white-space (input)
603 (do ((ch (a-read-byte input
) (a-read-byte input
)))
604 ((not (and ch
(white-space-rune-p ch
)))
605 (when ch
(a-unread-byte ch input
)))))
607 (defun read-attribute (input dtd
)
608 (skip-white-space input
)
609 (let ((slot (read-sloopy-name input
)))
610 ;;(print (list 'slot '= (mungle slot) (mungle (vector (a-peek-byte input)))))
611 (skip-white-space input
)
612 (let ((c (a-peek-byte input
)))
613 (cond ((and (not (null c
)) (rune= c
#/=))
615 (skip-white-space input
)
616 (let ((value (read-value input dtd
)))
621 (defun read-value (input dtd
)
622 (let ((ch (a-peek-byte input
)))
623 (cond ((rune= ch
#/')
625 (read-literal input dtd ch
))
628 (read-literal input dtd ch
))
629 ((and ch
(sloopy-name-rune-p ch
))
630 (read-sloopy-value input
))
632 (read-tag-error input
"Bad value '~A' seen"
634 (format nil
"U+~4,'0X" (rune-code ch
))))))))
636 (defun read-literal (input dtd delim
)
637 (let* ((scratch (a-stream-scratch input
)) ;scratch pad
638 (sp 0) ;pointer into scratch pad
639 (se (length scratch
)) ;end of scratch pad
641 (declare (type rod scratch
))
642 (declare (type fixnum sp se
))
644 (let ((ch (a-read-byte input
)))
645 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
646 ;; a-read-byte returns a rune.
647 ;;; (declare (type (or null (unsigned-byte 8)) ch))
648 (declare (type (or null rune
) ch
))
649 (cond ((null ch
) ;eof
650 (read-tag-error input
"Eof in literal"))
654 (setf sp
(read-entity-ref input dtd sp
)))
656 (setf (aref scratch sp
) ch
) ;recode character read
657 (setf sp
(the fixnum
(+ sp
1)))
658 (cond ((= sp se
) ;end of scratch pad reached?
659 (enlarge-scratch-pad input
)
660 (setf scratch
(a-stream-scratch input
)
661 se
(length scratch
))))))))
662 (subseq/rod scratch
0 sp
) ))
664 (defun read-sloopy-name (input)
665 (let ((ch (a-peek-byte input
))
667 (cond ((and (not (null ch
)) (sloopy-name-rune-p ch
))
668 (do ((ch (a-read-byte input
) (a-read-byte input
)))
669 ((not (and ch
(sloopy-name-rune-p ch
)))
671 (a-unread-byte ch input
))
672 (subseq/rod
(a-stream-scratch input
) 0 sp
))
673 (setf sp
(push-on-scratch input sp ch
))))
675 (read-tag-error input
676 "Expected sloopy name, got ~A"
677 (or (rune-char ch
) (format nil
"U+~4,'0X" ch
)) )) )))
679 (defun read-sloopy-value (input)
680 (let ((ch (a-peek-byte input
))
682 (cond ((and (not (null ch
)) (sloopy-value-rune-p ch
))
683 (do ((ch (a-read-byte input
) (a-read-byte input
)))
684 ((not (and ch
(sloopy-value-rune-p ch
)))
686 (a-unread-byte ch input
))
687 (subseq/rod
(a-stream-scratch input
) 0 sp
))
688 (setf sp
(push-on-scratch input sp ch
))))
690 (read-tag-error input
"Expected sloopy value, got ~A"
691 (or (rune-char ch
) (format nil
"U+~4,'0X" ch
)) )) )))
693 (defun read-define-tag (input dtd
)
694 (let ((ch (a-peek-byte input
)))
696 (read-tag-error input
"unexpected EOF"))
698 ;; empty define tag -- to be ignored
700 (read-token input dtd
))
704 (let ((ch (a-peek-byte input
)))
705 (cond ((and (not (null ch
)) (rune= ch
#/-
))
706 (read-comment input
))
708 (read-tag-error input
"Expected '-' after \"<!-\"")))))
710 (read-define-tag-2 input
)) )))
712 (defun read-define-tag-2 (input)
714 ;; we simply slurp until '>'
716 (do ((ch (a-read-byte input
) (a-read-byte input
)))
717 ((and ch
(rune= ch
#/>))
718 (values :define-tag
(subseq/rod
(a-stream-scratch input
) 0 sp
)))
719 (setf sp
(push-on-scratch input sp ch
))) ))
721 (defun read-comment (input)
722 (a-read-byte input
) ;consume the '-'
724 (c1 (or (a-read-byte input
) (read-tag-error input
"Unexpected EOF")))
725 (c2 (or (a-read-byte input
) (read-tag-error input
"Unexpected EOF")))
731 c2
(a-read-byte input
))
733 (read-tag-error input
"EOF within comment."))
738 ((and *gt-ends-comment-p
*
740 (parse-warn input
3 "A '>' ends this comment.")
742 (cond ((and (rune= c0
#/-
) (rune= c1
#/-
))
744 (parse-warn input
4 "\"--\" seen within comment; This is strongly depreciated.")
746 (setf sp
(push-on-scratch input sp c0
)))
747 (values :comment
(subseq/rod
(a-stream-scratch input
) 0 sp
)) ))
749 ;;;; ------------------------------------------------------------------------------------------
751 (defun name-start-char-p (ch)
754 (defun name-char-p (ch)
755 (or (alphanumericp ch
) (char= ch
#\.
) (char= ch
#\-
)) )
757 (defun valid-name-string-p (string)
758 "Is the string `string' a valid name string according to the SGML
760 (and (> (length string
) 0)
761 (name-start-char-p (char string
0))
762 (every #'name-char-p string
)) )
764 ;;;; ------------------------------------------------------------------------------------------
765 ;;;; Resolving Entities
768 ;;;; TODO: Check that numeric entities are within 0..#xFFFF;
770 (defun resolve-numeric-entity (string start end
) ; --> string ; new start
771 (let ((j (or (position-if-not #'digit-rune-p string
:start start
:end end
) end
)))
773 (let ((n (parse-integer (rod-string (rod-subseq string start j
)) :radix
10)))
775 (if (and (< j end
) (rune= (rune string j
) #/\
;))
779 (defun resolve-hex-entity (string start end
) ; --> string ; new start
780 ;; Resolves a hexadecimal entity like "*", start should point
781 ;; to the character directy after the '&#x'.
782 (let ((j (or (position-if-not (rcurry #'digit-rune-p
16) string
:start start
:end end
) end
)))
784 (let ((n (parse-integer (rod-string (rod-subseq string start j
)) :radix
16)))
786 (if (and (< j end
) (rune= (rune string j
) #/\
;))
790 (defun resolve-named-entity (string entities start end
&optional input
)
791 ;; --> string ; new start
792 (let ((j (or (position-if-not #'name-start-rune-p string
:start start
:end end
) end
)))
794 (or (dolist (k entities
)
795 (when (and (= (length (car k
)) (- j start
))
796 ;; XXX this compare conses far too much!
797 (rod= (string-rod (subseq (car k
) 0 (- j start
)))
798 (rod-subseq string start j
)))
799 (return (string-rod (cdr k
))) ) )
800 (dolist (k entities nil
)
801 (when (and (>= (length (car k
)) (- j start
))
803 (rod= (string-rod (subseq (car k
) 0 (- j start
)))
804 (rod-subseq string start j
)))
805 (return (string-rod (cdr k
))) )))))
806 (cond ((not (null res
))
808 (resolve-entities-in-string res entities
0 (length res
) input
) ;right?
809 (if (and (< j end
) (rune= (rune string j
) #/\
;))
813 (parse-warn input
3 "Entity &~a; is not defined." (subseq string start j
))
814 (values (subseq string
(1- start
) j
) j
))))))
816 (defun resolve-entities-in-string (string entities
817 &optional
(start 0) (end (length string
)) input
)
818 ;; Resolve all entity references introduced by "&" in the string
819 ;; `string'. `start' and `end' specify a substring to operate on.
820 ;; For error messages `input' may be the input stream the data is
821 ;; coming from originally.
822 (let ((i (position #/& string
:start start
:end end
:test
#'rune
=)))
824 ;; no further entities in string -- all done
825 (rod-subseq string start end
))
826 ((and (< (+ i
2) end
)
827 (rune= #/# (rune string
(+ i
1)))
828 (digit-rune-p (rune string
(+ i
2))))
829 ;; numeric entity seen
830 (multiple-value-bind (res j
) (resolve-numeric-entity string
(+ i
2) end
)
831 (concatenate 'rod
(subseq string start i
)
833 (resolve-entities-in-string string entities j end input
))))
834 ((and (< (+ i
2) end
)
835 (rune= #/# (rune string
(+ i
1)))
836 (rune= #/x
(rune string
(+ i
2))))
837 ;; hexadecimal entity seen
838 (multiple-value-bind (res j
) (resolve-hex-entity string
(+ i
3) end
)
839 (concatenate 'rod
(subseq string start i
)
841 (resolve-entities-in-string string entities j end input
))))
842 ((and (< (+ i
1) end
)
843 (alpha-rune-p (aref string
(+ i
1))))
844 ;; this must be a named entity
845 (multiple-value-bind (res j
)
846 (resolve-named-entity string entities
(+ i
1) end input
)
847 (concatenate 'rod
(subseq string start i
)
849 (resolve-entities-in-string string entities j end input
))))
851 ;; no entity reference.
852 (concatenate 'rod
(subseq string start i
)
854 (resolve-entities-in-string string entities
(+ i
1) end input
))))))
857 ;;;; ------------------------------------------------------------------------------------------
858 ;;;; Mungling of Attribute values
861 (let ((kw-pkg (find-package :keyword
)))
865 (defun canon-value (input dtd tag slot value
)
866 (let* ((attlist (find-element-attlist dtd tag
))
867 (looked (assoc slot attlist
)))
868 (cond ((and looked
(listp (cadr looked
)))
869 (or (find value
(cadr looked
)
870 :test
#'(lambda (x y
)
871 (string-equal (string x
) (string y
))))
873 ;; Oh yeah! monster format strings are fun!
875 "~S is a bad value for the '~A' slot of '<~A>', which could ~
876 ~{~#[not take any value~;only take '~A'~:;take one of ~@{'~A'~#[~; or ~:;, ~]~}~]~:}."
877 value slot tag
(cadr looked
)))))
878 ((member (cadr looked
) '(:number
))
879 (or (maybe-parse-integer value
)
881 (parse-warn input
3 "~S is not NUMBER (attribute '~A' of '<~A>')."
884 ((member (cadr looked
) '(:name
:id
))
885 (cond ((valid-name-string-p value
)
886 (kintern (string-upcase value
)))
888 (parse-warn input
3 "~S is not NAME (attribute '~A' of '<~A>')."
894 (parse-warn input
3 "The '<~A>' element has no '~A' slot." tag slot
)
897 (defun find-slot-value-pair (input dtd tag value
)
898 (let* ((attlist (find-element-attlist dtd tag
))
900 (dolist (att attlist
)
901 (cond ((and (listp (cadr att
))
902 (setq looked
(find value
(cadr att
)
903 :test
#'(lambda (x y
)
904 (string-equal (string x
) (string y
))))))
905 (return-from find-slot-value-pair
(values (car att
) looked
)))))
908 "The '<~A>' tag has no slot which could take the '~A' keyword.~%~
909 ~1{~#[There are no possible slots at all.~;~
910 Only possible slot is:~:;~
911 Possible slots are:~]~
912 ~@{~& ~1{Slot '~A'~20T could ~{~#[not take any value~;~
914 take one of ~@{'~A'~#[~; or ~:;, ~]~}~].~:}~}~}~:}"
915 tag value
(remove-if-not #'(lambda (x) (consp (cadr x
))) attlist
)) ))
918 ;;;; ------------------------------------------------------------
920 (defun read-token* (input dtd
)
921 ;;(skip-white-space input)
922 (multiple-value-bind (kind a b
) (read-token input dtd
)
924 (:pcdata
(make-start-tag :name
:pcdata
:atts a
))
926 (let ((name (kintern (rod-string a
))))
927 (if (tag-exists? dtd name
)
928 (make-start-tag :name name
:atts
(mungle-attlist dtd name b
))
930 (parse-warn input
4 "There is no such thing as <~A> -- ignored." name
)
931 (read-token* input dtd
)))))
933 (let ((name (kintern (rod-string a
))))
934 (if (tag-exists? dtd name
)
935 (make-end-tag :name name
)
937 (parse-warn input
4 "There is no such thing as </~A> -- ignored." name
)
938 (read-token* input dtd
)))) )
940 (parse-warn input
2 "Oops -- there is an empty tag; but this is only HTML?!")
941 (let ((name (kintern (rod-string a
))))
942 (if (tag-exists? dtd name
)
943 (make-start-tag :name name
:atts
(mungle-attlist dtd name b
))
945 (parse-warn input
4 "There is no such thing as <~A> -- ignored." name
)
946 (read-token* input dtd
))))
947 ;;(read-token* input dtd)
949 (:define-tag
(read-token* input dtd
))
951 (parse-warn input
2 "Ignoreing processing instruction tag: '~A'" (mungle a
))
952 (read-token* input dtd
))
954 (make-comment-token :data a
))
956 (make-end-tag :name
:%top
)) )))
958 (defun tag-exists?
(dtd name
)
960 ;;(not (eq name :font)) ;xxx
961 ;;(not (eq name :center)) ;xxx
962 ;;(not (eq name :div)) ;xxx
963 ;;(not (eq name :img))
964 ;;(not (eq name :form))
965 (not (eq name
:noscript
)) ;needed for www.sgi.com
966 (sgml::find-element dtd name nil nil
)))
969 (cond ((integerp r
) (string-rod (prin1-to-string r
)))
970 ((symbolp r
) (string-rod (princ-to-string r
)))
971 ((stringp r
) (string-rod r
))
973 (error "foofoo: Hmm ~S ?!" r
))))
975 ;;; The renderer might depend on upper-case attribute values, so let's leave
976 ;;; this off by default. For the benefit of html <-> xml conversions we
977 ;;; don't want to check the DTD every time we convert an attribute though,
978 ;;; so we need this mode for lower-case attribute values.
979 (defvar *unmungle-attribute-case
* nil
)
981 (defun mungle-attlist (dtd tag atts
)
984 ;; this clause isn't unicode-safe
985 (multiple-value-bind (slot value
)
986 (sgml::find-slot-value-pair nil dtd tag
(mungle x
))
988 (setf value
(foofoo value
))
989 (when *unmungle-attribute-case
*
990 (setf value
(rod-downcase value
))))
994 (let ((slot (kintern (string-upcase (mungle (car x
))))))
995 (list slot
(cdr x
))))))
998 (defun read-experimental-tag (input)
1000 ;; we simply slurp until '>'
1002 (do ((ch (a-read-byte input
) (a-read-byte input
)))
1003 ((and ch
(rune= ch
#/>))
1004 (values :experimental-tag
(subseq/rod
(a-stream-scratch input
) 0 sp
)))
1005 (setf sp
(push-on-scratch input sp ch
))) ))
1008 ;;; ---------------------------------------------------------------------------
1012 (defun handle-meta-tag-in-parser (input attrs
)
1013 (when (rod-equal (string-rod "content-type") (getf attrs
:http-equiv
))
1014 (let ((content-type (getf attrs
:content
)))
1016 (multiple-value-bind (type subtype parameters
)
1017 (closure-mime-types:parse-mime-content-type
1018 (rod-string content-type
))
1019 (declare (ignore type subtype
))
1020 (let ((cs (assoc :charset parameters
:test
#'string-equal
)))
1022 (setup-code-vector input
(cdr cs
)))))))))
1024 (defun sgml-parse (dtd input
)
1025 (let* ((stack (list (make-start-tag :name
:%top
:atts nil
)))
1026 (s (sgml::make-pt
/low
:name
'top
))
1033 ((or eof?
(not (null (cdr eingabe
)))))
1034 (let ((tok (read-token* input dtd
)))
1035 (when (and (end-tag-p tok
) (eq (tag-name tok
) :%top
))
1037 (setf eingabe
(nconc eingabe
(list tok
)))))
1038 (multiple-value-setq (stack eingabe ausgabe
) (transition input dtd stack eingabe
))
1039 (cond ((eq ausgabe
:accept
)
1042 ((eq ausgabe
:error
)
1045 ((eq ausgabe
:close
)
1046 (setf s
(sgml:pt-parent s
)))
1048 ((eq (car ausgabe
) :comment
)
1049 (setf (sgml:pt-children s
)
1050 (nconc (sgml:pt-children s
) (list (sgml::make-pt
/low
1051 :name
(cadr ausgabe
)
1052 :attrs
(caddr ausgabe
)
1056 ((eq (car ausgabe
) :open
)
1057 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1058 (cond ((and (eq (cadr ausgabe
) :meta
))
1059 (handle-meta-tag-in-parser input
(caddr ausgabe
))))
1060 ;; when the BODY tag is openend, switch the streams speed to full speed.
1061 (cond ((and (eq (cadr ausgabe
) :body
))
1062 (setf (runes::xstream-speed input
)
1063 (length (runes::xstream-os-buffer input
)))))
1064 (let ((n (sgml::make-pt
/low
1065 :name
(cadr ausgabe
)
1066 :attrs
(caddr ausgabe
)
1069 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))
1072 ((eq (car ausgabe
) :open
/close
)
1073 ;; code duplication alert!
1074 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1075 (cond ((and (eq (cadr ausgabe
) :meta
))
1076 (handle-meta-tag-in-parser input
(caddr ausgabe
))))
1077 (let ((n (sgml::make-pt
/low
1078 :name
(cadr ausgabe
)
1079 :attrs
(caddr ausgabe
)
1082 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))
1084 (setf s
(sgml:pt-parent s
)))
1086 ((eq (car ausgabe
) :htag
)
1087 ;; code duplication alert!
1088 (let* ((v (cadr ausgabe
))
1089 (n (cond ((start-tag-p v
)
1092 :attrs
(start-tag-atts v
)
1102 (error "fix your code.")))))
1103 (setf (sgml:pt-children s
) (nconc (sgml:pt-children s
) (list n
))) ))
1107 (defun empty-element-p (dtd gi
)
1108 (null (elm-inclusion dtd gi
)))
1110 (defun transition (input dtd stack eingabe
) ;; --> stack', eingabe', ausgabe
1111 (cond ((and stack
(eq (tag-name (car stack
)) :pcdata
))
1117 (values nil nil
:accept
))
1119 (values stack eingabe
:error
))))
1120 ;; (aS, </a>W) -> (S, W, </a>)
1121 ((comment-token-p (car eingabe
))
1124 (if (member (tag-name (car stack
)) *preserves-comments-elements
*)
1126 (list :comment
:pcdata
(comment-token-data (car eingabe
))))
1129 ((and (tag-p (car eingabe
))
1130 (tag-as-marker-p (tag-name (car eingabe
))))
1133 (list :htag
(car eingabe
))))
1135 ((and (end-tag-p (car eingabe
))
1137 (eq (tag-name (car stack
))
1138 (tag-name (car eingabe
))))
1143 ((and (start-tag-p (car eingabe
))
1145 (member (tag-name (car eingabe
)) (elm-inclusion dtd
(tag-name (car stack
)))) )
1147 ((empty-element-p dtd
(tag-name (car eingabe
)))
1150 (list :open
/close
(tag-name (car eingabe
)) (start-tag-atts (car eingabe
)))))
1152 (values (cons (car eingabe
) stack
)
1154 (list :open
(tag-name (car eingabe
)) (start-tag-atts (car eingabe
)))))))
1156 ((and (white-space-token-p (car eingabe
))
1158 (not (member :pcdata
(elm-inclusion dtd
(tag-name (car stack
))))))
1160 (values stack
(cdr eingabe
) nil
))
1162 (error "Oops empty stack in TRANSITION on ~S." eingabe
))
1164 (let ((x (resolve dtd
(tag-name (car stack
)) (car eingabe
))))
1170 (heuristic input dtd
(car stack
) eingabe
)
1173 (defun tag-as-marker-p (gi)
1174 (and *font-heuristic-p
*
1176 (and *anchor-heuristic-p
*
1179 (defun white-space-token-p (x)
1180 (and (start-tag-p x
)
1181 (eq (tag-name x
) :pcdata
)
1182 (every #'white-space-rune-p
(start-tag-atts x
))))
1184 ;;; Heuristic conflict resolution
1186 (defun shortest-different-beginning (x y
)
1194 (return-from shortest-different-beginning
1195 (values (subseq x
0 i
)
1196 (subseq y
0 j
)))))))
1198 (defun document-action (input context alte-eingabe neue-eingabe is-default-p
)
1199 (let ((see (car alte-eingabe
)))
1200 (parse-warn input
4 "[~A] Saw ~A in ~A ~A"
1201 (if is-default-p
"-" "H") see context
1202 (multiple-value-bind (a b
) (shortest-different-beginning alte-eingabe neue-eingabe
)
1203 (cond ((and (null a
) (null b
))
1204 (format nil
"-- ??? patched ~S -> ~S" alte-eingabe neue-eingabe
))
1206 (format nil
"-- nuked~{ ~A~}." a
))
1208 (format nil
"-- inserted~{ ~A~}." b
))
1210 (format nil
"-- patched~{ ~A~} ->~{ ~A~}." a b
)) )) )))
1212 (defun heuristic (input dtd context eingabe
)
1213 (let ((see (car eingabe
))
1215 (labels ((is (tag state
)
1216 (and (elms-eqv dtd state
(tag-name context
))
1217 (if (char= (char (symbol-name tag
) 0) #\
/)
1218 (and (end-tag-p (car eingabe
))
1220 (tag-name (car eingabe
))
1221 (kintern (subseq (symbol-name tag
) 1))))
1222 (and (start-tag-p (car eingabe
))
1223 (elms-eqv dtd
(tag-name (car eingabe
)) tag
))))))
1225 (cond ((and (member :HTML
(find-dtd-top-elements dtd
))
1226 (cond ((and (end-tag-p see
) (eq (tag-name see
) :%top
))
1227 (cons (elm-etag (tag-name context
)) eingabe
))
1229 ((and (start-tag-p see
)
1230 (eq (tag-name see
) :style
))
1231 (cons (make-start-tag :name
:SPAN
1232 :atts
(list :class
(rod "illegalstyle")))
1235 ((is :center
#|in|
# :h1
)
1236 (list* (elm-etag (tag-name context
))
1237 (car eingabe
) context
(cdr eingabe
)))
1242 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1244 ((or (is :center
#|in|
# :font
)
1245 (is :p
#|in|
# :font
))
1246 ;; Uff -- the attributes of FONT are lost here.
1247 ;; we have to extend `context' to include these.
1248 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1250 (list* (elm-etag (tag-name context
)) (car eingabe
) context
(cdr eingabe
)))
1251 ((elms-eqv dtd
(tag-name context
) ':font
)
1252 (cons (elm-etag (tag-name context
)) eingabe
))
1253 ((is :tr
#|in|
# :center
)
1254 (cons (elm-etag (tag-name context
)) eingabe
))
1255 ((is :/h2
#|in|
# :h1
)
1256 (cons (elm-etag (tag-name context
)) eingabe
))
1257 ((is :ul
#|in|
# :h1
)
1258 (cons (elm-etag (tag-name context
)) eingabe
))
1259 ((is :/center
#|in|
# :h3
)
1260 (cons (elm-etag (tag-name context
)) eingabe
))
1261 ((is :/td
#|in|
# :div
)
1262 (cons (elm-etag (tag-name context
)) eingabe
))
1264 (cons (make-start-tag :name
:li
) eingabe
))
1266 (cons (make-start-tag :name
:li
) eingabe
))
1267 ((is :img
#|in|
# :ul
)
1268 (cons (make-start-tag :name
:li
) eingabe
))
1270 ((is :pcdata
#|in|
# :ul
)
1271 (cons (make-start-tag :name
:li
) eingabe
))
1273 ((is :td
#|in|
# :table
)
1274 (list* (make-start-tag :name
:tr
)
1276 ((is :pcdata
#|in|
# :table
)
1277 (list* (elm-etag (tag-name context
)) (car eingabe
) context
1280 ((is :frameset
#|in|
# :noscript
)
1281 (list* (elm-etag (tag-name context
)) eingabe
))
1283 ((is :form
#|in|
# :table
)
1284 (list* (elm-etag (tag-name context
))
1289 ((is :/form
#|in|
# :tbody
)
1290 ;; we should better check here wether FORM is open at all.
1291 (list* (cadr eingabe
)
1295 ;; new as of 1999-08-31
1296 ((is :td
#|in|
# :li
)
1297 (list* (elm-etag (tag-name context
))
1299 ((is :/td
#|in|
# :li
)
1300 (list* (elm-etag (tag-name context
))
1302 ((is :td
#|in|
# :ul
)
1303 (list* (elm-etag (tag-name context
))
1305 ((is :/td
#|in|
# :ul
)
1306 (list* (elm-etag (tag-name context
))
1311 (setq is-default-p t
)
1313 (document-action input
(tag-name context
) eingabe neu is-default-p
)
1316 (defun parse-html (input &optional
(charset :iso-8859-1
))
1317 (let ((dtd closure-html
:*html-dtd
*))
1318 (let ((input (runes:make-xstream input
:initial-speed
1 :speed
128)))
1319 (setf (a-stream-scratch input
)
1320 (make-array #.
(* 2 *buf-size
*) :element-type
'rune
))
1321 (setup-code-vector input charset
)
1322 (let ((r (sgml-parse dtd input
)))
1323 (post-mortem-heuristic dtd r
)))) )
1325 (defun post-mortem-heuristic (dtd parse-tree
)
1326 "Do possible post mortem heuristic on a parse tree."
1327 (when *font-heuristic-p
*
1328 (setf parse-tree
(post-mortem/fix-font dtd parse-tree
)))
1329 (setf parse-tree
(post-mortem/fix-top-level-structure parse-tree
))
1332 (defun post-mortem/fix-top-level-structure
(parse-tree)
1333 ;; The hacking below is needed because of buggy input. Something like this
1334 ;; <base ..> <html> <head> .. <body> ... </html>
1335 ;; are two documents in one. Here we merge then into one HTML document.
1336 ;; Note that this defeats later incremental rendering pretty well.
1337 ;; Also this is HTML specific.
1338 ;; Das muss dennoch alles noch anders werden hier.
1339 (let ((r parse-tree
)
1343 (head-warn-flag nil
)
1344 (body-warn-flag nil
)
1347 (when (> (length (pt-children r
)) 1)
1348 (parse-warn nil
4 "Multiple HTML elements in document."))
1349 (dolist (k (pt-children r
))
1352 (dolist (k (pt-children k
))
1356 (setf head-warn-flag t
))
1357 (setf head-elts
(nconc head-elts
(pt-children k
))))
1361 (setf body-warn-flag t
))
1362 (setf body-elts
(nconc body-elts
(pt-children k
))))
1365 (setf frameset-elts
(nconc frameset-elts
(pt-children k
))) ))))))
1366 (when head-warn-flag
1367 (parse-warn nil
4 "Multiple HEAD elements."))
1368 (when body-warn-flag
1369 (parse-warn nil
4 "Multiple BODY elements."))
1370 (let* ((html (make-pt/low
:name
:html
:parent nil
))
1371 (head (make-pt/low
:name
:head
:parent html
:children head-elts
)))
1374 (parse-warn nil
4 "Body present while FRAMSET is present -- nuked!")
1375 (setf body nil body-elts nil
))
1377 (setf (pt-parent frameset
) html
1378 (pt-children frameset
) frameset-elts
)
1382 (setf (pt-parent body
) html
1383 (pt-children body
) body-elts
))
1385 (setf body
(make-pt/low
:name
:body
1387 :children body-elts
))))))
1388 (dolist (k head-elts
) (setf (pt-parent k
) head
))
1389 (dolist (k body-elts
) (setf (pt-parent k
) body
))
1390 (dolist (k frameset-elts
) (setf (pt-parent k
) frameset
))
1391 (setf (pt-children html
)
1392 (append (and head
(list head
))
1393 (and body
(list body
))
1394 (and frameset
(list frameset
))))
1397 ;;; FONT post mortem heuristic
1398 ;;; ==========================
1400 ;; Since FONT is by far the most often misused element, we handle these by a
1401 ;; post mortem analysis. If *font-heuristic-p* is true, parser does not care
1402 ;; for FONT start or end tags, but inserts HSTAG-NODEs and HETAG-NODEs into
1403 ;; the parse tree blindly. (Thus emulating the Mosaic approach to rendering).
1408 ;; <p> <font> foo <b> bar </font> baz </b>
1410 ;; is then parsed as:
1412 ;; (P #<HSTAG font> "foo" (B "bar" #<HETAG font> "baz"))
1415 ;; We now always want to return a parse tree, which conforms to the DTD, and
1416 ;; thus have to mungle this somehow into a sane HTML parse tree by wraping
1417 ;; FONT nodes around the right parts.
1419 ;; The first thing we do is to find all pairs of HSTAG, HETAG
1420 ;; nodes. We then use the following algorithm:
1428 ;; if S and E are on the same level then // [*] that is E, S have the same parent
1430 ;; ;; partitionate p.children as:
1431 ;; p.children = (,@sb S ,@si E ,@se)
1433 ;; ;; FONT element spans nothing, so forget it
1435 ;; if p may contain FONT and
1436 ;; for all x in si: FONT may contain x
1438 ;; p.children <- (,@sb (font ,@si) ,@se)
1441 ;; if S is higher in tree than E then
1442 ;; V <- ancestor of E, with V.parent = S.parent
1443 ;; insert a copy of E directly before E
1444 ;; insert a copy of S directly before first child of V
1445 ;; apply the algorithm recursively
1447 ;; if E is higher in tree than S then
1448 ;; ;; this analog to the case above
1449 ;; V <- ancestor of S, with V.parent = E.parent // this ancestor need not to exist!
1450 ;; insert a copy of S directly after V
1451 ;; insert a copy of E direclly after last child of V
1452 ;; apply the algorithm recursively
1455 ;; [*] This is not right. This was an thinko: Two nodes on the same
1456 ;; level do not always have the parent nodes.
1458 ;; NOTE: My first formulation of the algorithm used access pathen and the
1459 ;; implementation below does also -- this is inefficient and should be
1462 ;; NOTE: It would be nice, if we could prove that switching on
1463 ;; *FONT-HEURISTIC-P* does not hurd any conforming document.
1465 ;; When the need araises, we could also use the same method to deal
1466 ;; malicious B, I, et al tags; But bad nesting isn't that popular any more
1469 ;; Expirience however showed, that we would need something similar for FORM,
1470 ;; since people have the habit of spitting a FORM anywhere they see
1471 ;; fit. [Most popular is right between TR and TD]. Visit www.deja.com and you
1472 ;; see what I mean. FORM elements are extremely important, since one may be
1473 ;; able to cope with gliberish on the screen, but not with non-functional
1474 ;; forms. Another thing here is <INPUT> elements in inlegal spots.
1478 ;; We still want to emit parser warnings, if FONT elements are misused.
1480 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1481 ;;; (declare (special q))
1482 ;;; (map-htag-pairs (lambda (stag etag)
1484 ;;; (setf s (pt-path parse-tree stag))
1485 ;;; (remove-pt stag)
1486 ;;; (setf e (pt-path parse-tree etag))
1487 ;;; (remove-pt etag)
1488 ;;; (mungle-font-pair dtd parse-tree stag s e)))
1489 ;;; parse-tree :font)
1492 (defun mungle-font-pair (dtd root tag s e
)
1493 ;; NOTE: ignore-errors is needed, since ELM-INCLUSION checks for existing
1494 ;; element names (this is introduced by our pseudo TOP element).
1496 ((and (= (length s
) (length e
))
1497 (equal (butlast s
) (butlast e
)))
1498 ;; implicit assert was (butlast s) equal (butlast e), which is not always true.
1499 (let ((p (pt-access root
(butlast s
))))
1500 (cond ((and (member :font
(ignore-errors (elm-inclusion dtd
(gi p
))))
1501 (do ((j (car (last s
)) (+ j
1)))
1502 ((>= j
(car (last e
))) t
)
1503 (when (not (null (elt (pt-children p
) j
)))
1504 ;; this should always be true!
1505 (unless (member (gi (elt (pt-children p
) j
))
1506 (ignore-errors (elm-inclusion dtd
:font
)))
1508 (let* ((before (loop for i from
0 to
(1- (car (last s
)))
1509 collect
(elt (pt-children p
) i
)))
1510 (between (loop for i from
(car (last s
)) to
(1- (car (last e
)))
1511 collect
(elt (pt-children p
) i
)))
1512 (after (loop for i from
(car (last e
)) to
(1- (length (pt-children p
)))
1513 collect
(elt (pt-children p
) i
)))
1514 (new (make-pt/low
:name
(gi tag
)
1515 :attrs
(pt-attrs tag
)
1518 (setf (pt-children p
) (append before
(list new
) after
))
1520 (setf (pt-parent k
) new
)) ))
1522 (loop for i from
(car (last s
)) to
(1- (car (last e
))) do
1525 (append (butlast s
) (list i
0))
1527 (list i
(length (pt-children (elt (pt-children p
) i
)))))))))))
1528 ((and (< (length s
) (length e
))
1529 '(equal (subseq e
(length s
)) s
))
1530 (let ((v (subseq e
0 (length s
))))
1531 (mungle-font-pair dtd root tag
(append v
(list 0)) e
)
1532 (mungle-font-pair dtd root tag s v
)))
1533 ((and (> (length s
) (length e
))
1534 '(equal (subseq s
(length e
)) e
))
1535 (let* ((v (subseq s
0 (length e
)))
1537 (mungle-font-pair dtd root tag
1538 (append (butlast v
) (list (+ 1 (car (last v
))))) e
)
1539 (setf m
(length (pt-children (pt-access root v
))))
1540 (mungle-font-pair dtd root tag
1541 s
(append v
(list m
)))))
1544 ;; search to first common parent
1546 ((or (= i
(length s
))
1548 (not (eq (elt s i
) (elt e i
))))
1549 (setf p
(subseq s
0 i
))))
1550 (let* ((v1 (append (subseq s
0 (length p
))
1551 (list (+ 1 (elt s
(length p
))))))
1553 (mungle-font-pair dtd root tag s v1
)
1554 (mungle-font-pair dtd root tag v2 e
))))
1557 (defun map-htag-pairs (continuation parse-tree gi
)
1558 ;; this function maps the inner htags first.
1561 (cond ((and (hstag-node-p x
) (eq (gi x
) gi
))
1563 ((and (hetag-node-p x
) (eq (gi x
) gi
))
1566 (parse-warn nil
4 "Superfluous ~A end tag." gi
))
1568 (funcall continuation
(pop stack
) x
))))
1570 (mapc #'walk
(pt-children x
))))))
1572 ;; take care for non-closed start tags
1573 (unless (null stack
)
1574 (parse-warn nil
4 "Unclosed ~A elements." gi
)
1575 (let ((new (mapcar (lambda (x)
1576 (declare (ignore x
))
1577 (make-hetag-node :name gi
1582 (setf (pt-children parse-tree
)
1583 (append (pt-children parse-tree
) new
))
1584 (mapc continuation stack new
))) )))
1586 ;;; FORM post mortem heuristic
1587 ;;; ==========================
1589 ;; FORM is another element, which some authors get incredible
1590 ;; wrong. We deal with this in a similar way;
1592 ;; while we parse HTAG are put into the parse tree, we sort it out
1593 ;; after we are done parsing; There are two possibilities:
1595 ;; a. We are able to insert FORM in a sane way.
1596 ;; b. We are not able to do this
1598 ;; How is b. now handled? I see two solutions:
1600 ;; - basically leave HTAGs in the tree.
1601 ;; - permitt multiple FORM elements and connect them via some special
1606 (defun setup-code-vector (input charset
)
1607 (let ((enc (runes-encoding:find-encoding charset
)))
1608 (cond ((not (null enc
))
1609 (setf (runes:xstream-encoding input
) enc
))
1611 (parse-warn input
4 "There is no such encoding: ~S." charset
)))))
1613 (defun sgml-parse-file (filename dtd
)
1614 (with-open-file (input filename
:direction
:input
:element-type
'(unsigned-byte 8))
1615 (sgml-parse dtd
(make-a-stream :cl-stream
(cl-byte-stream->gstream input
)))))
1617 (defun html-parse-file (filename)
1618 (with-open-file (input filename
:direction
:input
:element-type
'(unsigned-byte 8))
1619 (parse-html (cl-byte-stream->gstream input
))))
1622 (defun html-parse-url (url)
1623 (unless (url:url-p url
) (setf url
(url:parse-url url
)))
1624 (netlib:with-open-document
((input mime-type
) url
)
1626 (parse-html input
)))
1629 (defun check-saneness (pt &optional
(dtd closure-html
:*html-dtd
*))
1630 (dolist (k (pt-children pt
))
1631 (unless (member (gi k
) (elm-inclusion dtd
(gi pt
)))
1632 (warn "Unallowed ~A element within ~A." (gi k
) (gi pt
)))
1633 (unless (eq (pt-parent k
) pt
)
1634 (warn "Parent/child linkage broken."))
1635 (cond ((htag-node-p k
)
1636 (warn "HTAG node (~S) left in parse tree. (parent ~S)" (gi k
) (gi pt
))))
1637 (check-saneness k
)))
1639 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1640 ;;; ;; das hatten wir schon, so geht das nicht -- oder nur sehr schwer.
1642 ;;; (let ((font-stack nil)) ;stack of open HTAG FONT elements
1643 ;;; (labels ((walk (pt)
1644 ;;; (cond ((hstag-node-p pt)
1645 ;;; (push pt font-stack)
1647 ;;; ((hetag-node-p pt)
1648 ;;; (pop font-stack)
1651 ;;; (cond ((not (null font-stack))
1652 ;;; ;; some fonts are open
1653 ;;; ;; warp them around this node
1657 ;;; (let ((new-children (mapcan #'walk (pt-children pt))))
1658 ;;; (dolist (k new-children)
1659 ;;; (setf (pt-parent k) pt))
1660 ;;; (setf (pt-children pt) new-children)
1661 ;;; (list pt))) )) )))
1662 ;;; (car (walk parse-tree)) )))
1664 (defun post-mortem/fix-font
(dtd parse-tree
)
1665 (declare (special q
))
1667 ;; erstmal alle font tag paare suchen
1668 (map-htag-pairs (lambda (stag etag
)
1669 (push (cons stag etag
) pairs
))
1671 (setf pairs
(reverse pairs
))
1672 ;; dann alle raus nehmen
1675 (remove-pt (cdr k
)))
1676 ;; dann alle munglen
1677 (dolist (pair pairs
)
1678 (let ((stag (car pair
)) (etag (cdr pair
)) s e
)
1679 (setf s
(pt-path parse-tree stag
))
1680 (assert (eq (pt-access parse-tree s
) stag
))
1682 (setf e
(pt-path parse-tree etag
))
1683 (assert (eq (pt-access parse-tree e
) etag
))
1686 (mungle-font-pair dtd parse-tree stag s e
))
1688 (warn "Hmm..."))))))
1691 ;;; ===========================================================================
1695 ;; We want to match Netscape's behaviour and thus by experimentation,
1698 (defun shortest-path-to (dtd pathen goal
&optional
(max-depth 10))
1699 (cond ((<= max-depth
0)
1701 ((find-if (lambda (p)
1705 (shortest-path-to dtd
1709 (elm-surclusion dtd
(first p
))))
1717 ;; Start Tag anaylis
1719 (defun blah (offending)
1720 (with-open-file (sink "/tmp/t.html"
1722 :if-exists
:new-version
)
1723 (let ((dtd closure-html
:*html-dtd
*))
1724 (let ((p (shortest-path dtd
:BODY offending
)))
1725 (let ((p2 (shortest-path dtd offending
:PCDATA
)))
1726 (format sink
"~A<BR>~%" offending
)
1728 (pop p
) ;forget BODY
1730 (setq p2
(butlast p2
))
1732 (dolist (k (butlast p
)) (format sink
"<~A>~%" k
))
1733 (format sink
"<A href='xxx'>~%")
1734 (format sink
"<img src='/tmp/a.gif'>A")
1735 (format sink
"<~A>~%" (car (last p
)))
1736 (dolist (k p2
) (format sink
"<~A>~%" k
))
1737 (format sink
"<img src='/tmp/b.gif'>B")
1738 (dolist (k (reverse p2
)) (format sink
"</~A>~%" k
))
1739 (format sink
"</~A>~%" (car (last p
)))
1740 (format sink
"<img src='/tmp/c.gif'>C")
1741 (dolist (k (reverse (butlast p
))) (format sink
"</~A>~%" k
))
1744 (defun open-in-netscape (url)
1745 (glisp:run-unix-shell-command
1746 (format nil
"my-netscape -remote 'openURL(~A)'" url
)))
1750 (dolist (off (all-elms closure-html
:*html-dtd
*))
1751 (cond (t '(or (member :B
(elm-inclusion closure-html
:*html-dtd
* off
))
1752 (member :P
(elm-inclusion closure-html
:*html-dtd
* off
)))
1754 (format T
"~&;; ~A" off
)
1755 (open-in-netscape "file:/tmp/t.html")
1757 (let ((nam (format nil
"/tmp/~A.gif" off
)))
1758 (glisp:run-unix-shell-command
(format nil
"(my-xwd -id 0x3c003bf | xwdtopnm | ppmtogif) 2>/dev/null > ~A &" nam
)))
1761 (format T
"~&;; Skipping ~A, because inclusion is ~A."
1762 off
(elm-inclusion closure-html
:*html-dtd
* off
)))))))
1765 (defun equivalence-classes (prediate set
)
1766 (let ((classes nil
))
1768 (do ((cp classes
(cdr cp
)))
1770 (push (list item
) classes
))
1771 (cond ((funcall prediate
(caar cp
) item
)
1772 (push item
(car cp
))
1776 (defun shortest-path (dtd from to
&optional
(max-depth 10))
1777 (reverse (shortest-path/aux dtd
(list (list from
)) to max-depth
)))
1779 (defun shortest-path/aux
(dtd pathen to
&optional
(max-depth 10))
1780 (cond ((<= max-depth
0)
1784 ((find-if (lambda (p) (eq (car p
) to
)) pathen
))
1788 (mapcan (lambda (path)
1789 (mapcar (lambda (child)
1791 (elm-inclusion dtd
(car path
))))
1793 :test
(lambda (x y
) (eql (car x
) (car y
))))
1799 (let ((dtd closure-html
:*html-dtd
*))
1800 (equivalence-classes (lambda (x y
)
1801 (and (set-equal (elm-inclusion dtd x
) (elm-inclusion dtd y
))
1802 (set-equal (elm-surclusion dtd x
) (elm-surclusion dtd y
))))
1806 ;;; ===========================================================================
1809 (cond ((and (vectorp x
) (not (stringp x
))) (rod-string x
))
1811 ((cons (mungle (car x
))
1812 (mungle (cdr x
))))))
1815 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1818 (format T
"~&;; Parse error (line ~D column ~D): [~A] Saw ~A in ~A."
1819 (runes:xstream-line-number input
)
1820 (runes:xstream-column-number input
))