Fix license header to reflect the actual license of two files
[closure-html.git] / src / parse / sgml-parse.lisp
blobfaa9029bea864037fdc0f7bea475796d752558ab
1 ;;; -*- Mode: Lisp; Syntax: Common-Lisp; Package: SGML; Readtable: GLISP; -*-
2 ;;; ---------------------------------------------------------------------------
3 ;;; Title: An SGML Parser
4 ;;; Created: 1996-10-21
5 ;;; Author: Gilbert Baumann <gilbert@base-engineering.com>
6 ;;; License: MIT style (see below)
7 ;;; ---------------------------------------------------------------------------
8 ;;; (c) copyright 1996-1999 by Gilbert Baumann
10 ;;; Permission is hereby granted, free of charge, to any person obtaining
11 ;;; a copy of this software and associated documentation files (the
12 ;;; "Software"), to deal in the Software without restriction, including
13 ;;; without limitation the rights to use, copy, modify, merge, publish,
14 ;;; distribute, sublicense, and/or sell copies of the Software, and to
15 ;;; permit persons to whom the Software is furnished to do so, subject to
16 ;;; the following conditions:
17 ;;;
18 ;;; The above copyright notice and this permission notice shall be
19 ;;; included in all copies or substantial portions of the Software.
20 ;;;
21 ;;; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 ;;; EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 ;;; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24 ;;; IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25 ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26 ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27 ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 ;; Changes
31 ;; When Who What
32 ;; ----------------------------------------------------------------------------
33 ;; 2001-05-17 GB - empty tags are threated as start tags.
34 ;; [we can do better than that].
36 ;; 2001-05-14 GB - finally all mumbling goes with PARSE-WARN.
38 ;; 1999-09-19 GB - a-streams are gone now, we use the xstreams from the
39 ;; XML parser.
41 ;; 1999-08-24 GB = new scheme to handle unsyntactic nesting of FONT tags
42 ;; thru' post mortem analysis; due to that:
43 ;; - *FONT-HEURISTIC-P*: new special variable
44 ;; - HTAG-NODE: new structure type
45 ;; - New stack machine output token type :HTAG
46 ;; - TAG-AS-MARKER-P: new predicate
47 ;; - POST-MORTEM-HEURISTIC: new function
48 ;; - PARSE-HTML: use it
49 ;; - POST-MORTEM/FIX-TOP-LEVEL-STRUCTURE: moved code
50 ;; from PARSE-HTML here
52 ;; - HTML-PARSE-FILE, HTML-PARSE-URL: new functions
54 ;; 1999-08-23 GB - EMPTY-ELEMENT-P: new function
55 ;; - SGML-PARSE, TRANSITION: new output token :open/close
56 ;; to accommodate empty elements (like <IMG>, <BR>)
58 ;; 1999-08-19 GB - nuked FAT-WHITE-SPACE-P
59 ;; - nuked U16->STRING (using ROD-STRING now).
60 ;; - nuked VECTOR/U16
61 ;; - use #/.. read syntax instead of #.(char-code) idiom
62 ;; - use RUNE=, RUNE<= et al
63 ;; - use ROD= et al
64 ;; - twixed SUBSEQ/U16 to SUBSEQ/ROD
65 ;; - prays that everything still works.
67 (in-package :SGML)
69 #+:CMU
70 (shadow "CODE-CHAR")
71 #+:CMU
72 (declaim (inline code-char))
73 #+:CMU
74 (defun code-char (code)
75 (if (< code char-code-limit)
76 (cl:code-char code)
77 nil))
79 ;; This is a high-speed implementation of an SGML parser.
81 ;; NOTES
83 ;; o We do our own buffering here, which allows us to access the buffer
84 ;; directly and thus cutting down the number of function calls per character
85 ;; drastically.
87 ;; o The DTD has to be 'compiled' before we drive a little determistic PDA
88 ;; from it.
90 ;;; TODO
92 ;; o das mit dem <body> = full speed functioniert irgentwie nicht
93 ;; richtig ;-( wenn das BODY tag fehlt. -- es kommt nur etwas spaet
94 ;; wegen token lookahead, was aber kein problem sein sollte.
96 ;; o improve error handling
97 ;; - After we have encountered an error while reading a tag nuke everything
98 ;; until '>'.
99 ;; - Maybe?
100 ;; Forgive something like <foo x="10%> and back up until #/" by patching
101 ;; the read buffer
102 ;; - emit warning if somebody writes e.g. <A href=http://foo/bar/baz>,
103 ;; which is illegal
105 ;; o in SGML parser:
106 ;; - account for the SGML inclusion '+(..)' and exclusion '-(..)' feature
107 ;; - implement the HTML idea of CDATA for SCRIPT and (uhm!) XMP.
108 ;; - thanks to the W3C: '<' and '>' are now part of CSS-2 syntax so we
109 ;; need something similar for STYLE also.
111 ;; o merge as much is possible with the XML parser in a modular fashion
113 ;; o scratch-pad hack needs to be reentrant !!!
114 ;; [make xml-parser's implementation of with-rune-collector safe
115 ;; and use that?]
117 ;; o the FORM hack should be there.
118 ;; (we also need an <INPUT> hack).
120 ;; o finally ensure that the rune abstraction is fine.
122 ;; o report errors using exactly one mechanism and include file name
123 ;; (uri) into error messages; better yet: define a error message
124 ;; structure -- in the future it should be possible to have something
125 ;; like an error message listener.
127 ;; o since we use xstreams now morph the eof representation from NIL
128 ;; to :EOF. Or better yet: give the READ-RUNE macro a standard
129 ;; signature including eof-error-p and eof-value. (Wouldn't hurt,
130 ;; since it is a macro).
132 ;; o In DTD abstraction: ELM-INCLUSION needs flag error-p
134 ;; o we need rune-max and therefore are not dependent on UCS-2
135 ;; anymore, should the need araise simply redefine rune-max.
137 ;;; IDEA
139 ;; Build the parse tree out of lazy lists. Incremental rendering would
140 ;; then become cheap to implement.
142 (defparameter *preserves-comments-elements*
143 '(:STYLE :SCRIPT)
144 "List of names of elements, which should preserve (may contain) comment tokens.")
146 (defparameter *font-heuristic-p* t
147 "Whether to handle FONT by heuristic and post mortem processing.")
149 (defparameter *anchor-heuristic-p* nil
150 "Whether to handle A tags by heuristic and post mortem processing.")
152 (defparameter *gt-ends-comment-p* nil
153 "Whether and '>' ends a comment? -- A warning is emitted still.")
155 (defvar *options/parser-silent-p* nil)
156 (defvar *line-number*)
157 (defparameter *parse-warn-level* 5)
160 ;; HTAG = heuristic tag
161 ;; These are unparsed tags, which are inserted verbatim into the parse tree
162 ;; for post mortem analysis.
163 ;; See explaination in FONT element fixer far below.
165 (defstruct (htag-node (:include pt)) ) ;heuristic tag
166 (defstruct (hstag-node (:include htag-node)) ) ;heuristic start tag
167 (defstruct (hetag-node (:include htag-node)) ) ;heuristic end tag
169 ;; The predicate TAG-AS-MARKER-P decides, whether a given tag is
170 ;; inserted as HSTAG or HETAG.
172 ;;;; --------------------------------------------------------------------------
173 ;;;; Predicates and Utilities on Runes
175 ;; Most of these should really go into util.lisp
177 (definline name-start-rune-p (char)
178 (or (rune<= #/a char #/z)
179 (rune<= #/A char #/Z)))
181 (definline name-rune-p (char)
182 (or (name-start-rune-p char)
183 (digit-rune-p char)
184 (rune= char #/.)
185 (rune= char #/-)))
187 (definline sloopy-name-rune-p (char)
188 (or (name-rune-p char)
189 (rune= char #/%)
190 (rune= char #//) ;manche schreiben ganze urls ohne Gaensefuesschen
191 (rune= char #/:)
192 (rune= char #/~)
193 (rune= char #/#) ;farben werden auch gerne genommen
194 (rune= char #/_)
195 (rune= char #/+)
196 (rune= char #/?)
197 (rune= char #/!)
198 (rune= char #/@)
199 ;; ganzer ECMA-Script Code kommt auch vor
200 (rune= char #/\()
201 (rune= char #/\))
202 (rune= char #/')
203 (rune= char #/\")
204 (rune= char #/\;)
205 (rune= char #/,)
206 (rune= char #/[)
207 (rune= char #/])
208 (rune= char #/&) ))
210 (definline sloopy-value-rune-p (char)
211 (or (sloopy-name-rune-p char)
212 (rune= char #/=)))
214 (definline alpha-rune-p (char)
215 (or (rune<= #/a char #/z)
216 (rune<= #/A char #/Z)))
218 (definline upcase-name-rune (rune)
219 (rune-upcase rune))
221 ;;;; --------------------------------------------------------------------------
222 ;;;; Rod Utilities
224 (definline subseq/rod (source start end)
225 ;; Optimized version of subseq for arrays of runes
226 (declare (type rod source)
227 (type fixnum start end))
228 (let ((res (make-rod (- end start))))
229 (declare (type rod res))
230 (do ((i (- (- end start) 1) (the fixnum (- i 1))))
231 ((< i 0) res)
232 (declare (type fixnum i))
233 (setf (%rune res i) (%rune source (the fixnum (+ i start)))))))
235 ;;; ---------------------------------------------------------------------------
236 ;;; Buffered I/O
238 (eval-when (compile load eval)
239 (defparameter *buf-size* 4096))
241 (defmacro a-read-byte (input)
242 (let ((c (gensym)))
243 `(let ((,c (runes:read-rune ,input)))
244 (if (eq ,c :eof) nil ,c))))
246 (defmacro a-peek-byte (input)
247 (let ((c (gensym)))
248 `(let ((,c (runes:peek-rune ,input)))
249 (if (eq ,c :eof) nil ,c))))
251 (defmacro a-unread-byte (byte input)
252 `(runes:unread-rune ,byte ,input))
254 (defmacro a-stream-position (input)
255 `(runes:xstream-position ,input))
257 (defun make-a-stream (&key cl-stream)
258 (runes:make-xstream cl-stream :initial-speed 1 :speed 8192))
260 (defmethod runes::read-octets (sequence (stream html-glisp:gstream) start end)
261 (html-glisp:g/read-byte-sequence sequence stream :start start :end end))
263 (defmethod runes::xstream/close ((stream html-glisp:gstream))
264 (html-glisp:g/close stream))
266 ;; a fake definition -- XXX non-reentrant!
268 (defun a-stream-scratch (input)
269 (getf (runes::xstream-plist input) 'scratch-pad))
271 (defun (setf a-stream-scratch) (new-value input)
272 (setf (getf (runes::xstream-plist input) 'scratch-pad) new-value))
274 ;;;; -------------------------------------------------------------------------
275 ;;;; Reporting Errors
276 ;;;;
278 (defun read-tag-error (input format-string &rest format-args)
279 (apply #'parse-warn input 4 format-string format-args)
280 (throw 'read-tag-error
281 (values :pcdata
282 (string-rod "##BAD TAG##"))))
284 ;;; Warning Levels
285 ;;; --------------
286 ;;; 1 - Absolutely normal mumbleing
287 ;;; 2 - Laziness introduced by the HTML standard
288 ;;; 3 - Semantic error upon parsing attributes
289 ;;; 4 - Accidents happening while parsing the structure
290 ;;; 5 - More serve errors
293 (defun parse-warn (input level fmt &rest args)
294 (let ((*print-pretty* nil)) ;disable ugly^H^H^H^Hpretty printing
295 (when (>= level *parse-warn-level*)
296 (unless *options/parser-silent-p*
297 (let ((preample (format nil ";; Parser warning: ~11@<Line ~D,~> ~11@<column ~D~>: ~5A "
298 (and input (ignore-errors (runes:xstream-line-number input)))
299 (and input (ignore-errors (runes:xstream-column-number input)))
300 (make-string level :initial-element #\*))))
301 (fresh-line *trace-output*)
302 (write-string preample *trace-output*)
303 (with-input-from-string (i (apply #'format nil fmt args))
304 (do ((x (read-line i nil nil) (read-line i nil nil))
305 (firstp t nil))
306 ((null x))
307 (unless firstp
308 (write-string ";; " *trace-output*)
309 (dotimes (i (- (length preample) 3))
310 (write-char #\space *trace-output*)))
311 (write-string x *trace-output*)
312 (terpri *trace-output*))))))))
314 ;;; ---------------------------------------------------------------------------
315 ;;; SGML lexer
317 (defun read-token (input dtd)
318 ;; Reads on token from the stream `input'
319 ;; Returns:
320 ;; :pcdata cdata
321 ;; :start-tag name atts
322 ;; :end-tag name
323 ;; :empty-tag name atts
324 ;; :comment cdata
325 ;; :define-tag cdata
326 ;; :experimental-tag cdata
327 ;; :eof
328 (let ((ch (a-read-byte input)))
329 (cond ((null ch)
330 :eof)
331 ((rune= ch #/<)
332 (read-tag input dtd))
334 (a-unread-byte ch input)
335 (read-pcdata input dtd)) )))
337 (defun push-on-scratch (input sp ch)
338 ;; Push the character ch onto the scratch pad of `input' and enlarge if neccessary
339 (setf (aref (a-stream-scratch input) sp) ch)
340 (setf sp (+ sp 1))
341 (cond ((= sp (length (a-stream-scratch input))) ;end of scratch pad reached?
342 (enlarge-scratch-pad input)))
345 (defun read-pcdata (input dtd)
346 (let* ((scratch (a-stream-scratch input)) ;scratch pad
347 (sp 0) ;pointer into scratch pad
348 (se (length scratch)) ;end of scratch pad
349 ) ;code vector
350 (declare (type (simple-array rune (*)) scratch))
351 (declare (type fixnum sp se))
352 (loop
353 (let ((ch (a-read-byte input)))
354 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
355 ;; a-read-byte returns a rune.
356 ;;; (declare (type (or null (unsigned-byte 8)) ch))
357 (declare (type (or null rune) ch))
358 (cond ((null ch) ;eof
359 (return))
360 ((rune= ch #/<) ;end of pcdata
361 (a-unread-byte ch input)
362 (return))
363 ((rune= ch #/&)
364 (setf sp (read-entity-ref input dtd sp))
365 (if (>= sp se) ;read-entity-ref may enlarge scratchpad
366 (setf scratch (a-stream-scratch input)
367 se (length scratch))))
369 (setf (aref scratch sp) ch) ;recode character read
370 (setf sp (the fixnum (+ sp 1)))
371 (cond ((= sp se) ;end of scratch pad reached?
372 (enlarge-scratch-pad input)
373 (setf scratch (a-stream-scratch input)
374 se (length scratch))))))))
375 (values :pcdata
376 (subseq/rod scratch 0 sp)) ))
378 (defun read-entity-ref (input dtd sp)
379 ;; Reads an entity reference into the stream's scratch pad from position
380 ;; `sp' upwards.
381 ;; Returns the new write pointer. The initial "&" is already read from the
382 ;; stream.
383 ;; Syntax:
384 ;; entity-ref ::= "&" "#" <digit>+ (";")?
385 ;; entity-ref ::= "&" "#" "x" <hex-digit>+ (";")?
386 ;; entity-ref ::= "&" <name-start> <name-char>* (";")?
387 (let ((ch (a-read-byte input)))
388 (cond ((null ch) ;eof
389 (parse-warn input 3 "EOF in entity")
390 (push-on-scratch input sp #/&))
391 ((rune= ch #/#) ;numeric reference?
392 (read-numeric-entity input sp))
393 ((name-start-rune-p ch) ;named entity?
394 (read-named-entity input dtd sp ch))
396 (parse-warn input 3
397 "Saw character '~A' after '&' -- bad entity reference?!"
398 (or (rune-char ch)
399 (format nil "&#x~4,'0X" ch)))
400 (a-unread-byte ch input) ;it might be something interesting
401 (push-on-scratch input sp #/&)) )))
403 (defun read-numeric-entity (input sp)
404 ;; "&#" already read
405 (let ((ch (a-read-byte input)))
406 (setf sp (push-on-scratch input sp #/&))
407 (setf sp (push-on-scratch input sp #/#))
408 (cond ((null ch) ;eof
409 (parse-warn input 3 "EOF in entity")
412 ((digit-rune-p ch)
413 (read-numeric-entity-aux input (- sp 2) sp 10 ch))
415 ((rune= ch #/x)
416 ;; Hex entity
417 (setf sp (push-on-scratch input sp #/x))
418 (setf ch (a-read-byte input))
419 (cond ((null ch)
420 (parse-warn input 3 "EOF after '&#x'.")
422 ((not (digit-rune-p ch 16))
423 (parse-warn input 3 "Bad character after '&#x'.")
424 (a-unread-byte ch input)
427 (read-numeric-entity-aux input (- sp 3) sp 16 ch))))
429 (a-unread-byte ch input)
430 (parse-warn input 3 "Bad character after '&#'")
431 sp) )))
433 (defun read-numeric-entity-aux (input s0 sp radix ch)
434 ;; Aux routine for read-numeric-entity
435 ;; at s0..sp in the scratch pad is the already read prefix ('&#' or '&#x')
436 ;; Radix is the radix to use (10 or 16)
437 ;; returns new 'sp'
438 ;; 'ch' is the first digit
439 (let ((s1 sp))
440 (setf sp (push-on-scratch input sp ch))
441 (do ((ch (a-read-byte input) (a-read-byte input)))
442 ((or (null ch) (not (digit-rune-p ch radix)))
443 ;; Ok. [s1..sp) now is the digit sequence
444 (let ((num (parse-integer (map 'string #'rune-char
445 (subseq (a-stream-scratch input) s1 sp))
446 :radix radix)))
447 (cond ((<= 0 num #xFFFF)
448 ;; Proper entity value
449 (when (and (not (null ch)) (not (rune= ch #/\;)))
450 (a-unread-byte ch input))
451 ;; Rewind scratch pad to `s0' and push character `num'
452 (setf sp (push-on-scratch input s0 (code-rune num))))
454 ;; num too large; emit warning and leave scratch pad alone
455 (when (not (null ch))
456 (a-unread-byte ch input))
457 (parse-warn input 3 "Numeric enity ~A does not fit into our 16-bit strings; -- ignored."
458 (rod-string (rod-subseq (a-stream-scratch input) s0 sp)))))))
459 (setf sp (push-on-scratch input sp ch)))
460 sp))
462 (defun read-named-entity (input dtd sp ch)
463 ;; Just in case we want to leave the entity alone
464 (let ((s0 sp))
465 (setf sp (push-on-scratch input sp #/&))
466 (let ((s1 sp))
467 (setf sp (push-on-scratch input sp ch))
468 (do ((ch (a-read-byte input) (a-read-byte input)))
469 ((or (null ch) (not (name-rune-p ch)))
470 ;; Ok. [s1..sp) now is the name, try to resolve it
471 (let ((nums (find-named-entity dtd (subseq (a-stream-scratch input) s1 sp))))
472 (cond ((not (null nums))
473 ;; Proper entity value
474 ;; Rewind scratch pad to `s0' and push characters in `nums'
475 (dotimes (i (length nums))
476 (setf sp (push-on-scratch input s0 (aref nums i))))
477 (when (and (not (null ch)) (not (rune= ch #/\;)))
478 (a-unread-byte ch input)))
480 (when (not (null ch))
481 (a-unread-byte ch input))
482 (parse-warn input 3 "[~D] There is no such entity defined: ~A -- ignored."
483 (a-stream-position input)
484 (rod-string (rod-subseq (a-stream-scratch input) s0 sp)))))) )
485 (setf sp (push-on-scratch input sp ch))))
486 sp))
488 (defun find-named-entity (dtd fat-string)
489 (let ((str (rod-string fat-string)))
490 (let ((r (cdr (assoc str (sgml::dtd-entities dtd) :test #'string=))))
491 r)))
493 (defun enlarge-scratch-pad (input)
494 (let* ((old (a-stream-scratch input))
495 (se (length old)))
496 (declare (type fixnum se)
497 (type (simple-array rune (*)) old))
498 (let ((new (make-rod (+ (length (a-stream-scratch input)) *buf-size*))))
499 (declare (type rod new))
500 (do ((i (- se 1) (the fixnum (- i 1))))
501 ((< i 0))
502 (declare (type fixnum i))
503 (setf (aref new i) (aref old i)))
504 (setf (a-stream-scratch input) new))))
506 ;;; ------------------------------------------------------------
509 ;; tag ::= <start-tag> | <end-tag> | <exp-tag> | <comment>
510 ;; end-tag ::= "<" "/" <name> WSP ">"
511 ;; empty-tag ::= "<" <name> <atts> WSP "/" ">"
512 ;; start-tag ::= "<" <name> <atts> WSP ">"
513 ;; exp-tag ::= "<" "?" <any>* ">"
514 ;; comment ::= "<" "!" "-" "-" (<any>* - ("-" "-")) "-" "-" ">"
515 ;; | "<" "!" ">"
516 ;; att ::= <value>
517 ;; | <name> WSP "=" WSP <value>
518 ;; value ::= <literal> | <name>
519 ;; literal ::= """ <char>* """
520 ;; | "'" <char>* "'"
521 ;; atts ::= ( WSP <att> )*
522 ;; WSP ::= <white-space>*
523 ;; name ::= <name-start-char> <name-char>*
524 ;; char ::= <any> | <enitity-ref>
526 (defun read-tag (input dtd)
527 ;; The "<" is already read.
528 (catch 'read-tag-error
529 (let ((ch (a-peek-byte input)))
530 (cond ((rune= ch #//)
531 (a-read-byte input)
532 (read-end-tag input))
533 ((rune= ch #/!)
534 (a-read-byte input)
535 (read-define-tag input dtd))
536 ((rune= ch #/?)
537 (a-read-byte input)
538 (read-experimental-tag input))
539 ((and (not (null ch)) (name-start-rune-p ch))
540 (read-start-tag input dtd))
542 (parse-warn input 3 "Bad character after '<': '~A' -- ignored."
543 (rune-char ch))
544 (let ((res (string-rod "<")))
545 (values :pcdata res))) ))) )
547 (defun read-start-tag (input dtd)
548 (multiple-value-bind (name atts) (read-name-and-attributes input dtd)
549 (let ((ch (a-read-byte input)))
550 (cond ((null ch)
551 (read-tag-error input "EOF inside tag"))
552 ((rune= ch #/>)
553 (values :start-tag name atts))
554 ((rune= ch #/<)
555 (parse-warn input 3
556 "A '<' ended this tag.")
557 (a-unread-byte ch input)
558 (values :start-tag name atts))
559 ((rune= ch #//)
560 (setf ch (a-read-byte input))
561 (cond ((rune= ch #/>)
562 (values :empty-tag name atts))
564 (read-tag-error input "Expected '>' after '<' .. '/'"))))
566 (read-tag-error input "Expected '>'")) ))))
568 (defun read-end-tag (input)
569 (let ((name (read-name input)))
570 (skip-white-space input)
571 (let ((ch (a-read-byte input)))
572 (cond ((null ch)
573 (read-tag-error input "In end tag: Expected '>' got end-of-file instead."))
574 ((rune= ch #/>)
575 (values :end-tag name))
577 (read-tag-error input "In end tag: Expected '>'")) ))))
579 (defun read-name-and-attributes (input dtd)
580 (let ((name (read-name input))
581 (atts nil))
582 (loop
583 (skip-white-space input)
584 (cond ((member (a-peek-byte input) '(#/< #/> #//) :test #'eql)
585 (return)))
586 (push (read-attribute input dtd) atts))
587 (values name (nreverse atts)) ))
589 (defun read-name (input)
590 (let ((ch (a-peek-byte input))
591 (sp 0))
592 (cond ((and (not (null ch)) (name-start-rune-p ch))
593 (do ((ch (a-read-byte input) (a-read-byte input)))
594 ((not (and ch (name-rune-p ch)))
595 (when ch
596 (a-unread-byte ch input))
597 (subseq/rod (a-stream-scratch input) 0 sp))
598 (setf sp (push-on-scratch input sp (upcase-name-rune ch)))))
600 (read-tag-error input "Not a name")) )))
602 (defun skip-white-space (input)
603 (do ((ch (a-read-byte input) (a-read-byte input)))
604 ((not (and ch (white-space-rune-p ch)))
605 (when ch (a-unread-byte ch input)))))
607 (defun read-attribute (input dtd)
608 (skip-white-space input)
609 (let ((slot (read-sloopy-name input)))
610 ;;(print (list 'slot '= (mungle slot) (mungle (vector (a-peek-byte input)))))
611 (skip-white-space input)
612 (let ((c (a-peek-byte input)))
613 (cond ((and (not (null c)) (rune= c #/=))
614 (a-read-byte input)
615 (skip-white-space input)
616 (let ((value (read-value input dtd)))
617 (cons slot value)))
619 slot)))))
621 (defun read-value (input dtd)
622 (let ((ch (a-peek-byte input)))
623 (cond ((rune= ch #/')
624 (a-read-byte input)
625 (read-literal input dtd ch))
626 ((rune= ch #/\")
627 (a-read-byte input)
628 (read-literal input dtd ch))
629 ((and ch (sloopy-name-rune-p ch))
630 (read-sloopy-value input))
632 (read-tag-error input "Bad value '~A' seen"
633 (or (rune-char ch)
634 (format nil "U+~4,'0X" (rune-code ch))))))))
636 (defun read-literal (input dtd delim)
637 (let* ((scratch (a-stream-scratch input)) ;scratch pad
638 (sp 0) ;pointer into scratch pad
639 (se (length scratch)) ;end of scratch pad
640 ) ;code vector
641 (declare (type rod scratch))
642 (declare (type fixnum sp se))
643 (loop
644 (let ((ch (a-read-byte input)))
645 ;; FIXME: why was this declared as (u-b 8), not (u-b 16)?
646 ;; a-read-byte returns a rune.
647 ;;; (declare (type (or null (unsigned-byte 8)) ch))
648 (declare (type (or null rune) ch))
649 (cond ((null ch) ;eof
650 (read-tag-error input "Eof in literal"))
651 ((rune= ch delim)
652 (return))
653 ((rune= ch #/&)
654 (setf sp (read-entity-ref input dtd sp)))
656 (setf (aref scratch sp) ch) ;recode character read
657 (setf sp (the fixnum (+ sp 1)))
658 (cond ((= sp se) ;end of scratch pad reached?
659 (enlarge-scratch-pad input)
660 (setf scratch (a-stream-scratch input)
661 se (length scratch))))))))
662 (subseq/rod scratch 0 sp) ))
664 (defun read-sloopy-name (input)
665 (let ((ch (a-peek-byte input))
666 (sp 0))
667 (cond ((and (not (null ch)) (sloopy-name-rune-p ch))
668 (do ((ch (a-read-byte input) (a-read-byte input)))
669 ((not (and ch (sloopy-name-rune-p ch)))
670 (when ch
671 (a-unread-byte ch input))
672 (subseq/rod (a-stream-scratch input) 0 sp))
673 (setf sp (push-on-scratch input sp ch))))
675 (read-tag-error input
676 "Expected sloopy name, got ~A"
677 (or (rune-char ch) (format nil "U+~4,'0X" ch)) )) )))
679 (defun read-sloopy-value (input)
680 (let ((ch (a-peek-byte input))
681 (sp 0))
682 (cond ((and (not (null ch)) (sloopy-value-rune-p ch))
683 (do ((ch (a-read-byte input) (a-read-byte input)))
684 ((not (and ch (sloopy-value-rune-p ch)))
685 (when ch
686 (a-unread-byte ch input))
687 (subseq/rod (a-stream-scratch input) 0 sp))
688 (setf sp (push-on-scratch input sp ch))))
690 (read-tag-error input "Expected sloopy value, got ~A"
691 (or (rune-char ch) (format nil "U+~4,'0X" ch)) )) )))
693 (defun read-define-tag (input dtd)
694 (let ((ch (a-peek-byte input)))
695 (cond ((null ch)
696 (read-tag-error input "unexpected EOF"))
697 ((rune= ch #/>)
698 ;; empty define tag -- to be ignored
699 (a-read-byte input)
700 (read-token input dtd))
701 ((rune= ch #/-)
702 ;; comment?
703 (a-read-byte input)
704 (let ((ch (a-peek-byte input)))
705 (cond ((and (not (null ch)) (rune= ch #/-))
706 (read-comment input))
708 (read-tag-error input "Expected '-' after \"<!-\"")))))
710 (read-define-tag-2 input)) )))
712 (defun read-define-tag-2 (input)
713 ;; TODO: Comments
714 ;; we simply slurp until '>'
715 (let ((sp 0))
716 (do ((ch (a-read-byte input) (a-read-byte input)))
717 ((and ch (rune= ch #/>))
718 (values :define-tag (subseq/rod (a-stream-scratch input) 0 sp)))
719 (setf sp (push-on-scratch input sp ch))) ))
721 (defun read-comment (input)
722 (a-read-byte input) ;consume the '-'
723 (let ((c0 0)
724 (c1 (or (a-read-byte input) (read-tag-error input "Unexpected EOF")))
725 (c2 (or (a-read-byte input) (read-tag-error input "Unexpected EOF")))
726 (sp 0)
727 (warned-p nil))
728 (loop
729 (psetq c0 c1
730 c1 c2
731 c2 (a-read-byte input))
732 (cond ((null c2)
733 (read-tag-error input "EOF within comment."))
734 ((and (rune= c0 #/-)
735 (rune= c1 #/-)
736 (rune= c2 #/>))
737 (return))
738 ((and *gt-ends-comment-p*
739 (rune= c2 #/>))
740 (parse-warn input 3 "A '>' ends this comment.")
741 (return)))
742 (cond ((and (rune= c0 #/-) (rune= c1 #/-))
743 (unless warned-p
744 (parse-warn input 4 "\"--\" seen within comment; This is strongly depreciated.")
745 (setf warned-p t))))
746 (setf sp (push-on-scratch input sp c0)))
747 (values :comment (subseq/rod (a-stream-scratch input) 0 sp)) ))
749 ;;;; ------------------------------------------------------------------------------------------
751 (defun name-start-char-p (ch)
752 (alpha-char-p ch))
754 (defun name-char-p (ch)
755 (or (alphanumericp ch) (char= ch #\.) (char= ch #\-)) )
757 (defun valid-name-string-p (string)
758 "Is the string `string' a valid name string according to the SGML
759 conventions?"
760 (and (> (length string) 0)
761 (name-start-char-p (char string 0))
762 (every #'name-char-p string)) )
764 ;;;; ------------------------------------------------------------------------------------------
765 ;;;; Resolving Entities
766 ;;;;
768 ;;;; TODO: Check that numeric entities are within 0..#xFFFF;
770 (defun resolve-numeric-entity (string start end) ; --> string ; new start
771 (let ((j (or (position-if-not #'digit-rune-p string :start start :end end) end)))
772 (values
773 (let ((n (parse-integer (rod-string (rod-subseq string start j)) :radix 10)))
774 (rod n))
775 (if (and (< j end) (rune= (rune string j) #/\;))
776 (+ j 1)
777 j))))
779 (defun resolve-hex-entity (string start end) ; --> string ; new start
780 ;; Resolves a hexadecimal entity like "&#x2A;", start should point
781 ;; to the character directy after the '&#x'.
782 (let ((j (or (position-if-not (rcurry #'digit-rune-p 16) string :start start :end end) end)))
783 (values
784 (let ((n (parse-integer (rod-string (rod-subseq string start j)) :radix 16)))
785 (rod n))
786 (if (and (< j end) (rune= (rune string j) #/\;))
787 (+ j 1)
788 j))))
790 (defun resolve-named-entity (string entities start end &optional input)
791 ;; --> string ; new start
792 (let ((j (or (position-if-not #'name-start-rune-p string :start start :end end) end)))
793 (let ((res
794 (or (dolist (k entities)
795 (when (and (= (length (car k)) (- j start))
796 ;; XXX this compare conses far too much!
797 (rod= (string-rod (subseq (car k) 0 (- j start)))
798 (rod-subseq string start j)))
799 (return (string-rod (cdr k))) ) )
800 (dolist (k entities nil)
801 (when (and (>= (length (car k)) (- j start))
802 ;; XXX dito
803 (rod= (string-rod (subseq (car k) 0 (- j start)))
804 (rod-subseq string start j)))
805 (return (string-rod (cdr k))) )))))
806 (cond ((not (null res))
807 (values
808 (resolve-entities-in-string res entities 0 (length res) input) ;right?
809 (if (and (< j end) (rune= (rune string j) #/\;))
810 (+ j 1)
811 j)))
813 (parse-warn input 3 "Entity &~a; is not defined." (subseq string start j))
814 (values (subseq string (1- start) j) j))))))
816 (defun resolve-entities-in-string (string entities
817 &optional (start 0) (end (length string)) input)
818 ;; Resolve all entity references introduced by "&" in the string
819 ;; `string'. `start' and `end' specify a substring to operate on.
820 ;; For error messages `input' may be the input stream the data is
821 ;; coming from originally.
822 (let ((i (position #/& string :start start :end end :test #'rune=)))
823 (cond ((null i)
824 ;; no further entities in string -- all done
825 (rod-subseq string start end))
826 ((and (< (+ i 2) end)
827 (rune= #/# (rune string (+ i 1)))
828 (digit-rune-p (rune string (+ i 2))))
829 ;; numeric entity seen
830 (multiple-value-bind (res j) (resolve-numeric-entity string (+ i 2) end)
831 (concatenate 'rod (subseq string start i)
833 (resolve-entities-in-string string entities j end input))))
834 ((and (< (+ i 2) end)
835 (rune= #/# (rune string (+ i 1)))
836 (rune= #/x (rune string (+ i 2))))
837 ;; hexadecimal entity seen
838 (multiple-value-bind (res j) (resolve-hex-entity string (+ i 3) end)
839 (concatenate 'rod (subseq string start i)
841 (resolve-entities-in-string string entities j end input))))
842 ((and (< (+ i 1) end)
843 (alpha-rune-p (aref string (+ i 1))))
844 ;; this must be a named entity
845 (multiple-value-bind (res j)
846 (resolve-named-entity string entities (+ i 1) end input)
847 (concatenate 'rod (subseq string start i)
849 (resolve-entities-in-string string entities j end input))))
851 ;; no entity reference.
852 (concatenate 'rod (subseq string start i)
853 (rod #/&)
854 (resolve-entities-in-string string entities (+ i 1) end input))))))
857 ;;;; ------------------------------------------------------------------------------------------
858 ;;;; Mungling of Attribute values
859 ;;;;
861 (let ((kw-pkg (find-package :keyword)))
862 (defun kintern (x)
863 (intern x kw-pkg)))
865 (defun canon-value (input dtd tag slot value)
866 (let* ((attlist (find-element-attlist dtd tag))
867 (looked (assoc slot attlist)))
868 (cond ((and looked (listp (cadr looked)))
869 (or (find value (cadr looked)
870 :test #'(lambda (x y)
871 (string-equal (string x) (string y))))
872 (progn
873 ;; Oh yeah! monster format strings are fun!
874 (parse-warn input 3
875 "~S is a bad value for the '~A' slot of '<~A>', which could ~
876 ~{~#[not take any value~;only take '~A'~:;take one of ~@{'~A'~#[~; or ~:;, ~]~}~]~:}."
877 value slot tag (cadr looked)))))
878 ((member (cadr looked) '(:number))
879 (or (maybe-parse-integer value)
880 (progn
881 (parse-warn input 3 "~S is not NUMBER (attribute '~A' of '<~A>')."
882 value slot tag)
883 nil)))
884 ((member (cadr looked) '(:name :id))
885 (cond ((valid-name-string-p value)
886 (kintern (string-upcase value)))
888 (parse-warn input 3 "~S is not NAME (attribute '~A' of '<~A>')."
889 value slot tag)
890 nil)))
891 (looked
892 value)
894 (parse-warn input 3 "The '<~A>' element has no '~A' slot." tag slot)
895 nil) )))
897 (defun find-slot-value-pair (input dtd tag value)
898 (let* ((attlist (find-element-attlist dtd tag))
899 (looked nil))
900 (dolist (att attlist)
901 (cond ((and (listp (cadr att))
902 (setq looked (find value (cadr att)
903 :test #'(lambda (x y)
904 (string-equal (string x) (string y))))))
905 (return-from find-slot-value-pair (values (car att) looked)))))
906 ;;fall thru'
907 (parse-warn input 3
908 "The '<~A>' tag has no slot which could take the '~A' keyword.~%~
909 ~1{~#[There are no possible slots at all.~;~
910 Only possible slot is:~:;~
911 Possible slots are:~]~
912 ~@{~& ~1{Slot '~A'~20T could ~{~#[not take any value~;~
913 only take '~A'~:;~
914 take one of ~@{'~A'~#[~; or ~:;, ~]~}~].~:}~}~}~:}"
915 tag value (remove-if-not #'(lambda (x) (consp (cadr x))) attlist)) ))
918 ;;;; ------------------------------------------------------------
920 (defun read-token* (input dtd)
921 ;;(skip-white-space input)
922 (multiple-value-bind (kind a b) (read-token input dtd)
923 (ecase kind
924 (:pcdata (make-start-tag :name :pcdata :atts a))
925 (:start-tag
926 (let ((name (kintern (rod-string a))))
927 (if (tag-exists? dtd name)
928 (make-start-tag :name name :atts (mungle-attlist dtd name b))
929 (progn
930 (parse-warn input 4 "There is no such thing as <~A> -- ignored." name)
931 (read-token* input dtd)))))
932 (:end-tag
933 (let ((name (kintern (rod-string a))))
934 (if (tag-exists? dtd name)
935 (make-end-tag :name name)
936 (progn
937 (parse-warn input 4 "There is no such thing as </~A> -- ignored." name)
938 (read-token* input dtd)))) )
939 (:empty-tag
940 (parse-warn input 2 "Oops -- there is an empty tag; but this is only HTML?!")
941 (let ((name (kintern (rod-string a))))
942 (if (tag-exists? dtd name)
943 (make-start-tag :name name :atts (mungle-attlist dtd name b))
944 (progn
945 (parse-warn input 4 "There is no such thing as <~A> -- ignored." name)
946 (read-token* input dtd))))
947 ;;(read-token* input dtd)
949 (:define-tag (read-token* input dtd))
950 (:experimental-tag
951 (parse-warn input 2 "Ignoreing processing instruction tag: '~A'" (mungle a))
952 (read-token* input dtd))
953 (:comment
954 (make-comment-token :data a))
955 (:eof
956 (make-end-tag :name :%top)) )))
958 (defun tag-exists? (dtd name)
959 (and
960 ;;(not (eq name :font)) ;xxx
961 ;;(not (eq name :center)) ;xxx
962 ;;(not (eq name :div)) ;xxx
963 ;;(not (eq name :img))
964 ;;(not (eq name :form))
965 (not (eq name :noscript)) ;needed for www.sgi.com
966 (sgml::find-element dtd name nil nil)))
968 (defun foofoo (r)
969 (cond ((integerp r) (string-rod (prin1-to-string r)))
970 ((symbolp r) (string-rod (princ-to-string r)))
971 ((stringp r) (string-rod r))
973 (error "foofoo: Hmm ~S ?!" r))))
975 ;;; The renderer might depend on upper-case attribute values, so let's leave
976 ;;; this off by default. For the benefit of html <-> xml conversions we
977 ;;; don't want to check the DTD every time we convert an attribute though,
978 ;;; so we need this mode for lower-case attribute values.
979 (defvar *unmungle-attribute-case* nil)
981 (defun mungle-attlist (dtd tag atts)
982 (mapcan (lambda (x)
983 (cond ((atom x)
984 ;; this clause isn't unicode-safe
985 (multiple-value-bind (slot value)
986 (sgml::find-slot-value-pair nil dtd tag (mungle x))
987 (when value
988 (setf value (foofoo value))
989 (when *unmungle-attribute-case*
990 (setf value (rod-downcase value))))
991 (and slot
992 (list slot value))))
994 (let ((slot (kintern (string-upcase (mungle (car x))))))
995 (list slot (cdr x))))))
996 atts))
998 (defun read-experimental-tag (input)
999 ;; TODO: Comments
1000 ;; we simply slurp until '>'
1001 (let ((sp 0))
1002 (do ((ch (a-read-byte input) (a-read-byte input)))
1003 ((and ch (rune= ch #/>))
1004 (values :experimental-tag (subseq/rod (a-stream-scratch input) 0 sp)))
1005 (setf sp (push-on-scratch input sp ch))) ))
1008 ;;; ---------------------------------------------------------------------------
1009 ;;; The PDA
1012 (defun handle-meta-tag-in-parser (input attrs)
1013 (when (rod-equal (string-rod "content-type") (getf attrs :http-equiv))
1014 (let ((content-type (getf attrs :content)))
1015 (and content-type
1016 (multiple-value-bind (type subtype parameters)
1017 (closure-mime-types:parse-mime-content-type
1018 (rod-string content-type))
1019 (declare (ignore type subtype))
1020 (let ((cs (assoc :charset parameters :test #'string-equal)))
1021 (when cs
1022 (setup-code-vector input (cdr cs)))))))))
1024 (defun sgml-parse (dtd input)
1025 (let* ((stack (list (make-start-tag :name :%top :atts nil)))
1026 (s (sgml::make-pt/low :name 'top))
1027 (r s)
1028 (eof? nil)
1029 (eingabe nil)
1030 ausgabe)
1031 (loop
1032 (do ()
1033 ((or eof? (not (null (cdr eingabe)))))
1034 (let ((tok (read-token* input dtd)))
1035 (when (and (end-tag-p tok) (eq (tag-name tok) :%top))
1036 (setf eof? t))
1037 (setf eingabe (nconc eingabe (list tok)))))
1038 (multiple-value-setq (stack eingabe ausgabe) (transition input dtd stack eingabe))
1039 (cond ((eq ausgabe :accept)
1040 (return))
1042 ((eq ausgabe :error)
1043 (return))
1045 ((eq ausgabe :close)
1046 (setf s (sgml:pt-parent s)))
1048 ((eq (car ausgabe) :comment)
1049 (setf (sgml:pt-children s)
1050 (nconc (sgml:pt-children s) (list (sgml::make-pt/low
1051 :name (cadr ausgabe)
1052 :attrs (caddr ausgabe)
1053 :children nil
1054 :parent s)))))
1056 ((eq (car ausgabe) :open)
1057 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1058 (cond ((and (eq (cadr ausgabe) :meta))
1059 (handle-meta-tag-in-parser input (caddr ausgabe))))
1060 ;; when the BODY tag is openend, switch the streams speed to full speed.
1061 (cond ((and (eq (cadr ausgabe) :body))
1062 (setf (runes::xstream-speed input)
1063 (length (runes::xstream-os-buffer input)))))
1064 (let ((n (sgml::make-pt/low
1065 :name (cadr ausgabe)
1066 :attrs (caddr ausgabe)
1067 :children nil
1068 :parent s)))
1069 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))
1070 s n) ))
1072 ((eq (car ausgabe) :open/close)
1073 ;; code duplication alert!
1074 ;; Hack here to support <meta http-equiv="Content-Type" ...>
1075 (cond ((and (eq (cadr ausgabe) :meta))
1076 (handle-meta-tag-in-parser input (caddr ausgabe))))
1077 (let ((n (sgml::make-pt/low
1078 :name (cadr ausgabe)
1079 :attrs (caddr ausgabe)
1080 :children nil
1081 :parent s)))
1082 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))
1083 s n) )
1084 (setf s (sgml:pt-parent s)))
1086 ((eq (car ausgabe) :htag)
1087 ;; code duplication alert!
1088 (let* ((v (cadr ausgabe))
1089 (n (cond ((start-tag-p v)
1090 (make-hstag-node
1091 :name (tag-name v)
1092 :attrs (start-tag-atts v)
1093 :children nil
1094 :parent s))
1095 ((end-tag-p v)
1096 (make-hetag-node
1097 :name (tag-name v)
1098 :attrs nil
1099 :children nil
1100 :parent s))
1102 (error "fix your code.")))))
1103 (setf (sgml:pt-children s) (nconc (sgml:pt-children s) (list n))) ))
1105 r) )
1107 (defun empty-element-p (dtd gi)
1108 (null (elm-inclusion dtd gi)))
1110 (defun transition (input dtd stack eingabe) ;; --> stack', eingabe', ausgabe
1111 (cond ((and stack (eq (tag-name (car stack)) :pcdata))
1112 (values (cdr stack)
1113 eingabe
1114 :close))
1115 ((null eingabe)
1116 (cond ((null stack)
1117 (values nil nil :accept))
1119 (values stack eingabe :error))))
1120 ;; (aS, </a>W) -> (S, W, </a>)
1121 ((comment-token-p (car eingabe))
1122 (values stack
1123 (cdr eingabe)
1124 (if (member (tag-name (car stack)) *preserves-comments-elements*)
1125 (progn
1126 (list :comment :pcdata (comment-token-data (car eingabe))))
1127 nil)))
1129 ((and (tag-p (car eingabe))
1130 (tag-as-marker-p (tag-name (car eingabe))))
1131 (values stack
1132 (cdr eingabe)
1133 (list :htag (car eingabe))))
1135 ((and (end-tag-p (car eingabe))
1136 stack
1137 (eq (tag-name (car stack))
1138 (tag-name (car eingabe))))
1139 (values (cdr stack)
1140 (cdr eingabe)
1141 :close))
1143 ((and (start-tag-p (car eingabe))
1144 stack
1145 (member (tag-name (car eingabe)) (elm-inclusion dtd (tag-name (car stack)))) )
1146 (cond
1147 ((empty-element-p dtd (tag-name (car eingabe)))
1148 (values stack
1149 (cdr eingabe)
1150 (list :open/close (tag-name (car eingabe)) (start-tag-atts (car eingabe)))))
1152 (values (cons (car eingabe) stack)
1153 (cdr eingabe)
1154 (list :open (tag-name (car eingabe)) (start-tag-atts (car eingabe)))))))
1156 ((and (white-space-token-p (car eingabe))
1157 stack
1158 (not (member :pcdata (elm-inclusion dtd (tag-name (car stack))))))
1159 ;; ignorieren
1160 (values stack (cdr eingabe) nil))
1161 ((null stack)
1162 (error "Oops empty stack in TRANSITION on ~S." eingabe))
1164 (let ((x (resolve dtd (tag-name (car stack)) (car eingabe))))
1165 (if x
1166 (values stack
1167 (cons x eingabe)
1168 nil)
1169 (values stack
1170 (heuristic input dtd (car stack) eingabe)
1171 nil)))) ))
1173 (defun tag-as-marker-p (gi)
1174 (and *font-heuristic-p*
1175 (eq gi :font))
1176 (and *anchor-heuristic-p*
1177 (eq gi :a)) )
1179 (defun white-space-token-p (x)
1180 (and (start-tag-p x)
1181 (eq (tag-name x) :pcdata)
1182 (every #'white-space-rune-p (start-tag-atts x))))
1184 ;;; Heuristic conflict resolution
1186 (defun shortest-different-beginning (x y)
1187 (do ((q x (cdr q))
1188 (i 0 (+ i 1)))
1189 ((null q))
1190 (do ((p y (cdr p))
1191 (j 0 (+ j 1)))
1192 ((null p))
1193 (if (eq p q)
1194 (return-from shortest-different-beginning
1195 (values (subseq x 0 i)
1196 (subseq y 0 j)))))))
1198 (defun document-action (input context alte-eingabe neue-eingabe is-default-p)
1199 (let ((see (car alte-eingabe)))
1200 (parse-warn input 4 "[~A] Saw ~A in ~A ~A"
1201 (if is-default-p "-" "H") see context
1202 (multiple-value-bind (a b) (shortest-different-beginning alte-eingabe neue-eingabe)
1203 (cond ((and (null a) (null b))
1204 (format nil "-- ??? patched ~S -> ~S" alte-eingabe neue-eingabe))
1205 ((null b)
1206 (format nil "-- nuked~{ ~A~}." a))
1207 ((null a)
1208 (format nil "-- inserted~{ ~A~}." b))
1210 (format nil "-- patched~{ ~A~} ->~{ ~A~}." a b)) )) )))
1212 (defun heuristic (input dtd context eingabe)
1213 (let ((see (car eingabe))
1214 (is-default-p nil))
1215 (labels ((is (tag state)
1216 (and (elms-eqv dtd state (tag-name context))
1217 (if (char= (char (symbol-name tag) 0) #\/)
1218 (and (end-tag-p (car eingabe))
1219 (elms-eqv dtd
1220 (tag-name (car eingabe))
1221 (kintern (subseq (symbol-name tag) 1))))
1222 (and (start-tag-p (car eingabe))
1223 (elms-eqv dtd (tag-name (car eingabe)) tag))))))
1224 (let ((neu
1225 (cond ((and (member :HTML (find-dtd-top-elements dtd))
1226 (cond ((and (end-tag-p see) (eq (tag-name see) :%top))
1227 (cons (elm-etag (tag-name context)) eingabe))
1229 ((and (start-tag-p see)
1230 (eq (tag-name see) :style))
1231 (cons (make-start-tag :name :SPAN
1232 :atts (list :class (rod "illegalstyle")))
1233 (cdr eingabe)))
1235 ((is :center #|in|# :h1)
1236 (list* (elm-etag (tag-name context))
1237 (car eingabe) context (cdr eingabe)))
1239 #+(OR)
1240 ;; this one for KMP
1241 ((is :h2 #|in|# :a)
1242 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1244 ((or (is :center #|in|# :font)
1245 (is :p #|in|# :font))
1246 ;; Uff -- the attributes of FONT are lost here.
1247 ;; we have to extend `context' to include these.
1248 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1249 ((is :hr #|in|# :i)
1250 (list* (elm-etag (tag-name context)) (car eingabe) context (cdr eingabe)))
1251 ((elms-eqv dtd (tag-name context) ':font)
1252 (cons (elm-etag (tag-name context)) eingabe))
1253 ((is :tr #|in|# :center)
1254 (cons (elm-etag (tag-name context)) eingabe))
1255 ((is :/h2 #|in|# :h1)
1256 (cons (elm-etag (tag-name context)) eingabe))
1257 ((is :ul #|in|# :h1)
1258 (cons (elm-etag (tag-name context)) eingabe))
1259 ((is :/center #|in|# :h3)
1260 (cons (elm-etag (tag-name context)) eingabe))
1261 ((is :/td #|in|# :div)
1262 (cons (elm-etag (tag-name context)) eingabe))
1263 ((is :p #|in|# :ul)
1264 (cons (make-start-tag :name :li) eingabe))
1265 ((is :a #|in|# :ul)
1266 (cons (make-start-tag :name :li) eingabe))
1267 ((is :img #|in|# :ul)
1268 (cons (make-start-tag :name :li) eingabe))
1270 ((is :pcdata #|in|# :ul)
1271 (cons (make-start-tag :name :li) eingabe))
1273 ((is :td #|in|# :table)
1274 (list* (make-start-tag :name :tr)
1275 eingabe))
1276 ((is :pcdata #|in|# :table)
1277 (list* (elm-etag (tag-name context)) (car eingabe) context
1278 (cdr eingabe)))
1280 ((is :frameset #|in|# :noscript)
1281 (list* (elm-etag (tag-name context)) eingabe))
1283 ((is :form #|in|# :table)
1284 (list* (elm-etag (tag-name context))
1285 (car eingabe)
1286 context
1287 (cdr eingabe)))
1289 ((is :/form #|in|# :tbody)
1290 ;; we should better check here wether FORM is open at all.
1291 (list* (cadr eingabe)
1292 (car eingabe)
1293 (cddr eingabe)))
1295 ;; new as of 1999-08-31
1296 ((is :td #|in|# :li)
1297 (list* (elm-etag (tag-name context))
1298 eingabe))
1299 ((is :/td #|in|# :li)
1300 (list* (elm-etag (tag-name context))
1301 eingabe))
1302 ((is :td #|in|# :ul)
1303 (list* (elm-etag (tag-name context))
1304 eingabe))
1305 ((is :/td #|in|# :ul)
1306 (list* (elm-etag (tag-name context))
1307 eingabe))
1311 (setq is-default-p t)
1312 (cdr eingabe)) )))
1313 (document-action input (tag-name context) eingabe neu is-default-p)
1314 neu))))
1316 (defun parse-html (input &optional (charset :iso-8859-1))
1317 (let ((dtd closure-html:*html-dtd*))
1318 (let ((input (runes:make-xstream input :initial-speed 1 :speed 128)))
1319 (setf (a-stream-scratch input)
1320 (make-array #.(* 2 *buf-size*) :element-type 'rune))
1321 (setup-code-vector input charset)
1322 (let ((r (sgml-parse dtd input)))
1323 (post-mortem-heuristic dtd r)))) )
1325 (defun post-mortem-heuristic (dtd parse-tree)
1326 "Do possible post mortem heuristic on a parse tree."
1327 (when *font-heuristic-p*
1328 (setf parse-tree (post-mortem/fix-font dtd parse-tree)))
1329 (setf parse-tree (post-mortem/fix-top-level-structure parse-tree))
1330 parse-tree)
1332 (defun post-mortem/fix-top-level-structure (parse-tree)
1333 ;; The hacking below is needed because of buggy input. Something like this
1334 ;; <base ..> <html> <head> .. <body> ... </html>
1335 ;; are two documents in one. Here we merge then into one HTML document.
1336 ;; Note that this defeats later incremental rendering pretty well.
1337 ;; Also this is HTML specific.
1338 ;; Das muss dennoch alles noch anders werden hier.
1339 (let ((r parse-tree)
1340 (head-elts nil)
1341 (body-elts nil)
1342 (frameset-elts nil)
1343 (head-warn-flag nil)
1344 (body-warn-flag nil)
1345 (frameset nil)
1346 (body nil))
1347 (when (> (length (pt-children r)) 1)
1348 (parse-warn nil 4 "Multiple HTML elements in document."))
1349 (dolist (k (pt-children r))
1350 (ecase (gi k)
1351 (:html
1352 (dolist (k (pt-children k))
1353 (ecase (gi k)
1354 (:head
1355 (when head-elts
1356 (setf head-warn-flag t))
1357 (setf head-elts (nconc head-elts (pt-children k))))
1358 ((:body)
1359 (setq body k)
1360 (when body-elts
1361 (setf body-warn-flag t))
1362 (setf body-elts (nconc body-elts (pt-children k))))
1363 ((:frameset)
1364 (setq frameset k)
1365 (setf frameset-elts (nconc frameset-elts (pt-children k))) ))))))
1366 (when head-warn-flag
1367 (parse-warn nil 4 "Multiple HEAD elements."))
1368 (when body-warn-flag
1369 (parse-warn nil 4 "Multiple BODY elements."))
1370 (let* ((html (make-pt/low :name :html :parent nil))
1371 (head (make-pt/low :name :head :parent html :children head-elts)))
1372 (cond (frameset
1373 (cond (body
1374 (parse-warn nil 4 "Body present while FRAMSET is present -- nuked!")
1375 (setf body nil body-elts nil))
1377 (setf (pt-parent frameset) html
1378 (pt-children frameset) frameset-elts)
1381 (cond (body
1382 (setf (pt-parent body) html
1383 (pt-children body) body-elts))
1385 (setf body (make-pt/low :name :body
1386 :parent html
1387 :children body-elts))))))
1388 (dolist (k head-elts) (setf (pt-parent k) head))
1389 (dolist (k body-elts) (setf (pt-parent k) body))
1390 (dolist (k frameset-elts) (setf (pt-parent k) frameset))
1391 (setf (pt-children html)
1392 (append (and head (list head))
1393 (and body (list body))
1394 (and frameset (list frameset))))
1395 html) ))
1397 ;;; FONT post mortem heuristic
1398 ;;; ==========================
1400 ;; Since FONT is by far the most often misused element, we handle these by a
1401 ;; post mortem analysis. If *font-heuristic-p* is true, parser does not care
1402 ;; for FONT start or end tags, but inserts HSTAG-NODEs and HETAG-NODEs into
1403 ;; the parse tree blindly. (Thus emulating the Mosaic approach to rendering).
1405 ;; Example
1406 ;; -------
1408 ;; <p> <font> foo <b> bar </font> baz </b>
1410 ;; is then parsed as:
1412 ;; (P #<HSTAG font> "foo" (B "bar" #<HETAG font> "baz"))
1415 ;; We now always want to return a parse tree, which conforms to the DTD, and
1416 ;; thus have to mungle this somehow into a sane HTML parse tree by wraping
1417 ;; FONT nodes around the right parts.
1419 ;; The first thing we do is to find all pairs of HSTAG, HETAG
1420 ;; nodes. We then use the following algorithm:
1422 ;; Algorithm
1423 ;; ---------
1425 ;; S = HSTAG node
1426 ;; E = HETAG node
1428 ;; if S and E are on the same level then // [*] that is E, S have the same parent
1429 ;; p <- S.parent
1430 ;; ;; partitionate p.children as:
1431 ;; p.children = (,@sb S ,@si E ,@se)
1432 ;; if si = () then
1433 ;; ;; FONT element spans nothing, so forget it
1434 ;; else
1435 ;; if p may contain FONT and
1436 ;; for all x in si: FONT may contain x
1437 ;; then
1438 ;; p.children <- (,@sb (font ,@si) ,@se)
1439 ;; all done
1441 ;; if S is higher in tree than E then
1442 ;; V <- ancestor of E, with V.parent = S.parent
1443 ;; insert a copy of E directly before E
1444 ;; insert a copy of S directly before first child of V
1445 ;; apply the algorithm recursively
1447 ;; if E is higher in tree than S then
1448 ;; ;; this analog to the case above
1449 ;; V <- ancestor of S, with V.parent = E.parent // this ancestor need not to exist!
1450 ;; insert a copy of S directly after V
1451 ;; insert a copy of E direclly after last child of V
1452 ;; apply the algorithm recursively
1455 ;; [*] This is not right. This was an thinko: Two nodes on the same
1456 ;; level do not always have the parent nodes.
1458 ;; NOTE: My first formulation of the algorithm used access pathen and the
1459 ;; implementation below does also -- this is inefficient and should be
1460 ;; changed.
1462 ;; NOTE: It would be nice, if we could prove that switching on
1463 ;; *FONT-HEURISTIC-P* does not hurd any conforming document.
1465 ;; When the need araises, we could also use the same method to deal
1466 ;; malicious B, I, et al tags; But bad nesting isn't that popular any more
1467 ;; these days.
1469 ;; Expirience however showed, that we would need something similar for FORM,
1470 ;; since people have the habit of spitting a FORM anywhere they see
1471 ;; fit. [Most popular is right between TR and TD]. Visit www.deja.com and you
1472 ;; see what I mean. FORM elements are extremely important, since one may be
1473 ;; able to cope with gliberish on the screen, but not with non-functional
1474 ;; forms. Another thing here is <INPUT> elements in inlegal spots.
1476 ;;; TODO
1478 ;; We still want to emit parser warnings, if FONT elements are misused.
1480 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1481 ;;; (declare (special q))
1482 ;;; (map-htag-pairs (lambda (stag etag)
1483 ;;; (let (s e)
1484 ;;; (setf s (pt-path parse-tree stag))
1485 ;;; (remove-pt stag)
1486 ;;; (setf e (pt-path parse-tree etag))
1487 ;;; (remove-pt etag)
1488 ;;; (mungle-font-pair dtd parse-tree stag s e)))
1489 ;;; parse-tree :font)
1490 ;;; parse-tree)
1492 (defun mungle-font-pair (dtd root tag s e)
1493 ;; NOTE: ignore-errors is needed, since ELM-INCLUSION checks for existing
1494 ;; element names (this is introduced by our pseudo TOP element).
1495 (cond ((equal s e))
1496 ((and (= (length s) (length e))
1497 (equal (butlast s) (butlast e)))
1498 ;; implicit assert was (butlast s) equal (butlast e), which is not always true.
1499 (let ((p (pt-access root (butlast s))))
1500 (cond ((and (member :font (ignore-errors (elm-inclusion dtd (gi p))))
1501 (do ((j (car (last s)) (+ j 1)))
1502 ((>= j (car (last e))) t)
1503 (when (not (null (elt (pt-children p) j)))
1504 ;; this should always be true!
1505 (unless (member (gi (elt (pt-children p) j))
1506 (ignore-errors (elm-inclusion dtd :font)))
1507 (return nil)))))
1508 (let* ((before (loop for i from 0 to (1- (car (last s)))
1509 collect (elt (pt-children p) i)))
1510 (between (loop for i from (car (last s)) to (1- (car (last e)))
1511 collect (elt (pt-children p) i)))
1512 (after (loop for i from (car (last e)) to (1- (length (pt-children p)))
1513 collect (elt (pt-children p) i)))
1514 (new (make-pt/low :name (gi tag)
1515 :attrs (pt-attrs tag)
1516 :children between
1517 :parent p)))
1518 (setf (pt-children p) (append before (list new) after))
1519 (dolist (k between)
1520 (setf (pt-parent k) new)) ))
1522 (loop for i from (car (last s)) to (1- (car (last e))) do
1523 (mungle-font-pair
1524 dtd root tag
1525 (append (butlast s) (list i 0))
1526 (append (butlast s)
1527 (list i (length (pt-children (elt (pt-children p) i)))))))))))
1528 ((and (< (length s) (length e))
1529 '(equal (subseq e (length s)) s))
1530 (let ((v (subseq e 0 (length s))))
1531 (mungle-font-pair dtd root tag (append v (list 0)) e)
1532 (mungle-font-pair dtd root tag s v)))
1533 ((and (> (length s) (length e))
1534 '(equal (subseq s (length e)) e))
1535 (let* ((v (subseq s 0 (length e)))
1536 (m ))
1537 (mungle-font-pair dtd root tag
1538 (append (butlast v) (list (+ 1 (car (last v))))) e)
1539 (setf m (length (pt-children (pt-access root v))))
1540 (mungle-font-pair dtd root tag
1541 s (append v (list m)))))
1543 (let (p)
1544 ;; search to first common parent
1545 (do ((i 0 (+ i 1)))
1546 ((or (= i (length s))
1547 (= i (length e))
1548 (not (eq (elt s i) (elt e i))))
1549 (setf p (subseq s 0 i))))
1550 (let* ((v1 (append (subseq s 0 (length p))
1551 (list (+ 1 (elt s (length p))))))
1552 (v2 v1))
1553 (mungle-font-pair dtd root tag s v1)
1554 (mungle-font-pair dtd root tag v2 e))))
1557 (defun map-htag-pairs (continuation parse-tree gi)
1558 ;; this function maps the inner htags first.
1559 (let ((stack nil))
1560 (labels ((walk (x)
1561 (cond ((and (hstag-node-p x) (eq (gi x) gi))
1562 (push x stack))
1563 ((and (hetag-node-p x) (eq (gi x) gi))
1564 (cond ((null stack)
1565 (remove-pt x)
1566 (parse-warn nil 4 "Superfluous ~A end tag." gi))
1568 (funcall continuation (pop stack) x))))
1570 (mapc #'walk (pt-children x))))))
1571 (walk parse-tree)
1572 ;; take care for non-closed start tags
1573 (unless (null stack)
1574 (parse-warn nil 4 "Unclosed ~A elements." gi)
1575 (let ((new (mapcar (lambda (x)
1576 (declare (ignore x))
1577 (make-hetag-node :name gi
1578 :attrs nil
1579 :parent parse-tree
1580 :children nil))
1581 stack)))
1582 (setf (pt-children parse-tree)
1583 (append (pt-children parse-tree) new))
1584 (mapc continuation stack new))) )))
1586 ;;; FORM post mortem heuristic
1587 ;;; ==========================
1589 ;; FORM is another element, which some authors get incredible
1590 ;; wrong. We deal with this in a similar way;
1592 ;; while we parse HTAG are put into the parse tree, we sort it out
1593 ;; after we are done parsing; There are two possibilities:
1595 ;; a. We are able to insert FORM in a sane way.
1596 ;; b. We are not able to do this
1598 ;; How is b. now handled? I see two solutions:
1600 ;; - basically leave HTAGs in the tree.
1601 ;; - permitt multiple FORM elements and connect them via some special
1602 ;; attribute.
1606 (defun setup-code-vector (input charset)
1607 (let ((enc (runes-encoding:find-encoding charset)))
1608 (cond ((not (null enc))
1609 (setf (runes:xstream-encoding input) enc))
1611 (parse-warn input 4 "There is no such encoding: ~S." charset)))))
1613 (defun sgml-parse-file (filename dtd)
1614 (with-open-file (input filename :direction :input :element-type '(unsigned-byte 8))
1615 (sgml-parse dtd (make-a-stream :cl-stream (cl-byte-stream->gstream input)))))
1617 (defun html-parse-file (filename)
1618 (with-open-file (input filename :direction :input :element-type '(unsigned-byte 8))
1619 (parse-html (cl-byte-stream->gstream input))))
1622 (defun html-parse-url (url)
1623 (unless (url:url-p url) (setf url (url:parse-url url)))
1624 (netlib:with-open-document ((input mime-type) url)
1625 mime-type
1626 (parse-html input)))
1629 (defun check-saneness (pt &optional (dtd closure-html:*html-dtd*))
1630 (dolist (k (pt-children pt))
1631 (unless (member (gi k) (elm-inclusion dtd (gi pt)))
1632 (warn "Unallowed ~A element within ~A." (gi k) (gi pt)))
1633 (unless (eq (pt-parent k) pt)
1634 (warn "Parent/child linkage broken."))
1635 (cond ((htag-node-p k)
1636 (warn "HTAG node (~S) left in parse tree. (parent ~S)" (gi k) (gi pt))))
1637 (check-saneness k)))
1639 ;;;(defun post-mortem/fix-font (dtd parse-tree)
1640 ;;; ;; das hatten wir schon, so geht das nicht -- oder nur sehr schwer.
1641 ;;; dtd
1642 ;;; (let ((font-stack nil)) ;stack of open HTAG FONT elements
1643 ;;; (labels ((walk (pt)
1644 ;;; (cond ((hstag-node-p pt)
1645 ;;; (push pt font-stack)
1646 ;;; nil)
1647 ;;; ((hetag-node-p pt)
1648 ;;; (pop font-stack)
1649 ;;; nil)
1650 ;;; (t
1651 ;;; (cond ((not (null font-stack))
1652 ;;; ;; some fonts are open
1653 ;;; ;; warp them around this node
1654 ;;;
1655 ;;; )
1656 ;;; (t
1657 ;;; (let ((new-children (mapcan #'walk (pt-children pt))))
1658 ;;; (dolist (k new-children)
1659 ;;; (setf (pt-parent k) pt))
1660 ;;; (setf (pt-children pt) new-children)
1661 ;;; (list pt))) )) )))
1662 ;;; (car (walk parse-tree)) )))
1664 (defun post-mortem/fix-font (dtd parse-tree)
1665 (declare (special q))
1666 (let ((pairs nil))
1667 ;; erstmal alle font tag paare suchen
1668 (map-htag-pairs (lambda (stag etag)
1669 (push (cons stag etag) pairs))
1670 parse-tree :font)
1671 (setf pairs (reverse pairs))
1672 ;; dann alle raus nehmen
1673 '(dolist (k pairs)
1674 (remove-pt (car k))
1675 (remove-pt (cdr k)))
1676 ;; dann alle munglen
1677 (dolist (pair pairs)
1678 (let ((stag (car pair)) (etag (cdr pair)) s e)
1679 (setf s (pt-path parse-tree stag))
1680 (assert (eq (pt-access parse-tree s) stag))
1681 (remove-pt stag)
1682 (setf e (pt-path parse-tree etag))
1683 (assert (eq (pt-access parse-tree e) etag))
1684 (remove-pt etag)
1685 (cond ((and s e)
1686 (mungle-font-pair dtd parse-tree stag s e))
1688 (warn "Hmm..."))))))
1689 parse-tree)
1691 ;;; ===========================================================================
1692 ;;; 'A' Heuristic
1695 ;; We want to match Netscape's behaviour and thus by experimentation,
1696 ;; we conclude:
1698 (defun shortest-path-to (dtd pathen goal &optional (max-depth 10))
1699 (cond ((<= max-depth 0)
1700 nil)
1701 ((find-if (lambda (p)
1702 (eql (car p) goal))
1703 pathen))
1705 (shortest-path-to dtd
1706 (mapcan (lambda (p)
1707 (mapcar (lambda (s)
1708 (cons s p))
1709 (elm-surclusion dtd (first p))))
1710 pathen)
1711 goal
1712 (1- max-depth))) ))
1717 ;; Start Tag anaylis
1719 (defun blah (offending)
1720 (with-open-file (sink "/tmp/t.html"
1721 :direction :output
1722 :if-exists :new-version)
1723 (let ((dtd closure-html:*html-dtd*))
1724 (let ((p (shortest-path dtd :BODY offending)))
1725 (let ((p2 (shortest-path dtd offending :PCDATA)))
1726 (format sink "~A<BR>~%" offending)
1727 (when (and p p2)
1728 (pop p) ;forget BODY
1729 (pop p2)
1730 (setq p2 (butlast p2))
1731 ;; pre-material
1732 (dolist (k (butlast p)) (format sink "<~A>~%" k))
1733 (format sink "<A href='xxx'>~%")
1734 (format sink "<img src='/tmp/a.gif'>A")
1735 (format sink "<~A>~%" (car (last p)))
1736 (dolist (k p2) (format sink "<~A>~%" k))
1737 (format sink "<img src='/tmp/b.gif'>B")
1738 (dolist (k (reverse p2)) (format sink "</~A>~%" k))
1739 (format sink "</~A>~%" (car (last p)))
1740 (format sink "<img src='/tmp/c.gif'>C")
1741 (dolist (k (reverse (butlast p))) (format sink "</~A>~%" k))
1742 ))))))
1744 (defun open-in-netscape (url)
1745 (glisp:run-unix-shell-command
1746 (format nil "my-netscape -remote 'openURL(~A)'" url)))
1748 (defun bluu ()
1749 (let ((i 0))
1750 (dolist (off (all-elms closure-html:*html-dtd*))
1751 (cond (t '(or (member :B (elm-inclusion closure-html:*html-dtd* off))
1752 (member :P (elm-inclusion closure-html:*html-dtd* off)))
1753 (blah off)
1754 (format T "~&;; ~A" off)
1755 (open-in-netscape "file:/tmp/t.html")
1756 (sleep 1)
1757 (let ((nam (format nil "/tmp/~A.gif" off)))
1758 (glisp:run-unix-shell-command (format nil "(my-xwd -id 0x3c003bf | xwdtopnm | ppmtogif) 2>/dev/null > ~A &" nam)))
1759 (sleep 1))
1761 (format T "~&;; Skipping ~A, because inclusion is ~A."
1762 off (elm-inclusion closure-html:*html-dtd* off)))))))
1765 (defun equivalence-classes (prediate set)
1766 (let ((classes nil))
1767 (dolist (item set)
1768 (do ((cp classes (cdr cp)))
1769 ((null cp)
1770 (push (list item) classes))
1771 (cond ((funcall prediate (caar cp) item)
1772 (push item (car cp))
1773 (return)))))
1774 classes))
1776 (defun shortest-path (dtd from to &optional (max-depth 10))
1777 (reverse (shortest-path/aux dtd (list (list from)) to max-depth)))
1779 (defun shortest-path/aux (dtd pathen to &optional (max-depth 10))
1780 (cond ((<= max-depth 0)
1781 nil)
1782 ((null pathen)
1783 nil)
1784 ((find-if (lambda (p) (eq (car p) to)) pathen))
1785 ((shortest-path/aux
1787 (remove-duplicates
1788 (mapcan (lambda (path)
1789 (mapcar (lambda (child)
1790 (cons child path))
1791 (elm-inclusion dtd (car path))))
1792 pathen)
1793 :test (lambda (x y) (eql (car x) (car y))))
1795 (1- max-depth)))))
1798 (defun bloo ()
1799 (let ((dtd closure-html:*html-dtd*))
1800 (equivalence-classes (lambda (x y)
1801 (and (set-equal (elm-inclusion dtd x) (elm-inclusion dtd y))
1802 (set-equal (elm-surclusion dtd x) (elm-surclusion dtd y))))
1803 (all-elms dtd))))
1806 ;;; ===========================================================================
1808 (defun mungle (x)
1809 (cond ((and (vectorp x) (not (stringp x))) (rod-string x))
1810 ((atom x) x)
1811 ((cons (mungle (car x))
1812 (mungle (cdr x))))))
1815 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1818 (format T "~&;; Parse error (line ~D column ~D): [~A] Saw ~A in ~A."
1819 (runes:xstream-line-number input)
1820 (runes:xstream-column-number input))