libjava/ChangeLog:
[official-gcc.git] / libjava / classpath / gnu / javax / swing / text / html / parser / support / Parser.java
blobb087c3c003ced5e62bc972084a91496cbc1ac2f7
1 /* Parser.java -- HTML parser.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
39 package gnu.javax.swing.text.html.parser.support;
41 import gnu.java.lang.CPStringBuilder;
43 import gnu.javax.swing.text.html.parser.htmlAttributeSet;
44 import gnu.javax.swing.text.html.parser.htmlValidator;
45 import gnu.javax.swing.text.html.parser.support.low.Constants;
46 import gnu.javax.swing.text.html.parser.support.low.ParseException;
47 import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
48 import gnu.javax.swing.text.html.parser.support.low.Token;
49 import gnu.javax.swing.text.html.parser.support.low.node;
50 import gnu.javax.swing.text.html.parser.support.low.pattern;
52 import java.io.IOException;
53 import java.io.Reader;
55 import java.util.Comparator;
56 import java.util.Set;
57 import java.util.TreeSet;
58 import java.util.Vector;
60 import javax.swing.text.ChangedCharSetException;
61 import javax.swing.text.SimpleAttributeSet;
62 import javax.swing.text.html.HTML;
63 import javax.swing.text.html.parser.AttributeList;
64 import javax.swing.text.html.parser.DTD;
65 import javax.swing.text.html.parser.DTDConstants;
66 import javax.swing.text.html.parser.Element;
67 import javax.swing.text.html.parser.Entity;
68 import javax.swing.text.html.parser.TagElement;
70 /**
71 * <p>A simple error-tolerant HTML parser that uses a DTD document
72 * to access data on the possible tokens, arguments and syntax.</p>
73 * <p> The parser reads an HTML content from a Reader and calls various
74 * notifying methods (which should be overridden in a subclass)
75 * when tags or data are encountered.</p>
76 * <p>Some HTML elements need no opening or closing tags. The
77 * task of this parser is to invoke the tag handling methods also when
78 * the tags are not explicitly specified and must be supposed using
79 * information, stored in the DTD.
80 * For example, parsing the document
81 * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
82 * will invoke exactly the handling methods exactly in the same order
83 * (and with the same parameters) as if parsing the document: <br>
84 * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
85 * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
86 * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
87 * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
88 * (supposed tags are given in italics). The parser also supports
89 * obsolete elements of HTML syntax.<p>
90 * </p>
91 * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
93 public class Parser
94 extends ReaderTokenizer
95 implements DTDConstants
97 /**
98 * The current html tag.
100 public Token hTag = new Token();
103 * The document template description that will be used to parse the documents.
105 protected DTD dtd;
108 * The value of this field determines whether or not the Parser will be
109 * strict in enforcing SGML compatibility. The default value is false,
110 * stating that the parser should do everything to parse and get at least
111 * some information even from the incorrectly written HTML input.
113 protected boolean strict;
116 * This fields has positive values in preformatted tags.
118 protected int preformatted = 0;
121 * The set of the document tags. This field is used for supporting
122 * markFirstTime().
124 private Set documentTags =
125 new TreeSet(new Comparator()
127 public int compare(Object a, Object b)
129 return ((String) a).compareToIgnoreCase((String) b);
135 * The buffer to collect the incremental output like text or coment.
137 private final StringBuffer buffer = new StringBuffer();
140 * The buffer to store the document title.
142 private final StringBuffer title = new StringBuffer();
145 * The current token.
147 private Token t;
150 * True means that the 'title' tag of this document has
151 * already been handled.
153 private boolean titleHandled;
156 * True means that the 'title' tag is currently open and all
157 * text is also added to the title buffer.
159 private boolean titleOpen;
162 * The attributes of the current HTML element.
163 * Package-private to avoid an accessor method.
165 htmlAttributeSet attributes =
166 htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
169 * The validator, controlling the forcible closing of the tags that
170 * (in accordance to dtd) are not allowed in the current context.
172 private htmlValidator validator;
175 * Provides the default values for parameters in the case when these
176 * values are defined in the DTD.
178 private parameterDefaulter defaulter;
181 * The text pre-processor for handling line ends and tabs.
183 private textPreProcessor textProcessor = new textPreProcessor();
186 * Creates a new Parser that uses the given
187 * {@link javax.swing.text.html.parser.DTD }. The only standard way
188 * to get an instance of DTD is to construct it manually, filling in
189 * all required fields.
190 * @param a_dtd The DTD to use. The parser behaviour after passing null
191 * as an argument is not documented and may vary between implementations.
193 public Parser(DTD a_dtd)
195 if (a_dtd == null)
196 dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
197 else
198 dtd = a_dtd;
200 defaulter = new parameterDefaulter(dtd);
202 validator =
203 new htmlValidator(dtd)
206 * Handles the error message. This method must be overridden to pass
207 * the message where required.
208 * @param msg The message text.
210 protected void s_error(String msg)
212 error(msg);
216 * The method is called when the tag validator decides to close the
217 * tag on its own initiative. After reaching the end of stream,
218 * The tag validator closes all unclosed elements that are required
219 * to have the end (closing) tag.
221 * @param tElement The tag being fictionally (forcibly) closed.
223 protected void handleSupposedEndTag(Element tElement)
225 // The tag is cloned as the original tElement is the
226 // element from the starting tag - may be accidently used
227 // somewhere else.
228 TagElement tag = makeTag(tElement, true);
229 _handleEndTag_remaining(tag);
233 * The method is called when the the tag validator decides to open
234 * the new tag on its own initiative. The tags, opened in this
235 * way, are HTML, HEAD and BODY. The attribute set is temporary
236 * assigned to the empty one, the previous value is
237 * restored before return.
239 * @param tElement The tag being fictionally (forcibly) closed.
241 protected void handleSupposedStartTag(Element tElement)
243 TagElement tag = makeTag(tElement, true);
244 htmlAttributeSet were = attributes;
245 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
246 _handleStartTag(tag);
247 attributes = were;
253 * Get the attributes of the current tag.
254 * @return The attribute set, representing the attributes of the current tag.
256 public SimpleAttributeSet getAttributes()
258 return new SimpleAttributeSet(attributes);
262 * Invokes the error handler. The default method in this implementation
263 * delegates the call to handleError, also providing the current line.
265 public void error(String msg)
267 error(msg, getTokenAhead());
270 public void error(String msg, Token atToken)
272 if (atToken != null)
273 handleError(atToken.where.beginLine,
274 msg + ": line " + atToken.where.beginLine +
275 ", absolute pos " + atToken.where.startPosition
277 else
278 handleError(0, msg);
282 * Invokes the error handler. The default method in this implementation
283 * delegates the call to error (parm1+": '"+parm2+"'").
285 public void error(String msg, String invalid)
287 error(msg + ": '" + invalid + "'");
291 * Invokes the error handler. The default method in this implementation
292 * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
294 public void error(String parm1, String parm2, String parm3)
296 error(parm1 + " " + parm2 + " " + parm3);
300 * Invokes the error handler. The default method in this implementation
301 * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
303 public void error(String parm1, String parm2, String parm3, String parm4)
305 error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
308 public void flushAttributes()
313 * Parse the HTML text, calling various methods in response to the
314 * occurence of the corresponding HTML constructions.
315 * @param reader The reader to read the source HTML from.
316 * @throws IOException If the reader throws one.
318 public synchronized void parse(Reader reader)
319 throws IOException
321 reset(reader);
322 restart();
325 parseDocument();
326 validator.closeAll();
328 catch (ParseException ex)
330 if (ex != null)
332 error("Unable to continue parsing the document", ex.getMessage());
334 Throwable cause = ex.getCause();
335 if (cause instanceof IOException)
336 throw (IOException) cause;
342 * Parses DTD markup declaration. Currently returns null without action.
343 * @return null.
344 * @throws IOException
346 public String parseDTDMarkup()
347 throws IOException
349 return null;
353 * Parse SGML insertion ( &lt;! ... &gt; ). When the
354 * the SGML insertion is found, this method is called, passing
355 * SGML in the string buffer as a parameter. The default method
356 * returns false without action and can be overridden to
357 * implement user - defined SGML support.
358 * <p>
359 * If you need more information about SGML insertions in HTML documents,
360 * the author suggests to read SGML tutorial on
361 * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
362 * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
363 * Oxford University Press, 688 p, ISBN: 0198537379.
364 * </p>
365 * @param strBuff
366 * @return true if this is a valid DTD markup declaration.
367 * @throws IOException
369 public boolean parseMarkupDeclarations(StringBuffer strBuff)
370 throws IOException
372 return false;
376 * Get the first line of the last parsed token.
378 protected int getCurrentLine()
380 return hTag.where.beginLine;
384 * Read parseable character data, add to buffer.
385 * @param clearBuffer If true, buffer if filled by CDATA section,
386 * otherwise the section is appended to the existing content of the
387 * buffer.
389 * @throws ParseException
391 protected void CDATA(boolean clearBuffer)
392 throws ParseException
394 Token start = hTag = getTokenAhead();
396 if (clearBuffer)
397 buffer.setLength(0);
399 // Handle expected EOF.
400 if (start.kind == EOF)
401 return;
403 read:
404 while (true)
406 t = getTokenAhead();
407 if (t.kind == EOF)
409 error("unexpected eof", t);
410 break read;
412 else if (t.kind == BEGIN)
413 break read;
414 else if (t.kind == Constants.ENTITY)
416 resolveAndAppendEntity(t);
417 getNextToken();
419 else
421 append(t);
422 getNextToken();
425 hTag = new Token(start, getTokenAhead(0));
426 if (buffer.length() != 0)
427 _handleText();
431 * Process Comment. This method skips till --> without
432 * taking SGML constructs into consideration. The supported SGML
433 * constructs are handled separately.
435 protected void Comment()
436 throws ParseException
438 buffer.setLength(0);
440 Token start = hTag = mustBe(BEGIN);
441 optional(WS);
442 mustBe(EXCLAMATION);
443 optional(WS);
444 mustBe(DOUBLE_DASH);
446 Token t;
447 Token last;
449 comment:
450 while (true)
452 t = getTokenAhead();
453 if (t.kind == EOF)
455 handleEOFInComment();
456 last = t;
457 break comment;
459 else if (COMMENT_END.matches(this))
461 mustBe(DOUBLE_DASH);
462 optional(WS);
463 last = mustBe(END);
464 break comment;
466 else if (COMMENT_TRIPLEDASH_END.matches(this))
468 mustBe(DOUBLE_DASH);
469 t = mustBe(NUMTOKEN);
470 if (t.getImage().equals("-"))
472 append(t);
473 last = mustBe(END);
474 break comment;
476 else
478 buffer.append("--");
479 append(t);
480 t = getTokenAhead();
483 else
484 /* The lllll-- can match as NUMTOKEN */
485 if ((t.getImage().endsWith("--")) &&
487 getTokenAhead(1).kind == END ||
488 (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
492 buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
494 /* Skip the closing > that we have already checked. */
495 last = mustBe(t.kind);
496 break comment;
498 else
499 append(t);
500 mustBe(t.kind);
502 hTag = new Token(start, last);
504 // Consume any whitespace immediately following a comment.
505 optional(WS);
506 handleComment();
510 * Read a script. The text, returned without any changes,
511 * is terminated only by the closing tag SCRIPT.
513 protected void Script()
514 throws ParseException
516 Token name;
518 Token start = hTag = mustBe(BEGIN);
519 optional(WS);
521 name = mustBe(SCRIPT);
523 optional(WS);
525 restOfTag(false, name, start);
527 buffer.setLength(0);
529 while (!SCRIPT_CLOSE.matches(this))
531 append(getNextToken());
534 consume(SCRIPT_CLOSE);
536 _handleText();
538 endTag(false);
539 _handleEndTag(makeTagElement(name.getImage(), false));
543 * Process SGML insertion that is not a comment.
545 protected void Sgml()
546 throws ParseException
548 if (COMMENT_OPEN.matches(this))
549 Comment();
550 else // skip till ">"
552 Token start = hTag = mustBe(BEGIN);
553 optional(WS);
554 mustBe(EXCLAMATION);
556 buffer.setLength(0);
557 read:
558 while (true)
560 t = getNextToken();
561 if (t.kind == Constants.ENTITY)
563 resolveAndAppendEntity(t);
565 else if (t.kind == EOF)
567 error("unexpected eof", t);
568 break read;
570 else if (t.kind == END)
571 break read;
572 else
573 append(t);
578 parseMarkupDeclarations(buffer);
580 catch (IOException ex)
582 error("Unable to parse SGML insertion: '" + buffer + "'",
583 new Token(start, t)
587 // Consume any whitespace that follows the Sgml insertion.
588 optional(WS);
592 * Read a style definition. The text, returned without any changes,
593 * is terminated only by the closing tag STYLE.
595 protected void Style()
596 throws ParseException
598 Token name;
600 Token start = hTag = mustBe(BEGIN);
601 optional(WS);
603 name = mustBe(STYLE);
605 optional(WS);
607 restOfTag(false, name, start);
609 buffer.setLength(0);
611 while (!STYLE_CLOSE.matches(this))
613 append(getNextToken());
616 consume(STYLE_CLOSE);
618 _handleText();
620 endTag(false);
621 _handleEndTag(makeTagElement(name.getImage(), false));
625 * Read a html tag.
627 protected void Tag()
628 throws ParseException
630 mark(true);
632 boolean closing = false;
633 Token name;
634 Token start = hTag = mustBe(BEGIN);
636 optional(WS);
637 name = getNextToken();
638 optional(WS);
640 if (name.kind == SLASH)
642 closing = true;
643 name = getNextToken();
646 restOfTag(closing, name, start);
650 * A hook, for operations, preceeding call to handleText.
651 * Handle text in a string buffer.
652 * In non - preformatted mode, all line breaks immediately following the
653 * start tag and immediately before an end tag is discarded,
654 * \r, \n and \t are replaced by spaces, multiple space are replaced
655 * by the single one and the result is moved into array,
656 * passing it to handleText().
658 protected void _handleText()
660 char[] text;
662 if (preformatted > 0)
663 text = textProcessor.preprocessPreformatted(buffer);
664 else
665 text = textProcessor.preprocess(buffer);
667 if (text != null && text.length > 0
668 // According to the specs we need to discard whitespace immediately
669 // before a closing tag.
670 && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
672 TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
673 attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
674 _handleEmptyTag(pcdata);
676 handleText(text);
677 if (titleOpen)
678 title.append(text);
683 * Add the image of this token to the buffer.
684 * @param t A token to append.
686 protected final void append(Token t)
688 if (t.kind != EOF)
689 t.appendTo(buffer);
693 * Consume pattern that must match.
694 * @param p A pattern to consume.
696 protected final void consume(pattern p)
698 node n;
699 for (int i = 0; i < p.nodes.length; i++)
701 n = p.nodes [ i ];
702 if (n.optional)
703 optional(n.kind);
704 else
705 mustBe(n.kind);
710 * The method is called when the HTML end (closing) tag is found or if
711 * the parser concludes that the one should be present in the
712 * current position. The method is called immediatly
713 * before calling the handleEndTag().
714 * @param omitted True if the tag is no actually present in the document,
715 * but is supposed by the parser (like &lt;/html&gt; at the end of the
716 * document).
718 protected void endTag(boolean omitted)
723 * Handle HTML comment. The default method returns without action.
724 * @param comment
726 protected void handleComment(char[] comment)
731 * This is additionally called in when the HTML content terminates
732 * without closing the HTML comment. This can only happen if the
733 * HTML document contains errors (for example, the closing --;gt is
734 * missing.
736 protected void handleEOFInComment()
738 error("Unclosed comment");
742 * Handle the tag with no content, like &lt;br&gt;. The method is
743 * called for the elements that, in accordance with the current DTD,
744 * has an empty content.
745 * @param tag The tag being handled.
746 * @throws javax.swing.text.ChangedCharSetException
748 protected void handleEmptyTag(TagElement tag)
749 throws javax.swing.text.ChangedCharSetException
754 * The method is called when the HTML closing tag ((like &lt;/table&gt;)
755 * is found or if the parser concludes that the one should be present
756 * in the current position.
757 * @param tag The tag
759 protected void handleEndTag(TagElement tag)
763 /* Handle error that has occured in the given line. */
764 protected void handleError(int line, String message)
769 * The method is called when the HTML opening tag ((like &lt;table&gt;)
770 * is found or if the parser concludes that the one should be present
771 * in the current position.
772 * @param tag The tag
774 protected void handleStartTag(TagElement tag)
779 * Handle the text section.
780 * <p> For non-preformatted section, the parser replaces
781 * \t, \r and \n by spaces and then multiple spaces
782 * by a single space. Additionaly, all whitespace around
783 * tags is discarded.
784 * </p>
785 * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
786 * all tabs and spaces, but removes <b>one</b> bounding \r, \n or \r\n,
787 * if it is present. Additionally, it replaces each occurence of \r or \r\n
788 * by a single \n.</p>
790 * @param text A section text.
792 protected void handleText(char[] text)
797 * Handle HTML &lt;title&gt; tag. This method is invoked when
798 * both title starting and closing tags are already behind.
799 * The passed argument contains the concatenation of all
800 * title text sections.
801 * @param title The title text.
803 protected void handleTitle(char[] title)
808 * Constructs the tag from the given element. In this implementation,
809 * this is defined, but never called.
810 * @return the tag
812 protected TagElement makeTag(Element element)
814 return makeTag(element, false);
818 * Constructs the tag from the given element.
819 * @param the tag base {@link javax.swing.text.html.parser.Element}
820 * @param isSupposed true if the tag is not actually present in the
821 * html input, but the parser supposes that it should to occur in
822 * the current location.
823 * @return the tag
825 protected TagElement makeTag(Element element, boolean isSupposed)
827 return new TagElement(element, isSupposed);
831 * This is called when the tag, representing the given element,
832 * occurs first time in the document.
833 * @param element
835 protected void markFirstTime(Element element)
840 * Consume the token that was checked before and hence MUST be present.
841 * @param kind The kind of token to consume.
843 protected Token mustBe(int kind)
845 if (getTokenAhead().kind == kind)
846 return getNextToken();
847 else
849 String ei = "";
850 if (kind < 1000)
851 ei = " ('" + (char) kind + "') ";
852 throw new AssertionError("The token of kind " + kind + ei +
853 " MUST be here,"
859 * Handle attribute without value. The default method uses
860 * the only allowed attribute value from DTD.
861 * If the attribute is unknown or allows several values,
862 * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
863 * this value is added to the attribute set.
864 * @param element The name of element.
865 * @param attribute The name of attribute without value.
867 protected void noValueAttribute(String element, String attribute)
869 Object value = HTML.NULL_ATTRIBUTE_VALUE;
871 Element e = dtd.elementHash.get(element.toLowerCase());
872 if (e != null)
874 AttributeList attr = e.getAttribute(attribute);
875 if (attr != null)
877 Vector values = attr.values;
878 if (values != null && values.size() == 1)
879 value = values.get(0);
882 attributes.addAttribute(attribute, value);
886 * Consume the optional token, if present.
887 * @param kind The kind of token to consume.
889 protected Token optional(int kind)
891 if (getTokenAhead().kind == kind)
892 return getNextToken();
893 else
894 return null;
897 /** Parse the html document. */
898 protected void parseDocument()
899 throws ParseException
901 // Read up any initial whitespace.
902 optional(WS);
903 while (getTokenAhead().kind != EOF)
905 advanced = false;
906 if (TAG.matches(this))
907 Tag();
908 else if (COMMENT_OPEN.matches(this))
909 Comment();
910 else if (STYLE_OPEN.matches(this))
911 Style();
912 else if (SCRIPT_OPEN.matches(this))
913 Script();
914 else if (SGML.matches(this))
915 Sgml();
916 else
917 CDATA(true);
919 // Surely HTML error, treat as a text.
920 if (!advanced)
922 Token wrong = getNextToken();
923 error("unexpected '" + wrong.getImage() + "'", wrong);
924 buffer.setLength(0);
925 buffer.append(wrong.getImage());
926 _handleText();
932 * Read the element attributes, adding them into attribute set.
933 * @param element The element name (needed to access attribute
934 * information in dtd).
936 protected void readAttributes(String element)
938 Token name;
939 Token value;
940 Token next;
941 String attrValue;
943 attributes = new htmlAttributeSet();
945 optional(WS);
947 attributeReading:
948 while (getTokenAhead().kind == NUMTOKEN)
950 name = getNextToken();
951 optional(WS);
953 next = getTokenAhead();
954 if (next.kind == EQ)
956 mustBe(EQ);
957 optional(WS);
959 next = getNextToken();
961 switch (next.kind)
963 case QUOT:
965 // read "quoted" attribute.
966 buffer.setLength(0);
967 readTillTokenE(QUOT);
968 attrValue = buffer.toString();
969 break;
971 case AP:
973 // read 'quoted' attribute.
974 buffer.setLength(0);
975 readTillTokenE(AP);
976 attrValue = buffer.toString();
977 break;
979 // read unquoted attribute.
980 case NUMTOKEN:
981 value = next;
982 optional(WS);
984 // Check maybe the opening quote is missing.
985 next = getTokenAhead();
986 if (bQUOTING.get(next.kind))
988 hTag = next;
989 error("The value without opening quote is closed with '"
990 + next.getImage() + "'");
991 attrValue = value.getImage();
993 else if (next.kind == SLASH || next.kind == OTHER)
994 // The slash and other characters (like %) in this context is
995 // treated as the ordinary
996 // character, not as a token. The character may be part of
997 // the unquoted URL.
999 CPStringBuilder image = new CPStringBuilder(value.getImage());
1000 while (next.kind == NUMTOKEN || next.kind == SLASH
1001 || next.kind == OTHER)
1003 image.append(getNextToken().getImage());
1004 next = getTokenAhead();
1006 attrValue = image.toString();
1008 else
1009 attrValue = value.getImage();
1010 break;
1012 case SLASH:
1013 value = next;
1014 optional(WS);
1016 // Check maybe the opening quote is missing.
1017 next = getTokenAhead();
1018 if (bQUOTING.get(next.kind))
1020 hTag = next;
1021 error("The value without opening quote is closed with '"
1022 + next.getImage() + "'");
1023 attrValue = value.getImage();
1025 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1026 // The slash in this context is treated as the ordinary
1027 // character, not as a token. The slash may be part of
1028 // the unquoted URL.
1030 CPStringBuilder image = new CPStringBuilder(value.getImage());
1031 while (next.kind == NUMTOKEN || next.kind == SLASH)
1033 image.append(getNextToken().getImage());
1034 next = getTokenAhead();
1036 attrValue = image.toString();
1038 else
1039 attrValue = value.getImage();
1040 break;
1041 default:
1042 break attributeReading;
1044 attributes.addAttribute(name.getImage(), attrValue);
1045 optional(WS);
1047 else
1048 // The '=' is missing: attribute without value.
1050 noValueAttribute(element, name.getImage());
1056 * Return string, corresponding the given named entity. The name is passed
1057 * with the preceeding &, but without the ending semicolon.
1059 protected String resolveNamedEntity(final String a_tag)
1061 // Discard &
1062 if (!a_tag.startsWith("&"))
1063 throw new AssertionError("Named entity " + a_tag +
1064 " must start witn '&'."
1067 String tag = a_tag.substring(1);
1071 Entity entity = dtd.getEntity(tag);
1072 if (entity != null)
1073 return entity.getString();
1075 entity = dtd.getEntity(tag.toLowerCase());
1077 if (entity != null)
1079 error("The name of this entity should be in lowercase", a_tag);
1080 return entity.getString();
1083 catch (IndexOutOfBoundsException ibx)
1085 /* The error will be reported. */
1088 error("Unknown named entity", a_tag);
1089 return a_tag;
1093 * Return char, corresponding the given numeric entity.
1094 * The name is passed with the preceeding &#, but without
1095 * the ending semicolon.
1097 protected char resolveNumericEntity(final String a_tag)
1099 // Discard &#
1100 if (!a_tag.startsWith("&#"))
1101 throw new AssertionError("Numeric entity " + a_tag +
1102 " must start witn '&#'."
1105 String tag = a_tag.substring(2);
1109 // Determine the encoding type:
1110 char cx = tag.charAt(0);
1111 if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1113 return (char) Integer.parseInt(tag.substring(1), 16);
1115 return (char) Integer.parseInt(tag);
1118 /* The error will be reported. */
1119 catch (NumberFormatException nex)
1122 catch (IndexOutOfBoundsException ix)
1126 error("Invalid numeric entity", a_tag);
1127 return '?';
1131 * Reset all fields into the intial default state, preparing the
1132 * parset for parsing the next document.
1134 protected void restart()
1136 documentTags.clear();
1137 titleHandled = false;
1138 titleOpen = false;
1139 buffer.setLength(0);
1140 title.setLength(0);
1141 validator.restart();
1145 * The method is called when the HTML opening tag ((like &lt;table&gt;)
1146 * is found or if the parser concludes that the one should be present
1147 * in the current position. The method is called immediately before
1148 * calling the handleStartTag.
1149 * @param tag The tag
1151 protected void startTag(TagElement tag)
1152 throws ChangedCharSetException
1157 * Handle a complete element, when the tag content is already present in the
1158 * buffer and both starting and heading tags behind. This is called
1159 * in the case when the tag text must not be parsed for the nested
1160 * elements (elements STYLE and SCRIPT).
1162 private void _handleCompleteElement(TagElement tag)
1164 _handleStartTag(tag);
1166 // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1167 HTML.Tag h = tag.getHTMLTag();
1168 if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1170 boolean tmp = titleOpen;
1171 titleOpen = false;
1172 _handleText();
1173 titleOpen = tmp;
1175 else
1176 _handleText();
1178 _handleEndTag(tag);
1182 * A hooks for operations, preceeding call to handleEmptyTag().
1183 * Handle the tag with no content, like &lt;br&gt;. As no any
1184 * nested tags are expected, the tag validator is not involved.
1185 * @param tag The tag being handled.
1187 private void _handleEmptyTag(TagElement tag)
1191 validator.validateTag(tag, attributes);
1192 handleEmptyTag(tag);
1193 HTML.Tag h = tag.getHTMLTag();
1194 // When a block tag is closed, consume whitespace that follows after
1195 // it.
1196 // For some unknown reason a FRAME tag is not treated as block element.
1197 // However in this case it should be treated as such.
1198 if (isBlock(h))
1199 optional(WS);
1201 catch (ChangedCharSetException ex)
1203 error("Changed charset exception:", ex.getMessage());
1208 * A hooks for operations, preceeding call to handleEndTag().
1209 * The method is called when the HTML closing tag
1210 * is found. Calls handleTitle after closing the 'title' tag.
1211 * @param tag The tag
1213 private void _handleEndTag(TagElement tag)
1215 if (validator.closeTag(tag))
1216 _handleEndTag_remaining(tag);
1220 * Actions that are also required if the closing action was
1221 * initiated by the tag validator.
1222 * Package-private to avoid an accessor method.
1224 void _handleEndTag_remaining(TagElement tag)
1226 HTML.Tag h = tag.getHTMLTag();
1228 handleEndTag(tag);
1229 endTag(tag.fictional());
1231 if (h.isPreformatted())
1232 preformatted--;
1233 if (preformatted < 0)
1234 preformatted = 0;
1236 // When a block tag is closed, consume whitespace that follows after
1237 // it.
1238 if (isBlock(h))
1239 optional(WS);
1241 if (h == HTML.Tag.TITLE)
1243 titleOpen = false;
1244 titleHandled = true;
1246 char[] a = new char[ title.length() ];
1247 title.getChars(0, a.length, a, 0);
1248 handleTitle(a);
1253 * A hooks for operations, preceeding call to handleStartTag().
1254 * The method is called when the HTML opening tag ((like &lt;table&gt;)
1255 * is found.
1256 * Package-private to avoid an accessor method.
1257 * @param tag The tag
1259 void _handleStartTag(TagElement tag)
1261 validator.openTag(tag, attributes);
1262 startingTag(tag);
1263 handleStartTag(tag);
1265 HTML.Tag h = tag.getHTMLTag();
1267 if (isBlock(h))
1268 optional(WS);
1270 if (h.isPreformatted())
1271 preformatted++;
1273 if (h == HTML.Tag.TITLE)
1275 if (titleHandled)
1276 error("Repetetive <TITLE> tag");
1277 titleOpen = true;
1278 titleHandled = false;
1283 * Resume parsing after heavy errors in HTML tag structure.
1284 * @throws ParseException
1286 private void forciblyCloseTheTag()
1287 throws ParseException
1289 int closeAt = 0;
1290 buffer.setLength(0);
1292 ahead:
1293 for (int i = 1; i < 100; i++)
1295 t = getTokenAhead(i - 1);
1296 if (t.kind == EOF || t.kind == BEGIN)
1297 break ahead;
1298 if (t.kind == END)
1300 /* Closing '>' found. */
1301 closeAt = i;
1302 break ahead;
1305 if (closeAt > 0)
1307 buffer.append("Ignoring '");
1308 for (int i = 1; i <= closeAt; i++)
1310 t = getNextToken();
1311 append(t);
1313 buffer.append('\'');
1314 error(buffer.toString());
1319 * Handle comment in string buffer. You can avoid allocating a char
1320 * array each time by processing your comment directly here.
1322 private void handleComment()
1324 char[] a = new char[ buffer.length() ];
1325 buffer.getChars(0, a.length, a, 0);
1326 handleComment(a);
1329 private TagElement makeTagElement(String name, boolean isSupposed)
1331 Element e = dtd.elementHash.get(name.toLowerCase());
1332 if (e == null)
1334 error("Unknown tag <" + name + ">");
1335 e = dtd.getElement(name);
1336 e.name = name.toUpperCase();
1337 e.index = -1;
1340 if (!documentTags.contains(e.name))
1342 markFirstTime(e);
1343 documentTags.add(e.name);
1346 return makeTag(e, isSupposed);
1350 * Read till the given token, resolving entities. Consume the given
1351 * token without adding it to buffer.
1352 * @param till The token to read till
1353 * @throws ParseException
1355 private void readTillTokenE(int till)
1356 throws ParseException
1358 buffer.setLength(0);
1359 read:
1360 while (true)
1362 t = getNextToken();
1363 if (t.kind == Constants.ENTITY)
1365 resolveAndAppendEntity(t);
1367 else if (t.kind == EOF)
1369 error("unexpected eof", t);
1370 break read;
1372 else if (t.kind == till)
1373 break read;
1374 else if (t.kind == WS)
1376 // Processing whitespace in accordance with CDATA rules:
1377 String s = t.getImage();
1378 char c;
1379 for (int i = 0; i < s.length(); i++)
1381 c = s.charAt(i);
1382 if (c == '\r')
1383 buffer.append(' '); // CR replaced by space
1384 else if (c == '\n')
1385 { /* LF ignored */ }
1386 else if (c == '\t')
1387 buffer.append(' '); // Tab replaced by space
1388 else
1389 buffer.append(c);
1392 else
1393 append(t);
1398 * Resolve the entity and append it to the end of buffer.
1399 * @param entity
1401 private void resolveAndAppendEntity(Token entity)
1403 switch (entity.category)
1405 case ENTITY_NAMED :
1406 buffer.append(resolveNamedEntity(entity.getImage()));
1407 break;
1409 case ENTITY_NUMERIC :
1410 buffer.append(resolveNumericEntity(entity.getImage()));
1411 break;
1413 default :
1414 throw new AssertionError("Invalid entity category " +
1415 entity.category
1421 * Handle the remaining of HTML tags. This is a common end for
1422 * TAG, SCRIPT and STYLE.
1423 * @param closing True for closing tags ( &lt;/TAG&gt; ).
1424 * @param name Name of element
1425 * @param start Token where element has started
1426 * @throws ParseException
1428 private void restOfTag(boolean closing, Token name, Token start)
1429 throws ParseException
1431 boolean end = false;
1432 Token next;
1434 optional(WS);
1436 readAttributes(name.getImage());
1438 optional(WS);
1440 next = getTokenAhead();
1441 if (next.kind == END)
1443 mustBe(END);
1444 end = true;
1447 hTag = new Token(start, next);
1449 if (!end)
1451 // The tag body contains errors. If additionally the tag
1452 // name is not valid, this construction is treated as text.
1453 if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1454 backupMode
1457 error("Errors in tag body and unknown tag name. " +
1458 "Treating the tag as a text."
1460 reset();
1462 hTag = mustBe(BEGIN);
1463 buffer.setLength(0);
1464 buffer.append(hTag.getImage());
1465 CDATA(false);
1466 return;
1468 else
1470 error("Forcibly closing invalid parameter list");
1471 forciblyCloseTheTag();
1475 if (closing)
1477 endTag(false);
1478 _handleEndTag(makeTagElement(name.getImage(), false));
1480 else
1482 TagElement te = makeTagElement(name.getImage(), false);
1483 if (te.getElement().type == DTDConstants.EMPTY)
1484 _handleEmptyTag(te);
1485 else
1487 // According to the specs we need to consume whitespace following
1488 // immediately after a opening tag.
1489 optional(WS);
1490 _handleStartTag(te);
1496 * This should fire additional actions in response to the
1497 * ChangedCharSetException. The current implementation
1498 * does nothing.
1499 * @param tag
1501 private void startingTag(TagElement tag)
1505 startTag(tag);
1507 catch (ChangedCharSetException cax)
1509 error("Invalid change of charset");
1513 private void ws_error()
1515 error("Whitespace here is not permitted");
1519 * Returns true when the specified tag should be considered a block tag
1520 * wrt whitespace handling. We need this special handling, since there
1521 * are a couple of tags that we must treat as block tags but which aren't
1522 * officially block tags.
1524 * @param tag the tag to check
1525 * @return true when the specified tag should be considered a block tag
1526 * wrt whitespace handling
1528 private boolean isBlock(HTML.Tag tag)
1530 return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;