libjava/classpath/gnu/javax/swing/text/html/parser/support/Parser.java

   1 /* Parser.java -- HTML parser.
   2    Copyright (C) 2005 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38
  39 package gnu.javax.swing.text.html.parser.support;
  40
  41 import gnu.java.lang.CPStringBuilder;
  42
  43 import gnu.javax.swing.text.html.parser.htmlAttributeSet;
  44 import gnu.javax.swing.text.html.parser.htmlValidator;
  45 import gnu.javax.swing.text.html.parser.support.low.Constants;
  46 import gnu.javax.swing.text.html.parser.support.low.ParseException;
  47 import gnu.javax.swing.text.html.parser.support.low.ReaderTokenizer;
  48 import gnu.javax.swing.text.html.parser.support.low.Token;
  49 import gnu.javax.swing.text.html.parser.support.low.node;
  50 import gnu.javax.swing.text.html.parser.support.low.pattern;
  51
  52 import java.io.IOException;
  53 import java.io.Reader;
  54
  55 import java.util.Comparator;
  56 import java.util.Set;
  57 import java.util.TreeSet;
  58 import java.util.Vector;
  59
  60 import javax.swing.text.ChangedCharSetException;
  61 import javax.swing.text.SimpleAttributeSet;
  62 import javax.swing.text.html.HTML;
  63 import javax.swing.text.html.parser.AttributeList;
  64 import javax.swing.text.html.parser.DTD;
  65 import javax.swing.text.html.parser.DTDConstants;
  66 import javax.swing.text.html.parser.Element;
  67 import javax.swing.text.html.parser.Entity;
  68 import javax.swing.text.html.parser.TagElement;
  69
  70 /**
  71  * <p>A simple error-tolerant HTML parser that uses a DTD document
  72  * to access data on the possible tokens, arguments and syntax.</p>
  73  * <p> The parser reads an HTML content from a Reader and calls various
  74  * notifying methods (which should be overridden in a subclass)
  75  * when tags or data are encountered.</p>
  76  * <p>Some HTML elements need no opening or closing tags. The
  77  * task of this parser is to invoke the tag handling methods also when
  78  * the tags are not explicitly specified and must be supposed using
  79  * information, stored in the DTD.
  80  * For  example, parsing the document
  81  * <p>&lt;table&gt;&lt;tr&gt;&lt;td&gt;a&lt;td&gt;b&lt;td&gt;c&lt;/tr&gt; <br>
  82  * will invoke exactly the handling methods exactly in the same order
  83  * (and with the same parameters) as if parsing the document: <br>
  84  * <em>&lt;html&gt;&lt;head&gt;&lt;/head&gt;&lt;body&gt;&lt;table&gt;&lt;
  85  * tbody&gt;</em>&lt;tr&gt;&lt;td&gt;a<em>&lt;/td&gt;</em>&lt;td&gt;b<em>
  86  * &lt;/td&gt;</em>&lt;td&gt;c<em>&lt;/td&gt;&lt;/tr&gt;</em>&lt;
  87  * <em>/tbody&gt;&lt;/table&gt;&lt;/body&gt;&lt;/html&gt;</em></p>
  88  * (supposed tags are given in italics). The parser also supports
  89  * obsolete elements of HTML syntax.<p>
  90  * </p>
  91  * @author Audrius Meskauskas, Lithuania (AudriusA@Bioinformatics.org)
  92  */
  93 public class Parser
  94   extends ReaderTokenizer
  95   implements DTDConstants
  96 {
  97   /**
  98    * The current html tag.
  99    */
 100   public Token hTag = new Token();
 101
 102   /**
 103    * The document template description that will be used to parse the documents.
 104    */
 105   protected DTD dtd;
 106
 107   /**
 108    * The value of this field determines whether or not the Parser will be
 109    * strict in enforcing SGML compatibility. The default value is false,
 110    * stating that the parser should do everything to parse and get at least
 111    * some information even from the incorrectly written HTML input.
 112    */
 113   protected boolean strict;
 114
 115   /**
 116    * This fields has positive values in preformatted tags.
 117    */
 118   protected int preformatted = 0;
 119
 120   /**
 121    * The set of the document tags. This field is used for supporting
 122    * markFirstTime().
 123    */
 124   private Set documentTags =
 125     new TreeSet(new Comparator()
 126       {
 127         public int compare(Object a, Object b)
 128         {
 129           return ((String) a).compareToIgnoreCase((String) b);
 130         }
 131       }
 132                );
 133
 134   /**
 135   * The buffer to collect the incremental output like text or coment.
 136   */
 137   private final StringBuffer buffer = new StringBuffer();
 138
 139   /**
 140    * The buffer to store the document title.
 141    */
 142   private final StringBuffer title = new StringBuffer();
 143
 144   /**
 145    * The current token.
 146    */
 147   private Token t;
 148
 149   /**
 150    * True means that the 'title' tag of this document has
 151    * already been handled.
 152    */
 153   private boolean titleHandled;
 154
 155   /**
 156    * True means that the 'title' tag is currently open and all
 157    * text is also added to the title buffer.
 158    */
 159   private boolean titleOpen;
 160
 161   /**
 162    * The attributes of the current HTML element.
 163    * Package-private to avoid an accessor method.
 164    */
 165   htmlAttributeSet attributes =
 166     htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 167
 168   /**
 169    * The validator, controlling the forcible closing of the tags that
 170    * (in accordance to dtd) are not allowed in the current context.
 171    */
 172   private htmlValidator validator;
 173
 174   /**
 175    * Provides the default values for parameters in the case when these
 176    * values are defined in the DTD.
 177    */
 178   private parameterDefaulter defaulter;
 179
 180   /**
 181    * The text pre-processor for handling line ends and tabs.
 182    */
 183   private textPreProcessor textProcessor = new textPreProcessor();
 184
 185   /**
 186    * Creates a new Parser that uses the given
 187    * {@link javax.swing.text.html.parser.DTD }. The only standard way
 188    * to get an instance of DTD is to construct it manually, filling in
 189    * all required fields.
 190    * @param a_dtd The DTD to use. The parser behaviour after passing null
 191    * as an argument is not documented and may vary between implementations.
 192    */
 193   public Parser(DTD a_dtd)
 194   {
 195     if (a_dtd == null)
 196       dtd = gnu.javax.swing.text.html.parser.HTML_401F.getInstance();
 197     else
 198       dtd = a_dtd;
 199
 200     defaulter = new parameterDefaulter(dtd);
 201
 202     validator =
 203       new htmlValidator(dtd)
 204         {
 205           /**
 206            * Handles the error message. This method must be overridden to pass
 207            * the message where required.
 208            * @param msg The message text.
 209            */
 210           protected void s_error(String msg)
 211           {
 212             error(msg);
 213           }
 214
 215           /**
 216            * The method is called when the tag validator decides to close the
 217            * tag on its own initiative. After reaching the end of stream,
 218            * The tag validator closes all unclosed elements that are required
 219            * to have the end (closing) tag.
 220            *
 221            * @param tElement The tag being fictionally (forcibly) closed.
 222            */
 223           protected void handleSupposedEndTag(Element tElement)
 224           {
 225             // The tag is cloned as the original tElement is the
 226             // element from the starting tag - may be accidently used
 227             // somewhere else.
 228             TagElement tag = makeTag(tElement, true);
 229             _handleEndTag_remaining(tag);
 230           }
 231
 232           /**
 233            * The method is called when the the tag validator decides to open
 234            * the new tag on its own initiative. The tags, opened in this
 235            * way, are HTML, HEAD and BODY. The attribute set is temporary
 236            * assigned to the empty one, the previous value is
 237            * restored before return.
 238            *
 239            * @param tElement The tag being fictionally (forcibly) closed.
 240            */
 241           protected void handleSupposedStartTag(Element tElement)
 242           {
 243             TagElement tag = makeTag(tElement, true);
 244             htmlAttributeSet were = attributes;
 245             attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 246             _handleStartTag(tag);
 247             attributes = were;
 248           }
 249         };
 250   }
 251
 252   /**
 253    * Get the attributes of the current tag.
 254    * @return The attribute set, representing the attributes of the current tag.
 255    */
 256   public SimpleAttributeSet getAttributes()
 257   {
 258     return new SimpleAttributeSet(attributes);
 259   }
 260
 261   /**
 262    * Invokes the error handler. The default method in this implementation
 263    * delegates the call to handleError, also providing the current line.
 264    */
 265   public void error(String msg)
 266   {
 267     error(msg, getTokenAhead());
 268   }
 269
 270   public void error(String msg, Token atToken)
 271   {
 272     if (atToken != null)
 273       handleError(atToken.where.beginLine,
 274                   msg + ": line " + atToken.where.beginLine +
 275                   ", absolute pos " + atToken.where.startPosition
 276                  );
 277     else
 278       handleError(0, msg);
 279   }
 280
 281   /**
 282    * Invokes the error handler. The default method in this implementation
 283    * delegates the call to error (parm1+": '"+parm2+"'").
 284    */
 285   public void error(String msg, String invalid)
 286   {
 287     error(msg + ": '" + invalid + "'");
 288   }
 289
 290   /**
 291    * Invokes the error handler. The default method in this implementation
 292    * delegates the call to error (parm1+" "+ parm2+" "+ parm3).
 293    */
 294   public void error(String parm1, String parm2, String parm3)
 295   {
 296     error(parm1 + " " + parm2 + " " + parm3);
 297   }
 298
 299   /**
 300    * Invokes the error handler. The default method in this implementation
 301    * delegates the call to error (parm1+" "+ parm2+" "+ parm3+" "+ parm4).
 302    */
 303   public void error(String parm1, String parm2, String parm3, String parm4)
 304   {
 305     error(parm1 + " " + parm2 + " " + parm3 + " " + parm4);
 306   }
 307
 308   public void flushAttributes()
 309   {
 310   }
 311
 312   /**
 313    * Parse the HTML text, calling various methods in response to the
 314    * occurence of the corresponding HTML constructions.
 315    * @param reader The reader to read the source HTML from.
 316    * @throws IOException If the reader throws one.
 317    */
 318   public synchronized void parse(Reader reader)
 319                           throws IOException
 320   {
 321     reset(reader);
 322     restart();
 323     try
 324       {
 325         parseDocument();
 326         validator.closeAll();
 327       }
 328     catch (ParseException ex)
 329       {
 330         if (ex != null)
 331           {
 332             error("Unable to continue parsing the document", ex.getMessage());
 333
 334             Throwable cause = ex.getCause();
 335             if (cause instanceof IOException)
 336               throw (IOException) cause;
 337           }
 338       }
 339   }
 340
 341   /**
 342    * Parses DTD markup declaration. Currently returns null without action.
 343    * @return null.
 344    * @throws IOException
 345    */
 346   public String parseDTDMarkup()
 347                         throws IOException
 348   {
 349     return null;
 350   }
 351
 352   /**
 353    * Parse SGML insertion ( &lt;! ... &gt; ). When the
 354    * the SGML insertion is found, this method is called, passing
 355    * SGML in the string buffer as a parameter. The default method
 356    * returns false without action and can be overridden to
 357    * implement user - defined SGML support.
 358    * <p>
 359    * If you need more information about SGML insertions in HTML documents,
 360    * the author suggests to read SGML tutorial on
 361    * {@link http://www.w3.org/TR/WD-html40-970708/intro/sgmltut.html}.
 362    * We also recommend Goldfarb C.F (1991) <i>The SGML Handbook</i>,
 363    * Oxford University Press, 688 p, ISBN: 0198537379.
 364    * </p>
 365    * @param strBuff
 366    * @return true if this is a valid DTD markup declaration.
 367    * @throws IOException
 368    */
 369   public boolean parseMarkupDeclarations(StringBuffer strBuff)
 370                                   throws IOException
 371   {
 372     return false;
 373   }
 374
 375   /**
 376    * Get the first line of the last parsed token.
 377    */
 378   protected int getCurrentLine()
 379   {
 380     return hTag.where.beginLine;
 381   }
 382
 383   /**
 384    * Read parseable character data, add to buffer.
 385    * @param clearBuffer If true, buffer if filled by CDATA section,
 386    * otherwise the section is appended to the existing content of the
 387    * buffer.
 388    *
 389    * @throws ParseException
 390    */
 391   protected void CDATA(boolean clearBuffer)
 392                 throws ParseException
 393   {
 394     Token start = hTag = getTokenAhead();
 395
 396     if (clearBuffer)
 397       buffer.setLength(0);
 398
 399     // Handle expected EOF.
 400     if (start.kind == EOF)
 401       return;
 402
 403     read:
 404     while (true)
 405       {
 406         t = getTokenAhead();
 407         if (t.kind == EOF)
 408           {
 409             error("unexpected eof", t);
 410             break read;
 411           }
 412         else if (t.kind == BEGIN)
 413           break read;
 414         else if (t.kind == Constants.ENTITY)
 415           {
 416             resolveAndAppendEntity(t);
 417             getNextToken();
 418           }
 419         else
 420           {
 421             append(t);
 422             getNextToken();
 423           }
 424       }
 425     hTag = new Token(start, getTokenAhead(0));
 426     if (buffer.length() != 0)
 427       _handleText();
 428   }
 429
 430   /**
 431   * Process Comment. This method skips till --> without
 432   * taking SGML constructs into consideration.  The supported SGML
 433   * constructs are handled separately.
 434   */
 435   protected void Comment()
 436                   throws ParseException
 437   {
 438     buffer.setLength(0);
 439
 440     Token start = hTag = mustBe(BEGIN);
 441     optional(WS);
 442     mustBe(EXCLAMATION);
 443     optional(WS);
 444     mustBe(DOUBLE_DASH);
 445
 446     Token t;
 447     Token last;
 448
 449     comment:
 450     while (true)
 451       {
 452         t = getTokenAhead();
 453         if (t.kind == EOF)
 454           {
 455             handleEOFInComment();
 456             last = t;
 457             break comment;
 458           }
 459         else if (COMMENT_END.matches(this))
 460           {
 461             mustBe(DOUBLE_DASH);
 462             optional(WS);
 463             last = mustBe(END);
 464             break comment;
 465           }
 466         else if (COMMENT_TRIPLEDASH_END.matches(this))
 467           {
 468             mustBe(DOUBLE_DASH);
 469             t = mustBe(NUMTOKEN);
 470             if (t.getImage().equals("-"))
 471               {
 472                 append(t);
 473                 last = mustBe(END);
 474                 break comment;
 475               }
 476             else
 477               {
 478                 buffer.append("--");
 479                 append(t);
 480                 t = getTokenAhead();
 481               }
 482           }
 483         else
 484         /* The lllll-- can match as NUMTOKEN */
 485         if ((t.getImage().endsWith("--")) &&
 486             (
 487               getTokenAhead(1).kind == END ||
 488               (getTokenAhead(1).kind == WS && getTokenAhead(2).kind == END)
 489             )
 490            )
 491           {
 492             buffer.append(t.getImage().substring(0, t.getImage().length() - 2));
 493
 494             /* Skip the closing > that we have already checked. */
 495             last = mustBe(t.kind);
 496             break comment;
 497           }
 498         else
 499           append(t);
 500         mustBe(t.kind);
 501       }
 502     hTag = new Token(start, last);
 503
 504     // Consume any whitespace immediately following a comment.
 505     optional(WS);
 506     handleComment();
 507   }
 508
 509   /**
 510   * Read a script. The text, returned without any changes,
 511   * is terminated only by the closing tag SCRIPT.
 512   */
 513   protected void Script()
 514                  throws ParseException
 515   {
 516     Token name;
 517
 518     Token start = hTag = mustBe(BEGIN);
 519     optional(WS);
 520
 521     name = mustBe(SCRIPT);
 522
 523     optional(WS);
 524
 525     restOfTag(false, name, start);
 526
 527     buffer.setLength(0);
 528
 529     while (!SCRIPT_CLOSE.matches(this))
 530       {
 531         append(getNextToken());
 532       }
 533
 534     consume(SCRIPT_CLOSE);
 535
 536     _handleText();
 537
 538     endTag(false);
 539     _handleEndTag(makeTagElement(name.getImage(), false));
 540   }
 541
 542   /**
 543   * Process SGML insertion that is not a comment.
 544   */
 545   protected void Sgml()
 546                throws ParseException
 547   {
 548     if (COMMENT_OPEN.matches(this))
 549       Comment();
 550     else // skip till ">"
 551       {
 552         Token start = hTag = mustBe(BEGIN);
 553         optional(WS);
 554         mustBe(EXCLAMATION);
 555
 556         buffer.setLength(0);
 557         read:
 558         while (true)
 559           {
 560             t = getNextToken();
 561             if (t.kind == Constants.ENTITY)
 562               {
 563                 resolveAndAppendEntity(t);
 564               }
 565             else if (t.kind == EOF)
 566               {
 567                 error("unexpected eof", t);
 568                 break read;
 569               }
 570             else if (t.kind == END)
 571               break read;
 572             else
 573               append(t);
 574           }
 575
 576         try
 577           {
 578             parseMarkupDeclarations(buffer);
 579           }
 580         catch (IOException ex)
 581           {
 582             error("Unable to parse SGML insertion: '" + buffer + "'",
 583                   new Token(start, t)
 584                  );
 585           }
 586       }
 587     // Consume any whitespace that follows the Sgml insertion.
 588     optional(WS);
 589   }
 590
 591   /**
 592   * Read a style definition. The text, returned without any changes,
 593   * is terminated only by the closing tag STYLE.
 594   */
 595   protected void Style()
 596                 throws ParseException
 597   {
 598     Token name;
 599
 600     Token start = hTag = mustBe(BEGIN);
 601     optional(WS);
 602
 603     name = mustBe(STYLE);
 604
 605     optional(WS);
 606
 607     restOfTag(false, name, start);
 608
 609     buffer.setLength(0);
 610
 611     while (!STYLE_CLOSE.matches(this))
 612       {
 613         append(getNextToken());
 614       }
 615
 616     consume(STYLE_CLOSE);
 617
 618     _handleText();
 619
 620     endTag(false);
 621     _handleEndTag(makeTagElement(name.getImage(), false));
 622   }
 623
 624   /**
 625    * Read a html tag.
 626    */
 627   protected void Tag()
 628               throws ParseException
 629   {
 630     mark(true);
 631
 632     boolean closing = false;
 633     Token name;
 634     Token start = hTag = mustBe(BEGIN);
 635
 636     optional(WS);
 637     name = getNextToken();
 638     optional(WS);
 639
 640     if (name.kind == SLASH)
 641       {
 642         closing = true;
 643         name = getNextToken();
 644       }
 645
 646     restOfTag(closing, name, start);
 647   }
 648
 649   /**
 650    * A hook, for operations, preceeding call to handleText.
 651    * Handle text in a string buffer.
 652    * In non - preformatted mode, all line breaks immediately following the
 653    * start tag and immediately before an end tag is discarded,
 654    * \r, \n and \t are replaced by spaces, multiple space are replaced
 655    * by the single one and the result is  moved into array,
 656    * passing it  to handleText().
 657    */
 658   protected void _handleText()
 659   {
 660     char[] text;
 661
 662     if (preformatted > 0)
 663       text = textProcessor.preprocessPreformatted(buffer);
 664     else
 665       text = textProcessor.preprocess(buffer);
 666
 667     if (text != null && text.length > 0
 668         // According to the specs we need to discard whitespace immediately
 669         // before a closing tag.
 670         && (text.length > 1 || text[0] != ' ' || ! TAG_CLOSE.matches(this)))
 671       {
 672         TagElement pcdata = new TagElement(dtd.getElement("#pcdata"));
 673         attributes = htmlAttributeSet.EMPTY_HTML_ATTRIBUTE_SET;
 674         _handleEmptyTag(pcdata);
 675
 676         handleText(text);
 677         if (titleOpen)
 678           title.append(text);
 679       }
 680   }
 681
 682   /**
 683    * Add the image of this token to the buffer.
 684    * @param t A token to append.
 685    */
 686   protected final void append(Token t)
 687   {
 688     if (t.kind != EOF)
 689       t.appendTo(buffer);
 690   }
 691
 692   /**
 693    * Consume pattern that must match.
 694    * @param p A pattern to consume.
 695    */
 696   protected final void consume(pattern p)
 697   {
 698     node n;
 699     for (int i = 0; i < p.nodes.length; i++)
 700       {
 701         n = p.nodes [ i ];
 702         if (n.optional)
 703           optional(n.kind);
 704         else
 705           mustBe(n.kind);
 706       }
 707   }
 708
 709   /**
 710    * The method is called when the HTML end (closing) tag is found or if
 711    * the parser concludes that the one should be present in the
 712    * current position. The method is called immediatly
 713    * before calling the handleEndTag().
 714    * @param omitted True if the tag is no actually present in the document,
 715    * but is supposed by the parser (like &lt;/html&gt; at the end of the
 716    * document).
 717    */
 718   protected void endTag(boolean omitted)
 719   {
 720   }
 721
 722   /**
 723    * Handle HTML comment. The default method returns without action.
 724    * @param comment
 725    */
 726   protected void handleComment(char[] comment)
 727   {
 728   }
 729
 730   /**
 731    * This is additionally called in when the HTML content terminates
 732    * without closing the HTML comment. This can only happen if the
 733    * HTML document contains errors (for example, the closing --;gt is
 734    * missing.
 735    */
 736   protected void handleEOFInComment()
 737   {
 738     error("Unclosed comment");
 739   }
 740
 741   /**
 742    * Handle the tag with no content, like &lt;br&gt;. The method is
 743    * called for the elements that, in accordance with the current DTD,
 744    * has an empty content.
 745    * @param tag The tag being handled.
 746    * @throws javax.swing.text.ChangedCharSetException
 747    */
 748   protected void handleEmptyTag(TagElement tag)
 749                          throws javax.swing.text.ChangedCharSetException
 750   {
 751   }
 752
 753   /**
 754    * The method is called when the HTML closing tag ((like &lt;/table&gt;)
 755    * is found or if the parser concludes that the one should be present
 756    * in the current position.
 757    * @param tag The tag
 758    */
 759   protected void handleEndTag(TagElement tag)
 760   {
 761   }
 762
 763   /* Handle error that has occured in the given line. */
 764   protected void handleError(int line, String message)
 765   {
 766   }
 767
 768   /**
 769    * The method is called when the HTML opening tag ((like &lt;table&gt;)
 770    * is found or if the parser concludes that the one should be present
 771    * in the current position.
 772    * @param tag The tag
 773    */
 774   protected void handleStartTag(TagElement tag)
 775   {
 776   }
 777
 778   /**
 779    * Handle the text section.
 780    * <p> For non-preformatted section, the parser replaces
 781    * \t, \r and \n by spaces and then multiple spaces
 782    * by a single space. Additionaly, all whitespace around
 783    * tags is discarded.
 784    * </p>
 785    * <p> For pre-formatted text (inside TEXAREA and PRE), the parser preserves
 786    * all tabs and spaces, but removes <b>one</b>  bounding \r, \n or \r\n,
 787    * if it is present. Additionally, it replaces each occurence of \r or \r\n
 788    * by a single \n.</p>
 789    *
 790    * @param text A section text.
 791    */
 792   protected void handleText(char[] text)
 793   {
 794   }
 795
 796   /**
 797    * Handle HTML &lt;title&gt; tag. This method is invoked when
 798    * both title starting and closing tags are already behind.
 799    * The passed argument contains the concatenation of all
 800    * title text sections.
 801    * @param title The title text.
 802    */
 803   protected void handleTitle(char[] title)
 804   {
 805   }
 806
 807   /**
 808    * Constructs the tag from the given element. In this implementation,
 809    * this is defined, but never called.
 810    * @return the tag
 811    */
 812   protected TagElement makeTag(Element element)
 813   {
 814     return makeTag(element, false);
 815   }
 816
 817   /**
 818    * Constructs the tag from the given element.
 819    * @param the tag base {@link javax.swing.text.html.parser.Element}
 820    * @param isSupposed true if the tag is not actually present in the
 821    * html input, but the parser supposes that it should to occur in
 822    * the current location.
 823    * @return the tag
 824    */
 825   protected TagElement makeTag(Element element, boolean isSupposed)
 826   {
 827     return new TagElement(element, isSupposed);
 828   }
 829
 830   /**
 831    * This is called when the tag, representing the given element,
 832    * occurs first time in the document.
 833    * @param element
 834    */
 835   protected void markFirstTime(Element element)
 836   {
 837   }
 838
 839   /**
 840    * Consume the token that was checked before and hence MUST be present.
 841    * @param kind The kind of token to consume.
 842    */
 843   protected Token mustBe(int kind)
 844   {
 845     if (getTokenAhead().kind == kind)
 846       return getNextToken();
 847     else
 848       {
 849         String ei = "";
 850         if (kind < 1000)
 851           ei = " ('" + (char) kind + "') ";
 852         throw new AssertionError("The token of kind " + kind + ei +
 853                                  " MUST be here,"
 854                                 );
 855       }
 856   }
 857
 858   /**
 859    * Handle attribute without value. The default method uses
 860    * the only allowed attribute value from DTD.
 861    * If the attribute is unknown or allows several values,
 862    * the HTML.NULL_ATTRIBUTE_VALUE is used. The attribute with
 863    * this value is added to the attribute set.
 864    * @param element The name of element.
 865    * @param attribute The name of attribute without value.
 866    */
 867   protected void noValueAttribute(String element, String attribute)
 868   {
 869     Object value = HTML.NULL_ATTRIBUTE_VALUE;
 870
 871     Element e = dtd.elementHash.get(element.toLowerCase());
 872     if (e != null)
 873       {
 874         AttributeList attr = e.getAttribute(attribute);
 875         if (attr != null)
 876           {
 877             Vector values = attr.values;
 878             if (values != null && values.size() == 1)
 879               value = values.get(0);
 880           }
 881       }
 882     attributes.addAttribute(attribute, value);
 883   }
 884
 885   /**
 886    * Consume the optional token, if present.
 887    * @param kind The kind of token to consume.
 888    */
 889   protected Token optional(int kind)
 890   {
 891     if (getTokenAhead().kind == kind)
 892       return getNextToken();
 893     else
 894       return null;
 895   }
 896
 897   /** Parse the html document. */
 898   protected void parseDocument()
 899                         throws ParseException
 900   {
 901     // Read up any initial whitespace.
 902     optional(WS);
 903     while (getTokenAhead().kind != EOF)
 904       {
 905         advanced = false;
 906         if (TAG.matches(this))
 907           Tag();
 908         else if (COMMENT_OPEN.matches(this))
 909           Comment();
 910         else if (STYLE_OPEN.matches(this))
 911           Style();
 912         else if (SCRIPT_OPEN.matches(this))
 913           Script();
 914         else if (SGML.matches(this))
 915           Sgml();
 916         else
 917           CDATA(true);
 918
 919         // Surely HTML error, treat as a text.
 920         if (!advanced)
 921           {
 922             Token wrong = getNextToken();
 923             error("unexpected '" + wrong.getImage() + "'", wrong);
 924             buffer.setLength(0);
 925             buffer.append(wrong.getImage());
 926             _handleText();
 927           }
 928       }
 929   }
 930
 931   /**
 932    * Read the element attributes, adding them into attribute set.
 933    * @param element The element name (needed to access attribute
 934    * information in dtd).
 935    */
 936   protected void readAttributes(String element)
 937   {
 938     Token name;
 939     Token value;
 940     Token next;
 941     String attrValue;
 942
 943     attributes = new htmlAttributeSet();
 944
 945     optional(WS);
 946
 947     attributeReading:
 948       while (getTokenAhead().kind == NUMTOKEN)
 949       {
 950         name = getNextToken();
 951         optional(WS);
 952
 953         next = getTokenAhead();
 954         if (next.kind == EQ)
 955           {
 956             mustBe(EQ);
 957             optional(WS);
 958
 959             next = getNextToken();
 960
 961             switch (next.kind)
 962               {
 963               case QUOT:
 964
 965                 // read "quoted" attribute.
 966                 buffer.setLength(0);
 967                 readTillTokenE(QUOT);
 968                 attrValue = buffer.toString();
 969                 break;
 970
 971               case AP:
 972
 973                 // read 'quoted' attribute.
 974                 buffer.setLength(0);
 975                 readTillTokenE(AP);
 976                 attrValue = buffer.toString();
 977                 break;
 978
 979               // read unquoted attribute.
 980               case NUMTOKEN:
 981                 value = next;
 982                 optional(WS);
 983
 984                 // Check maybe the opening quote is missing.
 985                 next = getTokenAhead();
 986                 if (bQUOTING.get(next.kind))
 987                   {
 988                     hTag = next;
 989                     error("The value without opening quote is closed with '"
 990                           + next.getImage() + "'");
 991                     attrValue = value.getImage();
 992                   }
 993                 else if (next.kind == SLASH || next.kind == OTHER)
 994                 // The slash and other characters (like %) in this context is
 995                 // treated as the ordinary
 996                 // character, not as a token. The character may be part of
 997                 // the unquoted URL.
 998                   {
 999                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1000                     while (next.kind == NUMTOKEN || next.kind == SLASH
1001                            || next.kind == OTHER)
1002                       {
1003                         image.append(getNextToken().getImage());
1004                         next = getTokenAhead();
1005                       }
1006                     attrValue = image.toString();
1007                   }
1008                 else
1009                   attrValue = value.getImage();
1010                 break;
1011
1012               case SLASH:
1013                 value = next;
1014                 optional(WS);
1015
1016                 // Check maybe the opening quote is missing.
1017                 next = getTokenAhead();
1018                 if (bQUOTING.get(next.kind))
1019                   {
1020                     hTag = next;
1021                     error("The value without opening quote is closed with '"
1022                           + next.getImage() + "'");
1023                     attrValue = value.getImage();
1024                   }
1025                 else if (next.kind == NUMTOKEN || next.kind == SLASH)
1026                 // The slash in this context is treated as the ordinary
1027                 // character, not as a token. The slash may be part of
1028                 // the unquoted URL.
1029                   {
1030                     CPStringBuilder image = new CPStringBuilder(value.getImage());
1031                     while (next.kind == NUMTOKEN || next.kind == SLASH)
1032                       {
1033                         image.append(getNextToken().getImage());
1034                         next = getTokenAhead();
1035                       }
1036                     attrValue = image.toString();
1037                   }
1038                 else
1039                   attrValue = value.getImage();
1040                 break;
1041               default:
1042                 break attributeReading;
1043               }
1044             attributes.addAttribute(name.getImage(), attrValue);
1045             optional(WS);
1046           }
1047         else
1048           // The '=' is missing: attribute without value.
1049           {
1050             noValueAttribute(element, name.getImage());
1051           }
1052       }
1053   }
1054
1055   /**
1056    * Return string, corresponding the given named entity. The name is passed
1057    * with the preceeding &, but without the ending semicolon.
1058    */
1059   protected String resolveNamedEntity(final String a_tag)
1060   {
1061     // Discard &
1062     if (!a_tag.startsWith("&"))
1063       throw new AssertionError("Named entity " + a_tag +
1064                                " must start witn '&'."
1065                               );
1066
1067     String tag = a_tag.substring(1);
1068
1069     try
1070       {
1071         Entity entity = dtd.getEntity(tag);
1072         if (entity != null)
1073           return entity.getString();
1074
1075         entity = dtd.getEntity(tag.toLowerCase());
1076
1077         if (entity != null)
1078           {
1079             error("The name of this entity should be in lowercase", a_tag);
1080             return entity.getString();
1081           }
1082       }
1083     catch (IndexOutOfBoundsException ibx)
1084       {
1085         /* The error will be reported. */
1086       }
1087
1088     error("Unknown named entity", a_tag);
1089     return a_tag;
1090   }
1091
1092   /**
1093    * Return char, corresponding the given numeric entity.
1094    * The name is passed with the preceeding &#, but without
1095    * the ending semicolon.
1096    */
1097   protected char resolveNumericEntity(final String a_tag)
1098   {
1099     // Discard &#
1100     if (!a_tag.startsWith("&#"))
1101       throw new AssertionError("Numeric entity " + a_tag +
1102                                " must start witn '&#'."
1103                               );
1104
1105     String tag = a_tag.substring(2);
1106
1107     try
1108       {
1109         // Determine the encoding type:
1110         char cx = tag.charAt(0);
1111         if (cx == 'x' || cx == 'X') // Hexadecimal &#Xnnn;
1112
1113           return (char) Integer.parseInt(tag.substring(1), 16);
1114
1115         return (char) Integer.parseInt(tag);
1116       }
1117
1118     /* The error will be reported. */
1119     catch (NumberFormatException nex)
1120       {
1121       }
1122     catch (IndexOutOfBoundsException ix)
1123       {
1124       }
1125
1126     error("Invalid numeric entity", a_tag);
1127     return '?';
1128   }
1129
1130   /**
1131    * Reset all fields into the intial default state, preparing the
1132    * parset for parsing the next document.
1133    */
1134   protected void restart()
1135   {
1136     documentTags.clear();
1137     titleHandled = false;
1138     titleOpen = false;
1139     buffer.setLength(0);
1140     title.setLength(0);
1141     validator.restart();
1142   }
1143
1144   /**
1145    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1146    * is found or if the parser concludes that the one should be present
1147    * in the current position. The method is called immediately before
1148    * calling the handleStartTag.
1149    * @param tag The tag
1150    */
1151   protected void startTag(TagElement tag)
1152                    throws ChangedCharSetException
1153   {
1154   }
1155
1156   /**
1157    * Handle a complete element, when the tag content is already present in the
1158    * buffer and both starting and heading tags behind. This is called
1159    * in the case when the tag text must not be parsed for the nested
1160    * elements (elements STYLE and SCRIPT).
1161    */
1162   private void _handleCompleteElement(TagElement tag)
1163   {
1164     _handleStartTag(tag);
1165
1166     // Suppress inclusion of the SCRIPT ans STYLE texts into the title.
1167     HTML.Tag h = tag.getHTMLTag();
1168     if (h == HTML.Tag.SCRIPT || h == HTML.Tag.STYLE)
1169       {
1170         boolean tmp = titleOpen;
1171         titleOpen = false;
1172         _handleText();
1173         titleOpen = tmp;
1174       }
1175     else
1176       _handleText();
1177
1178     _handleEndTag(tag);
1179   }
1180
1181   /**
1182    * A hooks for operations, preceeding call to handleEmptyTag().
1183    * Handle the tag with no content, like &lt;br&gt;. As no any
1184    * nested tags are expected, the tag validator is not involved.
1185    * @param tag The tag being handled.
1186    */
1187   private void _handleEmptyTag(TagElement tag)
1188   {
1189     try
1190       {
1191         validator.validateTag(tag, attributes);
1192         handleEmptyTag(tag);
1193         HTML.Tag h = tag.getHTMLTag();
1194         // When a block tag is closed, consume whitespace that follows after
1195         // it.
1196         // For some unknown reason a FRAME tag is not treated as block element.
1197         // However in this case it should be treated as such.
1198         if (isBlock(h))
1199           optional(WS);
1200       }
1201     catch (ChangedCharSetException ex)
1202       {
1203         error("Changed charset exception:", ex.getMessage());
1204       }
1205   }
1206
1207   /**
1208    * A hooks for operations, preceeding call to handleEndTag().
1209    * The method is called when the HTML closing tag
1210    * is found. Calls handleTitle after closing the 'title' tag.
1211    * @param tag The tag
1212    */
1213   private void _handleEndTag(TagElement tag)
1214   {
1215     if (validator.closeTag(tag))
1216        _handleEndTag_remaining(tag);
1217   }
1218
1219   /**
1220    * Actions that are also required if the closing action was
1221    * initiated by the tag validator.
1222    * Package-private to avoid an accessor method.
1223    */
1224   void _handleEndTag_remaining(TagElement tag)
1225   {
1226     HTML.Tag h = tag.getHTMLTag();
1227
1228     handleEndTag(tag);
1229     endTag(tag.fictional());
1230
1231     if (h.isPreformatted())
1232       preformatted--;
1233     if (preformatted < 0)
1234       preformatted = 0;
1235
1236     // When a block tag is closed, consume whitespace that follows after
1237     // it.
1238     if (isBlock(h))
1239       optional(WS);
1240
1241     if (h == HTML.Tag.TITLE)
1242       {
1243         titleOpen = false;
1244         titleHandled = true;
1245
1246         char[] a = new char[ title.length() ];
1247         title.getChars(0, a.length, a, 0);
1248         handleTitle(a);
1249       }
1250   }
1251
1252   /**
1253    * A hooks for operations, preceeding call to handleStartTag().
1254    * The method is called when the HTML opening tag ((like &lt;table&gt;)
1255    * is found.
1256    * Package-private to avoid an accessor method.
1257    * @param tag The tag
1258    */
1259   void _handleStartTag(TagElement tag)
1260   {
1261     validator.openTag(tag, attributes);
1262     startingTag(tag);
1263     handleStartTag(tag);
1264
1265     HTML.Tag h = tag.getHTMLTag();
1266
1267     if (isBlock(h))
1268       optional(WS);
1269
1270     if (h.isPreformatted())
1271       preformatted++;
1272
1273     if (h == HTML.Tag.TITLE)
1274       {
1275         if (titleHandled)
1276           error("Repetetive <TITLE> tag");
1277         titleOpen = true;
1278         titleHandled = false;
1279       }
1280   }
1281
1282   /**
1283    * Resume parsing after heavy errors in HTML tag structure.
1284    * @throws ParseException
1285    */
1286   private void forciblyCloseTheTag()
1287                             throws ParseException
1288   {
1289     int closeAt = 0;
1290     buffer.setLength(0);
1291
1292     ahead:
1293     for (int i = 1; i < 100; i++)
1294       {
1295         t = getTokenAhead(i - 1);
1296         if (t.kind == EOF || t.kind == BEGIN)
1297           break ahead;
1298         if (t.kind == END)
1299           {
1300             /* Closing '>' found. */
1301             closeAt = i;
1302             break ahead;
1303           }
1304       }
1305     if (closeAt > 0)
1306       {
1307         buffer.append("Ignoring '");
1308         for (int i = 1; i <= closeAt; i++)
1309           {
1310             t = getNextToken();
1311             append(t);
1312           }
1313         buffer.append('\'');
1314         error(buffer.toString());
1315       }
1316   }
1317
1318   /**
1319    * Handle comment in string buffer. You can avoid allocating a char
1320    * array each time by processing your comment directly here.
1321    */
1322   private void handleComment()
1323   {
1324     char[] a = new char[ buffer.length() ];
1325     buffer.getChars(0, a.length, a, 0);
1326     handleComment(a);
1327   }
1328
1329   private TagElement makeTagElement(String name, boolean isSupposed)
1330   {
1331     Element e = dtd.elementHash.get(name.toLowerCase());
1332     if (e == null)
1333       {
1334         error("Unknown tag <" + name + ">");
1335         e = dtd.getElement(name);
1336         e.name = name.toUpperCase();
1337         e.index = -1;
1338       }
1339
1340     if (!documentTags.contains(e.name))
1341       {
1342         markFirstTime(e);
1343         documentTags.add(e.name);
1344       }
1345
1346     return makeTag(e, isSupposed);
1347   }
1348
1349   /**
1350    * Read till the given token, resolving entities. Consume the given
1351    * token without adding it to buffer.
1352    * @param till The token to read till
1353    * @throws ParseException
1354    */
1355   private void readTillTokenE(int till)
1356                        throws ParseException
1357   {
1358     buffer.setLength(0);
1359     read:
1360     while (true)
1361       {
1362         t = getNextToken();
1363         if (t.kind == Constants.ENTITY)
1364           {
1365             resolveAndAppendEntity(t);
1366           }
1367         else if (t.kind == EOF)
1368           {
1369             error("unexpected eof", t);
1370             break read;
1371           }
1372         else if (t.kind == till)
1373           break read;
1374         else if (t.kind == WS)
1375           {
1376             // Processing whitespace in accordance with CDATA rules:
1377             String s = t.getImage();
1378             char c;
1379             for (int i = 0; i < s.length(); i++)
1380               {
1381                 c = s.charAt(i);
1382                 if (c == '\r')
1383                   buffer.append(' '); // CR replaced by space
1384                 else if (c == '\n')
1385                   { /* LF ignored */ }
1386                 else if (c == '\t')
1387                   buffer.append(' '); // Tab replaced by space
1388                 else
1389                   buffer.append(c);
1390               }
1391           }
1392         else
1393           append(t);
1394       }
1395   }
1396
1397   /**
1398    * Resolve the entity and append it to the end of buffer.
1399    * @param entity
1400    */
1401   private void resolveAndAppendEntity(Token entity)
1402   {
1403     switch (entity.category)
1404       {
1405         case ENTITY_NAMED :
1406           buffer.append(resolveNamedEntity(entity.getImage()));
1407           break;
1408
1409         case ENTITY_NUMERIC :
1410           buffer.append(resolveNumericEntity(entity.getImage()));
1411           break;
1412
1413         default :
1414           throw new AssertionError("Invalid entity category " +
1415                                    entity.category
1416                                   );
1417       }
1418   }
1419
1420   /**
1421    * Handle the remaining of HTML tags. This is a common end for
1422    * TAG, SCRIPT and STYLE.
1423    * @param closing True for closing tags ( &lt;/TAG&gt; ).
1424    * @param name Name of element
1425    * @param start Token where element has started
1426    * @throws ParseException
1427    */
1428   private void restOfTag(boolean closing, Token name, Token start)
1429                   throws ParseException
1430   {
1431     boolean end = false;
1432     Token next;
1433
1434     optional(WS);
1435
1436     readAttributes(name.getImage());
1437
1438     optional(WS);
1439
1440     next = getTokenAhead();
1441     if (next.kind == END)
1442       {
1443         mustBe(END);
1444         end = true;
1445       }
1446
1447     hTag = new Token(start, next);
1448
1449     if (!end)
1450       {
1451         // The tag body contains errors. If additionally the tag
1452         // name is not valid, this construction is treated as text.
1453         if (dtd.elementHash.get(name.getImage().toLowerCase()) == null &&
1454             backupMode
1455            )
1456           {
1457             error("Errors in tag body and unknown tag name. " +
1458                   "Treating the tag as a text."
1459                  );
1460             reset();
1461
1462             hTag = mustBe(BEGIN);
1463             buffer.setLength(0);
1464             buffer.append(hTag.getImage());
1465             CDATA(false);
1466             return;
1467           }
1468         else
1469           {
1470             error("Forcibly closing invalid parameter list");
1471             forciblyCloseTheTag();
1472           }
1473       }
1474
1475     if (closing)
1476       {
1477         endTag(false);
1478         _handleEndTag(makeTagElement(name.getImage(), false));
1479       }
1480     else
1481       {
1482         TagElement te = makeTagElement(name.getImage(), false);
1483         if (te.getElement().type == DTDConstants.EMPTY)
1484           _handleEmptyTag(te);
1485         else
1486           {
1487             // According to the specs we need to consume whitespace following
1488             // immediately after a opening tag.
1489             optional(WS);
1490             _handleStartTag(te);
1491           }
1492       }
1493   }
1494
1495   /**
1496    * This should fire additional actions in response to the
1497    * ChangedCharSetException.  The current implementation
1498    * does nothing.
1499    * @param tag
1500    */
1501   private void startingTag(TagElement tag)
1502   {
1503     try
1504       {
1505         startTag(tag);
1506       }
1507     catch (ChangedCharSetException cax)
1508       {
1509         error("Invalid change of charset");
1510       }
1511   }
1512
1513   private void ws_error()
1514   {
1515     error("Whitespace here is not permitted");
1516   }
1517
1518   /**
1519    * Returns true when the specified tag should be considered a block tag
1520    * wrt whitespace handling. We need this special handling, since there
1521    * are a couple of tags that we must treat as block tags but which aren't
1522    * officially block tags.
1523    *
1524    * @param tag the tag to check
1525    * @return true when the specified tag should be considered a block tag
1526    *         wrt whitespace handling
1527    */
1528   private boolean isBlock(HTML.Tag tag)
1529   {
1530     return tag.isBlock() || tag == HTML.Tag.STYLE || tag == HTML.Tag.FRAME;
1531   }
1532 }