gomp-20050608-branch/libjava/classpath/gnu/xml/aelfred2/XmlParser.java

   1 /* XmlParser.java --
   2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version.
  37
  38 Partly derived from code which carried the following notice:
  39
  40   Copyright (c) 1997, 1998 by Microstar Software Ltd.
  41
  42   AElfred is free for both commercial and non-commercial use and
  43   redistribution, provided that Microstar's copyright and disclaimer are
  44   retained intact.  You are free to modify AElfred for your own use and
  45   to redistribute AElfred with your modifications, provided that the
  46   modifications are clearly documented.
  47
  48   This program is distributed in the hope that it will be useful, but
  49   WITHOUT ANY WARRANTY; without even the implied warranty of
  50   merchantability or fitness for a particular purpose.  Please use it AT
  51   YOUR OWN RISK.
  52 */
  53
  54 package gnu.xml.aelfred2;
  55
  56 import gnu.java.security.action.GetPropertyAction;
  57
  58 import java.io.BufferedInputStream;
  59 import java.io.CharConversionException;
  60 import java.io.EOFException;
  61 import java.io.InputStream;
  62 import java.io.InputStreamReader;
  63 import java.io.IOException;
  64 import java.io.Reader;
  65 import java.io.UnsupportedEncodingException;
  66 import java.net.URL;
  67 import java.net.URLConnection;
  68 import java.security.AccessController;
  69
  70 import java.util.Iterator;
  71 import java.util.HashMap;
  72 import java.util.LinkedList;
  73
  74 import org.xml.sax.InputSource;
  75 import org.xml.sax.SAXException;
  76
  77
  78 /**
  79  * Parse XML documents and return parse events through call-backs.
  80  * Use the <code>SAXDriver</code> class as your entry point, as all
  81  * internal parser interfaces are subject to change.
  82  *
  83  * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
  84  *      (version 1.2a with bugfixes)
  85  * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
  86  * @see SAXDriver
  87  */
  88 final class XmlParser
  89 {
  90
  91   // avoid slow per-character readCh()
  92   private final static boolean USE_CHEATS = true;
  93
  94   ////////////////////////////////////////////////////////////////////////
  95   // Constants.
  96   ////////////////////////////////////////////////////////////////////////
  97
  98   //
  99   // Constants for element content type.
 100   //
 101
 102   /**
 103    * Constant: an element has not been declared.
 104    * @see #getElementContentType
 105    */
 106   public final static int CONTENT_UNDECLARED = 0;
 107
 108   /**
 109    * Constant: the element has a content model of ANY.
 110    * @see #getElementContentType
 111    */
 112   public final static int CONTENT_ANY = 1;
 113
 114   /**
 115    * Constant: the element has declared content of EMPTY.
 116    * @see #getElementContentType
 117    */
 118   public final static int CONTENT_EMPTY = 2;
 119
 120   /**
 121    * Constant: the element has mixed content.
 122    * @see #getElementContentType
 123    */
 124   public final static int CONTENT_MIXED = 3;
 125
 126   /**
 127    * Constant: the element has element content.
 128    * @see #getElementContentType
 129    */
 130   public final static int CONTENT_ELEMENTS = 4;
 131
 132
 133   //
 134   // Constants for the entity type.
 135   //
 136
 137   /**
 138    * Constant: the entity has not been declared.
 139    * @see #getEntityType
 140    */
 141   public final static int ENTITY_UNDECLARED = 0;
 142
 143   /**
 144    * Constant: the entity is internal.
 145    * @see #getEntityType
 146    */
 147   public final static int ENTITY_INTERNAL = 1;
 148
 149   /**
 150    * Constant: the entity is external, non-parsable data.
 151    * @see #getEntityType
 152    */
 153   public final static int ENTITY_NDATA = 2;
 154
 155   /**
 156    * Constant: the entity is external XML data.
 157    * @see #getEntityType
 158    */
 159   public final static int ENTITY_TEXT = 3;
 160
 161   //
 162   // Attribute type constants are interned literal strings.
 163   //
 164
 165   //
 166   // Constants for supported encodings.  "external" is just a flag.
 167   //
 168   private final static int ENCODING_EXTERNAL = 0;
 169   private final static int ENCODING_UTF_8 = 1;
 170   private final static int ENCODING_ISO_8859_1 = 2;
 171   private final static int ENCODING_UCS_2_12 = 3;
 172   private final static int ENCODING_UCS_2_21 = 4;
 173   private final static int ENCODING_UCS_4_1234 = 5;
 174   private final static int ENCODING_UCS_4_4321 = 6;
 175   private final static int ENCODING_UCS_4_2143 = 7;
 176   private final static int ENCODING_UCS_4_3412 = 8;
 177   private final static int ENCODING_ASCII = 9;
 178
 179   //
 180   // Constants for attribute default value.
 181   //
 182
 183   /**
 184    * Constant: the attribute is not declared.
 185    * @see #getAttributeDefaultValueType
 186    */
 187   public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
 188
 189   /**
 190    * Constant: the attribute has a literal default value specified.
 191    * @see #getAttributeDefaultValueType
 192    * @see #getAttributeDefaultValue
 193    */
 194   public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
 195
 196   /**
 197    * Constant: the attribute was declared #IMPLIED.
 198    * @see #getAttributeDefaultValueType
 199    */
 200   public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
 201
 202   /**
 203    * Constant: the attribute was declared #REQUIRED.
 204    * @see #getAttributeDefaultValueType
 205    */
 206   public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
 207
 208   /**
 209    * Constant: the attribute was declared #FIXED.
 210    * @see #getAttributeDefaultValueType
 211    * @see #getAttributeDefaultValue
 212    */
 213   public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
 214
 215   //
 216   // Constants for input.
 217   //
 218   private final static int INPUT_NONE = 0;
 219   private final static int INPUT_INTERNAL = 1;
 220   private final static int INPUT_STREAM = 3;
 221   private final static int INPUT_READER = 5;
 222
 223   //
 224   // Flags for reading literals.
 225   //
 226   // expand general entity refs (attribute values in dtd and content)
 227   private final static int LIT_ENTITY_REF = 2;
 228   // normalize this value (space chars) (attributes, public ids)
 229   private final static int LIT_NORMALIZE = 4;
 230   // literal is an attribute value
 231   private final static int LIT_ATTRIBUTE = 8;
 232   // don't expand parameter entities
 233   private final static int LIT_DISABLE_PE = 16;
 234   // don't expand [or parse] character refs
 235   private final static int LIT_DISABLE_CREF = 32;
 236   // don't parse general entity refs
 237   private final static int LIT_DISABLE_EREF = 64;
 238   // literal is a public ID value
 239   private final static int LIT_PUBID = 256;
 240
 241   //
 242   // Flags affecting PE handling in DTDs (if expandPE is true).
 243   // PEs expand with space padding, except inside literals.
 244   //
 245   private final static int CONTEXT_NORMAL = 0;
 246   private final static int CONTEXT_LITERAL = 1;
 247
 248   // Emit warnings for relative URIs with no base URI.
 249   static boolean uriWarnings;
 250   static
 251   {
 252     String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
 253     GetPropertyAction a = new GetPropertyAction(key);
 254     uriWarnings = "true".equals(AccessController.doPrivileged(a));
 255   }
 256
 257   //
 258   // The current XML handler interface.
 259   //
 260   private SAXDriver handler;
 261
 262   //
 263   // I/O information.
 264   //
 265   private Reader reader;   // current reader
 266   private InputStream is;     // current input stream
 267   private int line;     // current line number
 268   private int column;   // current column number
 269   private int sourceType;   // type of input source
 270   private LinkedList inputStack;   // stack of input soruces
 271   private URLConnection externalEntity; // current external entity
 272   private int encoding;   // current character encoding
 273   private int currentByteCount; // bytes read from current source
 274   private InputSource scratch;  // temporary
 275
 276   //
 277   // Buffers for decoded but unparsed character input.
 278   //
 279   private char[] readBuffer;
 280   private int readBufferPos;
 281   private int readBufferLength;
 282   private int readBufferOverflow;  // overflow from last data chunk.
 283
 284   //
 285   // Buffer for undecoded raw byte input.
 286   //
 287   private final static int READ_BUFFER_MAX = 16384;
 288   private byte[] rawReadBuffer;
 289
 290
 291   //
 292   // Buffer for attribute values, char refs, DTD stuff.
 293   //
 294   private static int DATA_BUFFER_INITIAL = 4096;
 295   private char[] dataBuffer;
 296   private int dataBufferPos;
 297
 298   //
 299   // Buffer for parsed names.
 300   //
 301   private static int NAME_BUFFER_INITIAL = 1024;
 302   private char[] nameBuffer;
 303   private int nameBufferPos;
 304
 305   //
 306   // Save any standalone flag
 307   //
 308   private boolean docIsStandalone;
 309
 310   //
 311   // Hashtables for DTD information on elements, entities, and notations.
 312   // Populated until we start ignoring decls (because of skipping a PE)
 313   //
 314   private HashMap elementInfo;
 315   private HashMap entityInfo;
 316   private HashMap notationInfo;
 317   private boolean skippedPE;
 318
 319   //
 320   // Element type currently in force.
 321   //
 322   private String currentElement;
 323   private int currentElementContent;
 324
 325   //
 326   // Stack of entity names, to detect recursion.
 327   //
 328   private LinkedList entityStack;
 329
 330   //
 331   // PE expansion is enabled in most chunks of the DTD, not all.
 332   // When it's enabled, literals are treated differently.
 333   //
 334   private boolean inLiteral;
 335   private boolean expandPE;
 336   private boolean peIsError;
 337
 338   //
 339   // can't report entity expansion inside two constructs:
 340   // - attribute expansions (internal entities only)
 341   // - markup declarations (parameter entities only)
 342   //
 343   private boolean doReport;
 344
 345   //
 346   // Symbol table, for caching interned names.
 347   //
 348   // These show up wherever XML names or nmtokens are used:  naming elements,
 349   // attributes, PIs, notations, entities, and enumerated attribute values.
 350   //
 351   // NOTE:  This hashtable doesn't grow.  The default size is intended to be
 352   // rather large for most documents.  Example:  one snapshot of the DocBook
 353   // XML 4.1 DTD used only about 350 such names.  As a rule, only pathological
 354   // documents (ones that don't reuse names) should ever see much collision.
 355   //
 356   // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
 357   // "2039" keeps the hash table size at about two memory pages on typical
 358   // 32 bit hardware.
 359   //
 360   private final static int SYMBOL_TABLE_LENGTH = 2039;
 361
 362   private Object[][] symbolTable;
 363
 364   //
 365   // Hash table of attributes found in current start tag.
 366   //
 367   private String[] tagAttributes;
 368   private int tagAttributePos;
 369
 370   //
 371   // Utility flag: have we noticed a CR while reading the last
 372   // data chunk?  If so, we will have to go back and normalise
 373   // CR or CR/LF line ends.
 374   //
 375   private boolean sawCR;
 376
 377   //
 378   // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
 379   //
 380   private boolean inCDATA;
 381
 382   //
 383   // Xml version.
 384   //
 385   private static final int XML_10 = 0;
 386   private static final int XML_11 = 1;
 387   private int xmlVersion = XML_10;
 388
 389   //////////////////////////////////////////////////////////////////////
 390   // Constructors.
 391   ////////////////////////////////////////////////////////////////////////
 392
 393   /**
 394    * Construct a new parser with no associated handler.
 395    * @see #setHandler
 396    * @see #parse
 397    */
 398   // package private
 399   XmlParser()
 400   {
 401   }
 402
 403   /**
 404    * Set the handler that will receive parsing events.
 405    * @param handler The handler to receive callback events.
 406    * @see #parse
 407    */
 408   // package private
 409   void setHandler(SAXDriver handler)
 410   {
 411     this.handler = handler;
 412   }
 413
 414   /**
 415    * Parse an XML document from the character stream, byte stream, or URI
 416    * that you provide (in that order of preference).  Any URI that you
 417    * supply will become the base URI for resolving relative URI, and may
 418    * be used to acquire a reader or byte stream.
 419    *
 420    * <p> Only one thread at a time may use this parser; since it is
 421    * private to this package, post-parse cleanup is done by the caller,
 422    * which MUST NOT REUSE the parser (just null it).
 423    *
 424    * @param systemId Absolute URI of the document; should never be null,
 425    *    but may be so iff a reader <em>or</em> a stream is provided.
 426    * @param publicId The public identifier of the document, or null.
 427    * @param reader A character stream; must be null if stream isn't.
 428    * @param stream A byte input stream; must be null if reader isn't.
 429    * @param encoding The suggested encoding, or null if unknown.
 430    * @exception java.lang.Exception Basically SAXException or IOException
 431    */
 432   // package private
 433   void doParse(String systemId, String publicId, Reader reader,
 434                InputStream stream, String encoding)
 435     throws Exception
 436   {
 437     if (handler == null)
 438       {
 439         throw new IllegalStateException("no callback handler");
 440       }
 441
 442     initializeVariables();
 443
 444     // predeclare the built-in entities here (replacement texts)
 445     // we don't need to intern(), since we're guaranteed literals
 446     // are always (globally) interned.
 447     setInternalEntity("amp", "&#38;");
 448     setInternalEntity("lt", "&#60;");
 449     setInternalEntity("gt", "&#62;");
 450     setInternalEntity("apos", "&#39;");
 451     setInternalEntity("quot", "&#34;");
 452
 453     try
 454       {
 455         // pushURL first to ensure locator is correct in startDocument
 456         // ... it might report an IO or encoding exception.
 457         handler.startDocument();
 458         pushURL(false, "[document]",
 459                 // default baseURI: null
 460                 new ExternalIdentifiers(publicId, systemId, null),
 461                 reader, stream, encoding, false);
 462
 463         parseDocument();
 464       }
 465     catch (EOFException e)
 466       {
 467         //empty input
 468         error("empty document, with no root element.");
 469       }
 470     finally
 471       {
 472         if (reader != null)
 473           {
 474             try
 475               {
 476                 reader.close();
 477               }
 478             catch (IOException e)
 479               {
 480                 /* ignore */
 481               }
 482           }
 483         if (stream != null)
 484           {
 485             try
 486               {
 487                 stream.close();
 488               }
 489             catch (IOException e)
 490               {
 491                 /* ignore */
 492               }
 493           }
 494         if (is != null)
 495           {
 496             try
 497               {
 498                 is.close();
 499               }
 500             catch (IOException e)
 501               {
 502                 /* ignore */
 503               }
 504           }
 505         scratch = null;
 506       }
 507   }
 508
 509   //////////////////////////////////////////////////////////////////////
 510   // Error reporting.
 511   //////////////////////////////////////////////////////////////////////
 512
 513   /**
 514    * Report an error.
 515    * @param message The error message.
 516    * @param textFound The text that caused the error (or null).
 517    * @see SAXDriver#error
 518    * @see #line
 519    */
 520   private void error(String message, String textFound, String textExpected)
 521     throws SAXException
 522   {
 523     if (textFound != null)
 524       {
 525         message = message + " (found \"" + textFound + "\")";
 526       }
 527     if (textExpected != null)
 528       {
 529         message = message + " (expected \"" + textExpected + "\")";
 530       }
 531     handler.fatal(message);
 532
 533     // "can't happen"
 534     throw new SAXException(message);
 535   }
 536
 537   /**
 538    * Report a serious error.
 539    * @param message The error message.
 540    * @param textFound The text that caused the error (or null).
 541    */
 542   private void error(String message, char textFound, String textExpected)
 543     throws SAXException
 544   {
 545     error(message, new Character(textFound).toString(), textExpected);
 546   }
 547
 548   /**
 549    * Report typical case fatal errors.
 550    */
 551   private void error(String message)
 552     throws SAXException
 553   {
 554     handler.fatal(message);
 555   }
 556
 557   //////////////////////////////////////////////////////////////////////
 558   // Major syntactic productions.
 559   //////////////////////////////////////////////////////////////////////
 560
 561   /**
 562    * Parse an XML document.
 563    * <pre>
 564    * [1] document ::= prolog element Misc*
 565    * </pre>
 566    * <p>This is the top-level parsing function for a single XML
 567    * document.  As a minimum, a well-formed document must have
 568    * a document element, and a valid document must have a prolog
 569    * (one with doctype) as well.
 570    */
 571   private void parseDocument()
 572     throws Exception
 573   {
 574     try
 575       {                                       // added by MHK
 576         boolean sawDTD = parseProlog();
 577         require('<');
 578         parseElement(!sawDTD);
 579       }
 580     catch (EOFException ee)
 581       {                 // added by MHK
 582         error("premature end of file", "[EOF]", null);
 583       }
 584
 585     try
 586       {
 587         parseMisc();   //skip all white, PIs, and comments
 588         char c = readCh();    //if this doesn't throw an exception...
 589         error("unexpected characters after document end", c, null);
 590       }
 591     catch (EOFException e)
 592       {
 593         return;
 594       }
 595   }
 596
 597   static final char[] startDelimComment = { '<', '!', '-', '-' };
 598   static final char[] endDelimComment = { '-', '-' };
 599
 600   /**
 601    * Skip a comment.
 602    * <pre>
 603    * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
 604    * </pre>
 605    * <p> (The <code>&lt;!--</code> has already been read.)
 606    */
 607   private void parseComment()
 608     throws Exception
 609   {
 610     char c;
 611     boolean saved = expandPE;
 612
 613     expandPE = false;
 614     parseUntil(endDelimComment);
 615     require('>');
 616     expandPE = saved;
 617     handler.comment(dataBuffer, 0, dataBufferPos);
 618     dataBufferPos = 0;
 619   }
 620
 621   static final char[] startDelimPI = { '<', '?' };
 622   static final char[] endDelimPI = { '?', '>' };
 623
 624   /**
 625    * Parse a processing instruction and do a call-back.
 626    * <pre>
 627    * [16] PI ::= '&lt;?' PITarget
 628    *    (S (Char* - (Char* '?&gt;' Char*)))?
 629    *    '?&gt;'
 630    * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
 631    * </pre>
 632    * <p> (The <code>&lt;?</code> has already been read.)
 633    */
 634   private void parsePI()
 635     throws SAXException, IOException
 636   {
 637     String name;
 638     boolean saved = expandPE;
 639
 640     expandPE = false;
 641     name = readNmtoken(true);
 642     //NE08
 643     if (name.indexOf(':') >= 0)
 644       {
 645         error("Illegal character(':') in processing instruction name ",
 646               name, null);
 647       }
 648     if ("xml".equalsIgnoreCase(name))
 649       {
 650         error("Illegal processing instruction target", name, null);
 651       }
 652     if (!tryRead(endDelimPI))
 653       {
 654         requireWhitespace();
 655         parseUntil(endDelimPI);
 656       }
 657     expandPE = saved;
 658     handler.processingInstruction(name, dataBufferToString());
 659   }
 660
 661   static final char[] endDelimCDATA = { ']', ']', '>' };
 662
 663   private boolean isDirtyCurrentElement;
 664
 665   /**
 666    * Parse a CDATA section.
 667    * <pre>
 668    * [18] CDSect ::= CDStart CData CDEnd
 669    * [19] CDStart ::= '&lt;![CDATA['
 670    * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
 671    * [21] CDEnd ::= ']]&gt;'
 672    * </pre>
 673    * <p> (The '&lt;![CDATA[' has already been read.)
 674    */
 675   private void parseCDSect()
 676     throws Exception
 677   {
 678     parseUntil(endDelimCDATA);
 679     dataBufferFlush();
 680   }
 681
 682   /**
 683    * Parse the prolog of an XML document.
 684    * <pre>
 685    * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
 686    * </pre>
 687    * <p>We do not look for the XML declaration here, because it was
 688    * handled by pushURL ().
 689    * @see pushURL
 690    * @return true if a DTD was read.
 691    */
 692   private boolean parseProlog()
 693     throws Exception
 694   {
 695     parseMisc();
 696
 697     if (tryRead("<!DOCTYPE"))
 698       {
 699         parseDoctypedecl();
 700         parseMisc();
 701         return true;
 702       }
 703     return false;
 704   }
 705
 706   private void checkLegalVersion(String version)
 707     throws SAXException
 708   {
 709     int len = version.length();
 710     for (int i = 0; i < len; i++)
 711       {
 712         char c = version.charAt(i);
 713         if ('0' <= c && c <= '9')
 714           {
 715             continue;
 716           }
 717         if (c == '_' || c == '.' || c == ':' || c == '-')
 718           {
 719             continue;
 720           }
 721         if ('a' <= c && c <= 'z')
 722           {
 723             continue;
 724           }
 725         if ('A' <= c && c <= 'Z')
 726           {
 727             continue;
 728           }
 729         error ("illegal character in version", version, "1.0");
 730       }
 731   }
 732
 733   /**
 734    * Parse the XML declaration.
 735    * <pre>
 736    * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
 737    * [24] VersionInfo ::= S 'version' Eq
 738    *    ("'" VersionNum "'" | '"' VersionNum '"' )
 739    * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
 740    * [32] SDDecl ::= S 'standalone' Eq
 741    *    ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
 742    * [80] EncodingDecl ::= S 'encoding' Eq
 743    *    ( "'" EncName "'" | "'" EncName "'" )
 744    * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
 745    * </pre>
 746    * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
 747    * @return the encoding in the declaration, uppercased; or null
 748    * @see #parseTextDecl
 749    * @see #setupDecoding
 750    */
 751   private String parseXMLDecl(boolean ignoreEncoding)
 752     throws SAXException, IOException
 753   {
 754     String version;
 755     String encodingName = null;
 756     String standalone = null;
 757     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
 758     String inputEncoding = null;
 759
 760     switch (this.encoding)
 761       {
 762       case ENCODING_EXTERNAL:
 763       case ENCODING_UTF_8:
 764         inputEncoding = "UTF-8";
 765         break;
 766       case ENCODING_ISO_8859_1:
 767         inputEncoding = "ISO-8859-1";
 768         break;
 769       case ENCODING_UCS_2_12:
 770         inputEncoding = "UTF-16BE";
 771         break;
 772       case ENCODING_UCS_2_21:
 773         inputEncoding = "UTF-16LE";
 774         break;
 775       }
 776
 777     // Read the version.
 778     require("version");
 779     parseEq();
 780     checkLegalVersion(version = readLiteral(flags));
 781     if (!version.equals("1.0"))
 782       {
 783         if (version.equals("1.1"))
 784           {
 785             handler.warn("expected XML version 1.0, not: " + version);
 786             xmlVersion = XML_11;
 787           }
 788         else
 789           {
 790             error("illegal XML version", version, "1.0 or 1.1");
 791           }
 792       }
 793     else
 794       {
 795         xmlVersion = XML_10;
 796       }
 797     // Try reading an encoding declaration.
 798     boolean white = tryWhitespace();
 799
 800     if (tryRead("encoding"))
 801       {
 802         if (!white)
 803           {
 804             error("whitespace required before 'encoding='");
 805           }
 806         parseEq();
 807         encodingName = readLiteral(flags);
 808         if (!ignoreEncoding)
 809           {
 810             setupDecoding(encodingName);
 811           }
 812       }
 813
 814     // Try reading a standalone declaration
 815     if (encodingName != null)
 816       {
 817         white = tryWhitespace();
 818       }
 819     if (tryRead("standalone"))
 820       {
 821         if (!white)
 822           {
 823             error("whitespace required before 'standalone='");
 824           }
 825         parseEq();
 826         standalone = readLiteral(flags);
 827         if ("yes".equals(standalone))
 828           {
 829             docIsStandalone = true;
 830           }
 831         else if (!"no".equals(standalone))
 832           {
 833             error("standalone flag must be 'yes' or 'no'");
 834           }
 835       }
 836
 837     skipWhitespace();
 838     require("?>");
 839
 840     if (inputEncoding == null)
 841       {
 842         inputEncoding = encodingName;
 843       }
 844     return encodingName;
 845   }
 846
 847   /**
 848    * Parse a text declaration.
 849    * <pre>
 850    * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
 851    * [80] EncodingDecl ::= S 'encoding' Eq
 852    *    ( '"' EncName '"' | "'" EncName "'" )
 853    * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
 854    * </pre>
 855    * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
 856    * @return the encoding in the declaration, uppercased; or null
 857    * @see #parseXMLDecl
 858    * @see #setupDecoding
 859    */
 860   private String parseTextDecl(boolean ignoreEncoding)
 861     throws SAXException, IOException
 862   {
 863     String encodingName = null;
 864     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
 865
 866     // Read an optional version.
 867     if (tryRead ("version"))
 868       {
 869         String version;
 870         parseEq();
 871         checkLegalVersion(version = readLiteral(flags));
 872
 873         if (version.equals("1.1"))
 874           {
 875             if (xmlVersion == XML_10)
 876               {
 877                 error("external subset has later version number.", "1.0",
 878                       version);
 879               }
 880             handler.warn("expected XML version 1.0, not: " + version);
 881             xmlVersion = XML_11;
 882           }
 883         else if (!version.equals("1.0"))
 884           {
 885             error("illegal XML version", version, "1.0 or 1.1");
 886           }
 887         requireWhitespace();
 888       }
 889
 890     // Read the encoding.
 891     require("encoding");
 892     parseEq();
 893     encodingName = readLiteral(flags);
 894     if (!ignoreEncoding)
 895       {
 896         setupDecoding(encodingName);
 897       }
 898     skipWhitespace();
 899     require("?>");
 900
 901     return encodingName;
 902   }
 903
 904   /**
 905    * Sets up internal state so that we can decode an entity using the
 906    * specified encoding.  This is used when we start to read an entity
 907    * and we have been given knowledge of its encoding before we start to
 908    * read any data (e.g. from a SAX input source or from a MIME type).
 909    *
 910    * <p> It is also used after autodetection, at which point only very
 911    * limited adjustments to the encoding may be used (switching between
 912    * related builtin decoders).
 913    *
 914    * @param encodingName The name of the encoding specified by the user.
 915    * @exception IOException if the encoding isn't supported either
 916    *  internally to this parser, or by the hosting JVM.
 917    * @see #parseXMLDecl
 918    * @see #parseTextDecl
 919      */
 920   private void setupDecoding(String encodingName)
 921     throws SAXException, IOException
 922   {
 923     encodingName = encodingName.toUpperCase();
 924
 925     // ENCODING_EXTERNAL indicates an encoding that wasn't
 926     // autodetected ... we can use builtin decoders, or
 927     // ones from the JVM (InputStreamReader).
 928
 929     // Otherwise we can only tweak what was autodetected, and
 930     // only for single byte (ASCII derived) builtin encodings.
 931
 932     // ASCII-derived encodings
 933     if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
 934       {
 935         if (encodingName.equals("ISO-8859-1")
 936             || encodingName.equals("8859_1")
 937             || encodingName.equals("ISO8859_1"))
 938           {
 939             encoding = ENCODING_ISO_8859_1;
 940             return;
 941           }
 942         else if (encodingName.equals("US-ASCII")
 943                  || encodingName.equals("ASCII"))
 944           {
 945             encoding = ENCODING_ASCII;
 946             return;
 947           }
 948         else if (encodingName.equals("UTF-8")
 949                  || encodingName.equals("UTF8"))
 950           {
 951             encoding = ENCODING_UTF_8;
 952             return;
 953           }
 954         else if (encoding != ENCODING_EXTERNAL)
 955           {
 956             // used to start with a new reader ...
 957             throw new UnsupportedEncodingException(encodingName);
 958           }
 959         // else fallthrough ...
 960         // it's ASCII-ish and something other than a builtin
 961       }
 962
 963     // Unicode and such
 964     if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
 965       {
 966         if (!(encodingName.equals("ISO-10646-UCS-2")
 967               || encodingName.equals("UTF-16")
 968               || encodingName.equals("UTF-16BE")
 969               || encodingName.equals("UTF-16LE")))
 970           {
 971             error("unsupported Unicode encoding", encodingName, "UTF-16");
 972           }
 973         return;
 974       }
 975
 976     // four byte encodings
 977     if (encoding == ENCODING_UCS_4_1234
 978         || encoding == ENCODING_UCS_4_4321
 979         || encoding == ENCODING_UCS_4_2143
 980         || encoding == ENCODING_UCS_4_3412)
 981       {
 982         // Strictly:  "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
 983         if (!encodingName.equals("ISO-10646-UCS-4"))
 984           {
 985             error("unsupported 32-bit encoding", encodingName,
 986                   "ISO-10646-UCS-4");
 987           }
 988         return;
 989       }
 990
 991     // assert encoding == ENCODING_EXTERNAL
 992     // if (encoding != ENCODING_EXTERNAL)
 993     //     throw new RuntimeException ("encoding = " + encoding);
 994
 995     if (encodingName.equals("UTF-16BE"))
 996       {
 997         encoding = ENCODING_UCS_2_12;
 998         return;
 999       }
1000     if (encodingName.equals("UTF-16LE"))
1001       {
1002         encoding = ENCODING_UCS_2_21;
1003         return;
1004       }
1005
1006     // We couldn't use the builtin decoders at all.  But we can try to
1007     // create a reader, since we haven't messed up buffering.  Tweak
1008     // the encoding name if necessary.
1009
1010     if (encodingName.equals("UTF-16")
1011         || encodingName.equals("ISO-10646-UCS-2"))
1012       {
1013         encodingName = "Unicode";
1014       }
1015     // Ignoring all the EBCDIC aliases here
1016
1017     reader = new InputStreamReader(is, encodingName);
1018     sourceType = INPUT_READER;
1019   }
1020
1021   /**
1022    * Parse miscellaneous markup outside the document element and DOCTYPE
1023    * declaration.
1024    * <pre>
1025    * [27] Misc ::= Comment | PI | S
1026    * </pre>
1027    */
1028   private void parseMisc()
1029     throws Exception
1030   {
1031     while (true)
1032       {
1033         skipWhitespace();
1034         if (tryRead(startDelimPI))
1035           {
1036             parsePI();
1037           }
1038         else if (tryRead(startDelimComment))
1039           {
1040             parseComment();
1041           }
1042         else
1043           {
1044             return;
1045           }
1046       }
1047   }
1048
1049   /**
1050    * Parse a document type declaration.
1051    * <pre>
1052    * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1053    *    ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1054    * </pre>
1055    * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1056    */
1057   private void parseDoctypedecl()
1058     throws Exception
1059   {
1060     String rootName;
1061     ExternalIdentifiers ids;
1062
1063     // Read the document type name.
1064     requireWhitespace();
1065     rootName = readNmtoken(true);
1066
1067     // Read the External subset's IDs
1068     skipWhitespace();
1069     ids = readExternalIds(false, true);
1070
1071     // report (a) declaration of name, (b) lexical info (ids)
1072     handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1073
1074     // Internal subset is parsed first, if present
1075     skipWhitespace();
1076     if (tryRead('['))
1077       {
1078
1079         // loop until the subset ends
1080         while (true)
1081           {
1082             doReport = expandPE = true;
1083             skipWhitespace();
1084             doReport = expandPE = false;
1085             if (tryRead(']'))
1086               {
1087                 break;     // end of subset
1088               }
1089             else
1090               {
1091                 // WFC, PEs in internal subset (only between decls)
1092                 peIsError = expandPE = true;
1093                 parseMarkupdecl();
1094                 peIsError = expandPE = false;
1095               }
1096           }
1097       }
1098     skipWhitespace();
1099     require('>');
1100
1101     // Read the external subset, if any
1102     InputSource subset;
1103
1104     if (ids.systemId == null)
1105       {
1106         subset = handler.getExternalSubset(rootName,
1107                                            handler.getSystemId());
1108       }
1109     else
1110       {
1111         subset = null;
1112       }
1113     if (ids.systemId != null || subset != null)
1114       {
1115         pushString(null, ">");
1116
1117         // NOTE:  [dtd] is so we say what SAX2 expects,
1118         // though it's misleading (subset, not entire dtd)
1119         if (ids.systemId != null)
1120           {
1121             pushURL(true, "[dtd]", ids, null, null, null, true);
1122           }
1123         else
1124           {
1125             handler.warn("modifying document by adding external subset");
1126             pushURL(true, "[dtd]",
1127                     new ExternalIdentifiers(subset.getPublicId(),
1128                                             subset.getSystemId(),
1129                                             null),
1130                     subset.getCharacterStream(),
1131                     subset.getByteStream(),
1132                     subset.getEncoding(),
1133                     false);
1134           }
1135
1136         // Loop until we end up back at '>'
1137         while (true)
1138           {
1139             doReport = expandPE = true;
1140             skipWhitespace();
1141             doReport = expandPE = false;
1142             if (tryRead('>'))
1143               {
1144                 break;
1145               }
1146             else
1147               {
1148                 expandPE = true;
1149                 parseMarkupdecl();
1150                 expandPE = false;
1151               }
1152           }
1153
1154         // the ">" string isn't popped yet
1155         if (inputStack.size() != 1)
1156           {
1157             error("external subset has unmatched '>'");
1158           }
1159       }
1160
1161     // done dtd
1162     handler.endDoctype();
1163     expandPE = false;
1164     doReport = true;
1165   }
1166
1167   /**
1168    * Parse a markup declaration in the internal or external DTD subset.
1169    * <pre>
1170    * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1171    *    | NotationDecl | PI | Comment
1172    * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1173    *    | PEReference | S) *
1174    * </pre>
1175    * <p> Reading toplevel PE references is handled as a lexical issue
1176    * by the caller, as is whitespace.
1177    */
1178   private void parseMarkupdecl()
1179     throws Exception
1180   {
1181     char[] saved = null;
1182     boolean savedPE = expandPE;
1183
1184     // prevent "<%foo;" and ensures saved entity is right
1185     require('<');
1186     unread('<');
1187     expandPE = false;
1188
1189     if (tryRead("<!ELEMENT"))
1190       {
1191         saved = readBuffer;
1192         expandPE = savedPE;
1193         parseElementDecl();
1194       }
1195     else if (tryRead("<!ATTLIST"))
1196       {
1197         saved = readBuffer;
1198         expandPE = savedPE;
1199         parseAttlistDecl();
1200       }
1201     else if (tryRead("<!ENTITY"))
1202       {
1203         saved = readBuffer;
1204         expandPE = savedPE;
1205         parseEntityDecl();
1206       }
1207     else if (tryRead("<!NOTATION"))
1208       {
1209         saved = readBuffer;
1210         expandPE = savedPE;
1211         parseNotationDecl();
1212       }
1213     else if (tryRead(startDelimPI))
1214       {
1215         saved = readBuffer;
1216         expandPE = savedPE;
1217         parsePI();
1218       }
1219     else if (tryRead(startDelimComment))
1220       {
1221         saved = readBuffer;
1222         expandPE = savedPE;
1223         parseComment();
1224       }
1225     else if (tryRead("<!["))
1226       {
1227         saved = readBuffer;
1228         expandPE = savedPE;
1229         if (inputStack.size() > 0)
1230           {
1231             parseConditionalSect(saved);
1232           }
1233         else
1234           {
1235             error("conditional sections illegal in internal subset");
1236           }
1237       }
1238     else
1239       {
1240         error("expected markup declaration");
1241       }
1242
1243     // VC: Proper Decl/PE Nesting
1244     if (readBuffer != saved)
1245       {
1246         handler.verror("Illegal Declaration/PE nesting");
1247       }
1248   }
1249
1250   /**
1251    * Parse an element, with its tags.
1252    * <pre>
1253    * [39] element ::= EmptyElementTag | STag content ETag
1254    * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1255    * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1256    * </pre>
1257    * <p> (The '&lt;' has already been read.)
1258    * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1259    * and parseContent () will take care of calling parseETag ().
1260    */
1261   private void parseElement(boolean maybeGetSubset)
1262     throws Exception
1263   {
1264     String gi;
1265     char c;
1266     int oldElementContent = currentElementContent;
1267     String oldElement = currentElement;
1268     ElementDecl element;
1269
1270     // This is the (global) counter for the
1271     // array of specified attributes.
1272     tagAttributePos = 0;
1273
1274     // Read the element type name.
1275     gi = readNmtoken(true);
1276
1277     // If we saw no DTD, and this is the document root element,
1278     // let the application modify the input stream by providing one.
1279     if (maybeGetSubset)
1280       {
1281         InputSource subset = handler.getExternalSubset(gi,
1282                                                        handler.getSystemId());
1283         if (subset != null)
1284           {
1285             String publicId = subset.getPublicId();
1286             String systemId = subset.getSystemId();
1287
1288             handler.warn("modifying document by adding DTD");
1289             handler.doctypeDecl(gi, publicId, systemId);
1290             pushString(null, ">");
1291
1292             // NOTE:  [dtd] is so we say what SAX2 expects,
1293             // though it's misleading (subset, not entire dtd)
1294             pushURL(true, "[dtd]",
1295                     new ExternalIdentifiers(publicId, systemId, null),
1296                     subset.getCharacterStream(),
1297                     subset.getByteStream(),
1298                     subset.getEncoding(),
1299                     false);
1300
1301             // Loop until we end up back at '>'
1302             while (true)
1303               {
1304                 doReport = expandPE = true;
1305                 skipWhitespace();
1306                 doReport = expandPE = false;
1307                 if (tryRead('>'))
1308                   {
1309                     break;
1310                   }
1311                 else
1312                   {
1313                     expandPE = true;
1314                     parseMarkupdecl();
1315                     expandPE = false;
1316                   }
1317               }
1318
1319             // the ">" string isn't popped yet
1320             if (inputStack.size() != 1)
1321               {
1322                 error("external subset has unmatched '>'");
1323               }
1324
1325             handler.endDoctype();
1326           }
1327       }
1328
1329     // Determine the current content type.
1330     currentElement = gi;
1331     element = (ElementDecl) elementInfo.get(gi);
1332     currentElementContent = getContentType(element, CONTENT_ANY);
1333
1334     // Read the attributes, if any.
1335     // After this loop, "c" is the closing delimiter.
1336     boolean white = tryWhitespace();
1337     c = readCh();
1338     while (c != '/' && c != '>')
1339       {
1340         unread(c);
1341         if (!white)
1342           {
1343             error("need whitespace between attributes");
1344           }
1345         parseAttribute(gi);
1346         white = tryWhitespace();
1347         c = readCh();
1348       }
1349
1350     // Supply any defaulted attributes.
1351     Iterator atts = declaredAttributes(element);
1352     if (atts != null)
1353       {
1354         String aname;
1355 loop:
1356         while (atts.hasNext())
1357           {
1358             aname = (String) atts.next();
1359             // See if it was specified.
1360             for (int i = 0; i < tagAttributePos; i++)
1361               {
1362                 if (tagAttributes[i] == aname)
1363                   {
1364                     continue loop;
1365                   }
1366               }
1367             // ... or has a default
1368             String value = getAttributeDefaultValue(gi, aname);
1369
1370             if (value == null)
1371               {
1372                 continue;
1373               }
1374             handler.attribute(aname, value, false);
1375           }
1376       }
1377
1378     // Figure out if this is a start tag
1379     // or an empty element, and dispatch an
1380     // event accordingly.
1381     switch (c)
1382       {
1383       case '>':
1384         handler.startElement(gi);
1385         parseContent();
1386         break;
1387       case '/':
1388         require('>');
1389         handler.startElement(gi);
1390         handler.endElement(gi);
1391         break;
1392       }
1393
1394     // Restore the previous state.
1395     currentElement = oldElement;
1396     currentElementContent = oldElementContent;
1397   }
1398
1399   /**
1400    * Parse an attribute assignment.
1401    * <pre>
1402    * [41] Attribute ::= Name Eq AttValue
1403    * </pre>
1404    * @param name The name of the attribute's element.
1405    * @see SAXDriver#attribute
1406    */
1407   private void parseAttribute(String name)
1408     throws Exception
1409   {
1410     String aname;
1411     String type;
1412     String value;
1413     int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1414
1415     // Read the attribute name.
1416     aname = readNmtoken(true);
1417     type = getAttributeType(name, aname);
1418
1419     // Parse '='
1420     parseEq();
1421
1422     // Read the value, normalizing whitespace
1423     // unless it is CDATA.
1424     if (handler.stringInterning)
1425       {
1426         if (type == "CDATA" || type == null)
1427           {
1428             value = readLiteral(flags);
1429           }
1430         else
1431           {
1432             value = readLiteral(flags | LIT_NORMALIZE);
1433           }
1434       }
1435     else
1436       {
1437         if (type == null || type.equals("CDATA"))
1438           {
1439             value = readLiteral(flags);
1440           }
1441         else
1442           {
1443             value = readLiteral(flags | LIT_NORMALIZE);
1444           }
1445       }
1446
1447     // WFC: no duplicate attributes
1448     for (int i = 0; i < tagAttributePos; i++)
1449       {
1450         if (aname.equals(tagAttributes [i]))
1451           {
1452             error("duplicate attribute", aname, null);
1453           }
1454       }
1455
1456     // Inform the handler about the
1457     // attribute.
1458     handler.attribute(aname, value, true);
1459     dataBufferPos = 0;
1460
1461     // Note that the attribute has been
1462     // specified.
1463     if (tagAttributePos == tagAttributes.length)
1464       {
1465         String newAttrib[] = new String[tagAttributes.length * 2];
1466         System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1467         tagAttributes = newAttrib;
1468       }
1469     tagAttributes[tagAttributePos++] = aname;
1470   }
1471
1472   /**
1473    * Parse an equals sign surrounded by optional whitespace.
1474    * <pre>
1475    * [25] Eq ::= S? '=' S?
1476    * </pre>
1477    */
1478   private void parseEq()
1479     throws SAXException, IOException
1480   {
1481     skipWhitespace();
1482     require('=');
1483     skipWhitespace();
1484   }
1485
1486   /**
1487    * Parse an end tag.
1488    * <pre>
1489    * [42] ETag ::= '</' Name S? '>'
1490    * </pre>
1491    * <p>NOTE: parseContent () chains to here, we already read the
1492    * "&lt;/".
1493    */
1494   private void parseETag()
1495     throws Exception
1496   {
1497     require(currentElement);
1498     skipWhitespace();
1499     require('>');
1500     handler.endElement(currentElement);
1501     // not re-reporting any SAXException re bogus end tags,
1502     // even though that diagnostic might be clearer ...
1503   }
1504
1505   /**
1506    * Parse the content of an element.
1507    * <pre>
1508    * [43] content ::= (element | CharData | Reference
1509    *    | CDSect | PI | Comment)*
1510    * [67] Reference ::= EntityRef | CharRef
1511    * </pre>
1512    * <p> NOTE: consumes ETtag.
1513    */
1514   private void parseContent()
1515     throws Exception
1516   {
1517     char c;
1518
1519     while (true)
1520       {
1521         // consume characters (or ignorable whitspace) until delimiter
1522         parseCharData();
1523
1524         // Handle delimiters
1525         c = readCh();
1526         switch (c)
1527           {
1528           case '&':       // Found "&"
1529             c = readCh();
1530             if (c == '#')
1531               {
1532                 parseCharRef();
1533               }
1534             else
1535               {
1536                 unread(c);
1537                 parseEntityRef(true);
1538               }
1539             isDirtyCurrentElement = true;
1540             break;
1541
1542           case '<':       // Found "<"
1543             dataBufferFlush();
1544             c = readCh();
1545             switch (c)
1546               {
1547               case '!':       // Found "<!"
1548                 c = readCh();
1549                 switch (c)
1550                   {
1551                   case '-':     // Found "<!-"
1552                     require('-');
1553                     isDirtyCurrentElement = false;
1554                     parseComment();
1555                     break;
1556                   case '[':     // Found "<!["
1557                     isDirtyCurrentElement = false;
1558                     require("CDATA[");
1559                     handler.startCDATA();
1560                     inCDATA = true;
1561                     parseCDSect();
1562                     inCDATA = false;
1563                     handler.endCDATA();
1564                     break;
1565                   default:
1566                     error("expected comment or CDATA section", c, null);
1567                     break;
1568                   }
1569                 break;
1570
1571               case '?':     // Found "<?"
1572                 isDirtyCurrentElement = false;
1573                 parsePI();
1574                 break;
1575
1576               case '/':     // Found "</"
1577                 isDirtyCurrentElement = false;
1578                 parseETag();
1579                 return;
1580
1581               default:     // Found "<" followed by something else
1582                 isDirtyCurrentElement = false;
1583                 unread(c);
1584                 parseElement(false);
1585                 break;
1586               }
1587           }
1588       }
1589   }
1590
1591   /**
1592    * Parse an element type declaration.
1593    * <pre>
1594    * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1595    * </pre>
1596    * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1597    */
1598   private void parseElementDecl()
1599     throws Exception
1600   {
1601     String name;
1602
1603     requireWhitespace();
1604     // Read the element type name.
1605     name = readNmtoken(true);
1606
1607     requireWhitespace();
1608     // Read the content model.
1609     parseContentspec(name);
1610
1611     skipWhitespace();
1612     require('>');
1613   }
1614
1615   /**
1616    * Content specification.
1617    * <pre>
1618    * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1619    * </pre>
1620    */
1621   private void parseContentspec(String name)
1622     throws Exception
1623   {
1624     // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1625     if (tryRead("EMPTY"))
1626       {
1627         setElement(name, CONTENT_EMPTY, null, null);
1628         if (!skippedPE)
1629           {
1630             handler.getDeclHandler().elementDecl(name, "EMPTY");
1631           }
1632         return;
1633       }
1634     else if (tryRead("ANY"))
1635       {
1636         setElement(name, CONTENT_ANY, null, null);
1637         if (!skippedPE)
1638           {
1639             handler.getDeclHandler().elementDecl(name, "ANY");
1640           }
1641         return;
1642       }
1643     else
1644       {
1645         String model;
1646         char[] saved;
1647
1648         require('(');
1649         saved = readBuffer;
1650         dataBufferAppend('(');
1651         skipWhitespace();
1652         if (tryRead("#PCDATA"))
1653           {
1654             dataBufferAppend("#PCDATA");
1655             parseMixed(saved);
1656             model = dataBufferToString();
1657             setElement(name, CONTENT_MIXED, model, null);
1658           }
1659         else
1660           {
1661             parseElements(saved);
1662             model = dataBufferToString();
1663             setElement(name, CONTENT_ELEMENTS, model, null);
1664           }
1665         if (!skippedPE)
1666           {
1667             handler.getDeclHandler().elementDecl(name, model);
1668           }
1669       }
1670   }
1671
1672   /**
1673    * Parse an element-content model.
1674    * <pre>
1675    * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1676    * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1677    * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1678    * </pre>
1679    *
1680    * <p> NOTE: the opening '(' and S have already been read.
1681    *
1682    * @param saved Buffer for entity that should have the terminal ')'
1683    */
1684   private void parseElements(char[] saved)
1685     throws Exception
1686   {
1687     char c;
1688     char sep;
1689
1690     // Parse the first content particle
1691     skipWhitespace();
1692     parseCp();
1693
1694     // Check for end or for a separator.
1695     skipWhitespace();
1696     c = readCh();
1697     switch (c)
1698       {
1699       case ')':
1700         // VC: Proper Group/PE Nesting
1701         if (readBuffer != saved)
1702           {
1703             handler.verror("Illegal Group/PE nesting");
1704           }
1705
1706         dataBufferAppend(')');
1707         c = readCh();
1708         switch (c)
1709           {
1710           case '*':
1711           case '+':
1712           case '?':
1713             dataBufferAppend(c);
1714             break;
1715           default:
1716             unread(c);
1717           }
1718         return;
1719       case ',':       // Register the separator.
1720       case '|':
1721         sep = c;
1722         dataBufferAppend(c);
1723         break;
1724       default:
1725         error("bad separator in content model", c, null);
1726         return;
1727       }
1728
1729     // Parse the rest of the content model.
1730     while (true)
1731       {
1732         skipWhitespace();
1733         parseCp();
1734         skipWhitespace();
1735         c = readCh();
1736         if (c == ')')
1737           {
1738             // VC: Proper Group/PE Nesting
1739             if (readBuffer != saved)
1740               {
1741                 handler.verror("Illegal Group/PE nesting");
1742               }
1743
1744             dataBufferAppend(')');
1745             break;
1746           }
1747         else if (c != sep)
1748           {
1749             error("bad separator in content model", c, null);
1750             return;
1751           }
1752         else
1753           {
1754             dataBufferAppend(c);
1755           }
1756       }
1757
1758     // Check for the occurrence indicator.
1759     c = readCh();
1760     switch (c)
1761       {
1762       case '?':
1763       case '*':
1764       case '+':
1765         dataBufferAppend(c);
1766         return;
1767       default:
1768         unread(c);
1769         return;
1770       }
1771   }
1772
1773   /**
1774    * Parse a content particle.
1775    * <pre>
1776    * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1777    * </pre>
1778    */
1779   private void parseCp()
1780     throws Exception
1781   {
1782     if (tryRead('('))
1783       {
1784         dataBufferAppend('(');
1785         parseElements(readBuffer);
1786       }
1787     else
1788       {
1789         dataBufferAppend(readNmtoken(true));
1790         char c = readCh();
1791         switch (c)
1792           {
1793           case '?':
1794           case '*':
1795           case '+':
1796             dataBufferAppend(c);
1797             break;
1798           default:
1799             unread(c);
1800             break;
1801           }
1802       }
1803   }
1804
1805   /**
1806    * Parse mixed content.
1807    * <pre>
1808    * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1809    *        | '(' S? ('#PCDATA') S? ')'
1810    * </pre>
1811    *
1812    * @param saved Buffer for entity that should have the terminal ')'
1813    */
1814   private void parseMixed(char[] saved)
1815     throws Exception
1816   {
1817     // Check for PCDATA alone.
1818     skipWhitespace();
1819     if (tryRead(')'))
1820       {
1821         // VC: Proper Group/PE Nesting
1822         if (readBuffer != saved)
1823           {
1824             handler.verror("Illegal Group/PE nesting");
1825           }
1826
1827         dataBufferAppend(")*");
1828         tryRead('*');
1829         return;
1830       }
1831
1832     // Parse mixed content.
1833     skipWhitespace();
1834     while (!tryRead(")"))
1835       {
1836         require('|');
1837         dataBufferAppend('|');
1838         skipWhitespace();
1839         dataBufferAppend(readNmtoken(true));
1840         skipWhitespace();
1841       }
1842
1843     // VC: Proper Group/PE Nesting
1844     if (readBuffer != saved)
1845       {
1846         handler.verror("Illegal Group/PE nesting");
1847       }
1848
1849     require('*');
1850     dataBufferAppend(")*");
1851   }
1852
1853   /**
1854    * Parse an attribute list declaration.
1855    * <pre>
1856    * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1857    * </pre>
1858    * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1859    */
1860   private void parseAttlistDecl()
1861     throws Exception
1862   {
1863     String elementName;
1864
1865     requireWhitespace();
1866     elementName = readNmtoken(true);
1867     boolean white = tryWhitespace();
1868     while (!tryRead('>'))
1869       {
1870         if (!white)
1871           {
1872             error("whitespace required before attribute definition");
1873           }
1874         parseAttDef(elementName);
1875         white = tryWhitespace();
1876       }
1877   }
1878
1879   /**
1880    * Parse a single attribute definition.
1881    * <pre>
1882    * [53] AttDef ::= S Name S AttType S DefaultDecl
1883    * </pre>
1884    */
1885   private void parseAttDef(String elementName)
1886     throws Exception
1887   {
1888     String name;
1889     String type;
1890     String enumer = null;
1891
1892     // Read the attribute name.
1893     name = readNmtoken(true);
1894
1895     // Read the attribute type.
1896     requireWhitespace();
1897     type = readAttType();
1898
1899     // Get the string of enumerated values if necessary.
1900     if (handler.stringInterning)
1901       {
1902         if ("ENUMERATION" == type || "NOTATION" == type)
1903           {
1904             enumer = dataBufferToString();
1905           }
1906       }
1907     else
1908       {
1909         if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1910           {
1911             enumer = dataBufferToString();
1912           }
1913       }
1914
1915     // Read the default value.
1916     requireWhitespace();
1917     parseDefault(elementName, name, type, enumer);
1918   }
1919
1920   /**
1921    * Parse the attribute type.
1922    * <pre>
1923    * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1924    * [55] StringType ::= 'CDATA'
1925    * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1926    *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1927    * [57] EnumeratedType ::= NotationType | Enumeration
1928    * </pre>
1929    */
1930   private String readAttType()
1931     throws Exception
1932   {
1933     if (tryRead('('))
1934       {
1935         parseEnumeration(false);
1936         return "ENUMERATION";
1937       }
1938     else
1939       {
1940         String typeString = readNmtoken(true);
1941         if (handler.stringInterning)
1942           {
1943             if ("NOTATION" == typeString)
1944               {
1945                 parseNotationType();
1946                 return typeString;
1947               }
1948             else if ("CDATA" == typeString
1949                      || "ID" == typeString
1950                      || "IDREF" == typeString
1951                      || "IDREFS" == typeString
1952                      || "ENTITY" == typeString
1953                      || "ENTITIES" == typeString
1954                      || "NMTOKEN" == typeString
1955                      || "NMTOKENS" == typeString)
1956               {
1957                 return typeString;
1958               }
1959           }
1960         else
1961           {
1962             if ("NOTATION".equals(typeString))
1963               {
1964                 parseNotationType();
1965                 return typeString;
1966               }
1967             else if ("CDATA".equals(typeString)
1968                      || "ID".equals(typeString)
1969                      || "IDREF".equals(typeString)
1970                      || "IDREFS".equals(typeString)
1971                      || "ENTITY".equals(typeString)
1972                      || "ENTITIES".equals(typeString)
1973                      || "NMTOKEN".equals(typeString)
1974                      || "NMTOKENS".equals(typeString))
1975               {
1976                 return typeString;
1977               }
1978           }
1979         error("illegal attribute type", typeString, null);
1980         return null;
1981       }
1982   }
1983
1984   /**
1985    * Parse an enumeration.
1986    * <pre>
1987    * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1988    * </pre>
1989    * <p>NOTE: the '(' has already been read.
1990    */
1991   private void parseEnumeration(boolean isNames)
1992     throws Exception
1993   {
1994     dataBufferAppend('(');
1995
1996     // Read the first token.
1997     skipWhitespace();
1998     dataBufferAppend(readNmtoken(isNames));
1999     // Read the remaining tokens.
2000     skipWhitespace();
2001     while (!tryRead(')'))
2002       {
2003         require('|');
2004         dataBufferAppend('|');
2005         skipWhitespace();
2006         dataBufferAppend(readNmtoken (isNames));
2007         skipWhitespace();
2008       }
2009     dataBufferAppend(')');
2010   }
2011
2012   /**
2013    * Parse a notation type for an attribute.
2014    * <pre>
2015    * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2016    *    (S? '|' S? name)* S? ')'
2017    * </pre>
2018    * <p>NOTE: the 'NOTATION' has already been read
2019    */
2020   private void parseNotationType()
2021     throws Exception
2022   {
2023     requireWhitespace();
2024     require('(');
2025
2026     parseEnumeration(true);
2027   }
2028
2029   /**
2030    * Parse the default value for an attribute.
2031    * <pre>
2032    * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2033    *    | (('#FIXED' S)? AttValue)
2034    * </pre>
2035    */
2036   private void parseDefault(String elementName, String name,
2037                             String type, String enumer)
2038     throws Exception
2039   {
2040     int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2041     String value = null;
2042     int flags = LIT_ATTRIBUTE;
2043     boolean saved = expandPE;
2044     String defaultType = null;
2045
2046     // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2047     // chars to spaces (doesn't matter when that's done if it doesn't
2048     // interfere with char refs expanding to whitespace).
2049
2050     if (!skippedPE)
2051       {
2052         flags |= LIT_ENTITY_REF;
2053         if (handler.stringInterning)
2054           {
2055             if ("CDATA" != type)
2056               {
2057                 flags |= LIT_NORMALIZE;
2058               }
2059           }
2060         else
2061           {
2062             if (!"CDATA".equals(type))
2063               {
2064                 flags |= LIT_NORMALIZE;
2065               }
2066           }
2067       }
2068
2069     expandPE = false;
2070     if (tryRead('#'))
2071       {
2072         if (tryRead("FIXED"))
2073           {
2074             defaultType = "#FIXED";
2075             valueType = ATTRIBUTE_DEFAULT_FIXED;
2076             requireWhitespace();
2077             value = readLiteral(flags);
2078           }
2079         else if (tryRead("REQUIRED"))
2080           {
2081             defaultType = "#REQUIRED";
2082             valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2083           }
2084         else if (tryRead("IMPLIED"))
2085           {
2086             defaultType = "#IMPLIED";
2087             valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2088           }
2089         else
2090           {
2091             error("illegal keyword for attribute default value");
2092           }
2093       }
2094     else
2095       {
2096         value = readLiteral(flags);
2097       }
2098     expandPE = saved;
2099     setAttribute(elementName, name, type, enumer, value, valueType);
2100     if (handler.stringInterning)
2101       {
2102         if ("ENUMERATION" == type)
2103           {
2104             type = enumer;
2105           }
2106         else if ("NOTATION" == type)
2107           {
2108             type = "NOTATION " + enumer;
2109           }
2110       }
2111     else
2112       {
2113         if ("ENUMERATION".equals(type))
2114           {
2115             type = enumer;
2116           }
2117         else if ("NOTATION".equals(type))
2118           {
2119             type = "NOTATION " + enumer;
2120           }
2121       }
2122     if (!skippedPE)
2123       {
2124         handler.getDeclHandler().attributeDecl(elementName, name, type,
2125                                                defaultType, value);
2126       }
2127   }
2128
2129   /**
2130    * Parse a conditional section.
2131    * <pre>
2132    * [61] conditionalSect ::= includeSect || ignoreSect
2133    * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2134    *    extSubsetDecl ']]&gt;'
2135    * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2136    *    ignoreSectContents* ']]&gt;'
2137    * [64] ignoreSectContents ::= Ignore
2138    *    ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2139    * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2140    * </pre>
2141    * <p> NOTE: the '&gt;![' has already been read.
2142    */
2143   private void parseConditionalSect(char[] saved)
2144     throws Exception
2145   {
2146     skipWhitespace();
2147     if (tryRead("INCLUDE"))
2148       {
2149         skipWhitespace();
2150         require('[');
2151         // VC: Proper Conditional Section/PE Nesting
2152         if (readBuffer != saved)
2153           {
2154             handler.verror("Illegal Conditional Section/PE nesting");
2155           }
2156         skipWhitespace();
2157         while (!tryRead("]]>"))
2158           {
2159             parseMarkupdecl();
2160             skipWhitespace();
2161           }
2162       }
2163     else if (tryRead("IGNORE"))
2164       {
2165         skipWhitespace();
2166         require('[');
2167         // VC: Proper Conditional Section/PE Nesting
2168         if (readBuffer != saved)
2169           {
2170             handler.verror("Illegal Conditional Section/PE nesting");
2171           }
2172         int nesting = 1;
2173         char c;
2174         expandPE = false;
2175         for (int nest = 1; nest > 0; )
2176           {
2177             c = readCh();
2178             switch (c)
2179               {
2180               case '<':
2181                 if (tryRead("!["))
2182                   {
2183                     nest++;
2184                   }
2185               case ']':
2186                 if (tryRead("]>"))
2187                   {
2188                     nest--;
2189                   }
2190               }
2191           }
2192         expandPE = true;
2193       }
2194     else
2195       {
2196         error("conditional section must begin with INCLUDE or IGNORE");
2197       }
2198   }
2199
2200   private void parseCharRef()
2201     throws SAXException, IOException
2202   {
2203     parseCharRef(true /* do flushDataBuffer by default */);
2204   }
2205
2206   /**
2207    * Try to read a character reference without consuming data from buffer.
2208    * <pre>
2209    * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2210    * </pre>
2211    * <p>NOTE: the '&#' has already been read.
2212    */
2213   private void tryReadCharRef()
2214     throws SAXException, IOException
2215   {
2216     int value = 0;
2217     char c;
2218
2219     if (tryRead('x'))
2220       {
2221 loop1:
2222         while (true)
2223           {
2224             c = readCh();
2225             if (c == ';')
2226               {
2227                 break loop1;
2228               }
2229             else
2230               {
2231                 int n = Character.digit(c, 16);
2232                 if (n == -1)
2233                   {
2234                     error("illegal character in character reference", c, null);
2235                     break loop1;
2236                   }
2237                 value *= 16;
2238                 value += n;
2239               }
2240           }
2241       }
2242     else
2243       {
2244 loop2:
2245         while (true)
2246           {
2247             c = readCh();
2248             if (c == ';')
2249               {
2250                 break loop2;
2251               }
2252             else
2253               {
2254                 int n = Character.digit(c, 10);
2255                 if (n == -1)
2256                   {
2257                     error("illegal character in character reference", c, null);
2258                     break loop2;
2259                   }
2260                 value *= 10;
2261                 value += n;
2262               }
2263           }
2264       }
2265
2266     // check for character refs being legal XML
2267     if ((value < 0x0020
2268          && ! (value == '\n' || value == '\t' || value == '\r'))
2269         || (value >= 0xD800 && value <= 0xDFFF)
2270         || value == 0xFFFE || value == 0xFFFF
2271         || value > 0x0010ffff)
2272       {
2273         error("illegal XML character reference U+"
2274               + Integer.toHexString(value));
2275       }
2276
2277     // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2278     //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2279     if (value > 0x0010ffff)
2280       {
2281         // too big for surrogate
2282         error("character reference " + value + " is too large for UTF-16",
2283               new Integer(value).toString(), null);
2284       }
2285
2286   }
2287
2288   /**
2289    * Read and interpret a character reference.
2290    * <pre>
2291    * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2292    * </pre>
2293    * <p>NOTE: the '&#' has already been read.
2294    */
2295   private void parseCharRef(boolean doFlush)
2296     throws SAXException, IOException
2297   {
2298     int value = 0;
2299     char c;
2300
2301     if (tryRead('x'))
2302       {
2303 loop1:
2304         while (true)
2305           {
2306             c = readCh();
2307             if (c == ';')
2308               {
2309                 break loop1;
2310               }
2311             else
2312               {
2313                 int n = Character.digit(c, 16);
2314                 if (n == -1)
2315                   {
2316                     error("illegal character in character reference", c, null);
2317                     break loop1;
2318                   }
2319                 value *= 16;
2320                 value += n;
2321               }
2322           }
2323       }
2324     else
2325       {
2326 loop2:
2327         while (true)
2328           {
2329             c = readCh();
2330             if (c == ';')
2331               {
2332                 break loop2;
2333               }
2334             else
2335               {
2336                 int n = Character.digit(c, 10);
2337                 if (n == -1)
2338                   {
2339                     error("illegal character in character reference", c, null);
2340                     break loop2;
2341                   }
2342                 value *= 10;
2343                 value += c - '0';
2344               }
2345           }
2346       }
2347
2348     // check for character refs being legal XML
2349     if ((value < 0x0020
2350          && ! (value == '\n' || value == '\t' || value == '\r'))
2351         || (value >= 0xD800 && value <= 0xDFFF)
2352         || value == 0xFFFE || value == 0xFFFF
2353         || value > 0x0010ffff)
2354       {
2355         error("illegal XML character reference U+"
2356               + Integer.toHexString(value));
2357       }
2358
2359     // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2360     //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2361     if (value <= 0x0000ffff)
2362       {
2363         // no surrogates needed
2364         dataBufferAppend((char) value);
2365       }
2366     else if (value <= 0x0010ffff)
2367       {
2368         value -= 0x10000;
2369         // > 16 bits, surrogate needed
2370         dataBufferAppend((char) (0xd800 | (value >> 10)));
2371         dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2372       }
2373     else
2374       {
2375         // too big for surrogate
2376         error("character reference " + value + " is too large for UTF-16",
2377               new Integer(value).toString(), null);
2378       }
2379     if (doFlush)
2380       {
2381         dataBufferFlush();
2382       }
2383   }
2384
2385   /**
2386    * Parse and expand an entity reference.
2387    * <pre>
2388    * [68] EntityRef ::= '&' Name ';'
2389    * </pre>
2390    * <p>NOTE: the '&amp;' has already been read.
2391    * @param externalAllowed External entities are allowed here.
2392    */
2393   private void parseEntityRef(boolean externalAllowed)
2394     throws SAXException, IOException
2395   {
2396     String name;
2397
2398     name = readNmtoken(true);
2399     require(';');
2400     switch (getEntityType(name))
2401       {
2402       case ENTITY_UNDECLARED:
2403         // NOTE:  XML REC describes amazingly convoluted handling for
2404         // this case.  Nothing as meaningful as being a WFness error
2405         // unless the processor might _legitimately_ not have seen a
2406         // declaration ... which is what this implements.
2407         String message;
2408
2409         message = "reference to undeclared general entity " + name;
2410         if (skippedPE && !docIsStandalone)
2411           {
2412             handler.verror(message);
2413             // we don't know this entity, and it might be external...
2414             if (externalAllowed)
2415               {
2416                 handler.skippedEntity(name);
2417               }
2418           }
2419         else
2420           {
2421             error(message);
2422           }
2423         break;
2424       case ENTITY_INTERNAL:
2425           pushString(name, getEntityValue(name));
2426
2427           //workaround for possible input pop before marking
2428           //the buffer reading position
2429           char t = readCh();
2430           unread(t);
2431           int bufferPosMark = readBufferPos;
2432
2433           int end = readBufferPos + getEntityValue(name).length();
2434           for (int k = readBufferPos; k < end; k++)
2435             {
2436               t = readCh();
2437               if (t == '&')
2438                 {
2439                   t = readCh();
2440                   if (t  == '#')
2441                     {
2442                       //try to match a character ref
2443                       tryReadCharRef();
2444
2445                       //everything has been read
2446                       if (readBufferPos >= end)
2447                         {
2448                           break;
2449                         }
2450                       k = readBufferPos;
2451                       continue;
2452                     }
2453                   else if (Character.isLetter(t))
2454                     {
2455                       //looks like an entity ref
2456                       unread(t);
2457                       readNmtoken(true);
2458                       require(';');
2459
2460                       //everything has been read
2461                       if (readBufferPos >= end)
2462                         {
2463                           break;
2464                         }
2465                       k = readBufferPos;
2466                       continue;
2467                     }
2468                   error(" malformed entity reference");
2469                 }
2470
2471             }
2472           readBufferPos = bufferPosMark;
2473           break;
2474       case ENTITY_TEXT:
2475           if (externalAllowed)
2476             {
2477               pushURL(false, name, getEntityIds(name),
2478                       null, null, null, true);
2479             }
2480           else
2481             {
2482               error("reference to external entity in attribute value.",
2483                     name, null);
2484             }
2485           break;
2486       case ENTITY_NDATA:
2487           if (externalAllowed)
2488             {
2489               error("unparsed entity reference in content", name, null);
2490             }
2491           else
2492             {
2493               error("reference to external entity in attribute value.",
2494                     name, null);
2495             }
2496           break;
2497       default:
2498           throw new RuntimeException();
2499       }
2500   }
2501
2502   /**
2503    * Parse and expand a parameter entity reference.
2504    * <pre>
2505    * [69] PEReference ::= '%' Name ';'
2506    * </pre>
2507    * <p>NOTE: the '%' has already been read.
2508    */
2509   private void parsePEReference()
2510     throws SAXException, IOException
2511   {
2512     String name;
2513
2514     name = "%" + readNmtoken(true);
2515     require(';');
2516     switch (getEntityType(name))
2517       {
2518       case ENTITY_UNDECLARED:
2519         // VC: Entity Declared
2520         handler.verror("reference to undeclared parameter entity " + name);
2521
2522         // we should disable handling of all subsequent declarations
2523         // unless this is a standalone document (info discarded)
2524         break;
2525       case ENTITY_INTERNAL:
2526         if (inLiteral)
2527           {
2528             pushString(name, getEntityValue(name));
2529           }
2530         else
2531           {
2532             pushString(name, ' ' + getEntityValue(name) + ' ');
2533           }
2534         break;
2535       case ENTITY_TEXT:
2536         if (!inLiteral)
2537           {
2538             pushString(null, " ");
2539           }
2540         pushURL(true, name, getEntityIds(name), null, null, null, true);
2541         if (!inLiteral)
2542           {
2543             pushString(null, " ");
2544           }
2545         break;
2546       }
2547   }
2548
2549   /**
2550    * Parse an entity declaration.
2551    * <pre>
2552    * [70] EntityDecl ::= GEDecl | PEDecl
2553    * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2554    * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2555    * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2556    * [74] PEDef ::= EntityValue | ExternalID
2557    * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2558    *       | 'PUBLIC' S PubidLiteral S SystemLiteral
2559    * [76] NDataDecl ::= S 'NDATA' S Name
2560    * </pre>
2561    * <p>NOTE: the '&lt;!ENTITY' has already been read.
2562    */
2563   private void parseEntityDecl()
2564     throws Exception
2565   {
2566     boolean peFlag = false;
2567     int flags = 0;
2568
2569     // Check for a parameter entity.
2570     expandPE = false;
2571     requireWhitespace();
2572     if (tryRead('%'))
2573       {
2574         peFlag = true;
2575         requireWhitespace();
2576       }
2577     expandPE = true;
2578
2579     // Read the entity name, and prepend
2580     // '%' if necessary.
2581     String name = readNmtoken(true);
2582     //NE08
2583     if (name.indexOf(':') >= 0)
2584       {
2585         error("Illegal character(':') in entity name ", name, null);
2586       }
2587     if (peFlag)
2588       {
2589         name = "%" + name;
2590       }
2591
2592     // Read the entity value.
2593     requireWhitespace();
2594     char c = readCh();
2595     unread (c);
2596     if (c == '"' || c == '\'')
2597       {
2598         // Internal entity ... replacement text has expanded refs
2599         // to characters and PEs, but not to general entities
2600         String value = readLiteral(flags);
2601         setInternalEntity(name, value);
2602       }
2603     else
2604       {
2605         // Read the external IDs
2606         ExternalIdentifiers ids = readExternalIds(false, false);
2607
2608         // Check for NDATA declaration.
2609         boolean white = tryWhitespace();
2610         if (!peFlag && tryRead("NDATA"))
2611           {
2612             if (!white)
2613               {
2614                 error("whitespace required before NDATA");
2615               }
2616             requireWhitespace();
2617             String notationName = readNmtoken(true);
2618             if (!skippedPE)
2619               {
2620                 setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2621                 handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2622                                            ids.baseUri, notationName);
2623               }
2624           }
2625         else if (!skippedPE)
2626           {
2627             setExternalEntity(name, ENTITY_TEXT, ids, null);
2628             handler.getDeclHandler()
2629               .externalEntityDecl(name, ids.publicId,
2630                                    handler.resolveURIs()
2631                                    // FIXME: ASSUMES not skipped
2632                                    // "false" forces error on bad URI
2633                                    ? handler.absolutize(ids.baseUri,
2634                                                         ids.systemId,
2635                                                         false)
2636                                    : ids.systemId);
2637           }
2638       }
2639
2640     // Finish the declaration.
2641     skipWhitespace();
2642     require('>');
2643   }
2644
2645   /**
2646    * Parse a notation declaration.
2647    * <pre>
2648    * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2649    *    (ExternalID | PublicID) S? '&gt;'
2650    * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2651    * </pre>
2652    * <P>NOTE: the '&lt;!NOTATION' has already been read.
2653    */
2654   private void parseNotationDecl()
2655     throws Exception
2656   {
2657     String nname;
2658     ExternalIdentifiers ids;
2659
2660     requireWhitespace();
2661     nname = readNmtoken(true);
2662     //NE08
2663     if (nname.indexOf(':') >= 0)
2664       {
2665         error("Illegal character(':') in notation name ", nname, null);
2666       }
2667     requireWhitespace();
2668
2669     // Read the external identifiers.
2670     ids = readExternalIds(true, false);
2671
2672     // Register the notation.
2673     setNotation(nname, ids);
2674
2675     skipWhitespace();
2676     require('>');
2677   }
2678
2679   /**
2680    * Parse character data.
2681    * <pre>
2682    * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2683    * </pre>
2684    */
2685   private void parseCharData()
2686     throws Exception
2687   {
2688     char c;
2689     int state = 0;
2690     boolean pureWhite = false;
2691
2692     // assert (dataBufferPos == 0);
2693
2694     // are we expecting pure whitespace?  it might be dirty...
2695     if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2696       {
2697         pureWhite = true;
2698       }
2699
2700     // always report right out of readBuffer
2701     // to minimize (pointless) buffer copies
2702     while (true)
2703       {
2704         int lineAugment = 0;
2705         int columnAugment = 0;
2706         int i;
2707
2708 loop:
2709         for (i = readBufferPos; i < readBufferLength; i++)
2710           {
2711             switch (c = readBuffer[i])
2712               {
2713               case '\n':
2714                 lineAugment++;
2715                 columnAugment = 0;
2716                 // pureWhite unmodified
2717                 break;
2718               case '\r':  // should not happen!!
2719               case '\t':
2720               case ' ':
2721                 // pureWhite unmodified
2722                 columnAugment++;
2723                 break;
2724               case '&':
2725               case '<':
2726                 columnAugment++;
2727                 // pureWhite unmodified
2728                 // CLEAN end of text sequence
2729                 state = 1;
2730                 break loop;
2731               case ']':
2732                 // that's not a whitespace char, and
2733                 // can not terminate pure whitespace either
2734                 pureWhite = false;
2735                 if ((i + 2) < readBufferLength)
2736                   {
2737                     if (readBuffer [i + 1] == ']'
2738                         && readBuffer [i + 2] == '>')
2739                       {
2740                         // ERROR end of text sequence
2741                         state = 2;
2742                         break loop;
2743                       }
2744                   }
2745                 else
2746                   {
2747                     // FIXME missing two end-of-buffer cases
2748                   }
2749                 columnAugment++;
2750                 break;
2751               default:
2752                 if ((c < 0x0020 || c > 0xFFFD)
2753                     || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2754                         && xmlVersion == XML_11))
2755                   {
2756                     error("illegal XML character U+"
2757                           + Integer.toHexString(c));
2758                   }
2759                 // that's not a whitespace char
2760                 pureWhite = false;
2761                 columnAugment++;
2762               }
2763           }
2764
2765         // report text thus far
2766         if (lineAugment > 0)
2767           {
2768             line += lineAugment;
2769             column = columnAugment;
2770           }
2771         else
2772           {
2773             column += columnAugment;
2774           }
2775
2776         // report characters/whitspace
2777         int length = i - readBufferPos;
2778
2779         if (length != 0)
2780           {
2781             if (pureWhite)
2782               {
2783                 handler.ignorableWhitespace(readBuffer,
2784                                             readBufferPos, length);
2785               }
2786             else
2787               {
2788                 handler.charData(readBuffer, readBufferPos, length);
2789               }
2790             readBufferPos = i;
2791           }
2792
2793         if (state != 0)
2794           {
2795             break;
2796           }
2797
2798         // fill next buffer from this entity, or
2799         // pop stack and continue with previous entity
2800         unread(readCh());
2801       }
2802     if (!pureWhite)
2803       {
2804         isDirtyCurrentElement = true;
2805       }
2806     // finish, maybe with error
2807     if (state != 1)  // finish, no error
2808       {
2809         error("character data may not contain ']]>'");
2810       }
2811   }
2812
2813   //////////////////////////////////////////////////////////////////////
2814   // High-level reading and scanning methods.
2815   //////////////////////////////////////////////////////////////////////
2816
2817   /**
2818    * Require whitespace characters.
2819    */
2820   private void requireWhitespace()
2821     throws SAXException, IOException
2822   {
2823     char c = readCh();
2824     if (isWhitespace(c))
2825       {
2826         skipWhitespace();
2827       }
2828     else
2829       {
2830         error("whitespace required", c, null);
2831       }
2832   }
2833
2834   /**
2835    * Skip whitespace characters.
2836    * <pre>
2837    * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2838    * </pre>
2839    */
2840   private void skipWhitespace()
2841     throws SAXException, IOException
2842   {
2843     // Start with a little cheat.  Most of
2844     // the time, the white space will fall
2845     // within the current read buffer; if
2846     // not, then fall through.
2847     if (USE_CHEATS)
2848       {
2849         int lineAugment = 0;
2850         int columnAugment = 0;
2851
2852 loop:
2853         for (int i = readBufferPos; i < readBufferLength; i++)
2854           {
2855             switch (readBuffer[i])
2856               {
2857               case ' ':
2858               case '\t':
2859               case '\r':
2860                 columnAugment++;
2861                 break;
2862               case '\n':
2863                 lineAugment++;
2864                 columnAugment = 0;
2865                 break;
2866               case '%':
2867                 if (expandPE)
2868                   {
2869                     break loop;
2870                   }
2871                 // else fall through...
2872               default:
2873                 readBufferPos = i;
2874                 if (lineAugment > 0)
2875                   {
2876                     line += lineAugment;
2877                     column = columnAugment;
2878                   }
2879                 else
2880                   {
2881                     column += columnAugment;
2882                   }
2883                 return;
2884               }
2885           }
2886       }
2887
2888     // OK, do it the slow way.
2889     char c = readCh ();
2890     while (isWhitespace(c))
2891       {
2892         c = readCh();
2893       }
2894     unread(c);
2895   }
2896
2897   /**
2898    * Read a name or (when parsing an enumeration) name token.
2899    * <pre>
2900    * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2901    * [7] Nmtoken ::= (NameChar)+
2902    * </pre>
2903    */
2904   private String readNmtoken(boolean isName)
2905     throws SAXException, IOException
2906   {
2907     char c;
2908
2909     if (USE_CHEATS)
2910       {
2911 loop:
2912         for (int i = readBufferPos; i < readBufferLength; i++)
2913           {
2914             c = readBuffer[i];
2915             switch (c)
2916               {
2917               case '%':
2918                 if (expandPE)
2919                   {
2920                     break loop;
2921                   }
2922                 // else fall through...
2923
2924                 // What may legitimately come AFTER a name/nmtoken?
2925               case '<': case '>': case '&':
2926               case ',': case '|': case '*': case '+': case '?':
2927               case ')':
2928               case '=':
2929               case '\'': case '"':
2930               case '[':
2931               case ' ': case '\t': case '\r': case '\n':
2932               case ';':
2933               case '/':
2934                 int start = readBufferPos;
2935                 if (i == start)
2936                   {
2937                     error("name expected", readBuffer[i], null);
2938                   }
2939                 readBufferPos = i;
2940                 return intern(readBuffer, start, i - start);
2941
2942               default:
2943                 // FIXME ... per IBM's OASIS test submission, these:
2944                 //   ?    U+06dd
2945                 //   Combining  U+309B
2946                 //these switches are kind of ugly but at least we won't
2947                 //have to go over the whole lits for each char
2948                 if (isName && i == readBufferPos)
2949                   {
2950                     char c2 = (char) (c & 0x00f0);
2951                     switch (c & 0xff00)
2952                       {
2953                         //starting with 01
2954                       case 0x0100:
2955                         switch (c2)
2956                           {
2957                           case 0x0030:
2958                             if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2959                               {
2960                                 error("Not a name start character, U+"
2961                                       + Integer.toHexString(c));
2962                               }
2963                             break;
2964                           case 0x0040:
2965                             if (c == 0x0140 || c == 0x0149)
2966                               {
2967                                 error("Not a name start character, U+"
2968                                       + Integer.toHexString(c));
2969                               }
2970                             break;
2971                           case 0x00c0:
2972                             if (c == 0x01c4 || c == 0x01cc)
2973                               {
2974                                 error("Not a name start character, U+"
2975                                       + Integer.toHexString(c));
2976                               }
2977                             break;
2978                           case 0x00f0:
2979                             if (c == 0x01f1 || c == 0x01f3)
2980                               {
2981                                 error("Not a name start character, U+"
2982                                       + Integer.toHexString(c));
2983                               }
2984                             break;
2985                           case 0x00b0:
2986                             if (c == 0x01f1 || c == 0x01f3)
2987                               {
2988                                 error("Not a name start character, U+"
2989                                       + Integer.toHexString(c));
2990                               }
2991                             break;
2992                           default:
2993                             if (c == 0x017f)
2994                               {
2995                                 error("Not a name start character, U+"
2996                                       + Integer.toHexString(c));
2997                               }
2998                           }
2999
3000                         break;
3001                         //starting with 11
3002                       case 0x1100:
3003                         switch (c2)
3004                           {
3005                           case 0x0000:
3006                             if (c == 0x1104 || c == 0x1108 ||
3007                                 c == 0x110a || c == 0x110d)
3008                               {
3009                                 error("Not a name start character, U+"
3010                                       + Integer.toHexString(c));
3011                               }
3012                             break;
3013                           case 0x0030:
3014                             if (c == 0x113b || c == 0x113f)
3015                               {
3016                                 error("Not a name start character, U+"
3017                                       + Integer.toHexString(c));
3018                               }
3019                             break;
3020                           case 0x0040:
3021                             if (c == 0x1141 || c == 0x114d
3022                                 || c == 0x114f )
3023                               {
3024                                 error("Not a name start character, U+"
3025                                       + Integer.toHexString(c));
3026                               }
3027                             break;
3028                           case 0x0050:
3029                             if (c == 0x1151 || c == 0x1156)
3030                               {
3031                                 error("Not a name start character, U+"
3032                                       + Integer.toHexString(c));
3033                               }
3034                             break;
3035                           case 0x0060:
3036                             if (c == 0x1162 || c == 0x1164
3037                                 || c == 0x1166 || c == 0x116b
3038                                 || c == 0x116f)
3039                               {
3040                                 error("Not a name start character, U+"
3041                                       + Integer.toHexString(c));
3042                               }
3043                             break;
3044                           case 0x00b0:
3045                             if (c == 0x11b6 || c == 0x11b9
3046                                 || c == 0x11bb || c == 0x116f)
3047                               {
3048                                 error("Not a name start character, U+"
3049                                       + Integer.toHexString(c));
3050                               }
3051                             break;
3052                           default:
3053                             if (c == 0x1174 || c == 0x119f
3054                                 || c == 0x11ac || c == 0x11c3
3055                                 || c == 0x11f1)
3056                               {
3057                                 error("Not a name start character, U+"
3058                                       + Integer.toHexString(c));
3059                               }
3060                           }
3061                         break;
3062                       default:
3063                         if (c == 0x0e46 || c == 0x1011
3064                             || c == 0x212f || c == 0x0587
3065                             || c == 0x0230 )
3066                           {
3067                             error("Not a name start character, U+"
3068                                   + Integer.toHexString(c));
3069                           }
3070                       }
3071                   }
3072                 // punt on exact tests from Appendix A; approximate
3073                 // them using the Unicode ID start/part rules
3074                 if (i == readBufferPos && isName)
3075                   {
3076                     if (!Character.isUnicodeIdentifierStart(c)
3077                         && c != ':' && c != '_')
3078                       {
3079                         error("Not a name start character, U+"
3080                               + Integer.toHexString(c));
3081                       }
3082                   }
3083                 else if (!Character.isUnicodeIdentifierPart(c)
3084                          && c != '-' && c != ':' && c != '_' && c != '.'
3085                          && !isExtender(c))
3086                   {
3087                     error("Not a name character, U+"
3088                           + Integer.toHexString(c));
3089                   }
3090               }
3091           }
3092       }
3093
3094     nameBufferPos = 0;
3095
3096     // Read the first character.
3097 loop:
3098     while (true)
3099       {
3100         c = readCh();
3101         switch (c)
3102           {
3103           case '%':
3104           case '<': case '>': case '&':
3105           case ',': case '|': case '*': case '+': case '?':
3106           case ')':
3107           case '=':
3108           case '\'': case '"':
3109           case '[':
3110           case ' ': case '\t': case '\n': case '\r':
3111           case ';':
3112           case '/':
3113             unread(c);
3114             if (nameBufferPos == 0)
3115               {
3116                 error ("name expected");
3117               }
3118             // punt on exact tests from Appendix A, but approximate them
3119             if (isName
3120                 && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3121                 && ":_".indexOf(nameBuffer[0]) == -1)
3122               {
3123                 error("Not a name start character, U+"
3124                       + Integer.toHexString(nameBuffer[0]));
3125               }
3126             String s = intern(nameBuffer, 0, nameBufferPos);
3127             nameBufferPos = 0;
3128             return s;
3129           default:
3130             // punt on exact tests from Appendix A, but approximate them
3131
3132             if ((nameBufferPos != 0 || !isName)
3133                 && !Character.isUnicodeIdentifierPart(c)
3134                 && ":-_.".indexOf(c) == -1
3135                 && !isExtender(c))
3136               {
3137                 error("Not a name character, U+"
3138                       + Integer.toHexString(c));
3139               }
3140             if (nameBufferPos >= nameBuffer.length)
3141               {
3142                 nameBuffer =
3143                   (char[]) extendArray(nameBuffer,
3144                                        nameBuffer.length, nameBufferPos);
3145               }
3146             nameBuffer[nameBufferPos++] = c;
3147           }
3148       }
3149   }
3150
3151   private static boolean isExtender(char c)
3152   {
3153     // [88] Extender ::= ...
3154     return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3155       || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3156       || (c >= 0x3031 && c <= 0x3035)
3157       || (c >= 0x309d && c <= 0x309e)
3158       || (c >= 0x30fc && c <= 0x30fe);
3159   }
3160
3161   /**
3162    * Read a literal.  With matching single or double quotes as
3163    * delimiters (and not embedded!) this is used to parse:
3164    * <pre>
3165    *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3166    *  [10] AttValue ::= ... ([^<&] | Reference)* ...
3167    *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
3168    *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3169    * </pre>
3170    * as well as the quoted strings in XML and text declarations
3171    * (for version, encoding, and standalone) which have their
3172    * own constraints.
3173    */
3174   private String readLiteral(int flags)
3175     throws SAXException, IOException
3176   {
3177     char delim, c;
3178     int startLine = line;
3179     boolean saved = expandPE;
3180     boolean savedReport = doReport;
3181
3182     // Find the first delimiter.
3183     delim = readCh();
3184     if (delim != '"' && delim != '\'')
3185       {
3186         error("expected '\"' or \"'\"", delim, null);
3187         return null;
3188       }
3189     inLiteral = true;
3190     if ((flags & LIT_DISABLE_PE) != 0)
3191       {
3192         expandPE = false;
3193       }
3194     doReport = false;
3195
3196     // Each level of input source has its own buffer; remember
3197     // ours, so we won't read the ending delimiter from any
3198     // other input source, regardless of entity processing.
3199     char[] ourBuf = readBuffer;
3200
3201     // Read the literal.
3202     try
3203       {
3204         c = readCh();
3205         boolean ampRead = false;
3206 loop:
3207         while (! (c == delim && readBuffer == ourBuf))
3208           {
3209             switch (c)
3210               {
3211                 // attributes and public ids are normalized
3212                 // in almost the same ways
3213               case '\n':
3214               case '\r':
3215                 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3216                   {
3217                     c = ' ';
3218                   }
3219                 break;
3220               case '\t':
3221                 if ((flags & LIT_ATTRIBUTE) != 0)
3222                   {
3223                     c = ' ';
3224                   }
3225                 break;
3226               case '&':
3227                 c = readCh();
3228                 // Char refs are expanded immediately, except for
3229                 // all the cases where it's deferred.
3230                 if (c == '#')
3231                   {
3232                     if ((flags & LIT_DISABLE_CREF) != 0)
3233                       {
3234                         dataBufferAppend('&');
3235                         break;
3236                       }
3237                     parseCharRef(false /* Do not do flushDataBuffer */);
3238
3239                     // exotic WFness risk: this is an entity literal,
3240                     // dataBuffer [dataBufferPos - 1] == '&', and
3241                     // following chars are a _partial_ entity/char ref
3242
3243                     // It looks like an entity ref ...
3244                   }
3245                 else
3246                   {
3247                     unread(c);
3248                     // Expand it?
3249                     if ((flags & LIT_ENTITY_REF) > 0)
3250                       {
3251                         parseEntityRef(false);
3252                         if (String.valueOf(readBuffer).equals("&#38;"))
3253                           {
3254                             ampRead = true;
3255                           }
3256                         //Is it just data?
3257                       }
3258                     else if ((flags & LIT_DISABLE_EREF) != 0)
3259                       {
3260                         dataBufferAppend('&');
3261
3262                         // OK, it will be an entity ref -- expanded later.
3263                       }
3264                     else
3265                       {
3266                         String name = readNmtoken(true);
3267                         require(';');
3268                         dataBufferAppend('&');
3269                         dataBufferAppend(name);
3270                         dataBufferAppend(';');
3271                       }
3272                   }
3273                 c = readCh();
3274                 continue loop;
3275
3276               case '<':
3277                 // and why?  Perhaps so "&foo;" expands the same
3278                 // inside and outside an attribute?
3279                 if ((flags & LIT_ATTRIBUTE) != 0)
3280                   {
3281                     error("attribute values may not contain '<'");
3282                   }
3283                 break;
3284
3285                 // We don't worry about case '%' and PE refs, readCh does.
3286
3287               default:
3288                 break;
3289               }
3290             dataBufferAppend(c);
3291             c = readCh();
3292           }
3293       }
3294     catch (EOFException e)
3295       {
3296         error("end of input while looking for delimiter (started on line "
3297               + startLine + ')', null, new Character(delim).toString());
3298       }
3299     inLiteral = false;
3300     expandPE = saved;
3301     doReport = savedReport;
3302
3303     // Normalise whitespace if necessary.
3304     if ((flags & LIT_NORMALIZE) > 0)
3305       {
3306         dataBufferNormalize();
3307       }
3308
3309     // Return the value.
3310     return dataBufferToString();
3311   }
3312
3313   /**
3314    * Try reading external identifiers.
3315    * A system identifier is not required for notations.
3316    * @param inNotation Are we parsing a notation decl?
3317    * @param isSubset Parsing external subset decl (may be omitted)?
3318    * @return A three-member String array containing the identifiers,
3319    *  or nulls. Order: public, system, baseURI.
3320    */
3321   private ExternalIdentifiers readExternalIds(boolean inNotation,
3322                                               boolean isSubset)
3323     throws Exception
3324   {
3325     char c;
3326     ExternalIdentifiers ids = new ExternalIdentifiers();
3327     int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3328
3329     if (tryRead("PUBLIC"))
3330       {
3331         requireWhitespace();
3332         ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3333         if (inNotation)
3334           {
3335             skipWhitespace();
3336             c = readCh();
3337             unread(c);
3338             if (c == '"' || c == '\'')
3339               {
3340                 ids.systemId = readLiteral(flags);
3341               }
3342           }
3343         else
3344           {
3345             requireWhitespace();
3346             ids.systemId = readLiteral(flags);
3347           }
3348
3349         for (int i = 0; i < ids.publicId.length(); i++)
3350           {
3351             c = ids.publicId.charAt(i);
3352             if (c >= 'a' && c <= 'z')
3353               {
3354                 continue;
3355               }
3356             if (c >= 'A' && c <= 'Z')
3357               {
3358                 continue;
3359               }
3360             if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3361               {
3362                 continue;
3363               }
3364             error("illegal PUBLIC id character U+"
3365                   + Integer.toHexString(c));
3366           }
3367       }
3368     else if (tryRead("SYSTEM"))
3369       {
3370         requireWhitespace();
3371         ids.systemId = readLiteral(flags);
3372       }
3373     else if (!isSubset)
3374       {
3375         error("missing SYSTEM or PUBLIC keyword");
3376       }
3377
3378     if (ids.systemId != null)
3379       {
3380         if (ids.systemId.indexOf('#') != -1)
3381           {
3382             handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3383           }
3384         ids.baseUri = handler.getSystemId();
3385         if (ids.baseUri == null && uriWarnings)
3386           {
3387             handler.warn("No base URI; hope URI is absolute: "
3388                          + ids.systemId);
3389           }
3390       }
3391
3392     return ids;
3393   }
3394
3395   /**
3396    * Test if a character is whitespace.
3397    * <pre>
3398    * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3399    * </pre>
3400    * @param c The character to test.
3401    * @return true if the character is whitespace.
3402    */
3403   private final boolean isWhitespace(char c)
3404   {
3405     if (c > 0x20)
3406       {
3407         return false;
3408       }
3409     if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3410       {
3411         return true;
3412       }
3413     return false;  // illegal ...
3414   }
3415
3416   //////////////////////////////////////////////////////////////////////
3417   // Utility routines.
3418   //////////////////////////////////////////////////////////////////////
3419
3420   /**
3421    * Add a character to the data buffer.
3422    */
3423   private void dataBufferAppend(char c)
3424   {
3425     // Expand buffer if necessary.
3426     if (dataBufferPos >= dataBuffer.length)
3427       {
3428         dataBuffer = (char[]) extendArray(dataBuffer,
3429                                           dataBuffer.length, dataBufferPos);
3430       }
3431     dataBuffer[dataBufferPos++] = c;
3432   }
3433
3434   /**
3435    * Add a string to the data buffer.
3436    */
3437   private void dataBufferAppend(String s)
3438   {
3439     dataBufferAppend(s.toCharArray(), 0, s.length());
3440   }
3441
3442   /**
3443    * Append (part of) a character array to the data buffer.
3444    */
3445   private void dataBufferAppend(char[] ch, int start, int length)
3446   {
3447     dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3448                                       dataBufferPos + length);
3449
3450     System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3451     dataBufferPos += length;
3452   }
3453
3454   /**
3455    * Normalise space characters in the data buffer.
3456    */
3457   private void dataBufferNormalize()
3458   {
3459     int i = 0;
3460     int j = 0;
3461     int end = dataBufferPos;
3462
3463     // Skip spaces at the start.
3464     while (j < end && dataBuffer[j] == ' ')
3465       {
3466         j++;
3467       }
3468
3469     // Skip whitespace at the end.
3470     while (end > j && dataBuffer[end - 1] == ' ')
3471       {
3472         end --;
3473       }
3474
3475     // Start copying to the left.
3476     while (j < end)
3477       {
3478
3479         char c = dataBuffer[j++];
3480
3481         // Normalise all other spaces to
3482         // a single space.
3483         if (c == ' ')
3484           {
3485             while (j < end && dataBuffer[j++] == ' ')
3486               {
3487                 continue;
3488               }
3489             dataBuffer[i++] = ' ';
3490             dataBuffer[i++] = dataBuffer[j - 1];
3491           }
3492         else
3493           {
3494             dataBuffer[i++] = c;
3495           }
3496       }
3497
3498     // The new length is <= the old one.
3499     dataBufferPos = i;
3500   }
3501
3502   /**
3503    * Convert the data buffer to a string.
3504    */
3505   private String dataBufferToString()
3506   {
3507     String s = new String(dataBuffer, 0, dataBufferPos);
3508     dataBufferPos = 0;
3509     return s;
3510   }
3511
3512   /**
3513    * Flush the contents of the data buffer to the handler, as
3514    * appropriate, and reset the buffer for new input.
3515    */
3516   private void dataBufferFlush()
3517     throws SAXException
3518   {
3519     if (currentElementContent == CONTENT_ELEMENTS
3520         && dataBufferPos > 0
3521         && !inCDATA)
3522       {
3523         // We can't just trust the buffer to be whitespace, there
3524         // are (error) cases when it isn't
3525         for (int i = 0; i < dataBufferPos; i++)
3526           {
3527             if (!isWhitespace(dataBuffer[i]))
3528               {
3529                 handler.charData(dataBuffer, 0, dataBufferPos);
3530                 dataBufferPos = 0;
3531               }
3532           }
3533         if (dataBufferPos > 0)
3534           {
3535             handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3536             dataBufferPos = 0;
3537           }
3538       }
3539     else if (dataBufferPos > 0)
3540       {
3541         handler.charData(dataBuffer, 0, dataBufferPos);
3542         dataBufferPos = 0;
3543       }
3544   }
3545
3546   /**
3547    * Require a string to appear, or throw an exception.
3548    * <p><em>Precondition:</em> Entity expansion is not required.
3549    * <p><em>Precondition:</em> data buffer has no characters that
3550    * will get sent to the application.
3551    */
3552   private void require(String delim)
3553     throws SAXException, IOException
3554   {
3555     int length = delim.length();
3556     char[] ch;
3557
3558     if (length < dataBuffer.length)
3559       {
3560         ch = dataBuffer;
3561         delim.getChars(0, length, ch, 0);
3562       }
3563     else
3564       {
3565         ch = delim.toCharArray();
3566       }
3567
3568     if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3569       {
3570         int offset = readBufferPos;
3571
3572         for (int i = 0; i < length; i++, offset++)
3573           {
3574             if (ch[i] != readBuffer[offset])
3575               {
3576                 error ("required string", null, delim);
3577               }
3578           }
3579         readBufferPos = offset;
3580
3581       }
3582     else
3583       {
3584         for (int i = 0; i < length; i++)
3585           {
3586             require(ch[i]);
3587           }
3588       }
3589   }
3590
3591   /**
3592    * Require a character to appear, or throw an exception.
3593    */
3594   private void require(char delim)
3595     throws SAXException, IOException
3596   {
3597     char c = readCh();
3598
3599     if (c != delim)
3600       {
3601         error("required character", c, new Character(delim).toString());
3602       }
3603   }
3604
3605   /**
3606    * Create an interned string from a character array.
3607    * &AElig;lfred uses this method to create an interned version
3608    * of all names and name tokens, so that it can test equality
3609    * with <code>==</code> instead of <code>String.equals ()</code>.
3610    *
3611    * <p>This is much more efficient than constructing a non-interned
3612    * string first, and then interning it.
3613    *
3614    * @param ch an array of characters for building the string.
3615    * @param start the starting position in the array.
3616    * @param length the number of characters to place in the string.
3617    * @return an interned string.
3618    * @see #intern (String)
3619    * @see java.lang.String#intern
3620    */
3621   public String intern(char[] ch, int start, int length)
3622   {
3623     int index = 0;
3624     int hash = 0;
3625     Object[] bucket;
3626
3627     // Generate a hash code.  This is a widely used string hash,
3628     // often attributed to Brian Kernighan.
3629     for (int i = start; i < start + length; i++)
3630       {
3631         hash = 31 * hash + ch[i];
3632       }
3633     hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3634
3635     // Get the bucket -- consists of {array,String} pairs
3636     if ((bucket = symbolTable[hash]) == null)
3637       {
3638         // first string in this bucket
3639         bucket = new Object[8];
3640
3641         // Search for a matching tuple, and
3642         // return the string if we find one.
3643       }
3644     else
3645       {
3646         while (index < bucket.length)
3647           {
3648             char[] chFound = (char[]) bucket[index];
3649
3650             // Stop when we hit an empty entry.
3651             if (chFound == null)
3652               {
3653                 break;
3654               }
3655
3656             // If they're the same length, check for a match.
3657             if (chFound.length == length)
3658               {
3659                 for (int i = 0; i < chFound.length; i++)
3660                   {
3661                     // continue search on failure
3662                     if (ch[start + i] != chFound[i])
3663                       {
3664                         break;
3665                       }
3666                     else if (i == length - 1)
3667                       {
3668                         // That's it, we have a match!
3669                         return (String) bucket[index + 1];
3670                       }
3671                   }
3672               }
3673             index += 2;
3674           }
3675         // Not found -- we'll have to add it.
3676
3677         // Do we have to grow the bucket?
3678         bucket = (Object[]) extendArray(bucket, bucket.length, index);
3679       }
3680     symbolTable[hash] = bucket;
3681
3682     // OK, add it to the end of the bucket -- "local" interning.
3683     // Intern "globally" to let applications share interning benefits.
3684     // That is, "!=" and "==" work on our strings, not just equals().
3685     String s = new String(ch, start, length).intern();
3686     bucket[index] = s.toCharArray();
3687     bucket[index + 1] = s;
3688     return s;
3689   }
3690
3691   /**
3692    * Ensure the capacity of an array, allocating a new one if
3693    * necessary.  Usually extends only for name hash collisions.
3694    */
3695   private Object extendArray(Object array, int currentSize, int requiredSize)
3696   {
3697     if (requiredSize < currentSize)
3698       {
3699         return array;
3700       }
3701     else
3702       {
3703         Object newArray = null;
3704         int newSize = currentSize * 2;
3705
3706         if (newSize <= requiredSize)
3707           {
3708             newSize = requiredSize + 1;
3709           }
3710
3711         if (array instanceof char[])
3712           {
3713             newArray = new char[newSize];
3714           }
3715         else if (array instanceof Object[])
3716           {
3717             newArray = new Object[newSize];
3718           }
3719         else
3720           {
3721             throw new RuntimeException();
3722           }
3723
3724         System.arraycopy(array, 0, newArray, 0, currentSize);
3725         return newArray;
3726       }
3727   }
3728
3729   //////////////////////////////////////////////////////////////////////
3730   // XML query routines.
3731   //////////////////////////////////////////////////////////////////////
3732
3733   boolean isStandalone()
3734   {
3735     return docIsStandalone;
3736   }
3737
3738   //
3739   // Elements
3740   //
3741
3742   private int getContentType(ElementDecl element, int defaultType)
3743   {
3744     int retval;
3745
3746     if (element == null)
3747       {
3748         return defaultType;
3749       }
3750     retval = element.contentType;
3751     if (retval == CONTENT_UNDECLARED)
3752       {
3753         retval = defaultType;
3754       }
3755     return retval;
3756   }
3757
3758   /**
3759    * Look up the content type of an element.
3760    * @param name The element type name.
3761    * @return An integer constant representing the content type.
3762    * @see #CONTENT_UNDECLARED
3763    * @see #CONTENT_ANY
3764    * @see #CONTENT_EMPTY
3765    * @see #CONTENT_MIXED
3766    * @see #CONTENT_ELEMENTS
3767    */
3768   public int getElementContentType(String name)
3769   {
3770     ElementDecl element = (ElementDecl) elementInfo.get(name);
3771     return getContentType(element, CONTENT_UNDECLARED);
3772   }
3773
3774   /**
3775    * Register an element.
3776    * Array format:
3777    *  [0] element type name
3778    *  [1] content model (mixed, elements only)
3779    *  [2] attribute hash table
3780    */
3781   private void setElement(String name, int contentType,
3782                           String contentModel, HashMap attributes)
3783     throws SAXException
3784   {
3785     if (skippedPE)
3786       {
3787         return;
3788       }
3789
3790     ElementDecl element = (ElementDecl) elementInfo.get(name);
3791
3792     // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3793     if (element == null)
3794       {
3795         element = new ElementDecl();
3796         element.contentType = contentType;
3797         element.contentModel = contentModel;
3798         element.attributes = attributes;
3799         elementInfo.put(name, element);
3800         return;
3801       }
3802
3803     // <!ELEMENT ...> declaration?
3804     if (contentType != CONTENT_UNDECLARED)
3805       {
3806         // ... following an associated <!ATTLIST ...>
3807         if (element.contentType == CONTENT_UNDECLARED)
3808           {
3809             element.contentType = contentType;
3810             element.contentModel = contentModel;
3811           }
3812         else
3813           {
3814             // VC: Unique Element Type Declaration
3815             handler.verror("multiple declarations for element type: "
3816                            + name);
3817           }
3818       }
3819
3820     // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3821     else if (attributes != null)
3822       {
3823         element.attributes = attributes;
3824       }
3825   }
3826
3827   /**
3828    * Look up the attribute hash table for an element.
3829    * The hash table is the second item in the element array.
3830    */
3831   private HashMap getElementAttributes(String name)
3832   {
3833     ElementDecl element = (ElementDecl) elementInfo.get(name);
3834     return (element == null) ? null : element.attributes;
3835   }
3836
3837   //
3838   // Attributes
3839   //
3840
3841   /**
3842    * Get the declared attributes for an element type.
3843    * @param elname The name of the element type.
3844    * @return An iterator over all the attributes declared for
3845    *   a specific element type.  The results will be valid only
3846    *   after the DTD (if any) has been parsed.
3847    * @see #getAttributeType
3848    * @see #getAttributeEnumeration
3849    * @see #getAttributeDefaultValueType
3850    * @see #getAttributeDefaultValue
3851    * @see #getAttributeExpandedValue
3852    */
3853   private Iterator declaredAttributes(ElementDecl element)
3854   {
3855     HashMap attlist;
3856
3857     if (element == null)
3858       {
3859         return null;
3860       }
3861     if ((attlist = element.attributes) == null)
3862       {
3863         return null;
3864       }
3865     return attlist.keySet().iterator();
3866   }
3867
3868   /**
3869    * Get the declared attributes for an element type.
3870    * @param elname The name of the element type.
3871    * @return An iterator over all the attributes declared for
3872    *   a specific element type.  The results will be valid only
3873    *   after the DTD (if any) has been parsed.
3874    * @see #getAttributeType
3875    * @see #getAttributeEnumeration
3876    * @see #getAttributeDefaultValueType
3877    * @see #getAttributeDefaultValue
3878    * @see #getAttributeExpandedValue
3879    */
3880   public Iterator declaredAttributes(String elname)
3881   {
3882     return declaredAttributes((ElementDecl) elementInfo.get(elname));
3883   }
3884
3885   /**
3886    * Retrieve the declared type of an attribute.
3887    * @param name The name of the associated element.
3888    * @param aname The name of the attribute.
3889    * @return An interend string denoting the type, or null
3890    *  indicating an undeclared attribute.
3891    */
3892   public String getAttributeType(String name, String aname)
3893   {
3894     AttributeDecl attribute = getAttribute(name, aname);
3895     return (attribute == null) ? null : attribute.type;
3896   }
3897
3898   /**
3899    * Retrieve the allowed values for an enumerated attribute type.
3900    * @param name The name of the associated element.
3901    * @param aname The name of the attribute.
3902    * @return A string containing the token list.
3903    */
3904   public String getAttributeEnumeration(String name, String aname)
3905   {
3906     AttributeDecl attribute = getAttribute(name, aname);
3907     // assert:  attribute.enumeration is "ENUMERATION" or "NOTATION"
3908     return (attribute == null) ? null : attribute.enumeration;
3909   }
3910
3911   /**
3912    * Retrieve the default value of a declared attribute.
3913    * @param name The name of the associated element.
3914    * @param aname The name of the attribute.
3915    * @return The default value, or null if the attribute was
3916    *   #IMPLIED or simply undeclared and unspecified.
3917    * @see #getAttributeExpandedValue
3918    */
3919   public String getAttributeDefaultValue(String name, String aname)
3920   {
3921     AttributeDecl attribute = getAttribute(name, aname);
3922     return (attribute == null) ? null : attribute.value;
3923   }
3924
3925     /*
3926
3927 // FIXME:  Leaving this in, until W3C finally resolves the confusion
3928 // between parts of the XML 2nd REC about when entity declararations
3929 // are guaranteed to be known.  Current code matches what section 5.1
3930 // (conformance) describes, but some readings of the self-contradicting
3931 // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3932 // attribute expansion/normalization must be deferred in some cases
3933 // (just TRY to identify them!).
3934
3935      * Retrieve the expanded value of a declared attribute.
3936      * <p>General entities (and char refs) will be expanded (once).
3937      * @param name The name of the associated element.
3938      * @param aname The name of the attribute.
3939      * @return The expanded default value, or null if the attribute was
3940      *   #IMPLIED or simply undeclared
3941      * @see #getAttributeDefaultValue
3942     public String getAttributeExpandedValue (String name, String aname)
3943     throws Exception
3944     {
3945   AttributeDecl attribute = getAttribute (name, aname);
3946
3947   if (attribute == null) {
3948       return null;
3949   } else if (attribute.defaultValue == null && attribute.value != null) {
3950       // we MUST use the same buf for both quotes else the literal
3951       // can't be properly terminated
3952       char buf [] = new char [1];
3953       int  flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3954       String type = getAttributeType (name, aname);
3955
3956       if (type != "CDATA" && type != null)
3957     flags |= LIT_NORMALIZE;
3958       buf [0] = '"';
3959       pushCharArray (null, buf, 0, 1);
3960       pushString (null, attribute.value);
3961       pushCharArray (null, buf, 0, 1);
3962       attribute.defaultValue = readLiteral (flags);
3963   }
3964   return attribute.defaultValue;
3965     }
3966      */
3967
3968   /**
3969    * Retrieve the default value mode of a declared attribute.
3970    * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3971    * @see #ATTRIBUTE_DEFAULT_IMPLIED
3972    * @see #ATTRIBUTE_DEFAULT_REQUIRED
3973    * @see #ATTRIBUTE_DEFAULT_FIXED
3974    */
3975   public int getAttributeDefaultValueType(String name, String aname)
3976   {
3977     AttributeDecl attribute = getAttribute(name, aname);
3978     return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
3979       attribute.valueType;
3980   }
3981
3982   /**
3983    * Register an attribute declaration for later retrieval.
3984    * Format:
3985    * - String type
3986    * - String default value
3987    * - int value type
3988    * - enumeration
3989    * - processed default value
3990    */
3991   private void setAttribute(String elName, String name, String type,
3992                             String enumeration, String value, int valueType)
3993     throws Exception
3994   {
3995     HashMap attlist;
3996
3997     if (skippedPE)
3998       {
3999         return;
4000       }
4001
4002     // Create a new hashtable if necessary.
4003     attlist = getElementAttributes(elName);
4004     if (attlist == null)
4005       {
4006         attlist = new HashMap();
4007       }
4008
4009     // ignore multiple attribute declarations!
4010     if (attlist.get(name) != null)
4011       {
4012         // warn ...
4013         return;
4014       }
4015     else
4016       {
4017         AttributeDecl attribute = new AttributeDecl();
4018         attribute.type = type;
4019         attribute.value = value;
4020         attribute.valueType = valueType;
4021         attribute.enumeration = enumeration;
4022         attlist.put(name, attribute);
4023
4024         // save; but don't overwrite any existing <!ELEMENT ...>
4025         setElement(elName, CONTENT_UNDECLARED, null, attlist);
4026       }
4027   }
4028
4029   /**
4030    * Retrieve the attribute declaration for the given element name and name.
4031    */
4032   private AttributeDecl getAttribute(String elName, String name)
4033   {
4034     HashMap attlist = getElementAttributes(elName);
4035     return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
4036   }
4037
4038   //
4039   // Entities
4040   //
4041
4042   /**
4043    * Find the type of an entity.
4044    * @returns An integer constant representing the entity type.
4045    * @see #ENTITY_UNDECLARED
4046    * @see #ENTITY_INTERNAL
4047    * @see #ENTITY_NDATA
4048    * @see #ENTITY_TEXT
4049    */
4050   public int getEntityType(String ename)
4051   {
4052     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4053     return (entity == null) ?  ENTITY_UNDECLARED : entity.type;
4054   }
4055
4056   /**
4057    * Return an external entity's identifiers.
4058    * @param ename The name of the external entity.
4059    * @return The entity's public identifier, system identifier, and base URI.
4060    *  Null if the entity was not declared as an external entity.
4061    * @see #getEntityType
4062    */
4063   public ExternalIdentifiers getEntityIds(String ename)
4064   {
4065     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4066     return (entity == null) ? null : entity.ids;
4067   }
4068
4069   /**
4070    * Return an internal entity's replacement text.
4071    * @param ename The name of the internal entity.
4072    * @return The entity's replacement text, or null if
4073    *   the entity was not declared as an internal entity.
4074    * @see #getEntityType
4075    */
4076   public String getEntityValue(String ename)
4077   {
4078     EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4079     return (entity == null) ? null : entity.value;
4080   }
4081
4082   /**
4083    * Register an entity declaration for later retrieval.
4084    */
4085   private void setInternalEntity(String eName, String value)
4086     throws SAXException
4087   {
4088     if (skippedPE)
4089       {
4090         return;
4091       }
4092
4093     if (entityInfo.get(eName) == null)
4094       {
4095         EntityInfo entity = new EntityInfo();
4096         entity.type = ENTITY_INTERNAL;
4097         entity.value = value;
4098         entityInfo.put(eName, entity);
4099       }
4100     if (handler.stringInterning)
4101       {
4102         if ("lt" == eName || "gt" == eName || "quot" == eName
4103             || "apos" == eName || "amp" == eName)
4104           {
4105             return;
4106           }
4107       }
4108     else
4109       {
4110         if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4111             || "apos".equals(eName) || "amp".equals(eName))
4112           {
4113             return;
4114           }
4115       }
4116     handler.getDeclHandler().internalEntityDecl(eName, value);
4117   }
4118
4119   /**
4120    * Register an external entity declaration for later retrieval.
4121    */
4122   private void setExternalEntity(String eName, int eClass,
4123                                  ExternalIdentifiers ids, String nName)
4124   {
4125     if (entityInfo.get(eName) == null)
4126       {
4127         EntityInfo entity = new EntityInfo();
4128         entity.type = eClass;
4129         entity.ids = ids;
4130         entity.notationName = nName;
4131         entityInfo.put(eName, entity);
4132       }
4133   }
4134
4135   //
4136   // Notations.
4137   //
4138
4139   /**
4140    * Report a notation declaration, checking for duplicates.
4141    */
4142   private void setNotation(String nname, ExternalIdentifiers ids)
4143     throws SAXException
4144   {
4145     if (skippedPE)
4146       {
4147         return;
4148       }
4149
4150     handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4151     if (notationInfo.get(nname) == null)
4152       {
4153         notationInfo.put(nname, nname);
4154       }
4155     else
4156       {
4157         // VC: Unique Notation Name
4158         handler.verror("Duplicate notation name decl: " + nname);
4159       }
4160   }
4161
4162   //
4163   // Location.
4164   //
4165
4166   /**
4167    * Return the current line number.
4168    */
4169   public int getLineNumber()
4170   {
4171     return line;
4172   }
4173
4174   /**
4175    * Return the current column number.
4176    */
4177   public int getColumnNumber()
4178   {
4179     return column;
4180   }
4181
4182   //////////////////////////////////////////////////////////////////////
4183   // High-level I/O.
4184   //////////////////////////////////////////////////////////////////////
4185
4186   /**
4187    * Read a single character from the readBuffer.
4188    * <p>The readDataChunk () method maintains the buffer.
4189    * <p>If we hit the end of an entity, try to pop the stack and
4190    * keep going.
4191    * <p> (This approach doesn't really enforce XML's rules about
4192    * entity boundaries, but this is not currently a validating
4193    * parser).
4194    * <p>This routine also attempts to keep track of the current
4195    * position in external entities, but it's not entirely accurate.
4196    * @return The next available input character.
4197    * @see #unread (char)
4198    * @see #readDataChunk
4199    * @see #readBuffer
4200    * @see #line
4201    * @return The next character from the current input source.
4202    */
4203   private char readCh()
4204     throws SAXException, IOException
4205   {
4206     // As long as there's nothing in the
4207     // read buffer, try reading more data
4208     // (for an external entity) or popping
4209     // the entity stack (for either).
4210     while (readBufferPos >= readBufferLength)
4211       {
4212         switch (sourceType)
4213           {
4214           case INPUT_READER:
4215           case INPUT_STREAM:
4216             readDataChunk();
4217             while (readBufferLength < 1)
4218               {
4219                 popInput();
4220                 if (readBufferLength < 1)
4221                   {
4222                     readDataChunk();
4223                   }
4224               }
4225             break;
4226
4227           default:
4228
4229             popInput();
4230             break;
4231           }
4232       }
4233
4234     char c = readBuffer[readBufferPos++];
4235
4236     if (c == '\n')
4237       {
4238         line++;
4239         column = 0;
4240       }
4241     else
4242       {
4243         if (c == '<')
4244           {
4245             /* the most common return to parseContent () ... NOP */
4246           }
4247         else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4248                  || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
4249                      && xmlVersion == XML_11))
4250           {
4251             error("illegal XML character U+" + Integer.toHexString(c));
4252           }
4253
4254         // If we're in the DTD and in a context where PEs get expanded,
4255         // do so ... 1/14/2000 errata identify those contexts.  There
4256         // are also spots in the internal subset where PE refs are fatal
4257         // errors, hence yet another flag.
4258         else if (c == '%' && expandPE)
4259           {
4260             if (peIsError)
4261               {
4262                 error("PE reference within decl in internal subset.");
4263               }
4264             parsePEReference();
4265             return readCh();
4266           }
4267         column++;
4268       }
4269
4270     return c;
4271   }
4272
4273   /**
4274    * Push a single character back onto the current input stream.
4275    * <p>This method usually pushes the character back onto
4276    * the readBuffer.
4277    * <p>I don't think that this would ever be called with
4278    * readBufferPos = 0, because the methods always reads a character
4279    * before unreading it, but just in case, I've added a boundary
4280    * condition.
4281    * @param c The character to push back.
4282    * @see #readCh
4283    * @see #unread (char[])
4284    * @see #readBuffer
4285    */
4286   private void unread(char c)
4287     throws SAXException
4288   {
4289     // Normal condition.
4290     if (c == '\n')
4291       {
4292         line--;
4293         column = -1;
4294       }
4295     if (readBufferPos > 0)
4296       {
4297         readBuffer[--readBufferPos] = c;
4298       }
4299     else
4300       {
4301         pushString(null, new Character(c).toString());
4302       }
4303   }
4304
4305   /**
4306    * Push a char array back onto the current input stream.
4307    * <p>NOTE: you must <em>never</em> push back characters that you
4308    * haven't actually read: use pushString () instead.
4309    * @see #readCh
4310    * @see #unread (char)
4311    * @see #readBuffer
4312    * @see #pushString
4313    */
4314   private void unread(char[] ch, int length)
4315     throws SAXException
4316   {
4317     for (int i = 0; i < length; i++)
4318       {
4319         if (ch[i] == '\n')
4320           {
4321             line--;
4322             column = -1;
4323           }
4324       }
4325     if (length < readBufferPos)
4326       {
4327         readBufferPos -= length;
4328       }
4329     else
4330       {
4331         pushCharArray(null, ch, 0, length);
4332       }
4333   }
4334
4335   /**
4336    * Push, or skip, a new external input source.
4337    * The source will be some kind of parsed entity, such as a PE
4338    * (including the external DTD subset) or content for the body.
4339    *
4340    * @param url The java.net.URL object for the entity.
4341    * @see SAXDriver#resolveEntity
4342    * @see #pushString
4343    * @see #sourceType
4344    * @see #pushInput
4345    * @see #detectEncoding
4346    * @see #sourceType
4347    * @see #readBuffer
4348    */
4349   private void pushURL(boolean isPE,
4350                        String ename,
4351                        ExternalIdentifiers ids,
4352                        Reader reader,
4353                        InputStream stream,
4354                        String encoding,
4355                        boolean doResolve)
4356     throws SAXException, IOException
4357   {
4358     boolean ignoreEncoding;
4359     String systemId;
4360     InputSource source;
4361
4362     if (!isPE)
4363       {
4364         dataBufferFlush();
4365       }
4366
4367     scratch.setPublicId(ids.publicId);
4368     scratch.setSystemId(ids.systemId);
4369
4370     // See if we should skip or substitute the entity.
4371     // If we're not skipping, resolving reports startEntity()
4372     // and updates the (handler's) stack of URIs.
4373     if (doResolve)
4374       {
4375         // assert (stream == null && reader == null && encoding == null)
4376         source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4377         if (source == null)
4378           {
4379             handler.warn("skipping entity: " + ename);
4380             handler.skippedEntity(ename);
4381             if (isPE)
4382               {
4383                 skippedPE = true;
4384               }
4385             return;
4386           }
4387
4388         // we might be using alternate IDs/encoding
4389         systemId = source.getSystemId();
4390         // The following warning and setting systemId was deleted bcause
4391         // the application has the option of not setting systemId
4392         // provided that it has set the characte/byte stream.
4393         /*
4394            if (systemId == null) {
4395            handler.warn ("missing system ID, using " + ids.systemId);
4396            systemId = ids.systemId;
4397            }
4398          */
4399       }
4400     else
4401       {
4402         // "[document]", or "[dtd]" via getExternalSubset()
4403         scratch.setCharacterStream(reader);
4404         scratch.setByteStream(stream);
4405         scratch.setEncoding(encoding);
4406         source = scratch;
4407         systemId = ids.systemId;
4408         if (handler.stringInterning)
4409           {
4410             handler.startExternalEntity(ename, systemId,
4411                                         "[document]" == ename);
4412           }
4413         else
4414           {
4415             handler.startExternalEntity(ename, systemId,
4416                                         "[document]".equals(ename));
4417           }
4418       }
4419
4420     // we may have been given I/O streams directly
4421     if (source.getCharacterStream() != null)
4422       {
4423         if (source.getByteStream() != null)
4424           error("InputSource has two streams!");
4425         reader = source.getCharacterStream();
4426       }
4427     else if (source.getByteStream() != null)
4428       {
4429         encoding = source.getEncoding();
4430         if (encoding == null)
4431           {
4432             stream = source.getByteStream();
4433           }
4434         else
4435           {
4436             try
4437               {
4438                 reader = new InputStreamReader(source.getByteStream(),
4439                                                encoding);
4440               }
4441             catch (IOException e)
4442               {
4443                 stream = source.getByteStream();
4444               }
4445           }
4446       }
4447     else if (systemId == null)
4448       {
4449         error("InputSource has no URI!");
4450       }
4451     scratch.setCharacterStream(null);
4452     scratch.setByteStream(null);
4453     scratch.setEncoding(null);
4454
4455     // Push the existing status.
4456     pushInput(ename);
4457
4458     // Create a new read buffer.
4459     // (Note the four-character margin)
4460     readBuffer = new char[READ_BUFFER_MAX + 4];
4461     readBufferPos = 0;
4462     readBufferLength = 0;
4463     readBufferOverflow = -1;
4464     is = null;
4465     line = 1;
4466     column = 0;
4467     currentByteCount = 0;
4468
4469     // If there's an explicit character stream, just
4470     // ignore encoding declarations.
4471     if (reader != null)
4472       {
4473         sourceType = INPUT_READER;
4474         this.reader = reader;
4475         tryEncodingDecl(true);
4476         return;
4477       }
4478
4479     // Else we handle the conversion, and need to ensure
4480     // it's done right.
4481     sourceType = INPUT_STREAM;
4482     if (stream != null)
4483       {
4484         is = stream;
4485       }
4486     else
4487       {
4488         // We have to open our own stream to the URL.
4489         URL url = new URL(systemId);
4490
4491         externalEntity = url.openConnection();
4492         externalEntity.connect();
4493         is = externalEntity.getInputStream();
4494       }
4495
4496     // If we get to here, there must be
4497     // an InputStream available.
4498     if (!is.markSupported())
4499       {
4500         is = new BufferedInputStream(is);
4501       }
4502
4503     // Get any external encoding label.
4504     if (encoding == null && externalEntity != null)
4505       {
4506         // External labels can be untrustworthy; filesystems in
4507         // particular often have the wrong default for content
4508         // that wasn't locally originated.  Those we autodetect.
4509         if (!"file".equals(externalEntity.getURL().getProtocol()))
4510           {
4511             int temp;
4512
4513             // application/xml;charset=something;otherAttr=...
4514             // ... with many variants on 'something'
4515             encoding = externalEntity.getContentType();
4516
4517             // MHK code (fix for Saxon 5.5.1/007):
4518             // protect against encoding==null
4519             if (encoding == null)
4520               {
4521                 temp = -1;
4522               }
4523             else
4524               {
4525                 temp = encoding.indexOf("charset");
4526               }
4527
4528             // RFC 2376 sez MIME text defaults to ASCII, but since the
4529             // JDK will create a MIME type out of thin air, we always
4530             // autodetect when there's no explicit charset attribute.
4531             if (temp < 0)
4532               {
4533                 encoding = null;  // autodetect
4534               }
4535             else
4536               {
4537                 // only this one attribute
4538                 if ((temp = encoding.indexOf(';')) > 0)
4539                   {
4540                     encoding = encoding.substring(0, temp);
4541                   }
4542
4543                 if ((temp = encoding.indexOf('=', temp + 7)) > 0)
4544                   {
4545                     encoding = encoding.substring(temp + 1);
4546
4547                     // attributes can have comment fields (RFC 822)
4548                     if ((temp = encoding.indexOf('(')) > 0)
4549                       {
4550                         encoding = encoding.substring(0, temp);
4551                       }
4552                     // ... and values may be quoted
4553                     if ((temp = encoding.indexOf('"')) > 0)
4554                       {
4555                         encoding =
4556                           encoding.substring(temp + 1,
4557                                              encoding.indexOf('"', temp + 2));
4558                       }
4559                     encoding = encoding.trim();
4560                   }
4561                 else
4562                   {
4563                     handler.warn("ignoring illegal MIME attribute: "
4564                                  + encoding);
4565                     encoding = null;
4566                   }
4567               }
4568           }
4569       }
4570
4571     // if we got an external encoding label, use it ...
4572     if (encoding != null)
4573       {
4574         this.encoding = ENCODING_EXTERNAL;
4575         setupDecoding(encoding);
4576         ignoreEncoding = true;
4577
4578         // ... else autodetect from first bytes.
4579       }
4580     else
4581       {
4582         detectEncoding();
4583         ignoreEncoding = false;
4584       }
4585
4586     // Read any XML or text declaration.
4587     // If we autodetected, it may tell us the "real" encoding.
4588     try
4589       {
4590         tryEncodingDecl(ignoreEncoding);
4591       }
4592     catch (UnsupportedEncodingException x)
4593       {
4594         encoding = x.getMessage();
4595
4596         // if we don't handle the declared encoding,
4597         // try letting a JVM InputStreamReader do it
4598         try
4599           {
4600             if (sourceType != INPUT_STREAM)
4601               {
4602                 throw x;
4603               }
4604
4605             is.reset();
4606             readBufferPos = 0;
4607             readBufferLength = 0;
4608             readBufferOverflow = -1;
4609             line = 1;
4610             currentByteCount = column = 0;
4611
4612             sourceType = INPUT_READER;
4613             this.reader = new InputStreamReader(is, encoding);
4614             is = null;
4615
4616             tryEncodingDecl(true);
4617
4618           }
4619         catch (IOException e)
4620           {
4621             error("unsupported text encoding",
4622                   encoding,
4623                   null);
4624           }
4625       }
4626   }
4627
4628   /**
4629    * Check for an encoding declaration.  This is the second part of the
4630    * XML encoding autodetection algorithm, relying on detectEncoding to
4631    * get to the point that this part can read any encoding declaration
4632    * in the document (using only US-ASCII characters).
4633    *
4634    * <p> Because this part starts to fill parser buffers with this data,
4635    * it's tricky to setup a reader so that Java's built-in decoders can be
4636    * used for the character encodings that aren't built in to this parser
4637    * (such as EUC-JP, KOI8-R, Big5, etc).
4638    *
4639    * @return any encoding in the declaration, uppercased; or null
4640    * @see detectEncoding
4641    */
4642   private String tryEncodingDecl(boolean ignoreEncoding)
4643     throws SAXException, IOException
4644   {
4645     // Read the XML/text declaration.
4646     if (tryRead("<?xml"))
4647       {
4648         if (tryWhitespace())
4649           {
4650             if (inputStack.size() > 0)
4651               {
4652                 return parseTextDecl(ignoreEncoding);
4653               }
4654             else
4655               {
4656                 return parseXMLDecl(ignoreEncoding);
4657               }
4658           }
4659         else
4660           {
4661             // <?xml-stylesheet ...?> or similar
4662             unread('l');
4663             unread('m');
4664             unread('x');
4665             unread('?');
4666             unread('<');
4667           }
4668       }
4669     return null;
4670   }
4671
4672   /**
4673    * Attempt to detect the encoding of an entity.
4674    * <p>The trick here (as suggested in the XML standard) is that
4675    * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
4676    * <b>must</b> begin with an XML declaration or an encoding
4677    * declaration; we simply have to look for "&lt;?xml" in various
4678    * encodings.
4679    * <p>This method has no way to distinguish among 8-bit encodings.
4680    * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4681    * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
4682    * should work, but most will be rejected later by setupDecoding ().
4683    * @see #tryEncoding (byte[], byte, byte, byte, byte)
4684    * @see #tryEncoding (byte[], byte, byte)
4685    * @see #setupDecoding
4686    */
4687   private void detectEncoding()
4688     throws SAXException, IOException
4689   {
4690     byte[] signature = new byte[4];
4691
4692     // Read the first four bytes for
4693     // autodetection.
4694     is.mark(4);
4695     is.read(signature);
4696     is.reset();
4697
4698     //
4699     // FIRST:  four byte encodings (who uses these?)
4700     //
4701     if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4702                     (byte) 0x00, (byte) 0x3c))
4703       {
4704         // UCS-4 must begin with "<?xml"
4705         // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4706         // "UTF-32BE"
4707         encoding = ENCODING_UCS_4_1234;
4708       }
4709     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4710                          (byte) 0x00, (byte) 0x00))
4711       {
4712         // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4713         // "UTF-32LE"
4714         encoding = ENCODING_UCS_4_4321;
4715       }
4716     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4717                          (byte) 0x3c, (byte) 0x00))
4718       {
4719         // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4720         encoding = ENCODING_UCS_4_2143;
4721       }
4722     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4723                          (byte) 0x00, (byte) 0x00))
4724       {
4725         // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4726         encoding = ENCODING_UCS_4_3412;
4727
4728         // 00 00 fe ff UCS_4_1234 (with BOM)
4729         // ff fe 00 00 UCS_4_4321 (with BOM)
4730       }
4731
4732     //
4733     // SECOND:  two byte encodings
4734     // note ... with 1/14/2000 errata the XML spec identifies some
4735     // more "broken UTF-16" autodetection cases, with no XML decl,
4736     // which we don't handle here (that's legal too).
4737     //
4738     else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4739       {
4740         // UCS-2 with a byte-order marker. (UTF-16)
4741         // 0xfe 0xff: UCS-2, big-endian (12)
4742         encoding = ENCODING_UCS_2_12;
4743         is.read(); is.read();
4744       }
4745     else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4746       {
4747         // UCS-2 with a byte-order marker. (UTF-16)
4748         // 0xff 0xfe: UCS-2, little-endian (21)
4749         encoding = ENCODING_UCS_2_21;
4750         is.read(); is.read();
4751       }
4752     else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4753                          (byte) 0x00, (byte) 0x3f))
4754       {
4755         // UTF-16BE (otherwise, malformed UTF-16)
4756         // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4757         encoding = ENCODING_UCS_2_12;
4758         error("no byte-order mark for UCS-2 entity");
4759       }
4760     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4761                          (byte) 0x3f, (byte) 0x00))
4762       {
4763         // UTF-16LE (otherwise, malformed UTF-16)
4764         // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4765         encoding = ENCODING_UCS_2_21;
4766         error("no byte-order mark for UCS-2 entity");
4767       }
4768
4769     //
4770     // THIRD:  ASCII-derived encodings, fixed and variable lengths
4771     //
4772     else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4773                          (byte) 0x78, (byte) 0x6d))
4774       {
4775         // ASCII derived
4776         // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4777         encoding = ENCODING_UTF_8;
4778         prefetchASCIIEncodingDecl();
4779       }
4780     else if (signature[0] == (byte) 0xef
4781              && signature[1] == (byte) 0xbb
4782              && signature[2] == (byte) 0xbf)
4783       {
4784         // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4785         // this un-needed notion slipped into XML 2nd ed through a
4786         // "non-normative" erratum; now required by MSFT and UDDI,
4787         // and E22 made it normative.
4788         encoding = ENCODING_UTF_8;
4789         is.read(); is.read(); is.read();
4790       }
4791     else
4792       {
4793         // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4794         // ... but we COULD at least kick in some fixed code page
4795
4796         // (default) UTF-8 without encoding/XML declaration
4797         encoding = ENCODING_UTF_8;
4798       }
4799   }
4800
4801   /**
4802    * Check for a four-byte signature.
4803    * <p>Utility routine for detectEncoding ().
4804    * <p>Always looks for some part of "<?XML" in a specific encoding.
4805    * @param sig The first four bytes read.
4806    * @param b1 The first byte of the signature
4807    * @param b2 The second byte of the signature
4808    * @param b3 The third byte of the signature
4809    * @param b4 The fourth byte of the signature
4810    * @see #detectEncoding
4811    */
4812   private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4813                                      byte b3, byte b4)
4814   {
4815     return (sig[0] == b1 && sig[1] == b2
4816             && sig[2] == b3 && sig[3] == b4);
4817   }
4818
4819   /**
4820    * Check for a two-byte signature.
4821    * <p>Looks for a UCS-2 byte-order mark.
4822    * <p>Utility routine for detectEncoding ().
4823    * @param sig The first four bytes read.
4824    * @param b1 The first byte of the signature
4825    * @param b2 The second byte of the signature
4826    * @see #detectEncoding
4827    */
4828   private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4829   {
4830     return ((sig[0] == b1) && (sig[1] == b2));
4831   }
4832
4833   /**
4834    * This method pushes a string back onto input.
4835    * <p>It is useful either as the expansion of an internal entity,
4836    * or for backtracking during the parse.
4837    * <p>Call pushCharArray () to do the actual work.
4838    * @param s The string to push back onto input.
4839    * @see #pushCharArray
4840    */
4841   private void pushString(String ename, String s)
4842     throws SAXException
4843   {
4844     char[] ch = s.toCharArray();
4845     pushCharArray(ename, ch, 0, ch.length);
4846   }
4847
4848   /**
4849    * Push a new internal input source.
4850    * <p>This method is useful for expanding an internal entity,
4851    * or for unreading a string of characters.  It creates a new
4852    * readBuffer containing the characters in the array, instead
4853    * of characters converted from an input byte stream.
4854    * @param ch The char array to push.
4855    * @see #pushString
4856    * @see #pushURL
4857    * @see #readBuffer
4858    * @see #sourceType
4859    * @see #pushInput
4860    */
4861   private void pushCharArray(String ename, char[] ch, int start, int length)
4862     throws SAXException
4863   {
4864     // Push the existing status
4865     pushInput(ename);
4866     if (ename != null && doReport)
4867       {
4868         dataBufferFlush();
4869         handler.startInternalEntity(ename);
4870       }
4871     sourceType = INPUT_INTERNAL;
4872     readBuffer = ch;
4873     readBufferPos = start;
4874     readBufferLength = length;
4875     readBufferOverflow = -1;
4876   }
4877
4878   /**
4879    * Save the current input source onto the stack.
4880    * <p>This method saves all of the global variables associated with
4881    * the current input source, so that they can be restored when a new
4882    * input source has finished.  It also tests for entity recursion.
4883    * <p>The method saves the following global variables onto a stack
4884    * using a fixed-length array:
4885    * <ol>
4886    * <li>sourceType
4887    * <li>externalEntity
4888    * <li>readBuffer
4889    * <li>readBufferPos
4890    * <li>readBufferLength
4891    * <li>line
4892    * <li>encoding
4893    * </ol>
4894    * @param ename The name of the entity (if any) causing the new input.
4895    * @see #popInput
4896    * @see #sourceType
4897    * @see #externalEntity
4898    * @see #readBuffer
4899    * @see #readBufferPos
4900    * @see #readBufferLength
4901    * @see #line
4902    * @see #encoding
4903    */
4904   private void pushInput(String ename)
4905     throws SAXException
4906   {
4907     // Check for entity recursion.
4908     if (ename != null)
4909       {
4910         Iterator entities = entityStack.iterator();
4911         while (entities.hasNext())
4912           {
4913             String e = (String) entities.next();
4914             if (e != null && e == ename)
4915               {
4916                 error("recursive reference to entity", ename, null);
4917               }
4918           }
4919       }
4920     entityStack.addLast(ename);
4921
4922     // Don't bother if there is no current input.
4923     if (sourceType == INPUT_NONE)
4924       {
4925         return;
4926       }
4927
4928     // Set up a snapshot of the current
4929     // input source.
4930     Input input = new Input();
4931
4932     input.sourceType = sourceType;
4933     input.externalEntity = externalEntity;
4934     input.readBuffer = readBuffer;
4935     input.readBufferPos = readBufferPos;
4936     input.readBufferLength = readBufferLength;
4937     input.line = line;
4938     input.encoding = encoding;
4939     input.readBufferOverflow = readBufferOverflow;
4940     input.is = is;
4941     input.currentByteCount = currentByteCount;
4942     input.column = column;
4943     input.reader = reader;
4944
4945     // Push it onto the stack.
4946     inputStack.addLast(input);
4947   }
4948
4949   /**
4950    * Restore a previous input source.
4951    * <p>This method restores all of the global variables associated with
4952    * the current input source.
4953    * @exception java.io.EOFException
4954    *    If there are no more entries on the input stack.
4955    * @see #pushInput
4956    * @see #sourceType
4957    * @see #externalEntity
4958    * @see #readBuffer
4959    * @see #readBufferPos
4960    * @see #readBufferLength
4961    * @see #line
4962    * @see #encoding
4963    */
4964   private void popInput()
4965     throws SAXException, IOException
4966   {
4967     String ename = (String) entityStack.removeLast();
4968
4969     if (ename != null && doReport)
4970       {
4971         dataBufferFlush();
4972       }
4973     switch (sourceType)
4974       {
4975       case INPUT_STREAM:
4976         handler.endExternalEntity(ename);
4977         is.close();
4978         break;
4979       case INPUT_READER:
4980         handler.endExternalEntity(ename);
4981         reader.close();
4982         break;
4983       case INPUT_INTERNAL:
4984         if (ename != null && doReport)
4985           {
4986             handler.endInternalEntity(ename);
4987           }
4988         break;
4989       }
4990
4991     // Throw an EOFException if there
4992     // is nothing else to pop.
4993     if (inputStack.isEmpty())
4994       {
4995         throw new EOFException("no more input");
4996       }
4997
4998     Input input = (Input) inputStack.removeLast();
4999
5000     sourceType = input.sourceType;
5001     externalEntity = input.externalEntity;
5002     readBuffer = input.readBuffer;
5003     readBufferPos = input.readBufferPos;
5004     readBufferLength = input.readBufferLength;
5005     line = input.line;
5006     encoding = input.encoding;
5007     readBufferOverflow = input.readBufferOverflow;
5008     is = input.is;
5009     currentByteCount = input.currentByteCount;
5010     column = input.column;
5011     reader = input.reader;
5012   }
5013
5014   /**
5015    * Return true if we can read the expected character.
5016    * <p>Note that the character will be removed from the input stream
5017    * on success, but will be put back on failure.  Do not attempt to
5018    * read the character again if the method succeeds.
5019    * @param delim The character that should appear next.  For a
5020    *        insensitive match, you must supply this in upper-case.
5021    * @return true if the character was successfully read, or false if
5022    *   it was not.
5023    * @see #tryRead (String)
5024    */
5025   private boolean tryRead(char delim)
5026     throws SAXException, IOException
5027   {
5028     char c;
5029
5030     // Read the character
5031     c = readCh();
5032
5033     // Test for a match, and push the character
5034     // back if the match fails.
5035     if (c == delim)
5036       {
5037         return true;
5038       }
5039     else
5040       {
5041         unread(c);
5042         return false;
5043       }
5044   }
5045
5046   /**
5047    * Return true if we can read the expected string.
5048    * <p>This is simply a convenience method.
5049    * <p>Note that the string will be removed from the input stream
5050    * on success, but will be put back on failure.  Do not attempt to
5051    * read the string again if the method succeeds.
5052    * <p>This method will push back a character rather than an
5053    * array whenever possible (probably the majority of cases).
5054    * @param delim The string that should appear next.
5055    * @return true if the string was successfully read, or false if
5056    *   it was not.
5057    * @see #tryRead (char)
5058    */
5059   private boolean tryRead(String delim)
5060     throws SAXException, IOException
5061   {
5062     return tryRead(delim.toCharArray());
5063   }
5064
5065   private boolean tryRead(char[] ch)
5066     throws SAXException, IOException
5067   {
5068     char c;
5069
5070     // Compare the input, character-
5071     // by character.
5072
5073     for (int i = 0; i < ch.length; i++)
5074       {
5075         c = readCh();
5076         if (c != ch[i])
5077           {
5078             unread(c);
5079             if (i != 0)
5080               {
5081                 unread(ch, i);
5082               }
5083             return false;
5084           }
5085       }
5086     return true;
5087   }
5088
5089   /**
5090    * Return true if we can read some whitespace.
5091    * <p>This is simply a convenience method.
5092    * <p>This method will push back a character rather than an
5093    * array whenever possible (probably the majority of cases).
5094    * @return true if whitespace was found.
5095    */
5096   private boolean tryWhitespace()
5097     throws SAXException, IOException
5098   {
5099     char c;
5100     c = readCh();
5101     if (isWhitespace(c))
5102       {
5103         skipWhitespace();
5104         return true;
5105       }
5106     else
5107       {
5108         unread(c);
5109         return false;
5110       }
5111   }
5112
5113   /**
5114    * Read all data until we find the specified string.
5115    * This is useful for scanning CDATA sections and PIs.
5116    * <p>This is inefficient right now, since it calls tryRead ()
5117    * for every character.
5118    * @param delim The string delimiter
5119    * @see #tryRead (String, boolean)
5120    * @see #readCh
5121    */
5122   private void parseUntil(String delim)
5123     throws SAXException, IOException
5124   {
5125     parseUntil(delim.toCharArray());
5126   }
5127
5128   private void parseUntil(char[] delim)
5129     throws SAXException, IOException
5130   {
5131     char c;
5132     int startLine = line;
5133
5134     try
5135       {
5136         while (!tryRead(delim))
5137           {
5138             c = readCh();
5139             dataBufferAppend(c);
5140           }
5141       }
5142     catch (EOFException e)
5143       {
5144         error("end of input while looking for delimiter "
5145               + "(started on line " + startLine
5146               + ')', null, new String(delim));
5147       }
5148   }
5149
5150   //////////////////////////////////////////////////////////////////////
5151   // Low-level I/O.
5152   //////////////////////////////////////////////////////////////////////
5153
5154   /**
5155    * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5156    * Doesn't buffer more than absolutely needed, so that when an encoding
5157    * decl says we need to create an InputStreamReader, we can discard our
5158    * buffer and reset().  Caller knows the first chars of the decl exist
5159    * in the input stream.
5160    */
5161   private void prefetchASCIIEncodingDecl()
5162     throws SAXException, IOException
5163   {
5164     int ch;
5165     readBufferPos = readBufferLength = 0;
5166
5167     is.mark(readBuffer.length);
5168     while (true)
5169       {
5170         ch = is.read();
5171         readBuffer[readBufferLength++] = (char) ch;
5172         switch (ch)
5173           {
5174           case (int) '>':
5175             return;
5176           case -1:
5177             error("file ends before end of XML or encoding declaration.",
5178                   null, "?>");
5179           }
5180         if (readBuffer.length == readBufferLength)
5181           {
5182             error("unfinished XML or encoding declaration");
5183           }
5184       }
5185   }
5186
5187   /**
5188    * Read a chunk of data from an external input source.
5189    * <p>This is simply a front-end that fills the rawReadBuffer
5190    * with bytes, then calls the appropriate encoding handler.
5191    * @see #encoding
5192    * @see #rawReadBuffer
5193    * @see #readBuffer
5194    * @see #filterCR
5195    * @see #copyUtf8ReadBuffer
5196    * @see #copyIso8859_1ReadBuffer
5197    * @see #copyUcs_2ReadBuffer
5198    * @see #copyUcs_4ReadBuffer
5199    */
5200   private void readDataChunk()
5201     throws SAXException, IOException
5202   {
5203     int count;
5204
5205     // See if we have any overflow (filterCR sets for CR at end)
5206     if (readBufferOverflow > -1)
5207       {
5208         readBuffer[0] = (char) readBufferOverflow;
5209         readBufferOverflow = -1;
5210         readBufferPos = 1;
5211         sawCR = true;
5212       }
5213     else
5214       {
5215         readBufferPos = 0;
5216         sawCR = false;
5217       }
5218
5219     // input from a character stream.
5220     if (sourceType == INPUT_READER)
5221       {
5222         count = reader.read(readBuffer,
5223                             readBufferPos, READ_BUFFER_MAX - readBufferPos);
5224         if (count < 0)
5225           {
5226             readBufferLength = readBufferPos;
5227           }
5228         else
5229           {
5230             readBufferLength = readBufferPos + count;
5231           }
5232         if (readBufferLength > 0)
5233           {
5234             filterCR(count >= 0);
5235           }
5236         sawCR = false;
5237         return;
5238       }
5239
5240     // Read as many bytes as possible into the raw buffer.
5241     count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
5242
5243     // Dispatch to an encoding-specific reader method to populate
5244     // the readBuffer.  In most parser speed profiles, these routines
5245     // show up at the top of the CPU usage chart.
5246     if (count > 0)
5247       {
5248         switch (encoding)
5249           {
5250             // one byte builtins
5251           case ENCODING_ASCII:
5252             copyIso8859_1ReadBuffer(count, (char) 0x0080);
5253             break;
5254           case ENCODING_UTF_8:
5255             copyUtf8ReadBuffer(count);
5256             break;
5257           case ENCODING_ISO_8859_1:
5258             copyIso8859_1ReadBuffer(count, (char) 0);
5259             break;
5260
5261             // two byte builtins
5262           case ENCODING_UCS_2_12:
5263             copyUcs2ReadBuffer(count, 8, 0);
5264             break;
5265           case ENCODING_UCS_2_21:
5266             copyUcs2ReadBuffer(count, 0, 8);
5267             break;
5268
5269             // four byte builtins
5270           case ENCODING_UCS_4_1234:
5271             copyUcs4ReadBuffer(count, 24, 16, 8, 0);
5272             break;
5273           case ENCODING_UCS_4_4321:
5274             copyUcs4ReadBuffer(count, 0, 8, 16, 24);
5275             break;
5276           case ENCODING_UCS_4_2143:
5277             copyUcs4ReadBuffer(count, 16, 24, 0, 8);
5278             break;
5279           case ENCODING_UCS_4_3412:
5280             copyUcs4ReadBuffer(count, 8, 0, 24, 16);
5281             break;
5282           }
5283       }
5284     else
5285       {
5286         readBufferLength = readBufferPos;
5287       }
5288
5289     readBufferPos = 0;
5290
5291     // Filter out all carriage returns if we've seen any
5292     // (including any saved from a previous read)
5293     if (sawCR)
5294       {
5295         filterCR(count >= 0);
5296         sawCR = false;
5297
5298         // must actively report EOF, lest some CRs get lost.
5299         if (readBufferLength == 0 && count >= 0)
5300           {
5301             readDataChunk();
5302           }
5303       }
5304
5305     if (count > 0)
5306       {
5307         currentByteCount += count;
5308       }
5309   }
5310
5311   /**
5312    * Filter carriage returns in the read buffer.
5313    * CRLF becomes LF; CR becomes LF.
5314    * @param moreData true iff more data might come from the same source
5315    * @see #readDataChunk
5316    * @see #readBuffer
5317    * @see #readBufferOverflow
5318    */
5319   private void filterCR(boolean moreData)
5320   {
5321     int i, j;
5322
5323     readBufferOverflow = -1;
5324
5325 loop:
5326     for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5327       {
5328         switch (readBuffer[j])
5329           {
5330           case '\r':
5331             if (j == readBufferLength - 1)
5332               {
5333                 if (moreData)
5334                   {
5335                     readBufferOverflow = '\r';
5336                     readBufferLength--;
5337                   }
5338                 else   // CR at end of buffer
5339                   {
5340                     readBuffer[i++] = '\n';
5341                   }
5342                 break loop;
5343               }
5344             else if (readBuffer[j + 1] == '\n')
5345               {
5346                 j++;
5347               }
5348             readBuffer[i] = '\n';
5349             break;
5350
5351           case '\n':
5352           default:
5353             readBuffer[i] = readBuffer[j];
5354             break;
5355           }
5356       }
5357     readBufferLength = i;
5358   }
5359
5360   /**
5361    * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
5362    * <p>When readDataChunk () calls this method, the raw bytes are in
5363    * rawReadBuffer, and the final characters will appear in
5364    * readBuffer.
5365    * <p>Note that as of Unicode 3.1, good practice became a requirement,
5366    * so that each Unicode character has exactly one UTF-8 representation.
5367    * @param count The number of bytes to convert.
5368    * @see #readDataChunk
5369    * @see #rawReadBuffer
5370    * @see #readBuffer
5371    * @see #getNextUtf8Byte
5372    */
5373   private void copyUtf8ReadBuffer(int count)
5374     throws SAXException, IOException
5375   {
5376     int i = 0;
5377     int j = readBufferPos;
5378     int b1;
5379     char c = 0;
5380
5381     /*
5382     // check once, so the runtime won't (if it's smart enough)
5383     if (count < 0 || count > rawReadBuffer.length)
5384     throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
5385      */
5386
5387     while (i < count)
5388       {
5389         b1 = rawReadBuffer[i++];
5390
5391         // Determine whether we are dealing
5392         // with a one-, two-, three-, or four-
5393         // byte sequence.
5394         if (b1 < 0)
5395           {
5396             if ((b1 & 0xe0) == 0xc0)
5397               {
5398                 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5399                 c = (char) (((b1 & 0x1f) << 6)
5400                             | getNextUtf8Byte(i++, count));
5401                 if (c < 0x0080)
5402                   {
5403                     encodingError("Illegal two byte UTF-8 sequence",
5404                                   c, 0);
5405                   }
5406
5407                 //Sec 2.11
5408                 // [1] the two-character sequence #xD #xA
5409                 // [2] the two-character sequence #xD #x85
5410                 if ((c == 0x0085 || c == 0x000a) && sawCR)
5411                   {
5412                     continue;
5413                   }
5414
5415                 // Sec 2.11
5416                 // [3] the single character #x85
5417
5418                 if (c == 0x0085 && xmlVersion == XML_11)
5419                   {
5420                     readBuffer[j++] = '\r';
5421                   }
5422               }
5423             else if ((b1 & 0xf0) == 0xe0)
5424               {
5425                 // 3-byte sequence:
5426                 // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5427                 // most CJKV characters
5428                 c = (char) (((b1 & 0x0f) << 12) |
5429                             (getNextUtf8Byte(i++, count) << 6) |
5430                             getNextUtf8Byte(i++, count));
5431                 //sec 2.11
5432                 //[4] the single character #x2028
5433                 if (c == 0x2028 && xmlVersion == XML_11)
5434                   {
5435                     readBuffer[j++] = '\r';
5436                     sawCR = true;
5437                     continue;
5438                   }
5439                 if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
5440                   {
5441                     encodingError("Illegal three byte UTF-8 sequence",
5442                                   c, 0);
5443                   }
5444               }
5445             else if ((b1 & 0xf8) == 0xf0)
5446               {
5447                 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
5448                 //     = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
5449                 // (uuuuu = wwww + 1)
5450                 // "Surrogate Pairs" ... from the "Astral Planes"
5451                 // Unicode 3.1 assigned the first characters there
5452                 int iso646 = b1 & 07;
5453                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5454                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5455                 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5456
5457                 if (iso646 <= 0xffff)
5458                   {
5459                     encodingError("Illegal four byte UTF-8 sequence",
5460                                   iso646, 0);
5461                   }
5462                 else
5463                   {
5464                     if (iso646 > 0x0010ffff)
5465                       {
5466                         encodingError("UTF-8 value out of range for Unicode",
5467                                       iso646, 0);
5468                       }
5469                     iso646 -= 0x010000;
5470                     readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
5471                     readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
5472                     continue;
5473                   }
5474               }
5475             else
5476               {
5477                 // The five and six byte encodings aren't supported;
5478                 // they exceed the Unicode (and XML) range.
5479                 encodingError("unsupported five or six byte UTF-8 sequence",
5480                               0xff & b1, i);
5481                 // NOTREACHED
5482                 c = 0;
5483               }
5484           }
5485         else
5486           {
5487             // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
5488             // (US-ASCII character, "common" case, one branch to here)
5489             c = (char) b1;
5490           }
5491         readBuffer[j++] = c;
5492         if (c == '\r')
5493           {
5494             sawCR = true;
5495           }
5496       }
5497     // How many characters have we read?
5498     readBufferLength = j;
5499   }
5500
5501   /**
5502    * Return the next byte value in a UTF-8 sequence.
5503    * If it is not possible to get a byte from the current
5504    * entity, throw an exception.
5505    * @param pos The current position in the rawReadBuffer.
5506    * @param count The number of bytes in the rawReadBuffer
5507    * @return The significant six bits of a non-initial byte in
5508    *   a UTF-8 sequence.
5509    * @exception EOFException If the sequence is incomplete.
5510    */
5511   private int getNextUtf8Byte(int pos, int count)
5512     throws SAXException, IOException
5513   {
5514     int val;
5515
5516     // Take a character from the buffer
5517     // or from the actual input stream.
5518     if (pos < count)
5519       {
5520         val = rawReadBuffer[pos];
5521       }
5522     else
5523       {
5524         val = is.read();
5525         if (val == -1)
5526           {
5527             encodingError("unfinished multi-byte UTF-8 sequence at EOF",
5528                           -1, pos);
5529           }
5530       }
5531
5532     // Check for the correct bits at the start.
5533     if ((val & 0xc0) != 0x80)
5534       {
5535         encodingError("bad continuation of multi-byte UTF-8 sequence",
5536                       val, pos + 1);
5537       }
5538
5539     // Return the significant bits.
5540     return (val & 0x3f);
5541   }
5542
5543   /**
5544    * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
5545    * UTF-16 characters.
5546    *
5547    * <p>When readDataChunk () calls this method, the raw bytes are in
5548    * rawReadBuffer, and the final characters will appear in
5549    * readBuffer.
5550    *
5551    * @param count The number of bytes to convert.
5552    * @param mask For ASCII conversion, 0x7f; else, 0xff.
5553    * @see #readDataChunk
5554    * @see #rawReadBuffer
5555    * @see #readBuffer
5556    */
5557   private void copyIso8859_1ReadBuffer(int count, char mask)
5558     throws IOException
5559   {
5560     int i, j;
5561     for (i = 0, j = readBufferPos; i < count; i++, j++)
5562       {
5563         char c = (char) (rawReadBuffer[i] & 0xff);
5564         if ((c & mask) != 0)
5565           {
5566             throw new CharConversionException("non-ASCII character U+"
5567                                               + Integer.toHexString(c));
5568           }
5569         if (c == 0x0085 && xmlVersion == XML_11)
5570           {
5571             c = '\r';
5572           }
5573         readBuffer[j] = c;
5574         if (c == '\r')
5575           {
5576             sawCR = true;
5577           }
5578       }
5579     readBufferLength = j;
5580   }
5581
5582   /**
5583    * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
5584    * (as used in Java string manipulation).
5585    *
5586    * <p>When readDataChunk () calls this method, the raw bytes are in
5587    * rawReadBuffer, and the final characters will appear in
5588    * readBuffer.
5589    * @param count The number of bytes to convert.
5590    * @param shift1 The number of bits to shift byte 1.
5591    * @param shift2 The number of bits to shift byte 2
5592    * @see #readDataChunk
5593    * @see #rawReadBuffer
5594    * @see #readBuffer
5595    */
5596   private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
5597     throws SAXException
5598   {
5599     int j = readBufferPos;
5600
5601     if (count > 0 && (count % 2) != 0)
5602       {
5603         encodingError("odd number of bytes in UCS-2 encoding", -1, count);
5604       }
5605     // The loops are faster with less internal brancing; hence two
5606     if (shift1 == 0)
5607       {  // "UTF-16-LE"
5608         for (int i = 0; i < count; i += 2)
5609           {
5610             char c = (char) (rawReadBuffer[i + 1] << 8);
5611             c |= 0xff & rawReadBuffer[i];
5612             readBuffer[j++] = c;
5613             if (c == '\r')
5614               {
5615                 sawCR = true;
5616               }
5617           }
5618       }
5619     else
5620       {  // "UTF-16-BE"
5621         for (int i = 0; i < count; i += 2)
5622           {
5623             char c = (char) (rawReadBuffer[i] << 8);
5624             c |= 0xff & rawReadBuffer[i + 1];
5625             readBuffer[j++] = c;
5626             if (c == '\r')
5627               {
5628                 sawCR = true;
5629               }
5630           }
5631       }
5632     readBufferLength = j;
5633   }
5634
5635   /**
5636    * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
5637    *
5638    * <p>When readDataChunk () calls this method, the raw bytes are in
5639    * rawReadBuffer, and the final characters will appear in
5640    * readBuffer.
5641    * <p>Java has Unicode chars, and this routine uses surrogate pairs
5642    * for ISO-10646 values between 0x00010000 and 0x000fffff.  An
5643    * exception is thrown if the ISO-10646 character has no Unicode
5644    * representation.
5645    *
5646    * @param count The number of bytes to convert.
5647    * @param shift1 The number of bits to shift byte 1.
5648    * @param shift2 The number of bits to shift byte 2
5649    * @param shift3 The number of bits to shift byte 2
5650    * @param shift4 The number of bits to shift byte 2
5651    * @see #readDataChunk
5652    * @see #rawReadBuffer
5653    * @see #readBuffer
5654    */
5655   private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
5656                                   int shift3, int shift4)
5657     throws SAXException
5658   {
5659     int j = readBufferPos;
5660
5661     if (count > 0 && (count % 4) != 0)
5662       {
5663         encodingError("number of bytes in UCS-4 encoding " +
5664                       "not divisible by 4",
5665                       -1, count);
5666       }
5667     for (int i = 0; i < count; i += 4)
5668       {
5669         int value = (((rawReadBuffer [i] & 0xff) << shift1) |
5670                      ((rawReadBuffer [i + 1] & 0xff) << shift2) |
5671                      ((rawReadBuffer [i + 2] & 0xff) << shift3) |
5672                      ((rawReadBuffer [i + 3] & 0xff) << shift4));
5673         if (value < 0x0000ffff)
5674           {
5675             readBuffer [j++] = (char) value;
5676             if (value == (int) '\r')
5677               {
5678                 sawCR = true;
5679               }
5680           }
5681         else if (value < 0x0010ffff)
5682           {
5683             value -= 0x010000;
5684             readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
5685             readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
5686           }
5687         else
5688           {
5689             encodingError("UCS-4 value out of range for Unicode",
5690                           value, i);
5691           }
5692       }
5693     readBufferLength = j;
5694   }
5695
5696   /**
5697    * Report a character encoding error.
5698    */
5699   private void encodingError(String message, int value, int offset)
5700     throws SAXException
5701   {
5702     if (value != -1)
5703       {
5704         message = message + " (character code: 0x" +
5705           Integer.toHexString(value) + ')';
5706         error(message);
5707       }
5708   }
5709
5710   //////////////////////////////////////////////////////////////////////
5711   // Local Variables.
5712   //////////////////////////////////////////////////////////////////////
5713
5714   /**
5715    * Re-initialize the variables for each parse.
5716    */
5717   private void initializeVariables()
5718   {
5719     // First line
5720     line = 1;
5721     column = 0;
5722
5723     // Set up the buffers for data and names
5724     dataBufferPos = 0;
5725     dataBuffer = new char[DATA_BUFFER_INITIAL];
5726     nameBufferPos = 0;
5727     nameBuffer = new char[NAME_BUFFER_INITIAL];
5728
5729     // Set up the DTD hash tables
5730     elementInfo = new HashMap();
5731     entityInfo = new HashMap();
5732     notationInfo = new HashMap();
5733     skippedPE = false;
5734
5735     // Set up the variables for the current
5736     // element context.
5737     currentElement = null;
5738     currentElementContent = CONTENT_UNDECLARED;
5739
5740     // Set up the input variables
5741     sourceType = INPUT_NONE;
5742     inputStack = new LinkedList();
5743     entityStack = new LinkedList();
5744     externalEntity = null;
5745     tagAttributePos = 0;
5746     tagAttributes = new String[100];
5747     rawReadBuffer = new byte[READ_BUFFER_MAX];
5748     readBufferOverflow = -1;
5749
5750     scratch = new InputSource();
5751
5752     inLiteral = false;
5753     expandPE = false;
5754     peIsError = false;
5755
5756     doReport = false;
5757
5758     inCDATA = false;
5759
5760     symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5761   }
5762
5763   static class ExternalIdentifiers
5764   {
5765
5766     String publicId;
5767     String systemId;
5768     String baseUri;
5769
5770     ExternalIdentifiers()
5771     {
5772     }
5773
5774     ExternalIdentifiers(String publicId, String systemId, String baseUri)
5775     {
5776       this.publicId = publicId;
5777       this.systemId = systemId;
5778       this.baseUri = baseUri;
5779     }
5780
5781   }
5782
5783   static class EntityInfo
5784   {
5785
5786     int type;
5787     ExternalIdentifiers ids;
5788     String value;
5789     String notationName;
5790
5791   }
5792
5793   static class AttributeDecl
5794   {
5795
5796     String type;
5797     String value;
5798     int valueType;
5799     String enumeration;
5800     String defaultValue;
5801
5802   }
5803
5804   static class ElementDecl
5805   {
5806
5807     int contentType;
5808     String contentModel;
5809     HashMap attributes;
5810
5811   }
5812
5813   static class Input
5814   {
5815
5816     int sourceType;
5817     URLConnection externalEntity;
5818     char[] readBuffer;
5819     int readBufferPos;
5820     int readBufferLength;
5821     int line;
5822     int encoding;
5823     int readBufferOverflow;
5824     InputStream is;
5825     int currentByteCount;
5826     int column;
5827     Reader reader;
5828
5829   }
5830
5831 }
5832