Dead
[official-gcc.git] / gomp-20050608-branch / libjava / classpath / gnu / xml / aelfred2 / XmlParser.java
blobab2ed16f94688de93776e20cdbc1a1a41b3a4dd7
1 /* XmlParser.java --
2 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version.
38 Partly derived from code which carried the following notice:
40 Copyright (c) 1997, 1998 by Microstar Software Ltd.
42 AElfred is free for both commercial and non-commercial use and
43 redistribution, provided that Microstar's copyright and disclaimer are
44 retained intact. You are free to modify AElfred for your own use and
45 to redistribute AElfred with your modifications, provided that the
46 modifications are clearly documented.
48 This program is distributed in the hope that it will be useful, but
49 WITHOUT ANY WARRANTY; without even the implied warranty of
50 merchantability or fitness for a particular purpose. Please use it AT
51 YOUR OWN RISK.
54 package gnu.xml.aelfred2;
56 import gnu.java.security.action.GetPropertyAction;
58 import java.io.BufferedInputStream;
59 import java.io.CharConversionException;
60 import java.io.EOFException;
61 import java.io.InputStream;
62 import java.io.InputStreamReader;
63 import java.io.IOException;
64 import java.io.Reader;
65 import java.io.UnsupportedEncodingException;
66 import java.net.URL;
67 import java.net.URLConnection;
68 import java.security.AccessController;
70 import java.util.Iterator;
71 import java.util.HashMap;
72 import java.util.LinkedList;
74 import org.xml.sax.InputSource;
75 import org.xml.sax.SAXException;
78 /**
79 * Parse XML documents and return parse events through call-backs.
80 * Use the <code>SAXDriver</code> class as your entry point, as all
81 * internal parser interfaces are subject to change.
83 * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
84 * (version 1.2a with bugfixes)
85 * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
86 * @see SAXDriver
88 final class XmlParser
91 // avoid slow per-character readCh()
92 private final static boolean USE_CHEATS = true;
94 ////////////////////////////////////////////////////////////////////////
95 // Constants.
96 ////////////////////////////////////////////////////////////////////////
99 // Constants for element content type.
103 * Constant: an element has not been declared.
104 * @see #getElementContentType
106 public final static int CONTENT_UNDECLARED = 0;
109 * Constant: the element has a content model of ANY.
110 * @see #getElementContentType
112 public final static int CONTENT_ANY = 1;
115 * Constant: the element has declared content of EMPTY.
116 * @see #getElementContentType
118 public final static int CONTENT_EMPTY = 2;
121 * Constant: the element has mixed content.
122 * @see #getElementContentType
124 public final static int CONTENT_MIXED = 3;
127 * Constant: the element has element content.
128 * @see #getElementContentType
130 public final static int CONTENT_ELEMENTS = 4;
134 // Constants for the entity type.
138 * Constant: the entity has not been declared.
139 * @see #getEntityType
141 public final static int ENTITY_UNDECLARED = 0;
144 * Constant: the entity is internal.
145 * @see #getEntityType
147 public final static int ENTITY_INTERNAL = 1;
150 * Constant: the entity is external, non-parsable data.
151 * @see #getEntityType
153 public final static int ENTITY_NDATA = 2;
156 * Constant: the entity is external XML data.
157 * @see #getEntityType
159 public final static int ENTITY_TEXT = 3;
162 // Attribute type constants are interned literal strings.
166 // Constants for supported encodings. "external" is just a flag.
168 private final static int ENCODING_EXTERNAL = 0;
169 private final static int ENCODING_UTF_8 = 1;
170 private final static int ENCODING_ISO_8859_1 = 2;
171 private final static int ENCODING_UCS_2_12 = 3;
172 private final static int ENCODING_UCS_2_21 = 4;
173 private final static int ENCODING_UCS_4_1234 = 5;
174 private final static int ENCODING_UCS_4_4321 = 6;
175 private final static int ENCODING_UCS_4_2143 = 7;
176 private final static int ENCODING_UCS_4_3412 = 8;
177 private final static int ENCODING_ASCII = 9;
180 // Constants for attribute default value.
184 * Constant: the attribute is not declared.
185 * @see #getAttributeDefaultValueType
187 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
190 * Constant: the attribute has a literal default value specified.
191 * @see #getAttributeDefaultValueType
192 * @see #getAttributeDefaultValue
194 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
197 * Constant: the attribute was declared #IMPLIED.
198 * @see #getAttributeDefaultValueType
200 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
203 * Constant: the attribute was declared #REQUIRED.
204 * @see #getAttributeDefaultValueType
206 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
209 * Constant: the attribute was declared #FIXED.
210 * @see #getAttributeDefaultValueType
211 * @see #getAttributeDefaultValue
213 public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
216 // Constants for input.
218 private final static int INPUT_NONE = 0;
219 private final static int INPUT_INTERNAL = 1;
220 private final static int INPUT_STREAM = 3;
221 private final static int INPUT_READER = 5;
224 // Flags for reading literals.
226 // expand general entity refs (attribute values in dtd and content)
227 private final static int LIT_ENTITY_REF = 2;
228 // normalize this value (space chars) (attributes, public ids)
229 private final static int LIT_NORMALIZE = 4;
230 // literal is an attribute value
231 private final static int LIT_ATTRIBUTE = 8;
232 // don't expand parameter entities
233 private final static int LIT_DISABLE_PE = 16;
234 // don't expand [or parse] character refs
235 private final static int LIT_DISABLE_CREF = 32;
236 // don't parse general entity refs
237 private final static int LIT_DISABLE_EREF = 64;
238 // literal is a public ID value
239 private final static int LIT_PUBID = 256;
242 // Flags affecting PE handling in DTDs (if expandPE is true).
243 // PEs expand with space padding, except inside literals.
245 private final static int CONTEXT_NORMAL = 0;
246 private final static int CONTEXT_LITERAL = 1;
248 // Emit warnings for relative URIs with no base URI.
249 static boolean uriWarnings;
250 static
252 String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
253 GetPropertyAction a = new GetPropertyAction(key);
254 uriWarnings = "true".equals(AccessController.doPrivileged(a));
258 // The current XML handler interface.
260 private SAXDriver handler;
263 // I/O information.
265 private Reader reader; // current reader
266 private InputStream is; // current input stream
267 private int line; // current line number
268 private int column; // current column number
269 private int sourceType; // type of input source
270 private LinkedList inputStack; // stack of input soruces
271 private URLConnection externalEntity; // current external entity
272 private int encoding; // current character encoding
273 private int currentByteCount; // bytes read from current source
274 private InputSource scratch; // temporary
277 // Buffers for decoded but unparsed character input.
279 private char[] readBuffer;
280 private int readBufferPos;
281 private int readBufferLength;
282 private int readBufferOverflow; // overflow from last data chunk.
285 // Buffer for undecoded raw byte input.
287 private final static int READ_BUFFER_MAX = 16384;
288 private byte[] rawReadBuffer;
292 // Buffer for attribute values, char refs, DTD stuff.
294 private static int DATA_BUFFER_INITIAL = 4096;
295 private char[] dataBuffer;
296 private int dataBufferPos;
299 // Buffer for parsed names.
301 private static int NAME_BUFFER_INITIAL = 1024;
302 private char[] nameBuffer;
303 private int nameBufferPos;
306 // Save any standalone flag
308 private boolean docIsStandalone;
311 // Hashtables for DTD information on elements, entities, and notations.
312 // Populated until we start ignoring decls (because of skipping a PE)
314 private HashMap elementInfo;
315 private HashMap entityInfo;
316 private HashMap notationInfo;
317 private boolean skippedPE;
320 // Element type currently in force.
322 private String currentElement;
323 private int currentElementContent;
326 // Stack of entity names, to detect recursion.
328 private LinkedList entityStack;
331 // PE expansion is enabled in most chunks of the DTD, not all.
332 // When it's enabled, literals are treated differently.
334 private boolean inLiteral;
335 private boolean expandPE;
336 private boolean peIsError;
339 // can't report entity expansion inside two constructs:
340 // - attribute expansions (internal entities only)
341 // - markup declarations (parameter entities only)
343 private boolean doReport;
346 // Symbol table, for caching interned names.
348 // These show up wherever XML names or nmtokens are used: naming elements,
349 // attributes, PIs, notations, entities, and enumerated attribute values.
351 // NOTE: This hashtable doesn't grow. The default size is intended to be
352 // rather large for most documents. Example: one snapshot of the DocBook
353 // XML 4.1 DTD used only about 350 such names. As a rule, only pathological
354 // documents (ones that don't reuse names) should ever see much collision.
356 // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
357 // "2039" keeps the hash table size at about two memory pages on typical
358 // 32 bit hardware.
360 private final static int SYMBOL_TABLE_LENGTH = 2039;
362 private Object[][] symbolTable;
365 // Hash table of attributes found in current start tag.
367 private String[] tagAttributes;
368 private int tagAttributePos;
371 // Utility flag: have we noticed a CR while reading the last
372 // data chunk? If so, we will have to go back and normalise
373 // CR or CR/LF line ends.
375 private boolean sawCR;
378 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable.
380 private boolean inCDATA;
383 // Xml version.
385 private static final int XML_10 = 0;
386 private static final int XML_11 = 1;
387 private int xmlVersion = XML_10;
389 //////////////////////////////////////////////////////////////////////
390 // Constructors.
391 ////////////////////////////////////////////////////////////////////////
394 * Construct a new parser with no associated handler.
395 * @see #setHandler
396 * @see #parse
398 // package private
399 XmlParser()
404 * Set the handler that will receive parsing events.
405 * @param handler The handler to receive callback events.
406 * @see #parse
408 // package private
409 void setHandler(SAXDriver handler)
411 this.handler = handler;
415 * Parse an XML document from the character stream, byte stream, or URI
416 * that you provide (in that order of preference). Any URI that you
417 * supply will become the base URI for resolving relative URI, and may
418 * be used to acquire a reader or byte stream.
420 * <p> Only one thread at a time may use this parser; since it is
421 * private to this package, post-parse cleanup is done by the caller,
422 * which MUST NOT REUSE the parser (just null it).
424 * @param systemId Absolute URI of the document; should never be null,
425 * but may be so iff a reader <em>or</em> a stream is provided.
426 * @param publicId The public identifier of the document, or null.
427 * @param reader A character stream; must be null if stream isn't.
428 * @param stream A byte input stream; must be null if reader isn't.
429 * @param encoding The suggested encoding, or null if unknown.
430 * @exception java.lang.Exception Basically SAXException or IOException
432 // package private
433 void doParse(String systemId, String publicId, Reader reader,
434 InputStream stream, String encoding)
435 throws Exception
437 if (handler == null)
439 throw new IllegalStateException("no callback handler");
442 initializeVariables();
444 // predeclare the built-in entities here (replacement texts)
445 // we don't need to intern(), since we're guaranteed literals
446 // are always (globally) interned.
447 setInternalEntity("amp", "&#38;");
448 setInternalEntity("lt", "&#60;");
449 setInternalEntity("gt", "&#62;");
450 setInternalEntity("apos", "&#39;");
451 setInternalEntity("quot", "&#34;");
455 // pushURL first to ensure locator is correct in startDocument
456 // ... it might report an IO or encoding exception.
457 handler.startDocument();
458 pushURL(false, "[document]",
459 // default baseURI: null
460 new ExternalIdentifiers(publicId, systemId, null),
461 reader, stream, encoding, false);
463 parseDocument();
465 catch (EOFException e)
467 //empty input
468 error("empty document, with no root element.");
470 finally
472 if (reader != null)
476 reader.close();
478 catch (IOException e)
480 /* ignore */
483 if (stream != null)
487 stream.close();
489 catch (IOException e)
491 /* ignore */
494 if (is != null)
498 is.close();
500 catch (IOException e)
502 /* ignore */
505 scratch = null;
509 //////////////////////////////////////////////////////////////////////
510 // Error reporting.
511 //////////////////////////////////////////////////////////////////////
514 * Report an error.
515 * @param message The error message.
516 * @param textFound The text that caused the error (or null).
517 * @see SAXDriver#error
518 * @see #line
520 private void error(String message, String textFound, String textExpected)
521 throws SAXException
523 if (textFound != null)
525 message = message + " (found \"" + textFound + "\")";
527 if (textExpected != null)
529 message = message + " (expected \"" + textExpected + "\")";
531 handler.fatal(message);
533 // "can't happen"
534 throw new SAXException(message);
538 * Report a serious error.
539 * @param message The error message.
540 * @param textFound The text that caused the error (or null).
542 private void error(String message, char textFound, String textExpected)
543 throws SAXException
545 error(message, new Character(textFound).toString(), textExpected);
549 * Report typical case fatal errors.
551 private void error(String message)
552 throws SAXException
554 handler.fatal(message);
557 //////////////////////////////////////////////////////////////////////
558 // Major syntactic productions.
559 //////////////////////////////////////////////////////////////////////
562 * Parse an XML document.
563 * <pre>
564 * [1] document ::= prolog element Misc*
565 * </pre>
566 * <p>This is the top-level parsing function for a single XML
567 * document. As a minimum, a well-formed document must have
568 * a document element, and a valid document must have a prolog
569 * (one with doctype) as well.
571 private void parseDocument()
572 throws Exception
575 { // added by MHK
576 boolean sawDTD = parseProlog();
577 require('<');
578 parseElement(!sawDTD);
580 catch (EOFException ee)
581 { // added by MHK
582 error("premature end of file", "[EOF]", null);
587 parseMisc(); //skip all white, PIs, and comments
588 char c = readCh(); //if this doesn't throw an exception...
589 error("unexpected characters after document end", c, null);
591 catch (EOFException e)
593 return;
597 static final char[] startDelimComment = { '<', '!', '-', '-' };
598 static final char[] endDelimComment = { '-', '-' };
601 * Skip a comment.
602 * <pre>
603 * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
604 * </pre>
605 * <p> (The <code>&lt;!--</code> has already been read.)
607 private void parseComment()
608 throws Exception
610 char c;
611 boolean saved = expandPE;
613 expandPE = false;
614 parseUntil(endDelimComment);
615 require('>');
616 expandPE = saved;
617 handler.comment(dataBuffer, 0, dataBufferPos);
618 dataBufferPos = 0;
621 static final char[] startDelimPI = { '<', '?' };
622 static final char[] endDelimPI = { '?', '>' };
625 * Parse a processing instruction and do a call-back.
626 * <pre>
627 * [16] PI ::= '&lt;?' PITarget
628 * (S (Char* - (Char* '?&gt;' Char*)))?
629 * '?&gt;'
630 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
631 * </pre>
632 * <p> (The <code>&lt;?</code> has already been read.)
634 private void parsePI()
635 throws SAXException, IOException
637 String name;
638 boolean saved = expandPE;
640 expandPE = false;
641 name = readNmtoken(true);
642 //NE08
643 if (name.indexOf(':') >= 0)
645 error("Illegal character(':') in processing instruction name ",
646 name, null);
648 if ("xml".equalsIgnoreCase(name))
650 error("Illegal processing instruction target", name, null);
652 if (!tryRead(endDelimPI))
654 requireWhitespace();
655 parseUntil(endDelimPI);
657 expandPE = saved;
658 handler.processingInstruction(name, dataBufferToString());
661 static final char[] endDelimCDATA = { ']', ']', '>' };
663 private boolean isDirtyCurrentElement;
666 * Parse a CDATA section.
667 * <pre>
668 * [18] CDSect ::= CDStart CData CDEnd
669 * [19] CDStart ::= '&lt;![CDATA['
670 * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
671 * [21] CDEnd ::= ']]&gt;'
672 * </pre>
673 * <p> (The '&lt;![CDATA[' has already been read.)
675 private void parseCDSect()
676 throws Exception
678 parseUntil(endDelimCDATA);
679 dataBufferFlush();
683 * Parse the prolog of an XML document.
684 * <pre>
685 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
686 * </pre>
687 * <p>We do not look for the XML declaration here, because it was
688 * handled by pushURL ().
689 * @see pushURL
690 * @return true if a DTD was read.
692 private boolean parseProlog()
693 throws Exception
695 parseMisc();
697 if (tryRead("<!DOCTYPE"))
699 parseDoctypedecl();
700 parseMisc();
701 return true;
703 return false;
706 private void checkLegalVersion(String version)
707 throws SAXException
709 int len = version.length();
710 for (int i = 0; i < len; i++)
712 char c = version.charAt(i);
713 if ('0' <= c && c <= '9')
715 continue;
717 if (c == '_' || c == '.' || c == ':' || c == '-')
719 continue;
721 if ('a' <= c && c <= 'z')
723 continue;
725 if ('A' <= c && c <= 'Z')
727 continue;
729 error ("illegal character in version", version, "1.0");
734 * Parse the XML declaration.
735 * <pre>
736 * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
737 * [24] VersionInfo ::= S 'version' Eq
738 * ("'" VersionNum "'" | '"' VersionNum '"' )
739 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
740 * [32] SDDecl ::= S 'standalone' Eq
741 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
742 * [80] EncodingDecl ::= S 'encoding' Eq
743 * ( "'" EncName "'" | "'" EncName "'" )
744 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
745 * </pre>
746 * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
747 * @return the encoding in the declaration, uppercased; or null
748 * @see #parseTextDecl
749 * @see #setupDecoding
751 private String parseXMLDecl(boolean ignoreEncoding)
752 throws SAXException, IOException
754 String version;
755 String encodingName = null;
756 String standalone = null;
757 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
758 String inputEncoding = null;
760 switch (this.encoding)
762 case ENCODING_EXTERNAL:
763 case ENCODING_UTF_8:
764 inputEncoding = "UTF-8";
765 break;
766 case ENCODING_ISO_8859_1:
767 inputEncoding = "ISO-8859-1";
768 break;
769 case ENCODING_UCS_2_12:
770 inputEncoding = "UTF-16BE";
771 break;
772 case ENCODING_UCS_2_21:
773 inputEncoding = "UTF-16LE";
774 break;
777 // Read the version.
778 require("version");
779 parseEq();
780 checkLegalVersion(version = readLiteral(flags));
781 if (!version.equals("1.0"))
783 if (version.equals("1.1"))
785 handler.warn("expected XML version 1.0, not: " + version);
786 xmlVersion = XML_11;
788 else
790 error("illegal XML version", version, "1.0 or 1.1");
793 else
795 xmlVersion = XML_10;
797 // Try reading an encoding declaration.
798 boolean white = tryWhitespace();
800 if (tryRead("encoding"))
802 if (!white)
804 error("whitespace required before 'encoding='");
806 parseEq();
807 encodingName = readLiteral(flags);
808 if (!ignoreEncoding)
810 setupDecoding(encodingName);
814 // Try reading a standalone declaration
815 if (encodingName != null)
817 white = tryWhitespace();
819 if (tryRead("standalone"))
821 if (!white)
823 error("whitespace required before 'standalone='");
825 parseEq();
826 standalone = readLiteral(flags);
827 if ("yes".equals(standalone))
829 docIsStandalone = true;
831 else if (!"no".equals(standalone))
833 error("standalone flag must be 'yes' or 'no'");
837 skipWhitespace();
838 require("?>");
840 if (inputEncoding == null)
842 inputEncoding = encodingName;
844 return encodingName;
848 * Parse a text declaration.
849 * <pre>
850 * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
851 * [80] EncodingDecl ::= S 'encoding' Eq
852 * ( '"' EncName '"' | "'" EncName "'" )
853 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
854 * </pre>
855 * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
856 * @return the encoding in the declaration, uppercased; or null
857 * @see #parseXMLDecl
858 * @see #setupDecoding
860 private String parseTextDecl(boolean ignoreEncoding)
861 throws SAXException, IOException
863 String encodingName = null;
864 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
866 // Read an optional version.
867 if (tryRead ("version"))
869 String version;
870 parseEq();
871 checkLegalVersion(version = readLiteral(flags));
873 if (version.equals("1.1"))
875 if (xmlVersion == XML_10)
877 error("external subset has later version number.", "1.0",
878 version);
880 handler.warn("expected XML version 1.0, not: " + version);
881 xmlVersion = XML_11;
883 else if (!version.equals("1.0"))
885 error("illegal XML version", version, "1.0 or 1.1");
887 requireWhitespace();
890 // Read the encoding.
891 require("encoding");
892 parseEq();
893 encodingName = readLiteral(flags);
894 if (!ignoreEncoding)
896 setupDecoding(encodingName);
898 skipWhitespace();
899 require("?>");
901 return encodingName;
905 * Sets up internal state so that we can decode an entity using the
906 * specified encoding. This is used when we start to read an entity
907 * and we have been given knowledge of its encoding before we start to
908 * read any data (e.g. from a SAX input source or from a MIME type).
910 * <p> It is also used after autodetection, at which point only very
911 * limited adjustments to the encoding may be used (switching between
912 * related builtin decoders).
914 * @param encodingName The name of the encoding specified by the user.
915 * @exception IOException if the encoding isn't supported either
916 * internally to this parser, or by the hosting JVM.
917 * @see #parseXMLDecl
918 * @see #parseTextDecl
920 private void setupDecoding(String encodingName)
921 throws SAXException, IOException
923 encodingName = encodingName.toUpperCase();
925 // ENCODING_EXTERNAL indicates an encoding that wasn't
926 // autodetected ... we can use builtin decoders, or
927 // ones from the JVM (InputStreamReader).
929 // Otherwise we can only tweak what was autodetected, and
930 // only for single byte (ASCII derived) builtin encodings.
932 // ASCII-derived encodings
933 if (encoding == ENCODING_UTF_8 || encoding == ENCODING_EXTERNAL)
935 if (encodingName.equals("ISO-8859-1")
936 || encodingName.equals("8859_1")
937 || encodingName.equals("ISO8859_1"))
939 encoding = ENCODING_ISO_8859_1;
940 return;
942 else if (encodingName.equals("US-ASCII")
943 || encodingName.equals("ASCII"))
945 encoding = ENCODING_ASCII;
946 return;
948 else if (encodingName.equals("UTF-8")
949 || encodingName.equals("UTF8"))
951 encoding = ENCODING_UTF_8;
952 return;
954 else if (encoding != ENCODING_EXTERNAL)
956 // used to start with a new reader ...
957 throw new UnsupportedEncodingException(encodingName);
959 // else fallthrough ...
960 // it's ASCII-ish and something other than a builtin
963 // Unicode and such
964 if (encoding == ENCODING_UCS_2_12 || encoding == ENCODING_UCS_2_21)
966 if (!(encodingName.equals("ISO-10646-UCS-2")
967 || encodingName.equals("UTF-16")
968 || encodingName.equals("UTF-16BE")
969 || encodingName.equals("UTF-16LE")))
971 error("unsupported Unicode encoding", encodingName, "UTF-16");
973 return;
976 // four byte encodings
977 if (encoding == ENCODING_UCS_4_1234
978 || encoding == ENCODING_UCS_4_4321
979 || encoding == ENCODING_UCS_4_2143
980 || encoding == ENCODING_UCS_4_3412)
982 // Strictly: "UCS-4" == "UTF-32BE"; also, "UTF-32LE" exists
983 if (!encodingName.equals("ISO-10646-UCS-4"))
985 error("unsupported 32-bit encoding", encodingName,
986 "ISO-10646-UCS-4");
988 return;
991 // assert encoding == ENCODING_EXTERNAL
992 // if (encoding != ENCODING_EXTERNAL)
993 // throw new RuntimeException ("encoding = " + encoding);
995 if (encodingName.equals("UTF-16BE"))
997 encoding = ENCODING_UCS_2_12;
998 return;
1000 if (encodingName.equals("UTF-16LE"))
1002 encoding = ENCODING_UCS_2_21;
1003 return;
1006 // We couldn't use the builtin decoders at all. But we can try to
1007 // create a reader, since we haven't messed up buffering. Tweak
1008 // the encoding name if necessary.
1010 if (encodingName.equals("UTF-16")
1011 || encodingName.equals("ISO-10646-UCS-2"))
1013 encodingName = "Unicode";
1015 // Ignoring all the EBCDIC aliases here
1017 reader = new InputStreamReader(is, encodingName);
1018 sourceType = INPUT_READER;
1022 * Parse miscellaneous markup outside the document element and DOCTYPE
1023 * declaration.
1024 * <pre>
1025 * [27] Misc ::= Comment | PI | S
1026 * </pre>
1028 private void parseMisc()
1029 throws Exception
1031 while (true)
1033 skipWhitespace();
1034 if (tryRead(startDelimPI))
1036 parsePI();
1038 else if (tryRead(startDelimComment))
1040 parseComment();
1042 else
1044 return;
1050 * Parse a document type declaration.
1051 * <pre>
1052 * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1053 * ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1054 * </pre>
1055 * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1057 private void parseDoctypedecl()
1058 throws Exception
1060 String rootName;
1061 ExternalIdentifiers ids;
1063 // Read the document type name.
1064 requireWhitespace();
1065 rootName = readNmtoken(true);
1067 // Read the External subset's IDs
1068 skipWhitespace();
1069 ids = readExternalIds(false, true);
1071 // report (a) declaration of name, (b) lexical info (ids)
1072 handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1074 // Internal subset is parsed first, if present
1075 skipWhitespace();
1076 if (tryRead('['))
1079 // loop until the subset ends
1080 while (true)
1082 doReport = expandPE = true;
1083 skipWhitespace();
1084 doReport = expandPE = false;
1085 if (tryRead(']'))
1087 break; // end of subset
1089 else
1091 // WFC, PEs in internal subset (only between decls)
1092 peIsError = expandPE = true;
1093 parseMarkupdecl();
1094 peIsError = expandPE = false;
1098 skipWhitespace();
1099 require('>');
1101 // Read the external subset, if any
1102 InputSource subset;
1104 if (ids.systemId == null)
1106 subset = handler.getExternalSubset(rootName,
1107 handler.getSystemId());
1109 else
1111 subset = null;
1113 if (ids.systemId != null || subset != null)
1115 pushString(null, ">");
1117 // NOTE: [dtd] is so we say what SAX2 expects,
1118 // though it's misleading (subset, not entire dtd)
1119 if (ids.systemId != null)
1121 pushURL(true, "[dtd]", ids, null, null, null, true);
1123 else
1125 handler.warn("modifying document by adding external subset");
1126 pushURL(true, "[dtd]",
1127 new ExternalIdentifiers(subset.getPublicId(),
1128 subset.getSystemId(),
1129 null),
1130 subset.getCharacterStream(),
1131 subset.getByteStream(),
1132 subset.getEncoding(),
1133 false);
1136 // Loop until we end up back at '>'
1137 while (true)
1139 doReport = expandPE = true;
1140 skipWhitespace();
1141 doReport = expandPE = false;
1142 if (tryRead('>'))
1144 break;
1146 else
1148 expandPE = true;
1149 parseMarkupdecl();
1150 expandPE = false;
1154 // the ">" string isn't popped yet
1155 if (inputStack.size() != 1)
1157 error("external subset has unmatched '>'");
1161 // done dtd
1162 handler.endDoctype();
1163 expandPE = false;
1164 doReport = true;
1168 * Parse a markup declaration in the internal or external DTD subset.
1169 * <pre>
1170 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1171 * | NotationDecl | PI | Comment
1172 * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1173 * | PEReference | S) *
1174 * </pre>
1175 * <p> Reading toplevel PE references is handled as a lexical issue
1176 * by the caller, as is whitespace.
1178 private void parseMarkupdecl()
1179 throws Exception
1181 char[] saved = null;
1182 boolean savedPE = expandPE;
1184 // prevent "<%foo;" and ensures saved entity is right
1185 require('<');
1186 unread('<');
1187 expandPE = false;
1189 if (tryRead("<!ELEMENT"))
1191 saved = readBuffer;
1192 expandPE = savedPE;
1193 parseElementDecl();
1195 else if (tryRead("<!ATTLIST"))
1197 saved = readBuffer;
1198 expandPE = savedPE;
1199 parseAttlistDecl();
1201 else if (tryRead("<!ENTITY"))
1203 saved = readBuffer;
1204 expandPE = savedPE;
1205 parseEntityDecl();
1207 else if (tryRead("<!NOTATION"))
1209 saved = readBuffer;
1210 expandPE = savedPE;
1211 parseNotationDecl();
1213 else if (tryRead(startDelimPI))
1215 saved = readBuffer;
1216 expandPE = savedPE;
1217 parsePI();
1219 else if (tryRead(startDelimComment))
1221 saved = readBuffer;
1222 expandPE = savedPE;
1223 parseComment();
1225 else if (tryRead("<!["))
1227 saved = readBuffer;
1228 expandPE = savedPE;
1229 if (inputStack.size() > 0)
1231 parseConditionalSect(saved);
1233 else
1235 error("conditional sections illegal in internal subset");
1238 else
1240 error("expected markup declaration");
1243 // VC: Proper Decl/PE Nesting
1244 if (readBuffer != saved)
1246 handler.verror("Illegal Declaration/PE nesting");
1251 * Parse an element, with its tags.
1252 * <pre>
1253 * [39] element ::= EmptyElementTag | STag content ETag
1254 * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1255 * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1256 * </pre>
1257 * <p> (The '&lt;' has already been read.)
1258 * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1259 * and parseContent () will take care of calling parseETag ().
1261 private void parseElement(boolean maybeGetSubset)
1262 throws Exception
1264 String gi;
1265 char c;
1266 int oldElementContent = currentElementContent;
1267 String oldElement = currentElement;
1268 ElementDecl element;
1270 // This is the (global) counter for the
1271 // array of specified attributes.
1272 tagAttributePos = 0;
1274 // Read the element type name.
1275 gi = readNmtoken(true);
1277 // If we saw no DTD, and this is the document root element,
1278 // let the application modify the input stream by providing one.
1279 if (maybeGetSubset)
1281 InputSource subset = handler.getExternalSubset(gi,
1282 handler.getSystemId());
1283 if (subset != null)
1285 String publicId = subset.getPublicId();
1286 String systemId = subset.getSystemId();
1288 handler.warn("modifying document by adding DTD");
1289 handler.doctypeDecl(gi, publicId, systemId);
1290 pushString(null, ">");
1292 // NOTE: [dtd] is so we say what SAX2 expects,
1293 // though it's misleading (subset, not entire dtd)
1294 pushURL(true, "[dtd]",
1295 new ExternalIdentifiers(publicId, systemId, null),
1296 subset.getCharacterStream(),
1297 subset.getByteStream(),
1298 subset.getEncoding(),
1299 false);
1301 // Loop until we end up back at '>'
1302 while (true)
1304 doReport = expandPE = true;
1305 skipWhitespace();
1306 doReport = expandPE = false;
1307 if (tryRead('>'))
1309 break;
1311 else
1313 expandPE = true;
1314 parseMarkupdecl();
1315 expandPE = false;
1319 // the ">" string isn't popped yet
1320 if (inputStack.size() != 1)
1322 error("external subset has unmatched '>'");
1325 handler.endDoctype();
1329 // Determine the current content type.
1330 currentElement = gi;
1331 element = (ElementDecl) elementInfo.get(gi);
1332 currentElementContent = getContentType(element, CONTENT_ANY);
1334 // Read the attributes, if any.
1335 // After this loop, "c" is the closing delimiter.
1336 boolean white = tryWhitespace();
1337 c = readCh();
1338 while (c != '/' && c != '>')
1340 unread(c);
1341 if (!white)
1343 error("need whitespace between attributes");
1345 parseAttribute(gi);
1346 white = tryWhitespace();
1347 c = readCh();
1350 // Supply any defaulted attributes.
1351 Iterator atts = declaredAttributes(element);
1352 if (atts != null)
1354 String aname;
1355 loop:
1356 while (atts.hasNext())
1358 aname = (String) atts.next();
1359 // See if it was specified.
1360 for (int i = 0; i < tagAttributePos; i++)
1362 if (tagAttributes[i] == aname)
1364 continue loop;
1367 // ... or has a default
1368 String value = getAttributeDefaultValue(gi, aname);
1370 if (value == null)
1372 continue;
1374 handler.attribute(aname, value, false);
1378 // Figure out if this is a start tag
1379 // or an empty element, and dispatch an
1380 // event accordingly.
1381 switch (c)
1383 case '>':
1384 handler.startElement(gi);
1385 parseContent();
1386 break;
1387 case '/':
1388 require('>');
1389 handler.startElement(gi);
1390 handler.endElement(gi);
1391 break;
1394 // Restore the previous state.
1395 currentElement = oldElement;
1396 currentElementContent = oldElementContent;
1400 * Parse an attribute assignment.
1401 * <pre>
1402 * [41] Attribute ::= Name Eq AttValue
1403 * </pre>
1404 * @param name The name of the attribute's element.
1405 * @see SAXDriver#attribute
1407 private void parseAttribute(String name)
1408 throws Exception
1410 String aname;
1411 String type;
1412 String value;
1413 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF;
1415 // Read the attribute name.
1416 aname = readNmtoken(true);
1417 type = getAttributeType(name, aname);
1419 // Parse '='
1420 parseEq();
1422 // Read the value, normalizing whitespace
1423 // unless it is CDATA.
1424 if (handler.stringInterning)
1426 if (type == "CDATA" || type == null)
1428 value = readLiteral(flags);
1430 else
1432 value = readLiteral(flags | LIT_NORMALIZE);
1435 else
1437 if (type == null || type.equals("CDATA"))
1439 value = readLiteral(flags);
1441 else
1443 value = readLiteral(flags | LIT_NORMALIZE);
1447 // WFC: no duplicate attributes
1448 for (int i = 0; i < tagAttributePos; i++)
1450 if (aname.equals(tagAttributes [i]))
1452 error("duplicate attribute", aname, null);
1456 // Inform the handler about the
1457 // attribute.
1458 handler.attribute(aname, value, true);
1459 dataBufferPos = 0;
1461 // Note that the attribute has been
1462 // specified.
1463 if (tagAttributePos == tagAttributes.length)
1465 String newAttrib[] = new String[tagAttributes.length * 2];
1466 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1467 tagAttributes = newAttrib;
1469 tagAttributes[tagAttributePos++] = aname;
1473 * Parse an equals sign surrounded by optional whitespace.
1474 * <pre>
1475 * [25] Eq ::= S? '=' S?
1476 * </pre>
1478 private void parseEq()
1479 throws SAXException, IOException
1481 skipWhitespace();
1482 require('=');
1483 skipWhitespace();
1487 * Parse an end tag.
1488 * <pre>
1489 * [42] ETag ::= '</' Name S? '>'
1490 * </pre>
1491 * <p>NOTE: parseContent () chains to here, we already read the
1492 * "&lt;/".
1494 private void parseETag()
1495 throws Exception
1497 require(currentElement);
1498 skipWhitespace();
1499 require('>');
1500 handler.endElement(currentElement);
1501 // not re-reporting any SAXException re bogus end tags,
1502 // even though that diagnostic might be clearer ...
1506 * Parse the content of an element.
1507 * <pre>
1508 * [43] content ::= (element | CharData | Reference
1509 * | CDSect | PI | Comment)*
1510 * [67] Reference ::= EntityRef | CharRef
1511 * </pre>
1512 * <p> NOTE: consumes ETtag.
1514 private void parseContent()
1515 throws Exception
1517 char c;
1519 while (true)
1521 // consume characters (or ignorable whitspace) until delimiter
1522 parseCharData();
1524 // Handle delimiters
1525 c = readCh();
1526 switch (c)
1528 case '&': // Found "&"
1529 c = readCh();
1530 if (c == '#')
1532 parseCharRef();
1534 else
1536 unread(c);
1537 parseEntityRef(true);
1539 isDirtyCurrentElement = true;
1540 break;
1542 case '<': // Found "<"
1543 dataBufferFlush();
1544 c = readCh();
1545 switch (c)
1547 case '!': // Found "<!"
1548 c = readCh();
1549 switch (c)
1551 case '-': // Found "<!-"
1552 require('-');
1553 isDirtyCurrentElement = false;
1554 parseComment();
1555 break;
1556 case '[': // Found "<!["
1557 isDirtyCurrentElement = false;
1558 require("CDATA[");
1559 handler.startCDATA();
1560 inCDATA = true;
1561 parseCDSect();
1562 inCDATA = false;
1563 handler.endCDATA();
1564 break;
1565 default:
1566 error("expected comment or CDATA section", c, null);
1567 break;
1569 break;
1571 case '?': // Found "<?"
1572 isDirtyCurrentElement = false;
1573 parsePI();
1574 break;
1576 case '/': // Found "</"
1577 isDirtyCurrentElement = false;
1578 parseETag();
1579 return;
1581 default: // Found "<" followed by something else
1582 isDirtyCurrentElement = false;
1583 unread(c);
1584 parseElement(false);
1585 break;
1592 * Parse an element type declaration.
1593 * <pre>
1594 * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1595 * </pre>
1596 * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1598 private void parseElementDecl()
1599 throws Exception
1601 String name;
1603 requireWhitespace();
1604 // Read the element type name.
1605 name = readNmtoken(true);
1607 requireWhitespace();
1608 // Read the content model.
1609 parseContentspec(name);
1611 skipWhitespace();
1612 require('>');
1616 * Content specification.
1617 * <pre>
1618 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1619 * </pre>
1621 private void parseContentspec(String name)
1622 throws Exception
1624 // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1625 if (tryRead("EMPTY"))
1627 setElement(name, CONTENT_EMPTY, null, null);
1628 if (!skippedPE)
1630 handler.getDeclHandler().elementDecl(name, "EMPTY");
1632 return;
1634 else if (tryRead("ANY"))
1636 setElement(name, CONTENT_ANY, null, null);
1637 if (!skippedPE)
1639 handler.getDeclHandler().elementDecl(name, "ANY");
1641 return;
1643 else
1645 String model;
1646 char[] saved;
1648 require('(');
1649 saved = readBuffer;
1650 dataBufferAppend('(');
1651 skipWhitespace();
1652 if (tryRead("#PCDATA"))
1654 dataBufferAppend("#PCDATA");
1655 parseMixed(saved);
1656 model = dataBufferToString();
1657 setElement(name, CONTENT_MIXED, model, null);
1659 else
1661 parseElements(saved);
1662 model = dataBufferToString();
1663 setElement(name, CONTENT_ELEMENTS, model, null);
1665 if (!skippedPE)
1667 handler.getDeclHandler().elementDecl(name, model);
1673 * Parse an element-content model.
1674 * <pre>
1675 * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1676 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1677 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1678 * </pre>
1680 * <p> NOTE: the opening '(' and S have already been read.
1682 * @param saved Buffer for entity that should have the terminal ')'
1684 private void parseElements(char[] saved)
1685 throws Exception
1687 char c;
1688 char sep;
1690 // Parse the first content particle
1691 skipWhitespace();
1692 parseCp();
1694 // Check for end or for a separator.
1695 skipWhitespace();
1696 c = readCh();
1697 switch (c)
1699 case ')':
1700 // VC: Proper Group/PE Nesting
1701 if (readBuffer != saved)
1703 handler.verror("Illegal Group/PE nesting");
1706 dataBufferAppend(')');
1707 c = readCh();
1708 switch (c)
1710 case '*':
1711 case '+':
1712 case '?':
1713 dataBufferAppend(c);
1714 break;
1715 default:
1716 unread(c);
1718 return;
1719 case ',': // Register the separator.
1720 case '|':
1721 sep = c;
1722 dataBufferAppend(c);
1723 break;
1724 default:
1725 error("bad separator in content model", c, null);
1726 return;
1729 // Parse the rest of the content model.
1730 while (true)
1732 skipWhitespace();
1733 parseCp();
1734 skipWhitespace();
1735 c = readCh();
1736 if (c == ')')
1738 // VC: Proper Group/PE Nesting
1739 if (readBuffer != saved)
1741 handler.verror("Illegal Group/PE nesting");
1744 dataBufferAppend(')');
1745 break;
1747 else if (c != sep)
1749 error("bad separator in content model", c, null);
1750 return;
1752 else
1754 dataBufferAppend(c);
1758 // Check for the occurrence indicator.
1759 c = readCh();
1760 switch (c)
1762 case '?':
1763 case '*':
1764 case '+':
1765 dataBufferAppend(c);
1766 return;
1767 default:
1768 unread(c);
1769 return;
1774 * Parse a content particle.
1775 * <pre>
1776 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1777 * </pre>
1779 private void parseCp()
1780 throws Exception
1782 if (tryRead('('))
1784 dataBufferAppend('(');
1785 parseElements(readBuffer);
1787 else
1789 dataBufferAppend(readNmtoken(true));
1790 char c = readCh();
1791 switch (c)
1793 case '?':
1794 case '*':
1795 case '+':
1796 dataBufferAppend(c);
1797 break;
1798 default:
1799 unread(c);
1800 break;
1806 * Parse mixed content.
1807 * <pre>
1808 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1809 * | '(' S? ('#PCDATA') S? ')'
1810 * </pre>
1812 * @param saved Buffer for entity that should have the terminal ')'
1814 private void parseMixed(char[] saved)
1815 throws Exception
1817 // Check for PCDATA alone.
1818 skipWhitespace();
1819 if (tryRead(')'))
1821 // VC: Proper Group/PE Nesting
1822 if (readBuffer != saved)
1824 handler.verror("Illegal Group/PE nesting");
1827 dataBufferAppend(")*");
1828 tryRead('*');
1829 return;
1832 // Parse mixed content.
1833 skipWhitespace();
1834 while (!tryRead(")"))
1836 require('|');
1837 dataBufferAppend('|');
1838 skipWhitespace();
1839 dataBufferAppend(readNmtoken(true));
1840 skipWhitespace();
1843 // VC: Proper Group/PE Nesting
1844 if (readBuffer != saved)
1846 handler.verror("Illegal Group/PE nesting");
1849 require('*');
1850 dataBufferAppend(")*");
1854 * Parse an attribute list declaration.
1855 * <pre>
1856 * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1857 * </pre>
1858 * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1860 private void parseAttlistDecl()
1861 throws Exception
1863 String elementName;
1865 requireWhitespace();
1866 elementName = readNmtoken(true);
1867 boolean white = tryWhitespace();
1868 while (!tryRead('>'))
1870 if (!white)
1872 error("whitespace required before attribute definition");
1874 parseAttDef(elementName);
1875 white = tryWhitespace();
1880 * Parse a single attribute definition.
1881 * <pre>
1882 * [53] AttDef ::= S Name S AttType S DefaultDecl
1883 * </pre>
1885 private void parseAttDef(String elementName)
1886 throws Exception
1888 String name;
1889 String type;
1890 String enumer = null;
1892 // Read the attribute name.
1893 name = readNmtoken(true);
1895 // Read the attribute type.
1896 requireWhitespace();
1897 type = readAttType();
1899 // Get the string of enumerated values if necessary.
1900 if (handler.stringInterning)
1902 if ("ENUMERATION" == type || "NOTATION" == type)
1904 enumer = dataBufferToString();
1907 else
1909 if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1911 enumer = dataBufferToString();
1915 // Read the default value.
1916 requireWhitespace();
1917 parseDefault(elementName, name, type, enumer);
1921 * Parse the attribute type.
1922 * <pre>
1923 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1924 * [55] StringType ::= 'CDATA'
1925 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1926 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1927 * [57] EnumeratedType ::= NotationType | Enumeration
1928 * </pre>
1930 private String readAttType()
1931 throws Exception
1933 if (tryRead('('))
1935 parseEnumeration(false);
1936 return "ENUMERATION";
1938 else
1940 String typeString = readNmtoken(true);
1941 if (handler.stringInterning)
1943 if ("NOTATION" == typeString)
1945 parseNotationType();
1946 return typeString;
1948 else if ("CDATA" == typeString
1949 || "ID" == typeString
1950 || "IDREF" == typeString
1951 || "IDREFS" == typeString
1952 || "ENTITY" == typeString
1953 || "ENTITIES" == typeString
1954 || "NMTOKEN" == typeString
1955 || "NMTOKENS" == typeString)
1957 return typeString;
1960 else
1962 if ("NOTATION".equals(typeString))
1964 parseNotationType();
1965 return typeString;
1967 else if ("CDATA".equals(typeString)
1968 || "ID".equals(typeString)
1969 || "IDREF".equals(typeString)
1970 || "IDREFS".equals(typeString)
1971 || "ENTITY".equals(typeString)
1972 || "ENTITIES".equals(typeString)
1973 || "NMTOKEN".equals(typeString)
1974 || "NMTOKENS".equals(typeString))
1976 return typeString;
1979 error("illegal attribute type", typeString, null);
1980 return null;
1985 * Parse an enumeration.
1986 * <pre>
1987 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1988 * </pre>
1989 * <p>NOTE: the '(' has already been read.
1991 private void parseEnumeration(boolean isNames)
1992 throws Exception
1994 dataBufferAppend('(');
1996 // Read the first token.
1997 skipWhitespace();
1998 dataBufferAppend(readNmtoken(isNames));
1999 // Read the remaining tokens.
2000 skipWhitespace();
2001 while (!tryRead(')'))
2003 require('|');
2004 dataBufferAppend('|');
2005 skipWhitespace();
2006 dataBufferAppend(readNmtoken (isNames));
2007 skipWhitespace();
2009 dataBufferAppend(')');
2013 * Parse a notation type for an attribute.
2014 * <pre>
2015 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2016 * (S? '|' S? name)* S? ')'
2017 * </pre>
2018 * <p>NOTE: the 'NOTATION' has already been read
2020 private void parseNotationType()
2021 throws Exception
2023 requireWhitespace();
2024 require('(');
2026 parseEnumeration(true);
2030 * Parse the default value for an attribute.
2031 * <pre>
2032 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2033 * | (('#FIXED' S)? AttValue)
2034 * </pre>
2036 private void parseDefault(String elementName, String name,
2037 String type, String enumer)
2038 throws Exception
2040 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2041 String value = null;
2042 int flags = LIT_ATTRIBUTE;
2043 boolean saved = expandPE;
2044 String defaultType = null;
2046 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2047 // chars to spaces (doesn't matter when that's done if it doesn't
2048 // interfere with char refs expanding to whitespace).
2050 if (!skippedPE)
2052 flags |= LIT_ENTITY_REF;
2053 if (handler.stringInterning)
2055 if ("CDATA" != type)
2057 flags |= LIT_NORMALIZE;
2060 else
2062 if (!"CDATA".equals(type))
2064 flags |= LIT_NORMALIZE;
2069 expandPE = false;
2070 if (tryRead('#'))
2072 if (tryRead("FIXED"))
2074 defaultType = "#FIXED";
2075 valueType = ATTRIBUTE_DEFAULT_FIXED;
2076 requireWhitespace();
2077 value = readLiteral(flags);
2079 else if (tryRead("REQUIRED"))
2081 defaultType = "#REQUIRED";
2082 valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2084 else if (tryRead("IMPLIED"))
2086 defaultType = "#IMPLIED";
2087 valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2089 else
2091 error("illegal keyword for attribute default value");
2094 else
2096 value = readLiteral(flags);
2098 expandPE = saved;
2099 setAttribute(elementName, name, type, enumer, value, valueType);
2100 if (handler.stringInterning)
2102 if ("ENUMERATION" == type)
2104 type = enumer;
2106 else if ("NOTATION" == type)
2108 type = "NOTATION " + enumer;
2111 else
2113 if ("ENUMERATION".equals(type))
2115 type = enumer;
2117 else if ("NOTATION".equals(type))
2119 type = "NOTATION " + enumer;
2122 if (!skippedPE)
2124 handler.getDeclHandler().attributeDecl(elementName, name, type,
2125 defaultType, value);
2130 * Parse a conditional section.
2131 * <pre>
2132 * [61] conditionalSect ::= includeSect || ignoreSect
2133 * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2134 * extSubsetDecl ']]&gt;'
2135 * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2136 * ignoreSectContents* ']]&gt;'
2137 * [64] ignoreSectContents ::= Ignore
2138 * ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2139 * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2140 * </pre>
2141 * <p> NOTE: the '&gt;![' has already been read.
2143 private void parseConditionalSect(char[] saved)
2144 throws Exception
2146 skipWhitespace();
2147 if (tryRead("INCLUDE"))
2149 skipWhitespace();
2150 require('[');
2151 // VC: Proper Conditional Section/PE Nesting
2152 if (readBuffer != saved)
2154 handler.verror("Illegal Conditional Section/PE nesting");
2156 skipWhitespace();
2157 while (!tryRead("]]>"))
2159 parseMarkupdecl();
2160 skipWhitespace();
2163 else if (tryRead("IGNORE"))
2165 skipWhitespace();
2166 require('[');
2167 // VC: Proper Conditional Section/PE Nesting
2168 if (readBuffer != saved)
2170 handler.verror("Illegal Conditional Section/PE nesting");
2172 int nesting = 1;
2173 char c;
2174 expandPE = false;
2175 for (int nest = 1; nest > 0; )
2177 c = readCh();
2178 switch (c)
2180 case '<':
2181 if (tryRead("!["))
2183 nest++;
2185 case ']':
2186 if (tryRead("]>"))
2188 nest--;
2192 expandPE = true;
2194 else
2196 error("conditional section must begin with INCLUDE or IGNORE");
2200 private void parseCharRef()
2201 throws SAXException, IOException
2203 parseCharRef(true /* do flushDataBuffer by default */);
2207 * Try to read a character reference without consuming data from buffer.
2208 * <pre>
2209 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2210 * </pre>
2211 * <p>NOTE: the '&#' has already been read.
2213 private void tryReadCharRef()
2214 throws SAXException, IOException
2216 int value = 0;
2217 char c;
2219 if (tryRead('x'))
2221 loop1:
2222 while (true)
2224 c = readCh();
2225 if (c == ';')
2227 break loop1;
2229 else
2231 int n = Character.digit(c, 16);
2232 if (n == -1)
2234 error("illegal character in character reference", c, null);
2235 break loop1;
2237 value *= 16;
2238 value += n;
2242 else
2244 loop2:
2245 while (true)
2247 c = readCh();
2248 if (c == ';')
2250 break loop2;
2252 else
2254 int n = Character.digit(c, 10);
2255 if (n == -1)
2257 error("illegal character in character reference", c, null);
2258 break loop2;
2260 value *= 10;
2261 value += n;
2266 // check for character refs being legal XML
2267 if ((value < 0x0020
2268 && ! (value == '\n' || value == '\t' || value == '\r'))
2269 || (value >= 0xD800 && value <= 0xDFFF)
2270 || value == 0xFFFE || value == 0xFFFF
2271 || value > 0x0010ffff)
2273 error("illegal XML character reference U+"
2274 + Integer.toHexString(value));
2277 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2278 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2279 if (value > 0x0010ffff)
2281 // too big for surrogate
2282 error("character reference " + value + " is too large for UTF-16",
2283 new Integer(value).toString(), null);
2289 * Read and interpret a character reference.
2290 * <pre>
2291 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2292 * </pre>
2293 * <p>NOTE: the '&#' has already been read.
2295 private void parseCharRef(boolean doFlush)
2296 throws SAXException, IOException
2298 int value = 0;
2299 char c;
2301 if (tryRead('x'))
2303 loop1:
2304 while (true)
2306 c = readCh();
2307 if (c == ';')
2309 break loop1;
2311 else
2313 int n = Character.digit(c, 16);
2314 if (n == -1)
2316 error("illegal character in character reference", c, null);
2317 break loop1;
2319 value *= 16;
2320 value += n;
2324 else
2326 loop2:
2327 while (true)
2329 c = readCh();
2330 if (c == ';')
2332 break loop2;
2334 else
2336 int n = Character.digit(c, 10);
2337 if (n == -1)
2339 error("illegal character in character reference", c, null);
2340 break loop2;
2342 value *= 10;
2343 value += c - '0';
2348 // check for character refs being legal XML
2349 if ((value < 0x0020
2350 && ! (value == '\n' || value == '\t' || value == '\r'))
2351 || (value >= 0xD800 && value <= 0xDFFF)
2352 || value == 0xFFFE || value == 0xFFFF
2353 || value > 0x0010ffff)
2355 error("illegal XML character reference U+"
2356 + Integer.toHexString(value));
2359 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2360 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2361 if (value <= 0x0000ffff)
2363 // no surrogates needed
2364 dataBufferAppend((char) value);
2366 else if (value <= 0x0010ffff)
2368 value -= 0x10000;
2369 // > 16 bits, surrogate needed
2370 dataBufferAppend((char) (0xd800 | (value >> 10)));
2371 dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2373 else
2375 // too big for surrogate
2376 error("character reference " + value + " is too large for UTF-16",
2377 new Integer(value).toString(), null);
2379 if (doFlush)
2381 dataBufferFlush();
2386 * Parse and expand an entity reference.
2387 * <pre>
2388 * [68] EntityRef ::= '&' Name ';'
2389 * </pre>
2390 * <p>NOTE: the '&amp;' has already been read.
2391 * @param externalAllowed External entities are allowed here.
2393 private void parseEntityRef(boolean externalAllowed)
2394 throws SAXException, IOException
2396 String name;
2398 name = readNmtoken(true);
2399 require(';');
2400 switch (getEntityType(name))
2402 case ENTITY_UNDECLARED:
2403 // NOTE: XML REC describes amazingly convoluted handling for
2404 // this case. Nothing as meaningful as being a WFness error
2405 // unless the processor might _legitimately_ not have seen a
2406 // declaration ... which is what this implements.
2407 String message;
2409 message = "reference to undeclared general entity " + name;
2410 if (skippedPE && !docIsStandalone)
2412 handler.verror(message);
2413 // we don't know this entity, and it might be external...
2414 if (externalAllowed)
2416 handler.skippedEntity(name);
2419 else
2421 error(message);
2423 break;
2424 case ENTITY_INTERNAL:
2425 pushString(name, getEntityValue(name));
2427 //workaround for possible input pop before marking
2428 //the buffer reading position
2429 char t = readCh();
2430 unread(t);
2431 int bufferPosMark = readBufferPos;
2433 int end = readBufferPos + getEntityValue(name).length();
2434 for (int k = readBufferPos; k < end; k++)
2436 t = readCh();
2437 if (t == '&')
2439 t = readCh();
2440 if (t == '#')
2442 //try to match a character ref
2443 tryReadCharRef();
2445 //everything has been read
2446 if (readBufferPos >= end)
2448 break;
2450 k = readBufferPos;
2451 continue;
2453 else if (Character.isLetter(t))
2455 //looks like an entity ref
2456 unread(t);
2457 readNmtoken(true);
2458 require(';');
2460 //everything has been read
2461 if (readBufferPos >= end)
2463 break;
2465 k = readBufferPos;
2466 continue;
2468 error(" malformed entity reference");
2472 readBufferPos = bufferPosMark;
2473 break;
2474 case ENTITY_TEXT:
2475 if (externalAllowed)
2477 pushURL(false, name, getEntityIds(name),
2478 null, null, null, true);
2480 else
2482 error("reference to external entity in attribute value.",
2483 name, null);
2485 break;
2486 case ENTITY_NDATA:
2487 if (externalAllowed)
2489 error("unparsed entity reference in content", name, null);
2491 else
2493 error("reference to external entity in attribute value.",
2494 name, null);
2496 break;
2497 default:
2498 throw new RuntimeException();
2503 * Parse and expand a parameter entity reference.
2504 * <pre>
2505 * [69] PEReference ::= '%' Name ';'
2506 * </pre>
2507 * <p>NOTE: the '%' has already been read.
2509 private void parsePEReference()
2510 throws SAXException, IOException
2512 String name;
2514 name = "%" + readNmtoken(true);
2515 require(';');
2516 switch (getEntityType(name))
2518 case ENTITY_UNDECLARED:
2519 // VC: Entity Declared
2520 handler.verror("reference to undeclared parameter entity " + name);
2522 // we should disable handling of all subsequent declarations
2523 // unless this is a standalone document (info discarded)
2524 break;
2525 case ENTITY_INTERNAL:
2526 if (inLiteral)
2528 pushString(name, getEntityValue(name));
2530 else
2532 pushString(name, ' ' + getEntityValue(name) + ' ');
2534 break;
2535 case ENTITY_TEXT:
2536 if (!inLiteral)
2538 pushString(null, " ");
2540 pushURL(true, name, getEntityIds(name), null, null, null, true);
2541 if (!inLiteral)
2543 pushString(null, " ");
2545 break;
2550 * Parse an entity declaration.
2551 * <pre>
2552 * [70] EntityDecl ::= GEDecl | PEDecl
2553 * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2554 * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2555 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2556 * [74] PEDef ::= EntityValue | ExternalID
2557 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2558 * | 'PUBLIC' S PubidLiteral S SystemLiteral
2559 * [76] NDataDecl ::= S 'NDATA' S Name
2560 * </pre>
2561 * <p>NOTE: the '&lt;!ENTITY' has already been read.
2563 private void parseEntityDecl()
2564 throws Exception
2566 boolean peFlag = false;
2567 int flags = 0;
2569 // Check for a parameter entity.
2570 expandPE = false;
2571 requireWhitespace();
2572 if (tryRead('%'))
2574 peFlag = true;
2575 requireWhitespace();
2577 expandPE = true;
2579 // Read the entity name, and prepend
2580 // '%' if necessary.
2581 String name = readNmtoken(true);
2582 //NE08
2583 if (name.indexOf(':') >= 0)
2585 error("Illegal character(':') in entity name ", name, null);
2587 if (peFlag)
2589 name = "%" + name;
2592 // Read the entity value.
2593 requireWhitespace();
2594 char c = readCh();
2595 unread (c);
2596 if (c == '"' || c == '\'')
2598 // Internal entity ... replacement text has expanded refs
2599 // to characters and PEs, but not to general entities
2600 String value = readLiteral(flags);
2601 setInternalEntity(name, value);
2603 else
2605 // Read the external IDs
2606 ExternalIdentifiers ids = readExternalIds(false, false);
2608 // Check for NDATA declaration.
2609 boolean white = tryWhitespace();
2610 if (!peFlag && tryRead("NDATA"))
2612 if (!white)
2614 error("whitespace required before NDATA");
2616 requireWhitespace();
2617 String notationName = readNmtoken(true);
2618 if (!skippedPE)
2620 setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2621 handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2622 ids.baseUri, notationName);
2625 else if (!skippedPE)
2627 setExternalEntity(name, ENTITY_TEXT, ids, null);
2628 handler.getDeclHandler()
2629 .externalEntityDecl(name, ids.publicId,
2630 handler.resolveURIs()
2631 // FIXME: ASSUMES not skipped
2632 // "false" forces error on bad URI
2633 ? handler.absolutize(ids.baseUri,
2634 ids.systemId,
2635 false)
2636 : ids.systemId);
2640 // Finish the declaration.
2641 skipWhitespace();
2642 require('>');
2646 * Parse a notation declaration.
2647 * <pre>
2648 * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2649 * (ExternalID | PublicID) S? '&gt;'
2650 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2651 * </pre>
2652 * <P>NOTE: the '&lt;!NOTATION' has already been read.
2654 private void parseNotationDecl()
2655 throws Exception
2657 String nname;
2658 ExternalIdentifiers ids;
2660 requireWhitespace();
2661 nname = readNmtoken(true);
2662 //NE08
2663 if (nname.indexOf(':') >= 0)
2665 error("Illegal character(':') in notation name ", nname, null);
2667 requireWhitespace();
2669 // Read the external identifiers.
2670 ids = readExternalIds(true, false);
2672 // Register the notation.
2673 setNotation(nname, ids);
2675 skipWhitespace();
2676 require('>');
2680 * Parse character data.
2681 * <pre>
2682 * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2683 * </pre>
2685 private void parseCharData()
2686 throws Exception
2688 char c;
2689 int state = 0;
2690 boolean pureWhite = false;
2692 // assert (dataBufferPos == 0);
2694 // are we expecting pure whitespace? it might be dirty...
2695 if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2697 pureWhite = true;
2700 // always report right out of readBuffer
2701 // to minimize (pointless) buffer copies
2702 while (true)
2704 int lineAugment = 0;
2705 int columnAugment = 0;
2706 int i;
2708 loop:
2709 for (i = readBufferPos; i < readBufferLength; i++)
2711 switch (c = readBuffer[i])
2713 case '\n':
2714 lineAugment++;
2715 columnAugment = 0;
2716 // pureWhite unmodified
2717 break;
2718 case '\r': // should not happen!!
2719 case '\t':
2720 case ' ':
2721 // pureWhite unmodified
2722 columnAugment++;
2723 break;
2724 case '&':
2725 case '<':
2726 columnAugment++;
2727 // pureWhite unmodified
2728 // CLEAN end of text sequence
2729 state = 1;
2730 break loop;
2731 case ']':
2732 // that's not a whitespace char, and
2733 // can not terminate pure whitespace either
2734 pureWhite = false;
2735 if ((i + 2) < readBufferLength)
2737 if (readBuffer [i + 1] == ']'
2738 && readBuffer [i + 2] == '>')
2740 // ERROR end of text sequence
2741 state = 2;
2742 break loop;
2745 else
2747 // FIXME missing two end-of-buffer cases
2749 columnAugment++;
2750 break;
2751 default:
2752 if ((c < 0x0020 || c > 0xFFFD)
2753 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
2754 && xmlVersion == XML_11))
2756 error("illegal XML character U+"
2757 + Integer.toHexString(c));
2759 // that's not a whitespace char
2760 pureWhite = false;
2761 columnAugment++;
2765 // report text thus far
2766 if (lineAugment > 0)
2768 line += lineAugment;
2769 column = columnAugment;
2771 else
2773 column += columnAugment;
2776 // report characters/whitspace
2777 int length = i - readBufferPos;
2779 if (length != 0)
2781 if (pureWhite)
2783 handler.ignorableWhitespace(readBuffer,
2784 readBufferPos, length);
2786 else
2788 handler.charData(readBuffer, readBufferPos, length);
2790 readBufferPos = i;
2793 if (state != 0)
2795 break;
2798 // fill next buffer from this entity, or
2799 // pop stack and continue with previous entity
2800 unread(readCh());
2802 if (!pureWhite)
2804 isDirtyCurrentElement = true;
2806 // finish, maybe with error
2807 if (state != 1) // finish, no error
2809 error("character data may not contain ']]>'");
2813 //////////////////////////////////////////////////////////////////////
2814 // High-level reading and scanning methods.
2815 //////////////////////////////////////////////////////////////////////
2818 * Require whitespace characters.
2820 private void requireWhitespace()
2821 throws SAXException, IOException
2823 char c = readCh();
2824 if (isWhitespace(c))
2826 skipWhitespace();
2828 else
2830 error("whitespace required", c, null);
2835 * Skip whitespace characters.
2836 * <pre>
2837 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2838 * </pre>
2840 private void skipWhitespace()
2841 throws SAXException, IOException
2843 // Start with a little cheat. Most of
2844 // the time, the white space will fall
2845 // within the current read buffer; if
2846 // not, then fall through.
2847 if (USE_CHEATS)
2849 int lineAugment = 0;
2850 int columnAugment = 0;
2852 loop:
2853 for (int i = readBufferPos; i < readBufferLength; i++)
2855 switch (readBuffer[i])
2857 case ' ':
2858 case '\t':
2859 case '\r':
2860 columnAugment++;
2861 break;
2862 case '\n':
2863 lineAugment++;
2864 columnAugment = 0;
2865 break;
2866 case '%':
2867 if (expandPE)
2869 break loop;
2871 // else fall through...
2872 default:
2873 readBufferPos = i;
2874 if (lineAugment > 0)
2876 line += lineAugment;
2877 column = columnAugment;
2879 else
2881 column += columnAugment;
2883 return;
2888 // OK, do it the slow way.
2889 char c = readCh ();
2890 while (isWhitespace(c))
2892 c = readCh();
2894 unread(c);
2898 * Read a name or (when parsing an enumeration) name token.
2899 * <pre>
2900 * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2901 * [7] Nmtoken ::= (NameChar)+
2902 * </pre>
2904 private String readNmtoken(boolean isName)
2905 throws SAXException, IOException
2907 char c;
2909 if (USE_CHEATS)
2911 loop:
2912 for (int i = readBufferPos; i < readBufferLength; i++)
2914 c = readBuffer[i];
2915 switch (c)
2917 case '%':
2918 if (expandPE)
2920 break loop;
2922 // else fall through...
2924 // What may legitimately come AFTER a name/nmtoken?
2925 case '<': case '>': case '&':
2926 case ',': case '|': case '*': case '+': case '?':
2927 case ')':
2928 case '=':
2929 case '\'': case '"':
2930 case '[':
2931 case ' ': case '\t': case '\r': case '\n':
2932 case ';':
2933 case '/':
2934 int start = readBufferPos;
2935 if (i == start)
2937 error("name expected", readBuffer[i], null);
2939 readBufferPos = i;
2940 return intern(readBuffer, start, i - start);
2942 default:
2943 // FIXME ... per IBM's OASIS test submission, these:
2944 // ? U+06dd
2945 // Combining U+309B
2946 //these switches are kind of ugly but at least we won't
2947 //have to go over the whole lits for each char
2948 if (isName && i == readBufferPos)
2950 char c2 = (char) (c & 0x00f0);
2951 switch (c & 0xff00)
2953 //starting with 01
2954 case 0x0100:
2955 switch (c2)
2957 case 0x0030:
2958 if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2960 error("Not a name start character, U+"
2961 + Integer.toHexString(c));
2963 break;
2964 case 0x0040:
2965 if (c == 0x0140 || c == 0x0149)
2967 error("Not a name start character, U+"
2968 + Integer.toHexString(c));
2970 break;
2971 case 0x00c0:
2972 if (c == 0x01c4 || c == 0x01cc)
2974 error("Not a name start character, U+"
2975 + Integer.toHexString(c));
2977 break;
2978 case 0x00f0:
2979 if (c == 0x01f1 || c == 0x01f3)
2981 error("Not a name start character, U+"
2982 + Integer.toHexString(c));
2984 break;
2985 case 0x00b0:
2986 if (c == 0x01f1 || c == 0x01f3)
2988 error("Not a name start character, U+"
2989 + Integer.toHexString(c));
2991 break;
2992 default:
2993 if (c == 0x017f)
2995 error("Not a name start character, U+"
2996 + Integer.toHexString(c));
3000 break;
3001 //starting with 11
3002 case 0x1100:
3003 switch (c2)
3005 case 0x0000:
3006 if (c == 0x1104 || c == 0x1108 ||
3007 c == 0x110a || c == 0x110d)
3009 error("Not a name start character, U+"
3010 + Integer.toHexString(c));
3012 break;
3013 case 0x0030:
3014 if (c == 0x113b || c == 0x113f)
3016 error("Not a name start character, U+"
3017 + Integer.toHexString(c));
3019 break;
3020 case 0x0040:
3021 if (c == 0x1141 || c == 0x114d
3022 || c == 0x114f )
3024 error("Not a name start character, U+"
3025 + Integer.toHexString(c));
3027 break;
3028 case 0x0050:
3029 if (c == 0x1151 || c == 0x1156)
3031 error("Not a name start character, U+"
3032 + Integer.toHexString(c));
3034 break;
3035 case 0x0060:
3036 if (c == 0x1162 || c == 0x1164
3037 || c == 0x1166 || c == 0x116b
3038 || c == 0x116f)
3040 error("Not a name start character, U+"
3041 + Integer.toHexString(c));
3043 break;
3044 case 0x00b0:
3045 if (c == 0x11b6 || c == 0x11b9
3046 || c == 0x11bb || c == 0x116f)
3048 error("Not a name start character, U+"
3049 + Integer.toHexString(c));
3051 break;
3052 default:
3053 if (c == 0x1174 || c == 0x119f
3054 || c == 0x11ac || c == 0x11c3
3055 || c == 0x11f1)
3057 error("Not a name start character, U+"
3058 + Integer.toHexString(c));
3061 break;
3062 default:
3063 if (c == 0x0e46 || c == 0x1011
3064 || c == 0x212f || c == 0x0587
3065 || c == 0x0230 )
3067 error("Not a name start character, U+"
3068 + Integer.toHexString(c));
3072 // punt on exact tests from Appendix A; approximate
3073 // them using the Unicode ID start/part rules
3074 if (i == readBufferPos && isName)
3076 if (!Character.isUnicodeIdentifierStart(c)
3077 && c != ':' && c != '_')
3079 error("Not a name start character, U+"
3080 + Integer.toHexString(c));
3083 else if (!Character.isUnicodeIdentifierPart(c)
3084 && c != '-' && c != ':' && c != '_' && c != '.'
3085 && !isExtender(c))
3087 error("Not a name character, U+"
3088 + Integer.toHexString(c));
3094 nameBufferPos = 0;
3096 // Read the first character.
3097 loop:
3098 while (true)
3100 c = readCh();
3101 switch (c)
3103 case '%':
3104 case '<': case '>': case '&':
3105 case ',': case '|': case '*': case '+': case '?':
3106 case ')':
3107 case '=':
3108 case '\'': case '"':
3109 case '[':
3110 case ' ': case '\t': case '\n': case '\r':
3111 case ';':
3112 case '/':
3113 unread(c);
3114 if (nameBufferPos == 0)
3116 error ("name expected");
3118 // punt on exact tests from Appendix A, but approximate them
3119 if (isName
3120 && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3121 && ":_".indexOf(nameBuffer[0]) == -1)
3123 error("Not a name start character, U+"
3124 + Integer.toHexString(nameBuffer[0]));
3126 String s = intern(nameBuffer, 0, nameBufferPos);
3127 nameBufferPos = 0;
3128 return s;
3129 default:
3130 // punt on exact tests from Appendix A, but approximate them
3132 if ((nameBufferPos != 0 || !isName)
3133 && !Character.isUnicodeIdentifierPart(c)
3134 && ":-_.".indexOf(c) == -1
3135 && !isExtender(c))
3137 error("Not a name character, U+"
3138 + Integer.toHexString(c));
3140 if (nameBufferPos >= nameBuffer.length)
3142 nameBuffer =
3143 (char[]) extendArray(nameBuffer,
3144 nameBuffer.length, nameBufferPos);
3146 nameBuffer[nameBufferPos++] = c;
3151 private static boolean isExtender(char c)
3153 // [88] Extender ::= ...
3154 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3155 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3156 || (c >= 0x3031 && c <= 0x3035)
3157 || (c >= 0x309d && c <= 0x309e)
3158 || (c >= 0x30fc && c <= 0x30fe);
3162 * Read a literal. With matching single or double quotes as
3163 * delimiters (and not embedded!) this is used to parse:
3164 * <pre>
3165 * [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3166 * [10] AttValue ::= ... ([^<&] | Reference)* ...
3167 * [11] SystemLiteral ::= ... (URLchar - "'")* ...
3168 * [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3169 * </pre>
3170 * as well as the quoted strings in XML and text declarations
3171 * (for version, encoding, and standalone) which have their
3172 * own constraints.
3174 private String readLiteral(int flags)
3175 throws SAXException, IOException
3177 char delim, c;
3178 int startLine = line;
3179 boolean saved = expandPE;
3180 boolean savedReport = doReport;
3182 // Find the first delimiter.
3183 delim = readCh();
3184 if (delim != '"' && delim != '\'')
3186 error("expected '\"' or \"'\"", delim, null);
3187 return null;
3189 inLiteral = true;
3190 if ((flags & LIT_DISABLE_PE) != 0)
3192 expandPE = false;
3194 doReport = false;
3196 // Each level of input source has its own buffer; remember
3197 // ours, so we won't read the ending delimiter from any
3198 // other input source, regardless of entity processing.
3199 char[] ourBuf = readBuffer;
3201 // Read the literal.
3204 c = readCh();
3205 boolean ampRead = false;
3206 loop:
3207 while (! (c == delim && readBuffer == ourBuf))
3209 switch (c)
3211 // attributes and public ids are normalized
3212 // in almost the same ways
3213 case '\n':
3214 case '\r':
3215 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3217 c = ' ';
3219 break;
3220 case '\t':
3221 if ((flags & LIT_ATTRIBUTE) != 0)
3223 c = ' ';
3225 break;
3226 case '&':
3227 c = readCh();
3228 // Char refs are expanded immediately, except for
3229 // all the cases where it's deferred.
3230 if (c == '#')
3232 if ((flags & LIT_DISABLE_CREF) != 0)
3234 dataBufferAppend('&');
3235 break;
3237 parseCharRef(false /* Do not do flushDataBuffer */);
3239 // exotic WFness risk: this is an entity literal,
3240 // dataBuffer [dataBufferPos - 1] == '&', and
3241 // following chars are a _partial_ entity/char ref
3243 // It looks like an entity ref ...
3245 else
3247 unread(c);
3248 // Expand it?
3249 if ((flags & LIT_ENTITY_REF) > 0)
3251 parseEntityRef(false);
3252 if (String.valueOf(readBuffer).equals("&#38;"))
3254 ampRead = true;
3256 //Is it just data?
3258 else if ((flags & LIT_DISABLE_EREF) != 0)
3260 dataBufferAppend('&');
3262 // OK, it will be an entity ref -- expanded later.
3264 else
3266 String name = readNmtoken(true);
3267 require(';');
3268 dataBufferAppend('&');
3269 dataBufferAppend(name);
3270 dataBufferAppend(';');
3273 c = readCh();
3274 continue loop;
3276 case '<':
3277 // and why? Perhaps so "&foo;" expands the same
3278 // inside and outside an attribute?
3279 if ((flags & LIT_ATTRIBUTE) != 0)
3281 error("attribute values may not contain '<'");
3283 break;
3285 // We don't worry about case '%' and PE refs, readCh does.
3287 default:
3288 break;
3290 dataBufferAppend(c);
3291 c = readCh();
3294 catch (EOFException e)
3296 error("end of input while looking for delimiter (started on line "
3297 + startLine + ')', null, new Character(delim).toString());
3299 inLiteral = false;
3300 expandPE = saved;
3301 doReport = savedReport;
3303 // Normalise whitespace if necessary.
3304 if ((flags & LIT_NORMALIZE) > 0)
3306 dataBufferNormalize();
3309 // Return the value.
3310 return dataBufferToString();
3314 * Try reading external identifiers.
3315 * A system identifier is not required for notations.
3316 * @param inNotation Are we parsing a notation decl?
3317 * @param isSubset Parsing external subset decl (may be omitted)?
3318 * @return A three-member String array containing the identifiers,
3319 * or nulls. Order: public, system, baseURI.
3321 private ExternalIdentifiers readExternalIds(boolean inNotation,
3322 boolean isSubset)
3323 throws Exception
3325 char c;
3326 ExternalIdentifiers ids = new ExternalIdentifiers();
3327 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3329 if (tryRead("PUBLIC"))
3331 requireWhitespace();
3332 ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3333 if (inNotation)
3335 skipWhitespace();
3336 c = readCh();
3337 unread(c);
3338 if (c == '"' || c == '\'')
3340 ids.systemId = readLiteral(flags);
3343 else
3345 requireWhitespace();
3346 ids.systemId = readLiteral(flags);
3349 for (int i = 0; i < ids.publicId.length(); i++)
3351 c = ids.publicId.charAt(i);
3352 if (c >= 'a' && c <= 'z')
3354 continue;
3356 if (c >= 'A' && c <= 'Z')
3358 continue;
3360 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3362 continue;
3364 error("illegal PUBLIC id character U+"
3365 + Integer.toHexString(c));
3368 else if (tryRead("SYSTEM"))
3370 requireWhitespace();
3371 ids.systemId = readLiteral(flags);
3373 else if (!isSubset)
3375 error("missing SYSTEM or PUBLIC keyword");
3378 if (ids.systemId != null)
3380 if (ids.systemId.indexOf('#') != -1)
3382 handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3384 ids.baseUri = handler.getSystemId();
3385 if (ids.baseUri == null && uriWarnings)
3387 handler.warn("No base URI; hope URI is absolute: "
3388 + ids.systemId);
3392 return ids;
3396 * Test if a character is whitespace.
3397 * <pre>
3398 * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3399 * </pre>
3400 * @param c The character to test.
3401 * @return true if the character is whitespace.
3403 private final boolean isWhitespace(char c)
3405 if (c > 0x20)
3407 return false;
3409 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3411 return true;
3413 return false; // illegal ...
3416 //////////////////////////////////////////////////////////////////////
3417 // Utility routines.
3418 //////////////////////////////////////////////////////////////////////
3421 * Add a character to the data buffer.
3423 private void dataBufferAppend(char c)
3425 // Expand buffer if necessary.
3426 if (dataBufferPos >= dataBuffer.length)
3428 dataBuffer = (char[]) extendArray(dataBuffer,
3429 dataBuffer.length, dataBufferPos);
3431 dataBuffer[dataBufferPos++] = c;
3435 * Add a string to the data buffer.
3437 private void dataBufferAppend(String s)
3439 dataBufferAppend(s.toCharArray(), 0, s.length());
3443 * Append (part of) a character array to the data buffer.
3445 private void dataBufferAppend(char[] ch, int start, int length)
3447 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3448 dataBufferPos + length);
3450 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3451 dataBufferPos += length;
3455 * Normalise space characters in the data buffer.
3457 private void dataBufferNormalize()
3459 int i = 0;
3460 int j = 0;
3461 int end = dataBufferPos;
3463 // Skip spaces at the start.
3464 while (j < end && dataBuffer[j] == ' ')
3466 j++;
3469 // Skip whitespace at the end.
3470 while (end > j && dataBuffer[end - 1] == ' ')
3472 end --;
3475 // Start copying to the left.
3476 while (j < end)
3479 char c = dataBuffer[j++];
3481 // Normalise all other spaces to
3482 // a single space.
3483 if (c == ' ')
3485 while (j < end && dataBuffer[j++] == ' ')
3487 continue;
3489 dataBuffer[i++] = ' ';
3490 dataBuffer[i++] = dataBuffer[j - 1];
3492 else
3494 dataBuffer[i++] = c;
3498 // The new length is <= the old one.
3499 dataBufferPos = i;
3503 * Convert the data buffer to a string.
3505 private String dataBufferToString()
3507 String s = new String(dataBuffer, 0, dataBufferPos);
3508 dataBufferPos = 0;
3509 return s;
3513 * Flush the contents of the data buffer to the handler, as
3514 * appropriate, and reset the buffer for new input.
3516 private void dataBufferFlush()
3517 throws SAXException
3519 if (currentElementContent == CONTENT_ELEMENTS
3520 && dataBufferPos > 0
3521 && !inCDATA)
3523 // We can't just trust the buffer to be whitespace, there
3524 // are (error) cases when it isn't
3525 for (int i = 0; i < dataBufferPos; i++)
3527 if (!isWhitespace(dataBuffer[i]))
3529 handler.charData(dataBuffer, 0, dataBufferPos);
3530 dataBufferPos = 0;
3533 if (dataBufferPos > 0)
3535 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3536 dataBufferPos = 0;
3539 else if (dataBufferPos > 0)
3541 handler.charData(dataBuffer, 0, dataBufferPos);
3542 dataBufferPos = 0;
3547 * Require a string to appear, or throw an exception.
3548 * <p><em>Precondition:</em> Entity expansion is not required.
3549 * <p><em>Precondition:</em> data buffer has no characters that
3550 * will get sent to the application.
3552 private void require(String delim)
3553 throws SAXException, IOException
3555 int length = delim.length();
3556 char[] ch;
3558 if (length < dataBuffer.length)
3560 ch = dataBuffer;
3561 delim.getChars(0, length, ch, 0);
3563 else
3565 ch = delim.toCharArray();
3568 if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3570 int offset = readBufferPos;
3572 for (int i = 0; i < length; i++, offset++)
3574 if (ch[i] != readBuffer[offset])
3576 error ("required string", null, delim);
3579 readBufferPos = offset;
3582 else
3584 for (int i = 0; i < length; i++)
3586 require(ch[i]);
3592 * Require a character to appear, or throw an exception.
3594 private void require(char delim)
3595 throws SAXException, IOException
3597 char c = readCh();
3599 if (c != delim)
3601 error("required character", c, new Character(delim).toString());
3606 * Create an interned string from a character array.
3607 * &AElig;lfred uses this method to create an interned version
3608 * of all names and name tokens, so that it can test equality
3609 * with <code>==</code> instead of <code>String.equals ()</code>.
3611 * <p>This is much more efficient than constructing a non-interned
3612 * string first, and then interning it.
3614 * @param ch an array of characters for building the string.
3615 * @param start the starting position in the array.
3616 * @param length the number of characters to place in the string.
3617 * @return an interned string.
3618 * @see #intern (String)
3619 * @see java.lang.String#intern
3621 public String intern(char[] ch, int start, int length)
3623 int index = 0;
3624 int hash = 0;
3625 Object[] bucket;
3627 // Generate a hash code. This is a widely used string hash,
3628 // often attributed to Brian Kernighan.
3629 for (int i = start; i < start + length; i++)
3631 hash = 31 * hash + ch[i];
3633 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3635 // Get the bucket -- consists of {array,String} pairs
3636 if ((bucket = symbolTable[hash]) == null)
3638 // first string in this bucket
3639 bucket = new Object[8];
3641 // Search for a matching tuple, and
3642 // return the string if we find one.
3644 else
3646 while (index < bucket.length)
3648 char[] chFound = (char[]) bucket[index];
3650 // Stop when we hit an empty entry.
3651 if (chFound == null)
3653 break;
3656 // If they're the same length, check for a match.
3657 if (chFound.length == length)
3659 for (int i = 0; i < chFound.length; i++)
3661 // continue search on failure
3662 if (ch[start + i] != chFound[i])
3664 break;
3666 else if (i == length - 1)
3668 // That's it, we have a match!
3669 return (String) bucket[index + 1];
3673 index += 2;
3675 // Not found -- we'll have to add it.
3677 // Do we have to grow the bucket?
3678 bucket = (Object[]) extendArray(bucket, bucket.length, index);
3680 symbolTable[hash] = bucket;
3682 // OK, add it to the end of the bucket -- "local" interning.
3683 // Intern "globally" to let applications share interning benefits.
3684 // That is, "!=" and "==" work on our strings, not just equals().
3685 String s = new String(ch, start, length).intern();
3686 bucket[index] = s.toCharArray();
3687 bucket[index + 1] = s;
3688 return s;
3692 * Ensure the capacity of an array, allocating a new one if
3693 * necessary. Usually extends only for name hash collisions.
3695 private Object extendArray(Object array, int currentSize, int requiredSize)
3697 if (requiredSize < currentSize)
3699 return array;
3701 else
3703 Object newArray = null;
3704 int newSize = currentSize * 2;
3706 if (newSize <= requiredSize)
3708 newSize = requiredSize + 1;
3711 if (array instanceof char[])
3713 newArray = new char[newSize];
3715 else if (array instanceof Object[])
3717 newArray = new Object[newSize];
3719 else
3721 throw new RuntimeException();
3724 System.arraycopy(array, 0, newArray, 0, currentSize);
3725 return newArray;
3729 //////////////////////////////////////////////////////////////////////
3730 // XML query routines.
3731 //////////////////////////////////////////////////////////////////////
3733 boolean isStandalone()
3735 return docIsStandalone;
3739 // Elements
3742 private int getContentType(ElementDecl element, int defaultType)
3744 int retval;
3746 if (element == null)
3748 return defaultType;
3750 retval = element.contentType;
3751 if (retval == CONTENT_UNDECLARED)
3753 retval = defaultType;
3755 return retval;
3759 * Look up the content type of an element.
3760 * @param name The element type name.
3761 * @return An integer constant representing the content type.
3762 * @see #CONTENT_UNDECLARED
3763 * @see #CONTENT_ANY
3764 * @see #CONTENT_EMPTY
3765 * @see #CONTENT_MIXED
3766 * @see #CONTENT_ELEMENTS
3768 public int getElementContentType(String name)
3770 ElementDecl element = (ElementDecl) elementInfo.get(name);
3771 return getContentType(element, CONTENT_UNDECLARED);
3775 * Register an element.
3776 * Array format:
3777 * [0] element type name
3778 * [1] content model (mixed, elements only)
3779 * [2] attribute hash table
3781 private void setElement(String name, int contentType,
3782 String contentModel, HashMap attributes)
3783 throws SAXException
3785 if (skippedPE)
3787 return;
3790 ElementDecl element = (ElementDecl) elementInfo.get(name);
3792 // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3793 if (element == null)
3795 element = new ElementDecl();
3796 element.contentType = contentType;
3797 element.contentModel = contentModel;
3798 element.attributes = attributes;
3799 elementInfo.put(name, element);
3800 return;
3803 // <!ELEMENT ...> declaration?
3804 if (contentType != CONTENT_UNDECLARED)
3806 // ... following an associated <!ATTLIST ...>
3807 if (element.contentType == CONTENT_UNDECLARED)
3809 element.contentType = contentType;
3810 element.contentModel = contentModel;
3812 else
3814 // VC: Unique Element Type Declaration
3815 handler.verror("multiple declarations for element type: "
3816 + name);
3820 // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3821 else if (attributes != null)
3823 element.attributes = attributes;
3828 * Look up the attribute hash table for an element.
3829 * The hash table is the second item in the element array.
3831 private HashMap getElementAttributes(String name)
3833 ElementDecl element = (ElementDecl) elementInfo.get(name);
3834 return (element == null) ? null : element.attributes;
3838 // Attributes
3842 * Get the declared attributes for an element type.
3843 * @param elname The name of the element type.
3844 * @return An iterator over all the attributes declared for
3845 * a specific element type. The results will be valid only
3846 * after the DTD (if any) has been parsed.
3847 * @see #getAttributeType
3848 * @see #getAttributeEnumeration
3849 * @see #getAttributeDefaultValueType
3850 * @see #getAttributeDefaultValue
3851 * @see #getAttributeExpandedValue
3853 private Iterator declaredAttributes(ElementDecl element)
3855 HashMap attlist;
3857 if (element == null)
3859 return null;
3861 if ((attlist = element.attributes) == null)
3863 return null;
3865 return attlist.keySet().iterator();
3869 * Get the declared attributes for an element type.
3870 * @param elname The name of the element type.
3871 * @return An iterator over all the attributes declared for
3872 * a specific element type. The results will be valid only
3873 * after the DTD (if any) has been parsed.
3874 * @see #getAttributeType
3875 * @see #getAttributeEnumeration
3876 * @see #getAttributeDefaultValueType
3877 * @see #getAttributeDefaultValue
3878 * @see #getAttributeExpandedValue
3880 public Iterator declaredAttributes(String elname)
3882 return declaredAttributes((ElementDecl) elementInfo.get(elname));
3886 * Retrieve the declared type of an attribute.
3887 * @param name The name of the associated element.
3888 * @param aname The name of the attribute.
3889 * @return An interend string denoting the type, or null
3890 * indicating an undeclared attribute.
3892 public String getAttributeType(String name, String aname)
3894 AttributeDecl attribute = getAttribute(name, aname);
3895 return (attribute == null) ? null : attribute.type;
3899 * Retrieve the allowed values for an enumerated attribute type.
3900 * @param name The name of the associated element.
3901 * @param aname The name of the attribute.
3902 * @return A string containing the token list.
3904 public String getAttributeEnumeration(String name, String aname)
3906 AttributeDecl attribute = getAttribute(name, aname);
3907 // assert: attribute.enumeration is "ENUMERATION" or "NOTATION"
3908 return (attribute == null) ? null : attribute.enumeration;
3912 * Retrieve the default value of a declared attribute.
3913 * @param name The name of the associated element.
3914 * @param aname The name of the attribute.
3915 * @return The default value, or null if the attribute was
3916 * #IMPLIED or simply undeclared and unspecified.
3917 * @see #getAttributeExpandedValue
3919 public String getAttributeDefaultValue(String name, String aname)
3921 AttributeDecl attribute = getAttribute(name, aname);
3922 return (attribute == null) ? null : attribute.value;
3927 // FIXME: Leaving this in, until W3C finally resolves the confusion
3928 // between parts of the XML 2nd REC about when entity declararations
3929 // are guaranteed to be known. Current code matches what section 5.1
3930 // (conformance) describes, but some readings of the self-contradicting
3931 // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3932 // attribute expansion/normalization must be deferred in some cases
3933 // (just TRY to identify them!).
3935 * Retrieve the expanded value of a declared attribute.
3936 * <p>General entities (and char refs) will be expanded (once).
3937 * @param name The name of the associated element.
3938 * @param aname The name of the attribute.
3939 * @return The expanded default value, or null if the attribute was
3940 * #IMPLIED or simply undeclared
3941 * @see #getAttributeDefaultValue
3942 public String getAttributeExpandedValue (String name, String aname)
3943 throws Exception
3945 AttributeDecl attribute = getAttribute (name, aname);
3947 if (attribute == null) {
3948 return null;
3949 } else if (attribute.defaultValue == null && attribute.value != null) {
3950 // we MUST use the same buf for both quotes else the literal
3951 // can't be properly terminated
3952 char buf [] = new char [1];
3953 int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3954 String type = getAttributeType (name, aname);
3956 if (type != "CDATA" && type != null)
3957 flags |= LIT_NORMALIZE;
3958 buf [0] = '"';
3959 pushCharArray (null, buf, 0, 1);
3960 pushString (null, attribute.value);
3961 pushCharArray (null, buf, 0, 1);
3962 attribute.defaultValue = readLiteral (flags);
3964 return attribute.defaultValue;
3969 * Retrieve the default value mode of a declared attribute.
3970 * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3971 * @see #ATTRIBUTE_DEFAULT_IMPLIED
3972 * @see #ATTRIBUTE_DEFAULT_REQUIRED
3973 * @see #ATTRIBUTE_DEFAULT_FIXED
3975 public int getAttributeDefaultValueType(String name, String aname)
3977 AttributeDecl attribute = getAttribute(name, aname);
3978 return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
3979 attribute.valueType;
3983 * Register an attribute declaration for later retrieval.
3984 * Format:
3985 * - String type
3986 * - String default value
3987 * - int value type
3988 * - enumeration
3989 * - processed default value
3991 private void setAttribute(String elName, String name, String type,
3992 String enumeration, String value, int valueType)
3993 throws Exception
3995 HashMap attlist;
3997 if (skippedPE)
3999 return;
4002 // Create a new hashtable if necessary.
4003 attlist = getElementAttributes(elName);
4004 if (attlist == null)
4006 attlist = new HashMap();
4009 // ignore multiple attribute declarations!
4010 if (attlist.get(name) != null)
4012 // warn ...
4013 return;
4015 else
4017 AttributeDecl attribute = new AttributeDecl();
4018 attribute.type = type;
4019 attribute.value = value;
4020 attribute.valueType = valueType;
4021 attribute.enumeration = enumeration;
4022 attlist.put(name, attribute);
4024 // save; but don't overwrite any existing <!ELEMENT ...>
4025 setElement(elName, CONTENT_UNDECLARED, null, attlist);
4030 * Retrieve the attribute declaration for the given element name and name.
4032 private AttributeDecl getAttribute(String elName, String name)
4034 HashMap attlist = getElementAttributes(elName);
4035 return (attlist == null) ? null : (AttributeDecl) attlist.get(name);
4039 // Entities
4043 * Find the type of an entity.
4044 * @returns An integer constant representing the entity type.
4045 * @see #ENTITY_UNDECLARED
4046 * @see #ENTITY_INTERNAL
4047 * @see #ENTITY_NDATA
4048 * @see #ENTITY_TEXT
4050 public int getEntityType(String ename)
4052 EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4053 return (entity == null) ? ENTITY_UNDECLARED : entity.type;
4057 * Return an external entity's identifiers.
4058 * @param ename The name of the external entity.
4059 * @return The entity's public identifier, system identifier, and base URI.
4060 * Null if the entity was not declared as an external entity.
4061 * @see #getEntityType
4063 public ExternalIdentifiers getEntityIds(String ename)
4065 EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4066 return (entity == null) ? null : entity.ids;
4070 * Return an internal entity's replacement text.
4071 * @param ename The name of the internal entity.
4072 * @return The entity's replacement text, or null if
4073 * the entity was not declared as an internal entity.
4074 * @see #getEntityType
4076 public String getEntityValue(String ename)
4078 EntityInfo entity = (EntityInfo) entityInfo.get(ename);
4079 return (entity == null) ? null : entity.value;
4083 * Register an entity declaration for later retrieval.
4085 private void setInternalEntity(String eName, String value)
4086 throws SAXException
4088 if (skippedPE)
4090 return;
4093 if (entityInfo.get(eName) == null)
4095 EntityInfo entity = new EntityInfo();
4096 entity.type = ENTITY_INTERNAL;
4097 entity.value = value;
4098 entityInfo.put(eName, entity);
4100 if (handler.stringInterning)
4102 if ("lt" == eName || "gt" == eName || "quot" == eName
4103 || "apos" == eName || "amp" == eName)
4105 return;
4108 else
4110 if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4111 || "apos".equals(eName) || "amp".equals(eName))
4113 return;
4116 handler.getDeclHandler().internalEntityDecl(eName, value);
4120 * Register an external entity declaration for later retrieval.
4122 private void setExternalEntity(String eName, int eClass,
4123 ExternalIdentifiers ids, String nName)
4125 if (entityInfo.get(eName) == null)
4127 EntityInfo entity = new EntityInfo();
4128 entity.type = eClass;
4129 entity.ids = ids;
4130 entity.notationName = nName;
4131 entityInfo.put(eName, entity);
4136 // Notations.
4140 * Report a notation declaration, checking for duplicates.
4142 private void setNotation(String nname, ExternalIdentifiers ids)
4143 throws SAXException
4145 if (skippedPE)
4147 return;
4150 handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4151 if (notationInfo.get(nname) == null)
4153 notationInfo.put(nname, nname);
4155 else
4157 // VC: Unique Notation Name
4158 handler.verror("Duplicate notation name decl: " + nname);
4163 // Location.
4167 * Return the current line number.
4169 public int getLineNumber()
4171 return line;
4175 * Return the current column number.
4177 public int getColumnNumber()
4179 return column;
4182 //////////////////////////////////////////////////////////////////////
4183 // High-level I/O.
4184 //////////////////////////////////////////////////////////////////////
4187 * Read a single character from the readBuffer.
4188 * <p>The readDataChunk () method maintains the buffer.
4189 * <p>If we hit the end of an entity, try to pop the stack and
4190 * keep going.
4191 * <p> (This approach doesn't really enforce XML's rules about
4192 * entity boundaries, but this is not currently a validating
4193 * parser).
4194 * <p>This routine also attempts to keep track of the current
4195 * position in external entities, but it's not entirely accurate.
4196 * @return The next available input character.
4197 * @see #unread (char)
4198 * @see #readDataChunk
4199 * @see #readBuffer
4200 * @see #line
4201 * @return The next character from the current input source.
4203 private char readCh()
4204 throws SAXException, IOException
4206 // As long as there's nothing in the
4207 // read buffer, try reading more data
4208 // (for an external entity) or popping
4209 // the entity stack (for either).
4210 while (readBufferPos >= readBufferLength)
4212 switch (sourceType)
4214 case INPUT_READER:
4215 case INPUT_STREAM:
4216 readDataChunk();
4217 while (readBufferLength < 1)
4219 popInput();
4220 if (readBufferLength < 1)
4222 readDataChunk();
4225 break;
4227 default:
4229 popInput();
4230 break;
4234 char c = readBuffer[readBufferPos++];
4236 if (c == '\n')
4238 line++;
4239 column = 0;
4241 else
4243 if (c == '<')
4245 /* the most common return to parseContent () ... NOP */
4247 else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4248 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085)
4249 && xmlVersion == XML_11))
4251 error("illegal XML character U+" + Integer.toHexString(c));
4254 // If we're in the DTD and in a context where PEs get expanded,
4255 // do so ... 1/14/2000 errata identify those contexts. There
4256 // are also spots in the internal subset where PE refs are fatal
4257 // errors, hence yet another flag.
4258 else if (c == '%' && expandPE)
4260 if (peIsError)
4262 error("PE reference within decl in internal subset.");
4264 parsePEReference();
4265 return readCh();
4267 column++;
4270 return c;
4274 * Push a single character back onto the current input stream.
4275 * <p>This method usually pushes the character back onto
4276 * the readBuffer.
4277 * <p>I don't think that this would ever be called with
4278 * readBufferPos = 0, because the methods always reads a character
4279 * before unreading it, but just in case, I've added a boundary
4280 * condition.
4281 * @param c The character to push back.
4282 * @see #readCh
4283 * @see #unread (char[])
4284 * @see #readBuffer
4286 private void unread(char c)
4287 throws SAXException
4289 // Normal condition.
4290 if (c == '\n')
4292 line--;
4293 column = -1;
4295 if (readBufferPos > 0)
4297 readBuffer[--readBufferPos] = c;
4299 else
4301 pushString(null, new Character(c).toString());
4306 * Push a char array back onto the current input stream.
4307 * <p>NOTE: you must <em>never</em> push back characters that you
4308 * haven't actually read: use pushString () instead.
4309 * @see #readCh
4310 * @see #unread (char)
4311 * @see #readBuffer
4312 * @see #pushString
4314 private void unread(char[] ch, int length)
4315 throws SAXException
4317 for (int i = 0; i < length; i++)
4319 if (ch[i] == '\n')
4321 line--;
4322 column = -1;
4325 if (length < readBufferPos)
4327 readBufferPos -= length;
4329 else
4331 pushCharArray(null, ch, 0, length);
4336 * Push, or skip, a new external input source.
4337 * The source will be some kind of parsed entity, such as a PE
4338 * (including the external DTD subset) or content for the body.
4340 * @param url The java.net.URL object for the entity.
4341 * @see SAXDriver#resolveEntity
4342 * @see #pushString
4343 * @see #sourceType
4344 * @see #pushInput
4345 * @see #detectEncoding
4346 * @see #sourceType
4347 * @see #readBuffer
4349 private void pushURL(boolean isPE,
4350 String ename,
4351 ExternalIdentifiers ids,
4352 Reader reader,
4353 InputStream stream,
4354 String encoding,
4355 boolean doResolve)
4356 throws SAXException, IOException
4358 boolean ignoreEncoding;
4359 String systemId;
4360 InputSource source;
4362 if (!isPE)
4364 dataBufferFlush();
4367 scratch.setPublicId(ids.publicId);
4368 scratch.setSystemId(ids.systemId);
4370 // See if we should skip or substitute the entity.
4371 // If we're not skipping, resolving reports startEntity()
4372 // and updates the (handler's) stack of URIs.
4373 if (doResolve)
4375 // assert (stream == null && reader == null && encoding == null)
4376 source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4377 if (source == null)
4379 handler.warn("skipping entity: " + ename);
4380 handler.skippedEntity(ename);
4381 if (isPE)
4383 skippedPE = true;
4385 return;
4388 // we might be using alternate IDs/encoding
4389 systemId = source.getSystemId();
4390 // The following warning and setting systemId was deleted bcause
4391 // the application has the option of not setting systemId
4392 // provided that it has set the characte/byte stream.
4394 if (systemId == null) {
4395 handler.warn ("missing system ID, using " + ids.systemId);
4396 systemId = ids.systemId;
4400 else
4402 // "[document]", or "[dtd]" via getExternalSubset()
4403 scratch.setCharacterStream(reader);
4404 scratch.setByteStream(stream);
4405 scratch.setEncoding(encoding);
4406 source = scratch;
4407 systemId = ids.systemId;
4408 if (handler.stringInterning)
4410 handler.startExternalEntity(ename, systemId,
4411 "[document]" == ename);
4413 else
4415 handler.startExternalEntity(ename, systemId,
4416 "[document]".equals(ename));
4420 // we may have been given I/O streams directly
4421 if (source.getCharacterStream() != null)
4423 if (source.getByteStream() != null)
4424 error("InputSource has two streams!");
4425 reader = source.getCharacterStream();
4427 else if (source.getByteStream() != null)
4429 encoding = source.getEncoding();
4430 if (encoding == null)
4432 stream = source.getByteStream();
4434 else
4438 reader = new InputStreamReader(source.getByteStream(),
4439 encoding);
4441 catch (IOException e)
4443 stream = source.getByteStream();
4447 else if (systemId == null)
4449 error("InputSource has no URI!");
4451 scratch.setCharacterStream(null);
4452 scratch.setByteStream(null);
4453 scratch.setEncoding(null);
4455 // Push the existing status.
4456 pushInput(ename);
4458 // Create a new read buffer.
4459 // (Note the four-character margin)
4460 readBuffer = new char[READ_BUFFER_MAX + 4];
4461 readBufferPos = 0;
4462 readBufferLength = 0;
4463 readBufferOverflow = -1;
4464 is = null;
4465 line = 1;
4466 column = 0;
4467 currentByteCount = 0;
4469 // If there's an explicit character stream, just
4470 // ignore encoding declarations.
4471 if (reader != null)
4473 sourceType = INPUT_READER;
4474 this.reader = reader;
4475 tryEncodingDecl(true);
4476 return;
4479 // Else we handle the conversion, and need to ensure
4480 // it's done right.
4481 sourceType = INPUT_STREAM;
4482 if (stream != null)
4484 is = stream;
4486 else
4488 // We have to open our own stream to the URL.
4489 URL url = new URL(systemId);
4491 externalEntity = url.openConnection();
4492 externalEntity.connect();
4493 is = externalEntity.getInputStream();
4496 // If we get to here, there must be
4497 // an InputStream available.
4498 if (!is.markSupported())
4500 is = new BufferedInputStream(is);
4503 // Get any external encoding label.
4504 if (encoding == null && externalEntity != null)
4506 // External labels can be untrustworthy; filesystems in
4507 // particular often have the wrong default for content
4508 // that wasn't locally originated. Those we autodetect.
4509 if (!"file".equals(externalEntity.getURL().getProtocol()))
4511 int temp;
4513 // application/xml;charset=something;otherAttr=...
4514 // ... with many variants on 'something'
4515 encoding = externalEntity.getContentType();
4517 // MHK code (fix for Saxon 5.5.1/007):
4518 // protect against encoding==null
4519 if (encoding == null)
4521 temp = -1;
4523 else
4525 temp = encoding.indexOf("charset");
4528 // RFC 2376 sez MIME text defaults to ASCII, but since the
4529 // JDK will create a MIME type out of thin air, we always
4530 // autodetect when there's no explicit charset attribute.
4531 if (temp < 0)
4533 encoding = null; // autodetect
4535 else
4537 // only this one attribute
4538 if ((temp = encoding.indexOf(';')) > 0)
4540 encoding = encoding.substring(0, temp);
4543 if ((temp = encoding.indexOf('=', temp + 7)) > 0)
4545 encoding = encoding.substring(temp + 1);
4547 // attributes can have comment fields (RFC 822)
4548 if ((temp = encoding.indexOf('(')) > 0)
4550 encoding = encoding.substring(0, temp);
4552 // ... and values may be quoted
4553 if ((temp = encoding.indexOf('"')) > 0)
4555 encoding =
4556 encoding.substring(temp + 1,
4557 encoding.indexOf('"', temp + 2));
4559 encoding = encoding.trim();
4561 else
4563 handler.warn("ignoring illegal MIME attribute: "
4564 + encoding);
4565 encoding = null;
4571 // if we got an external encoding label, use it ...
4572 if (encoding != null)
4574 this.encoding = ENCODING_EXTERNAL;
4575 setupDecoding(encoding);
4576 ignoreEncoding = true;
4578 // ... else autodetect from first bytes.
4580 else
4582 detectEncoding();
4583 ignoreEncoding = false;
4586 // Read any XML or text declaration.
4587 // If we autodetected, it may tell us the "real" encoding.
4590 tryEncodingDecl(ignoreEncoding);
4592 catch (UnsupportedEncodingException x)
4594 encoding = x.getMessage();
4596 // if we don't handle the declared encoding,
4597 // try letting a JVM InputStreamReader do it
4600 if (sourceType != INPUT_STREAM)
4602 throw x;
4605 is.reset();
4606 readBufferPos = 0;
4607 readBufferLength = 0;
4608 readBufferOverflow = -1;
4609 line = 1;
4610 currentByteCount = column = 0;
4612 sourceType = INPUT_READER;
4613 this.reader = new InputStreamReader(is, encoding);
4614 is = null;
4616 tryEncodingDecl(true);
4619 catch (IOException e)
4621 error("unsupported text encoding",
4622 encoding,
4623 null);
4629 * Check for an encoding declaration. This is the second part of the
4630 * XML encoding autodetection algorithm, relying on detectEncoding to
4631 * get to the point that this part can read any encoding declaration
4632 * in the document (using only US-ASCII characters).
4634 * <p> Because this part starts to fill parser buffers with this data,
4635 * it's tricky to setup a reader so that Java's built-in decoders can be
4636 * used for the character encodings that aren't built in to this parser
4637 * (such as EUC-JP, KOI8-R, Big5, etc).
4639 * @return any encoding in the declaration, uppercased; or null
4640 * @see detectEncoding
4642 private String tryEncodingDecl(boolean ignoreEncoding)
4643 throws SAXException, IOException
4645 // Read the XML/text declaration.
4646 if (tryRead("<?xml"))
4648 if (tryWhitespace())
4650 if (inputStack.size() > 0)
4652 return parseTextDecl(ignoreEncoding);
4654 else
4656 return parseXMLDecl(ignoreEncoding);
4659 else
4661 // <?xml-stylesheet ...?> or similar
4662 unread('l');
4663 unread('m');
4664 unread('x');
4665 unread('?');
4666 unread('<');
4669 return null;
4673 * Attempt to detect the encoding of an entity.
4674 * <p>The trick here (as suggested in the XML standard) is that
4675 * any entity not in UTF-8, or in UCS-2 with a byte-order mark,
4676 * <b>must</b> begin with an XML declaration or an encoding
4677 * declaration; we simply have to look for "&lt;?xml" in various
4678 * encodings.
4679 * <p>This method has no way to distinguish among 8-bit encodings.
4680 * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4681 * later in setupDecoding (). Any ASCII-derived 8-bit encoding
4682 * should work, but most will be rejected later by setupDecoding ().
4683 * @see #tryEncoding (byte[], byte, byte, byte, byte)
4684 * @see #tryEncoding (byte[], byte, byte)
4685 * @see #setupDecoding
4687 private void detectEncoding()
4688 throws SAXException, IOException
4690 byte[] signature = new byte[4];
4692 // Read the first four bytes for
4693 // autodetection.
4694 is.mark(4);
4695 is.read(signature);
4696 is.reset();
4699 // FIRST: four byte encodings (who uses these?)
4701 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4702 (byte) 0x00, (byte) 0x3c))
4704 // UCS-4 must begin with "<?xml"
4705 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4706 // "UTF-32BE"
4707 encoding = ENCODING_UCS_4_1234;
4709 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4710 (byte) 0x00, (byte) 0x00))
4712 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4713 // "UTF-32LE"
4714 encoding = ENCODING_UCS_4_4321;
4716 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4717 (byte) 0x3c, (byte) 0x00))
4719 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4720 encoding = ENCODING_UCS_4_2143;
4722 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4723 (byte) 0x00, (byte) 0x00))
4725 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4726 encoding = ENCODING_UCS_4_3412;
4728 // 00 00 fe ff UCS_4_1234 (with BOM)
4729 // ff fe 00 00 UCS_4_4321 (with BOM)
4733 // SECOND: two byte encodings
4734 // note ... with 1/14/2000 errata the XML spec identifies some
4735 // more "broken UTF-16" autodetection cases, with no XML decl,
4736 // which we don't handle here (that's legal too).
4738 else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4740 // UCS-2 with a byte-order marker. (UTF-16)
4741 // 0xfe 0xff: UCS-2, big-endian (12)
4742 encoding = ENCODING_UCS_2_12;
4743 is.read(); is.read();
4745 else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4747 // UCS-2 with a byte-order marker. (UTF-16)
4748 // 0xff 0xfe: UCS-2, little-endian (21)
4749 encoding = ENCODING_UCS_2_21;
4750 is.read(); is.read();
4752 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4753 (byte) 0x00, (byte) 0x3f))
4755 // UTF-16BE (otherwise, malformed UTF-16)
4756 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4757 encoding = ENCODING_UCS_2_12;
4758 error("no byte-order mark for UCS-2 entity");
4760 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4761 (byte) 0x3f, (byte) 0x00))
4763 // UTF-16LE (otherwise, malformed UTF-16)
4764 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4765 encoding = ENCODING_UCS_2_21;
4766 error("no byte-order mark for UCS-2 entity");
4770 // THIRD: ASCII-derived encodings, fixed and variable lengths
4772 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4773 (byte) 0x78, (byte) 0x6d))
4775 // ASCII derived
4776 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4777 encoding = ENCODING_UTF_8;
4778 prefetchASCIIEncodingDecl();
4780 else if (signature[0] == (byte) 0xef
4781 && signature[1] == (byte) 0xbb
4782 && signature[2] == (byte) 0xbf)
4784 // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4785 // this un-needed notion slipped into XML 2nd ed through a
4786 // "non-normative" erratum; now required by MSFT and UDDI,
4787 // and E22 made it normative.
4788 encoding = ENCODING_UTF_8;
4789 is.read(); is.read(); is.read();
4791 else
4793 // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4794 // ... but we COULD at least kick in some fixed code page
4796 // (default) UTF-8 without encoding/XML declaration
4797 encoding = ENCODING_UTF_8;
4802 * Check for a four-byte signature.
4803 * <p>Utility routine for detectEncoding ().
4804 * <p>Always looks for some part of "<?XML" in a specific encoding.
4805 * @param sig The first four bytes read.
4806 * @param b1 The first byte of the signature
4807 * @param b2 The second byte of the signature
4808 * @param b3 The third byte of the signature
4809 * @param b4 The fourth byte of the signature
4810 * @see #detectEncoding
4812 private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4813 byte b3, byte b4)
4815 return (sig[0] == b1 && sig[1] == b2
4816 && sig[2] == b3 && sig[3] == b4);
4820 * Check for a two-byte signature.
4821 * <p>Looks for a UCS-2 byte-order mark.
4822 * <p>Utility routine for detectEncoding ().
4823 * @param sig The first four bytes read.
4824 * @param b1 The first byte of the signature
4825 * @param b2 The second byte of the signature
4826 * @see #detectEncoding
4828 private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4830 return ((sig[0] == b1) && (sig[1] == b2));
4834 * This method pushes a string back onto input.
4835 * <p>It is useful either as the expansion of an internal entity,
4836 * or for backtracking during the parse.
4837 * <p>Call pushCharArray () to do the actual work.
4838 * @param s The string to push back onto input.
4839 * @see #pushCharArray
4841 private void pushString(String ename, String s)
4842 throws SAXException
4844 char[] ch = s.toCharArray();
4845 pushCharArray(ename, ch, 0, ch.length);
4849 * Push a new internal input source.
4850 * <p>This method is useful for expanding an internal entity,
4851 * or for unreading a string of characters. It creates a new
4852 * readBuffer containing the characters in the array, instead
4853 * of characters converted from an input byte stream.
4854 * @param ch The char array to push.
4855 * @see #pushString
4856 * @see #pushURL
4857 * @see #readBuffer
4858 * @see #sourceType
4859 * @see #pushInput
4861 private void pushCharArray(String ename, char[] ch, int start, int length)
4862 throws SAXException
4864 // Push the existing status
4865 pushInput(ename);
4866 if (ename != null && doReport)
4868 dataBufferFlush();
4869 handler.startInternalEntity(ename);
4871 sourceType = INPUT_INTERNAL;
4872 readBuffer = ch;
4873 readBufferPos = start;
4874 readBufferLength = length;
4875 readBufferOverflow = -1;
4879 * Save the current input source onto the stack.
4880 * <p>This method saves all of the global variables associated with
4881 * the current input source, so that they can be restored when a new
4882 * input source has finished. It also tests for entity recursion.
4883 * <p>The method saves the following global variables onto a stack
4884 * using a fixed-length array:
4885 * <ol>
4886 * <li>sourceType
4887 * <li>externalEntity
4888 * <li>readBuffer
4889 * <li>readBufferPos
4890 * <li>readBufferLength
4891 * <li>line
4892 * <li>encoding
4893 * </ol>
4894 * @param ename The name of the entity (if any) causing the new input.
4895 * @see #popInput
4896 * @see #sourceType
4897 * @see #externalEntity
4898 * @see #readBuffer
4899 * @see #readBufferPos
4900 * @see #readBufferLength
4901 * @see #line
4902 * @see #encoding
4904 private void pushInput(String ename)
4905 throws SAXException
4907 // Check for entity recursion.
4908 if (ename != null)
4910 Iterator entities = entityStack.iterator();
4911 while (entities.hasNext())
4913 String e = (String) entities.next();
4914 if (e != null && e == ename)
4916 error("recursive reference to entity", ename, null);
4920 entityStack.addLast(ename);
4922 // Don't bother if there is no current input.
4923 if (sourceType == INPUT_NONE)
4925 return;
4928 // Set up a snapshot of the current
4929 // input source.
4930 Input input = new Input();
4932 input.sourceType = sourceType;
4933 input.externalEntity = externalEntity;
4934 input.readBuffer = readBuffer;
4935 input.readBufferPos = readBufferPos;
4936 input.readBufferLength = readBufferLength;
4937 input.line = line;
4938 input.encoding = encoding;
4939 input.readBufferOverflow = readBufferOverflow;
4940 input.is = is;
4941 input.currentByteCount = currentByteCount;
4942 input.column = column;
4943 input.reader = reader;
4945 // Push it onto the stack.
4946 inputStack.addLast(input);
4950 * Restore a previous input source.
4951 * <p>This method restores all of the global variables associated with
4952 * the current input source.
4953 * @exception java.io.EOFException
4954 * If there are no more entries on the input stack.
4955 * @see #pushInput
4956 * @see #sourceType
4957 * @see #externalEntity
4958 * @see #readBuffer
4959 * @see #readBufferPos
4960 * @see #readBufferLength
4961 * @see #line
4962 * @see #encoding
4964 private void popInput()
4965 throws SAXException, IOException
4967 String ename = (String) entityStack.removeLast();
4969 if (ename != null && doReport)
4971 dataBufferFlush();
4973 switch (sourceType)
4975 case INPUT_STREAM:
4976 handler.endExternalEntity(ename);
4977 is.close();
4978 break;
4979 case INPUT_READER:
4980 handler.endExternalEntity(ename);
4981 reader.close();
4982 break;
4983 case INPUT_INTERNAL:
4984 if (ename != null && doReport)
4986 handler.endInternalEntity(ename);
4988 break;
4991 // Throw an EOFException if there
4992 // is nothing else to pop.
4993 if (inputStack.isEmpty())
4995 throw new EOFException("no more input");
4998 Input input = (Input) inputStack.removeLast();
5000 sourceType = input.sourceType;
5001 externalEntity = input.externalEntity;
5002 readBuffer = input.readBuffer;
5003 readBufferPos = input.readBufferPos;
5004 readBufferLength = input.readBufferLength;
5005 line = input.line;
5006 encoding = input.encoding;
5007 readBufferOverflow = input.readBufferOverflow;
5008 is = input.is;
5009 currentByteCount = input.currentByteCount;
5010 column = input.column;
5011 reader = input.reader;
5015 * Return true if we can read the expected character.
5016 * <p>Note that the character will be removed from the input stream
5017 * on success, but will be put back on failure. Do not attempt to
5018 * read the character again if the method succeeds.
5019 * @param delim The character that should appear next. For a
5020 * insensitive match, you must supply this in upper-case.
5021 * @return true if the character was successfully read, or false if
5022 * it was not.
5023 * @see #tryRead (String)
5025 private boolean tryRead(char delim)
5026 throws SAXException, IOException
5028 char c;
5030 // Read the character
5031 c = readCh();
5033 // Test for a match, and push the character
5034 // back if the match fails.
5035 if (c == delim)
5037 return true;
5039 else
5041 unread(c);
5042 return false;
5047 * Return true if we can read the expected string.
5048 * <p>This is simply a convenience method.
5049 * <p>Note that the string will be removed from the input stream
5050 * on success, but will be put back on failure. Do not attempt to
5051 * read the string again if the method succeeds.
5052 * <p>This method will push back a character rather than an
5053 * array whenever possible (probably the majority of cases).
5054 * @param delim The string that should appear next.
5055 * @return true if the string was successfully read, or false if
5056 * it was not.
5057 * @see #tryRead (char)
5059 private boolean tryRead(String delim)
5060 throws SAXException, IOException
5062 return tryRead(delim.toCharArray());
5065 private boolean tryRead(char[] ch)
5066 throws SAXException, IOException
5068 char c;
5070 // Compare the input, character-
5071 // by character.
5073 for (int i = 0; i < ch.length; i++)
5075 c = readCh();
5076 if (c != ch[i])
5078 unread(c);
5079 if (i != 0)
5081 unread(ch, i);
5083 return false;
5086 return true;
5090 * Return true if we can read some whitespace.
5091 * <p>This is simply a convenience method.
5092 * <p>This method will push back a character rather than an
5093 * array whenever possible (probably the majority of cases).
5094 * @return true if whitespace was found.
5096 private boolean tryWhitespace()
5097 throws SAXException, IOException
5099 char c;
5100 c = readCh();
5101 if (isWhitespace(c))
5103 skipWhitespace();
5104 return true;
5106 else
5108 unread(c);
5109 return false;
5114 * Read all data until we find the specified string.
5115 * This is useful for scanning CDATA sections and PIs.
5116 * <p>This is inefficient right now, since it calls tryRead ()
5117 * for every character.
5118 * @param delim The string delimiter
5119 * @see #tryRead (String, boolean)
5120 * @see #readCh
5122 private void parseUntil(String delim)
5123 throws SAXException, IOException
5125 parseUntil(delim.toCharArray());
5128 private void parseUntil(char[] delim)
5129 throws SAXException, IOException
5131 char c;
5132 int startLine = line;
5136 while (!tryRead(delim))
5138 c = readCh();
5139 dataBufferAppend(c);
5142 catch (EOFException e)
5144 error("end of input while looking for delimiter "
5145 + "(started on line " + startLine
5146 + ')', null, new String(delim));
5150 //////////////////////////////////////////////////////////////////////
5151 // Low-level I/O.
5152 //////////////////////////////////////////////////////////////////////
5155 * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5156 * Doesn't buffer more than absolutely needed, so that when an encoding
5157 * decl says we need to create an InputStreamReader, we can discard our
5158 * buffer and reset(). Caller knows the first chars of the decl exist
5159 * in the input stream.
5161 private void prefetchASCIIEncodingDecl()
5162 throws SAXException, IOException
5164 int ch;
5165 readBufferPos = readBufferLength = 0;
5167 is.mark(readBuffer.length);
5168 while (true)
5170 ch = is.read();
5171 readBuffer[readBufferLength++] = (char) ch;
5172 switch (ch)
5174 case (int) '>':
5175 return;
5176 case -1:
5177 error("file ends before end of XML or encoding declaration.",
5178 null, "?>");
5180 if (readBuffer.length == readBufferLength)
5182 error("unfinished XML or encoding declaration");
5188 * Read a chunk of data from an external input source.
5189 * <p>This is simply a front-end that fills the rawReadBuffer
5190 * with bytes, then calls the appropriate encoding handler.
5191 * @see #encoding
5192 * @see #rawReadBuffer
5193 * @see #readBuffer
5194 * @see #filterCR
5195 * @see #copyUtf8ReadBuffer
5196 * @see #copyIso8859_1ReadBuffer
5197 * @see #copyUcs_2ReadBuffer
5198 * @see #copyUcs_4ReadBuffer
5200 private void readDataChunk()
5201 throws SAXException, IOException
5203 int count;
5205 // See if we have any overflow (filterCR sets for CR at end)
5206 if (readBufferOverflow > -1)
5208 readBuffer[0] = (char) readBufferOverflow;
5209 readBufferOverflow = -1;
5210 readBufferPos = 1;
5211 sawCR = true;
5213 else
5215 readBufferPos = 0;
5216 sawCR = false;
5219 // input from a character stream.
5220 if (sourceType == INPUT_READER)
5222 count = reader.read(readBuffer,
5223 readBufferPos, READ_BUFFER_MAX - readBufferPos);
5224 if (count < 0)
5226 readBufferLength = readBufferPos;
5228 else
5230 readBufferLength = readBufferPos + count;
5232 if (readBufferLength > 0)
5234 filterCR(count >= 0);
5236 sawCR = false;
5237 return;
5240 // Read as many bytes as possible into the raw buffer.
5241 count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
5243 // Dispatch to an encoding-specific reader method to populate
5244 // the readBuffer. In most parser speed profiles, these routines
5245 // show up at the top of the CPU usage chart.
5246 if (count > 0)
5248 switch (encoding)
5250 // one byte builtins
5251 case ENCODING_ASCII:
5252 copyIso8859_1ReadBuffer(count, (char) 0x0080);
5253 break;
5254 case ENCODING_UTF_8:
5255 copyUtf8ReadBuffer(count);
5256 break;
5257 case ENCODING_ISO_8859_1:
5258 copyIso8859_1ReadBuffer(count, (char) 0);
5259 break;
5261 // two byte builtins
5262 case ENCODING_UCS_2_12:
5263 copyUcs2ReadBuffer(count, 8, 0);
5264 break;
5265 case ENCODING_UCS_2_21:
5266 copyUcs2ReadBuffer(count, 0, 8);
5267 break;
5269 // four byte builtins
5270 case ENCODING_UCS_4_1234:
5271 copyUcs4ReadBuffer(count, 24, 16, 8, 0);
5272 break;
5273 case ENCODING_UCS_4_4321:
5274 copyUcs4ReadBuffer(count, 0, 8, 16, 24);
5275 break;
5276 case ENCODING_UCS_4_2143:
5277 copyUcs4ReadBuffer(count, 16, 24, 0, 8);
5278 break;
5279 case ENCODING_UCS_4_3412:
5280 copyUcs4ReadBuffer(count, 8, 0, 24, 16);
5281 break;
5284 else
5286 readBufferLength = readBufferPos;
5289 readBufferPos = 0;
5291 // Filter out all carriage returns if we've seen any
5292 // (including any saved from a previous read)
5293 if (sawCR)
5295 filterCR(count >= 0);
5296 sawCR = false;
5298 // must actively report EOF, lest some CRs get lost.
5299 if (readBufferLength == 0 && count >= 0)
5301 readDataChunk();
5305 if (count > 0)
5307 currentByteCount += count;
5312 * Filter carriage returns in the read buffer.
5313 * CRLF becomes LF; CR becomes LF.
5314 * @param moreData true iff more data might come from the same source
5315 * @see #readDataChunk
5316 * @see #readBuffer
5317 * @see #readBufferOverflow
5319 private void filterCR(boolean moreData)
5321 int i, j;
5323 readBufferOverflow = -1;
5325 loop:
5326 for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5328 switch (readBuffer[j])
5330 case '\r':
5331 if (j == readBufferLength - 1)
5333 if (moreData)
5335 readBufferOverflow = '\r';
5336 readBufferLength--;
5338 else // CR at end of buffer
5340 readBuffer[i++] = '\n';
5342 break loop;
5344 else if (readBuffer[j + 1] == '\n')
5346 j++;
5348 readBuffer[i] = '\n';
5349 break;
5351 case '\n':
5352 default:
5353 readBuffer[i] = readBuffer[j];
5354 break;
5357 readBufferLength = i;
5361 * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
5362 * <p>When readDataChunk () calls this method, the raw bytes are in
5363 * rawReadBuffer, and the final characters will appear in
5364 * readBuffer.
5365 * <p>Note that as of Unicode 3.1, good practice became a requirement,
5366 * so that each Unicode character has exactly one UTF-8 representation.
5367 * @param count The number of bytes to convert.
5368 * @see #readDataChunk
5369 * @see #rawReadBuffer
5370 * @see #readBuffer
5371 * @see #getNextUtf8Byte
5373 private void copyUtf8ReadBuffer(int count)
5374 throws SAXException, IOException
5376 int i = 0;
5377 int j = readBufferPos;
5378 int b1;
5379 char c = 0;
5382 // check once, so the runtime won't (if it's smart enough)
5383 if (count < 0 || count > rawReadBuffer.length)
5384 throw new ArrayIndexOutOfBoundsException (Integer.toString (count));
5387 while (i < count)
5389 b1 = rawReadBuffer[i++];
5391 // Determine whether we are dealing
5392 // with a one-, two-, three-, or four-
5393 // byte sequence.
5394 if (b1 < 0)
5396 if ((b1 & 0xe0) == 0xc0)
5398 // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
5399 c = (char) (((b1 & 0x1f) << 6)
5400 | getNextUtf8Byte(i++, count));
5401 if (c < 0x0080)
5403 encodingError("Illegal two byte UTF-8 sequence",
5404 c, 0);
5407 //Sec 2.11
5408 // [1] the two-character sequence #xD #xA
5409 // [2] the two-character sequence #xD #x85
5410 if ((c == 0x0085 || c == 0x000a) && sawCR)
5412 continue;
5415 // Sec 2.11
5416 // [3] the single character #x85
5418 if (c == 0x0085 && xmlVersion == XML_11)
5420 readBuffer[j++] = '\r';
5423 else if ((b1 & 0xf0) == 0xe0)
5425 // 3-byte sequence:
5426 // zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
5427 // most CJKV characters
5428 c = (char) (((b1 & 0x0f) << 12) |
5429 (getNextUtf8Byte(i++, count) << 6) |
5430 getNextUtf8Byte(i++, count));
5431 //sec 2.11
5432 //[4] the single character #x2028
5433 if (c == 0x2028 && xmlVersion == XML_11)
5435 readBuffer[j++] = '\r';
5436 sawCR = true;
5437 continue;
5439 if (c < 0x0800 || (c >= 0xd800 && c <= 0xdfff))
5441 encodingError("Illegal three byte UTF-8 sequence",
5442 c, 0);
5445 else if ((b1 & 0xf8) == 0xf0)
5447 // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
5448 // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
5449 // (uuuuu = wwww + 1)
5450 // "Surrogate Pairs" ... from the "Astral Planes"
5451 // Unicode 3.1 assigned the first characters there
5452 int iso646 = b1 & 07;
5453 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5454 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5455 iso646 = (iso646 << 6) + getNextUtf8Byte(i++, count);
5457 if (iso646 <= 0xffff)
5459 encodingError("Illegal four byte UTF-8 sequence",
5460 iso646, 0);
5462 else
5464 if (iso646 > 0x0010ffff)
5466 encodingError("UTF-8 value out of range for Unicode",
5467 iso646, 0);
5469 iso646 -= 0x010000;
5470 readBuffer[j++] = (char) (0xd800 | (iso646 >> 10));
5471 readBuffer[j++] = (char) (0xdc00 | (iso646 & 0x03ff));
5472 continue;
5475 else
5477 // The five and six byte encodings aren't supported;
5478 // they exceed the Unicode (and XML) range.
5479 encodingError("unsupported five or six byte UTF-8 sequence",
5480 0xff & b1, i);
5481 // NOTREACHED
5482 c = 0;
5485 else
5487 // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
5488 // (US-ASCII character, "common" case, one branch to here)
5489 c = (char) b1;
5491 readBuffer[j++] = c;
5492 if (c == '\r')
5494 sawCR = true;
5497 // How many characters have we read?
5498 readBufferLength = j;
5502 * Return the next byte value in a UTF-8 sequence.
5503 * If it is not possible to get a byte from the current
5504 * entity, throw an exception.
5505 * @param pos The current position in the rawReadBuffer.
5506 * @param count The number of bytes in the rawReadBuffer
5507 * @return The significant six bits of a non-initial byte in
5508 * a UTF-8 sequence.
5509 * @exception EOFException If the sequence is incomplete.
5511 private int getNextUtf8Byte(int pos, int count)
5512 throws SAXException, IOException
5514 int val;
5516 // Take a character from the buffer
5517 // or from the actual input stream.
5518 if (pos < count)
5520 val = rawReadBuffer[pos];
5522 else
5524 val = is.read();
5525 if (val == -1)
5527 encodingError("unfinished multi-byte UTF-8 sequence at EOF",
5528 -1, pos);
5532 // Check for the correct bits at the start.
5533 if ((val & 0xc0) != 0x80)
5535 encodingError("bad continuation of multi-byte UTF-8 sequence",
5536 val, pos + 1);
5539 // Return the significant bits.
5540 return (val & 0x3f);
5544 * Convert a buffer of US-ASCII or ISO-8859-1-encoded bytes into
5545 * UTF-16 characters.
5547 * <p>When readDataChunk () calls this method, the raw bytes are in
5548 * rawReadBuffer, and the final characters will appear in
5549 * readBuffer.
5551 * @param count The number of bytes to convert.
5552 * @param mask For ASCII conversion, 0x7f; else, 0xff.
5553 * @see #readDataChunk
5554 * @see #rawReadBuffer
5555 * @see #readBuffer
5557 private void copyIso8859_1ReadBuffer(int count, char mask)
5558 throws IOException
5560 int i, j;
5561 for (i = 0, j = readBufferPos; i < count; i++, j++)
5563 char c = (char) (rawReadBuffer[i] & 0xff);
5564 if ((c & mask) != 0)
5566 throw new CharConversionException("non-ASCII character U+"
5567 + Integer.toHexString(c));
5569 if (c == 0x0085 && xmlVersion == XML_11)
5571 c = '\r';
5573 readBuffer[j] = c;
5574 if (c == '\r')
5576 sawCR = true;
5579 readBufferLength = j;
5583 * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters
5584 * (as used in Java string manipulation).
5586 * <p>When readDataChunk () calls this method, the raw bytes are in
5587 * rawReadBuffer, and the final characters will appear in
5588 * readBuffer.
5589 * @param count The number of bytes to convert.
5590 * @param shift1 The number of bits to shift byte 1.
5591 * @param shift2 The number of bits to shift byte 2
5592 * @see #readDataChunk
5593 * @see #rawReadBuffer
5594 * @see #readBuffer
5596 private void copyUcs2ReadBuffer(int count, int shift1, int shift2)
5597 throws SAXException
5599 int j = readBufferPos;
5601 if (count > 0 && (count % 2) != 0)
5603 encodingError("odd number of bytes in UCS-2 encoding", -1, count);
5605 // The loops are faster with less internal brancing; hence two
5606 if (shift1 == 0)
5607 { // "UTF-16-LE"
5608 for (int i = 0; i < count; i += 2)
5610 char c = (char) (rawReadBuffer[i + 1] << 8);
5611 c |= 0xff & rawReadBuffer[i];
5612 readBuffer[j++] = c;
5613 if (c == '\r')
5615 sawCR = true;
5619 else
5620 { // "UTF-16-BE"
5621 for (int i = 0; i < count; i += 2)
5623 char c = (char) (rawReadBuffer[i] << 8);
5624 c |= 0xff & rawReadBuffer[i + 1];
5625 readBuffer[j++] = c;
5626 if (c == '\r')
5628 sawCR = true;
5632 readBufferLength = j;
5636 * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
5638 * <p>When readDataChunk () calls this method, the raw bytes are in
5639 * rawReadBuffer, and the final characters will appear in
5640 * readBuffer.
5641 * <p>Java has Unicode chars, and this routine uses surrogate pairs
5642 * for ISO-10646 values between 0x00010000 and 0x000fffff. An
5643 * exception is thrown if the ISO-10646 character has no Unicode
5644 * representation.
5646 * @param count The number of bytes to convert.
5647 * @param shift1 The number of bits to shift byte 1.
5648 * @param shift2 The number of bits to shift byte 2
5649 * @param shift3 The number of bits to shift byte 2
5650 * @param shift4 The number of bits to shift byte 2
5651 * @see #readDataChunk
5652 * @see #rawReadBuffer
5653 * @see #readBuffer
5655 private void copyUcs4ReadBuffer(int count, int shift1, int shift2,
5656 int shift3, int shift4)
5657 throws SAXException
5659 int j = readBufferPos;
5661 if (count > 0 && (count % 4) != 0)
5663 encodingError("number of bytes in UCS-4 encoding " +
5664 "not divisible by 4",
5665 -1, count);
5667 for (int i = 0; i < count; i += 4)
5669 int value = (((rawReadBuffer [i] & 0xff) << shift1) |
5670 ((rawReadBuffer [i + 1] & 0xff) << shift2) |
5671 ((rawReadBuffer [i + 2] & 0xff) << shift3) |
5672 ((rawReadBuffer [i + 3] & 0xff) << shift4));
5673 if (value < 0x0000ffff)
5675 readBuffer [j++] = (char) value;
5676 if (value == (int) '\r')
5678 sawCR = true;
5681 else if (value < 0x0010ffff)
5683 value -= 0x010000;
5684 readBuffer[j++] = (char) (0xd8 | ((value >> 10) & 0x03ff));
5685 readBuffer[j++] = (char) (0xdc | (value & 0x03ff));
5687 else
5689 encodingError("UCS-4 value out of range for Unicode",
5690 value, i);
5693 readBufferLength = j;
5697 * Report a character encoding error.
5699 private void encodingError(String message, int value, int offset)
5700 throws SAXException
5702 if (value != -1)
5704 message = message + " (character code: 0x" +
5705 Integer.toHexString(value) + ')';
5706 error(message);
5710 //////////////////////////////////////////////////////////////////////
5711 // Local Variables.
5712 //////////////////////////////////////////////////////////////////////
5715 * Re-initialize the variables for each parse.
5717 private void initializeVariables()
5719 // First line
5720 line = 1;
5721 column = 0;
5723 // Set up the buffers for data and names
5724 dataBufferPos = 0;
5725 dataBuffer = new char[DATA_BUFFER_INITIAL];
5726 nameBufferPos = 0;
5727 nameBuffer = new char[NAME_BUFFER_INITIAL];
5729 // Set up the DTD hash tables
5730 elementInfo = new HashMap();
5731 entityInfo = new HashMap();
5732 notationInfo = new HashMap();
5733 skippedPE = false;
5735 // Set up the variables for the current
5736 // element context.
5737 currentElement = null;
5738 currentElementContent = CONTENT_UNDECLARED;
5740 // Set up the input variables
5741 sourceType = INPUT_NONE;
5742 inputStack = new LinkedList();
5743 entityStack = new LinkedList();
5744 externalEntity = null;
5745 tagAttributePos = 0;
5746 tagAttributes = new String[100];
5747 rawReadBuffer = new byte[READ_BUFFER_MAX];
5748 readBufferOverflow = -1;
5750 scratch = new InputSource();
5752 inLiteral = false;
5753 expandPE = false;
5754 peIsError = false;
5756 doReport = false;
5758 inCDATA = false;
5760 symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5763 static class ExternalIdentifiers
5766 String publicId;
5767 String systemId;
5768 String baseUri;
5770 ExternalIdentifiers()
5774 ExternalIdentifiers(String publicId, String systemId, String baseUri)
5776 this.publicId = publicId;
5777 this.systemId = systemId;
5778 this.baseUri = baseUri;
5783 static class EntityInfo
5786 int type;
5787 ExternalIdentifiers ids;
5788 String value;
5789 String notationName;
5793 static class AttributeDecl
5796 String type;
5797 String value;
5798 int valueType;
5799 String enumeration;
5800 String defaultValue;
5804 static class ElementDecl
5807 int contentType;
5808 String contentModel;
5809 HashMap attributes;
5813 static class Input
5816 int sourceType;
5817 URLConnection externalEntity;
5818 char[] readBuffer;
5819 int readBufferPos;
5820 int readBufferLength;
5821 int line;
5822 int encoding;
5823 int readBufferOverflow;
5824 InputStream is;
5825 int currentByteCount;
5826 int column;
5827 Reader reader;