Imported GNU Classpath 0.19 + gcj-import-20051115.
[official-gcc.git] / libjava / classpath / gnu / xml / pipeline / ValidationConsumer.java
blob8391767490c0c3ab01e69347335dc2ea53278dd4
1 /* ValidationConsumer.java --
2 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
4 This file is part of GNU Classpath.
6 GNU Classpath is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
11 GNU Classpath is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with GNU Classpath; see the file COPYING. If not, write to the
18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 02110-1301 USA.
21 Linking this library statically or dynamically with other modules is
22 making a combined work based on this library. Thus, the terms and
23 conditions of the GNU General Public License cover the whole
24 combination.
26 As a special exception, the copyright holders of this library give you
27 permission to link this library with independent modules to produce an
28 executable, regardless of the license terms of these independent
29 modules, and to copy and distribute the resulting executable under
30 terms of your choice, provided that you also meet, for each linked
31 independent module, the terms and conditions of the license of that
32 module. An independent module is a module which is not derived from
33 or based on this library. If you modify this library, you may extend
34 this exception to your version of the library, but you are not
35 obligated to do so. If you do not wish to do so, delete this
36 exception statement from your version. */
38 package gnu.xml.pipeline;
40 import java.io.IOException;
41 import java.io.StringReader;
42 import java.io.StringWriter;
43 import java.util.EmptyStackException;
44 import java.util.Enumeration;
45 import java.util.Hashtable;
46 import java.util.Stack;
47 import java.util.StringTokenizer;
48 import java.util.Vector;
50 import org.xml.sax.Attributes;
51 import org.xml.sax.EntityResolver;
52 import org.xml.sax.ErrorHandler;
53 import org.xml.sax.InputSource;
54 import org.xml.sax.Locator;
55 import org.xml.sax.SAXException;
56 import org.xml.sax.SAXParseException;
57 import org.xml.sax.XMLReader;
58 import org.xml.sax.helpers.XMLReaderFactory;
60 /**
61 * This class checks SAX2 events to report validity errors; it works as
62 * both a filter and a terminus on an event pipeline. It relies on the
63 * producer of SAX events to: </p> <ol>
65 * <li> Conform to the specification of a non-validating XML parser that
66 * reads all external entities, reported using SAX2 events. </li>
68 * <li> Report ignorable whitespace as such (through the ContentHandler
69 * interface). This is, strictly speaking, optional for nonvalidating
70 * XML processors. </li>
72 * <li> Make SAX2 DeclHandler callbacks, with default
73 * attribute values already normalized (and without "&lt;").</li>
75 * <li> Make SAX2 LexicalHandler startDTD() and endDTD ()
76 * callbacks. </li>
78 * <li> Act as if the <em>(URI)/namespace-prefixes</em> property were
79 * set to true, by providing XML 1.0 names and all <code>xmlns*</code>
80 * attributes (rather than omitting either or both). </li>
82 * </ol>
84 * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
85 * Crimson, and Xerces) meet these requirements, and this validation
86 * module is used by the optional &AElig;lfred2 validation support.
87 * </p>
89 * <p> Note that because this is a layered validator, it has to duplicate some
90 * work that the parser is doing; there are also other cost to layering.
91 * However, <em>because of layering it doesn't need a parser</em> in order
92 * to work! You can use it with anything that generates SAX events, such
93 * as an application component that wants to detect invalid content in
94 * a changed area without validating an entire document, or which wants to
95 * ensure that it doesn't write invalid data to a communications partner.</p>
97 * <p> Also, note that because this is a layered validator, the line numbers
98 * reported for some errors may seem strange. For example, if an element does
99 * not permit character content, the validator
100 * will use the locator provided to it.
101 * That might reflect the last character of a <em>characters</em> event
102 * callback, rather than the first non-whitespace character. </p>
104 * <hr />
106 * <!--
107 * <p> Of interest is the fact that unlike most currently known XML validators,
108 * this one can report some cases of non-determinism in element content models.
109 * It is a compile-time option, enabled by default. This will only report
110 * such XML errors if they relate to content actually appearing in a document;
111 * content models aren't aggressively scanned for non-deterministic structure.
112 * Documents which trigger such non-deterministic transitions may be handled
113 * differently by different validating parsers, without losing conformance
114 * to the XML specification. </p>
115 * -->
117 * <p> Current limitations of the validation performed are in roughly three
118 * categories. </p>
120 * <p> The first category represents constraints which demand violations
121 * of software layering: exposing lexical details, one of the first things
122 * that <em>application</em> programming interfaces (APIs) hide. These
123 * invariably relate to XML entity handling, and to historical oddities
124 * of the XML validation semantics. Curiously,
125 * recent (Autumn 1999) conformance testing showed that these constraints are
126 * among those handled worst by existing XML validating parsers. Arguments
127 * have been made that each of these VCs should be turned into WFCs (most
128 * of them) or discarded (popular for the standalone declaration); in short,
129 * that these are bugs in the XML specification (not all via SGML): </p><ul>
131 * <li> The <em>Proper Declaration/PE Nesting</em> and
132 * <em>Proper Group/PE Nesting</em> VCs can't be tested because they
133 * require access to particularly low level lexical level information.
134 * In essence, the reason XML isn't a simple thing to parse is that
135 * it's not a context free grammar, and these constraints elevate that
136 * SGML-derived context sensitivity to the level of a semantic rule.
138 * <li> The <em>Standalone Document Declaration</em> VC can't be
139 * tested. This is for two reasons. First, this flag isn't made
140 * available through SAX2. Second, it also requires breaking that
141 * lexical layering boundary. (If you ever wondered why classes
142 * in compiler construction or language design barely mention the
143 * existence of context-sensitive grammars, it's because of messy
144 * issues like these.)
146 * <li> The <em>Entity Declared</em> VC can't be tested, because it
147 * also requires breaking that lexical layering boundary! There's also
148 * another issue: the VC wording (and seemingly intent) is ambiguous.
149 * (This is still true in the "Second edition" XML spec.)
150 * Since there is a WFC of the same name, everyone's life would be
151 * easier if references to undeclared parsed entities were always well
152 * formedness errors, regardless of whether they're parameter entities
153 * or not. (Note that nonvalidating parsers are not required
154 * to report all such well formedness errors if they don't read external
155 * parameter entities, although currently most XML parsers read them
156 * in an attempt to avoid problems from inconsistent parser behavior.)
158 * </ul>
160 * <p> The second category of limitations on this validation represent
161 * constraints associated with information that is not guaranteed to be
162 * available (or in one case, <em>is guaranteed not to be available</em>,
163 * through the SAX2 API: </p><ul>
165 * <li> The <em>Unique Element Type Declaration</em> VC may not be
166 * reportable, if the underlying parser happens not to expose
167 * multiple declarations. (&AElig;lfred2 reports these validity
168 * errors directly.)</li>
170 * <li> Similarly, the <em>Unique Notation Name</em> VC, added in the
171 * 14-January-2000 XML spec errata to restrict typing models used by
172 * elements, may not be reportable. (&AElig;lfred reports these
173 * validity errors directly.) </li>
175 * </ul>
177 * <p> A third category relates to ease of implementation. (Think of this
178 * as "bugs".) The most notable issue here is character handling. Rather
179 * than attempting to implement the voluminous character tables in the XML
180 * specification (Appendix B), Unicode rules are used directly from
181 * the java.lang.Character class. Recent JVMs have begun to diverge from
182 * the original specification for that class (Unicode 2.0), meaning that
183 * different JVMs may handle that aspect of conformance differently.
184 * </p>
186 * <p> Note that for some of the validity errors that SAX2 does not
187 * expose, a nonvalidating parser is permitted (by the XML specification)
188 * to report validity errors. When used with a parser that does so for
189 * the validity constraints mentioned above (or any other SAX2 event
190 * stream producer that does the same thing), overall conformance is
191 * substantially improved.
193 * @see gnu.xml.aelfred2.SAXDriver
194 * @see gnu.xml.aelfred2.XmlReader
196 * @author David Brownell
198 public final class ValidationConsumer extends EventFilter
200 // report error if we happen to notice a non-deterministic choice?
201 // we won't report buggy content models; just buggy instances
202 private static final boolean warnNonDeterministic = false;
204 // for tracking active content models
205 private String rootName;
206 private Stack contentStack = new Stack ();
208 // flags for "saved DTD" processing
209 private boolean disableDeclarations;
210 private boolean disableReset;
213 // most VCs get tested when we see element start tags. the per-element
214 // info (including attributes) recorded here duplicates that found inside
215 // many nonvalidating parsers, hence dual lookups etc ... that's why a
216 // layered validator isn't going to be as fast as a non-layered one.
219 // key = element name; value = ElementInfo
220 private Hashtable elements = new Hashtable ();
222 // some VCs relate to ID/IDREF/IDREFS attributes
223 // key = id; value = boolean true (defd) or false (refd)
224 private Hashtable ids = new Hashtable ();
226 // we just record declared notation and unparsed entity names.
227 // the implementation here is simple/slow; these features
228 // are seldom used, one hopes they'll wither away soon
229 private Vector notations = new Vector (5, 5);
230 private Vector nDeferred = new Vector (5, 5);
231 private Vector unparsed = new Vector (5, 5);
232 private Vector uDeferred = new Vector (5, 5);
234 // note: DocBk 3.1.7 XML defines over 2 dozen notations,
235 // used when defining unparsed entities for graphics
236 // (and maybe in other places)
241 * Creates a pipeline terminus which consumes all events passed to
242 * it; this will report validity errors as if they were fatal errors,
243 * unless an error handler is assigned.
245 * @see #setErrorHandler
247 // constructor used by PipelineFactory
248 // ... and want one taking system ID of an external subset
249 public ValidationConsumer ()
251 this (null);
255 * Creates a pipeline filter which reports validity errors and then
256 * passes events on to the next consumer if they were not fatal.
258 * @see #setErrorHandler
260 // constructor used by PipelineFactory
261 // ... and want one taking system ID of an external subset
262 // (which won't send declaration events)
263 public ValidationConsumer (EventConsumer next)
265 super (next);
267 setContentHandler (this);
268 setDTDHandler (this);
269 try { setProperty (DECL_HANDLER, this); }
270 catch (Exception e) { /* "can't happen" */ }
271 try { setProperty (LEXICAL_HANDLER, this); }
272 catch (Exception e) { /* "can't happen" */ }
276 private static final String fakeRootName
277 = ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
280 * Creates a validation consumer which is preloaded with the DTD provided.
281 * It does this by constructing a document with that DTD, then parsing
282 * that document and recording its DTD declarations. Then it arranges
283 * not to modify that information.
285 * <p> The resulting validation consumer will only validate against
286 * the specified DTD, regardless of whether some other DTD is found
287 * in a document being parsed.
289 * @param rootName The name of the required root element; if this is
290 * null, any root element name will be accepted.
291 * @param publicId If non-null and there is a non-null systemId, this
292 * identifier provides an alternate access identifier for the DTD's
293 * external subset.
294 * @param systemId If non-null, this is a URI (normally URL) that
295 * may be used to access the DTD's external subset.
296 * @param internalSubset If non-null, holds literal markup declarations
297 * comprising the DTD's internal subset.
298 * @param resolver If non-null, this will be provided to the parser for
299 * use when resolving parameter entities (including any external subset).
300 * @param resolver If non-null, this will be provided to the parser for
301 * use when resolving parameter entities (including any external subset).
302 * @param minimalElement If non-null, a minimal valid document.
304 * @exception SAXNotSupportedException If the default SAX parser does
305 * not support the standard lexical or declaration handlers.
306 * @exception SAXParseException If the specified DTD has either
307 * well-formedness or validity errors
308 * @exception IOException If the specified DTD can't be read for
309 * some reason
311 public ValidationConsumer (
312 String rootName,
313 String publicId,
314 String systemId,
315 String internalSubset,
316 EntityResolver resolver,
317 String minimalDocument
318 ) throws SAXException, IOException
320 this (null);
322 disableReset = true;
323 if (rootName == null)
324 rootName = fakeRootName;
327 // Synthesize document with that DTD; is it possible to do
328 // better for the declaration of the root element?
330 // NOTE: can't use SAX2 to write internal subsets.
332 StringWriter writer = new StringWriter ();
334 writer.write ("<!DOCTYPE ");
335 writer.write (rootName);
336 if (systemId != null) {
337 writer.write ("\n ");
338 if (publicId != null) {
339 writer.write ("PUBLIC '");
340 writer.write (publicId);
341 writer.write ("'\n\t'");
342 } else
343 writer.write ("SYSTEM '");
344 writer.write (systemId);
345 writer.write ("'");
347 writer.write (" [ ");
348 if (rootName == fakeRootName) {
349 writer.write ("\n<!ELEMENT ");
350 writer.write (rootName);
351 writer.write (" EMPTY>");
353 if (internalSubset != null)
354 writer.write (internalSubset);
355 writer.write ("\n ]>");
357 if (minimalDocument != null) {
358 writer.write ("\n");
359 writer.write (minimalDocument);
360 writer.write ("\n");
361 } else {
362 writer.write (" <");
363 writer.write (rootName);
364 writer.write ("/>\n");
366 minimalDocument = writer.toString ();
369 // OK, load it
371 XMLReader producer;
373 producer = XMLReaderFactory.createXMLReader ();
374 bind (producer, this);
376 if (resolver != null)
377 producer.setEntityResolver (resolver);
379 InputSource in;
381 in = new InputSource (new StringReader (minimalDocument));
382 producer.parse (in);
384 disableDeclarations = true;
385 if (rootName == fakeRootName)
386 this.rootName = null;
389 private void resetState ()
391 if (!disableReset) {
392 rootName = null;
393 contentStack.removeAllElements ();
394 elements.clear ();
395 ids.clear ();
397 notations.removeAllElements ();
398 nDeferred.removeAllElements ();
399 unparsed.removeAllElements ();
400 uDeferred.removeAllElements ();
405 private void warning (String description)
406 throws SAXException
408 ErrorHandler errHandler = getErrorHandler ();
409 Locator locator = getDocumentLocator ();
410 SAXParseException err;
412 if (errHandler == null)
413 return;
415 if (locator == null)
416 err = new SAXParseException (description, null, null, -1, -1);
417 else
418 err = new SAXParseException (description, locator);
419 errHandler.warning (err);
422 // package private (for ChildrenRecognizer)
423 private void error (String description)
424 throws SAXException
426 ErrorHandler errHandler = getErrorHandler ();
427 Locator locator = getDocumentLocator ();
428 SAXParseException err;
430 if (locator == null)
431 err = new SAXParseException (description, null, null, -1, -1);
432 else
433 err = new SAXParseException (description, locator);
434 if (errHandler != null)
435 errHandler.error (err);
436 else // else we always treat it as fatal!
437 throw err;
440 private void fatalError (String description)
441 throws SAXException
443 ErrorHandler errHandler = getErrorHandler ();
444 Locator locator = getDocumentLocator ();
445 SAXParseException err;
447 if (locator != null)
448 err = new SAXParseException (description, locator);
449 else
450 err = new SAXParseException (description, null, null, -1, -1);
451 if (errHandler != null)
452 errHandler.fatalError (err);
453 // we always treat this as fatal, regardless of the handler
454 throw err;
458 private static boolean isExtender (char c)
460 // [88] Extender ::= ...
461 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
462 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
463 || (c >= 0x3031 && c <= 0x3035)
464 || (c >= 0x309d && c <= 0x309e)
465 || (c >= 0x30fc && c <= 0x30fe);
469 // use augmented Unicode rules, not full XML rules
470 private boolean isName (String name, String context, String id)
471 throws SAXException
473 char buf [] = name.toCharArray ();
474 boolean pass = true;
476 if (!Character.isUnicodeIdentifierStart (buf [0])
477 && ":_".indexOf (buf [0]) == -1)
478 pass = false;
479 else {
480 int max = buf.length;
481 for (int i = 1; pass && i < max; i++) {
482 char c = buf [i];
483 if (!Character.isUnicodeIdentifierPart (c)
484 && ":-_.".indexOf (c) == -1
485 && !isExtender (c))
486 pass = false;
490 if (!pass)
491 error ("In " + context + " for " + id
492 + ", '" + name + "' is not a name");
493 return pass; // true == OK
496 // use augmented Unicode rules, not full XML rules
497 private boolean isNmtoken (String nmtoken, String context, String id)
498 throws SAXException
500 char buf [] = nmtoken.toCharArray ();
501 boolean pass = true;
502 int max = buf.length;
504 // XXX make this share code with isName
506 for (int i = 0; pass && i < max; i++) {
507 char c = buf [i];
508 if (!Character.isUnicodeIdentifierPart (c)
509 && ":-_.".indexOf (c) == -1
510 && !isExtender (c))
511 pass = false;
514 if (!pass)
515 error ("In " + context + " for " + id
516 + ", '" + nmtoken + "' is not a name token");
517 return pass; // true == OK
520 private void checkEnumeration (String value, String type, String name)
521 throws SAXException
523 if (!hasMatch (value, type))
524 // VC: Enumeration
525 error ("Value '" + value
526 + "' for attribute '" + name
527 + "' is not permitted: " + type);
530 // used to test enumerated attributes and mixed content models
531 // package private
532 static boolean hasMatch (String value, String orList)
534 int len = value.length ();
535 int max = orList.length () - len;
537 for (int start = 0;
538 (start = orList.indexOf (value, start)) != -1;
539 start++) {
540 char c;
542 if (start > max)
543 break;
544 c = orList.charAt (start - 1);
545 if (c != '|' && c != '('/*)*/)
546 continue;
547 c = orList.charAt (start + len);
548 if (c != '|' && /*(*/ c != ')')
549 continue;
550 return true;
552 return false;
556 * <b>LexicalHandler</b> Records the declaration of the root
557 * element, so it can be verified later.
558 * Passed to the next consumer, unless this one was
559 * preloaded with a particular DTD.
561 public void startDTD (String name, String publicId, String systemId)
562 throws SAXException
564 if (disableDeclarations)
565 return;
567 rootName = name;
568 super.startDTD (name, publicId, systemId);
572 * <b>LexicalHandler</b> Verifies that all referenced notations
573 * and unparsed entities have been declared.
574 * Passed to the next consumer, unless this one was
575 * preloaded with a particular DTD.
577 public void endDTD ()
578 throws SAXException
580 if (disableDeclarations)
581 return;
583 // this is a convenient hook for end-of-dtd checks, but we
584 // could also trigger it in the first startElement call.
585 // locator info is more appropriate here though.
587 // VC: Notation Declared (NDATA can refer to them before decls,
588 // as can NOTATION attribute enumerations and defaults)
589 int length = nDeferred.size ();
590 for (int i = 0; i < length; i++) {
591 String notation = (String) nDeferred.elementAt (i);
592 if (!notations.contains (notation)) {
593 error ("A declaration referred to notation '" + notation
594 + "' which was never declared");
597 nDeferred.removeAllElements ();
599 // VC: Entity Name (attribute values can refer to them
600 // before they're declared); VC Attribute Default Legal
601 length = uDeferred.size ();
602 for (int i = 0; i < length; i++) {
603 String entity = (String) uDeferred.elementAt (i);
604 if (!unparsed.contains (entity)) {
605 error ("An attribute default referred to entity '" + entity
606 + "' which was never declared");
609 uDeferred.removeAllElements ();
610 super.endDTD ();
614 // These are interned, so we can rely on "==" to find the type of
615 // all attributes except enumerations ...
616 // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
617 static final String types [] = {
618 "CDATA",
619 "ID", "IDREF", "IDREFS",
620 "NMTOKEN", "NMTOKENS",
621 "ENTITY", "ENTITIES"
626 * <b>DecllHandler</b> Records attribute declaration for later use
627 * in validating document content, and checks validity constraints
628 * that are applicable to attribute declarations.
629 * Passed to the next consumer, unless this one was
630 * preloaded with a particular DTD.
632 public void attributeDecl (
633 String eName,
634 String aName,
635 String type,
636 String mode,
637 String value
638 ) throws SAXException
640 if (disableDeclarations)
641 return;
643 ElementInfo info = (ElementInfo) elements.get (eName);
644 AttributeInfo ainfo = new AttributeInfo ();
645 boolean checkOne = false;
646 boolean interned = false;
648 // cheap interning of type names and #FIXED, #REQUIRED
649 // for faster startElement (we can use "==")
650 for (int i = 0; i < types.length; i++) {
651 if (types [i].equals (type)) {
652 type = types [i];
653 interned = true;
654 break;
657 if ("#FIXED".equals (mode))
658 mode = "#FIXED";
659 else if ("#REQUIRED".equals (mode))
660 mode = "#REQUIRED";
662 ainfo.type = type;
663 ainfo.mode = mode;
664 ainfo.value = value;
666 // we might not have seen the content model yet
667 if (info == null) {
668 info = new ElementInfo (eName);
669 elements.put (eName, info);
671 if ("ID" == type) {
672 checkOne = true;
673 if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
674 // VC: ID Attribute Default
675 error ("ID attribute '" + aName
676 + "' must be #IMPLIED or #REQUIRED");
679 } else if (!interned && type.startsWith ("NOTATION ")) {
680 checkOne = true;
682 // VC: Notation Attributes (notations must be declared)
683 StringTokenizer tokens = new StringTokenizer (
684 type.substring (10, type.lastIndexOf (')')),
685 "|");
686 while (tokens.hasMoreTokens ()) {
687 String token = tokens.nextToken ();
688 if (!notations.contains (token))
689 nDeferred.addElement (token);
692 if (checkOne) {
693 for (Enumeration e = info.attributes.keys ();
694 e.hasMoreElements ();
695 /* NOP */) {
696 String name;
697 AttributeInfo ainfo2;
699 name = (String) e.nextElement ();
700 ainfo2 = (AttributeInfo) info.attributes.get (name);
701 if (type == ainfo2.type || !interned /* NOTATION */) {
702 // VC: One ID per Element Type
703 // VC: One Notation per Element TYpe
704 error ("Element '" + eName
705 + "' already has an attribute of type "
706 + (interned ? "NOTATION" : type)
707 + " ('" + name
708 + "') so '" + aName
709 + "' is a validity error");
714 // VC: Attribute Default Legal
715 if (value != null) {
717 if ("CDATA" == type) {
718 // event source rejected '<'
720 } else if ("NMTOKEN" == type) {
721 // VC: Name Token (is a nmtoken)
722 isNmtoken (value, "attribute default", aName);
724 } else if ("NMTOKENS" == type) {
725 // VC: Name Token (is a nmtoken; at least one value)
726 StringTokenizer tokens = new StringTokenizer (value);
727 if (!tokens.hasMoreTokens ())
728 error ("Default for attribute '" + aName
729 + "' must have at least one name token.");
730 else do {
731 String token = tokens.nextToken ();
732 isNmtoken (token, "attribute default", aName);
733 } while (tokens.hasMoreTokens ());
735 } else if ("IDREF" == type || "ENTITY" == type) {
736 // VC: Entity Name (is a name)
737 // VC: IDREF (is a name) (is declared)
738 isName (value, "attribute default", aName);
739 if ("ENTITY" == type && !unparsed.contains (value))
740 uDeferred.addElement (value);
742 } else if ("IDREFS" == type || "ENTITIES" == type) {
743 // VC: Entity Name (is a name; at least one value)
744 // VC: IDREF (is a name; at least one value)
745 StringTokenizer names = new StringTokenizer (value);
746 if (!names.hasMoreTokens ())
747 error ("Default for attribute '" + aName
748 + "' must have at least one name.");
749 else do {
750 String name = names.nextToken ();
751 isName (name, "attribute default", aName);
752 if ("ENTITIES" == type && !unparsed.contains (name))
753 uDeferred.addElement (value);
754 } while (names.hasMoreTokens ());
756 } else if (type.charAt (0) == '(' /*)*/ ) {
757 // VC: Enumeration (must match)
758 checkEnumeration (value, type, aName);
760 } else if (!interned && checkOne) { /* NOTATION */
761 // VC: Notation attributes (must be names)
762 isName (value, "attribute default", aName);
764 // VC: Notation attributes (must be declared)
765 if (!notations.contains (value))
766 nDeferred.addElement (value);
768 // VC: Enumeration (must match)
769 checkEnumeration (value, type, aName);
771 } else if ("ID" != type)
772 throw new RuntimeException ("illegal attribute type: " + type);
775 if (info.attributes.get (aName) == null)
776 info.attributes.put (aName, ainfo);
778 else
779 warning ("Element '" + eName
780 + "' already has an attribute named '" + aName + "'");
783 if ("xml:space".equals (aName)) {
784 if (!("(default|preserve)".equals (type)
785 || "(preserve|default)".equals (type)
786 // these next two are arguable; XHTML's DTD doesn't
787 // deserve errors. After all, it's not like any
788 // illegal _value_ could pass ...
789 || "(preserve)".equals (type)
790 || "(default)".equals (type)
792 error (
793 "xml:space attribute type must be like '(default|preserve)'"
794 + " not '" + type + "'"
798 super.attributeDecl (eName, aName, type, mode, value);
802 * <b>DecllHandler</b> Records the element declaration for later use
803 * when checking document content, and checks validity constraints that
804 * apply to element declarations. Passed to the next consumer, unless
805 * this one was preloaded with a particular DTD.
807 public void elementDecl (String name, String model)
808 throws SAXException
810 if (disableDeclarations)
811 return;
813 ElementInfo info = (ElementInfo) elements.get (name);
815 // we might have seen an attribute decl already
816 if (info == null) {
817 info = new ElementInfo (name);
818 elements.put (name, info);
820 if (info.model != null) {
821 // NOTE: not all parsers can report such duplicates.
822 // VC: Unique Element Type Declaration
823 error ("Element type '" + name
824 + "' was already declared.");
825 } else {
826 info.model = model;
828 // VC: No Duplicate Types (in mixed content models)
829 if (model.charAt (1) == '#') // (#PCDATA...
830 info.getRecognizer (this);
832 super.elementDecl (name, model);
836 * <b>DecllHandler</b> passed to the next consumer, unless this
837 * one was preloaded with a particular DTD
839 public void internalEntityDecl (String name, String value)
840 throws SAXException
842 if (!disableDeclarations)
843 super.internalEntityDecl (name, value);
847 * <b>DecllHandler</b> passed to the next consumer, unless this
848 * one was preloaded with a particular DTD
850 public void externalEntityDecl (String name,
851 String publicId, String systemId)
852 throws SAXException
854 if (!disableDeclarations)
855 super.externalEntityDecl (name, publicId, systemId);
860 * <b>DTDHandler</b> Records the notation name, for checking
861 * NOTATIONS attribute values and declararations of unparsed
862 * entities. Passed to the next consumer, unless this one was
863 * preloaded with a particular DTD.
865 public void notationDecl (String name, String publicId, String systemId)
866 throws SAXException
868 if (disableDeclarations)
869 return;
871 notations.addElement (name);
872 super.notationDecl (name, publicId, systemId);
876 * <b>DTDHandler</b> Records the entity name, for checking
877 * ENTITY and ENTITIES attribute values; records the notation
878 * name if it hasn't yet been declared. Passed to the next consumer,
879 * unless this one was preloaded with a particular DTD.
881 public void unparsedEntityDecl (
882 String name,
883 String publicId,
884 String systemId,
885 String notationName
886 ) throws SAXException
888 if (disableDeclarations)
889 return;
891 unparsed.addElement (name);
892 if (!notations.contains (notationName))
893 nDeferred.addElement (notationName);
894 super.unparsedEntityDecl (name, publicId, systemId, notationName);
899 * <b>ContentHandler</b> Ensures that state from any previous parse
900 * has been deleted.
901 * Passed to the next consumer.
903 public void startDocument ()
904 throws SAXException
906 resetState ();
907 super.startDocument ();
911 private static boolean isAsciiLetter (char c)
913 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
918 * <b>ContentHandler</b> Reports a fatal exception. Validating
919 * XML processors may not skip any entities.
921 public void skippedEntity (String name)
922 throws SAXException
924 fatalError ("may not skip entities");
928 * SAX2 doesn't expand non-PE refs in attribute defaults...
930 private String expandDefaultRefs (String s)
931 throws SAXException
933 if (s.indexOf ('&') < 0)
934 return s;
936 // FIXME: handle &#nn; &#xnn; &name;
937 String message = "Can't expand refs in attribute default: " + s;
938 warning (message);
940 return s;
944 * <b>ContentHandler</b> Performs validity checks against element
945 * (and document) content models, and attribute values.
946 * Passed to the next consumer.
948 public void startElement (
949 String uri,
950 String localName,
951 String qName,
952 Attributes atts
953 ) throws SAXException
956 // First check content model for the enclosing scope.
958 if (contentStack.isEmpty ()) {
959 // VC: Root Element Type
960 if (!qName.equals (rootName)) {
961 if (rootName == null)
962 warning ("This document has no DTD, can't be valid");
963 else
964 error ("Root element type '" + qName
965 + "' was declared to be '" + rootName + "'");
967 } else {
968 Recognizer state = (Recognizer) contentStack.peek ();
970 if (state != null) {
971 Recognizer newstate = state.acceptElement (qName);
973 if (newstate == null)
974 error ("Element type '" + qName
975 + "' in element '" + state.type.name
976 + "' violates content model " + state.type.model
978 if (newstate != state) {
979 contentStack.pop ();
980 contentStack.push (newstate);
986 // Then check that this element was declared, and push the
987 // object used to validate its content model onto our stack.
989 // This is where the recognizer gets created, if needed; if
990 // it's a "children" (elements) content model, an NDFA is
991 // created. (One recognizer is used per content type, no
992 // matter how complex that recognizer is.)
994 ElementInfo info;
996 info = (ElementInfo) elements.get (qName);
997 if (info == null || info.model == null) {
998 // VC: Element Valid (base clause)
999 error ("Element type '" + qName + "' was not declared");
1000 contentStack.push (null);
1002 // for less diagnostic noise, fake a declaration.
1003 elementDecl (qName, "ANY");
1004 } else
1005 contentStack.push (info.getRecognizer (this));
1008 // Then check each attribute present
1010 int len;
1011 String aname;
1012 AttributeInfo ainfo;
1014 if (atts != null)
1015 len = atts.getLength ();
1016 else
1017 len = 0;
1019 for (int i = 0; i < len; i++) {
1020 aname = atts.getQName (i);
1022 if (info == null
1023 || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024 == null) {
1025 // VC: Attribute Value Type
1026 error ("Attribute '" + aname
1027 + "' was not declared for element type " + qName);
1028 continue;
1031 String value = atts.getValue (i);
1033 // note that "==" for type names and "#FIXED" is correct
1034 // (and fast) since we've interned those literals.
1036 if ("#FIXED" == ainfo.mode) {
1037 String expanded = expandDefaultRefs (ainfo.value);
1039 // VC: Fixed Attribute Default
1040 if (!value.equals (expanded)) {
1041 error ("Attribute '" + aname
1042 + "' must match " + expanded
1044 continue;
1048 if ("CDATA" == ainfo.type)
1049 continue;
1052 // For all other attribute types, there are various
1053 // rules to follow.
1056 if ("ID" == ainfo.type) {
1057 // VC: ID (must be a name)
1058 if (isName (value, "ID attribute", aname)) {
1059 if (Boolean.TRUE == ids.get (value))
1060 // VC: ID (appears once)
1061 error ("ID attribute " + aname
1062 + " uses an ID value '" + value
1063 + "' which was already declared.");
1064 else
1065 // any forward refs are no longer problems
1066 ids.put (value, Boolean.TRUE);
1068 continue;
1071 if ("IDREF" == ainfo.type) {
1072 // VC: IDREF (value must be a name)
1073 if (isName (value, "IDREF attribute", aname)) {
1074 // VC: IDREF (must match some ID attribute)
1075 if (ids.get (value) == null)
1076 // new -- assume it's a forward ref
1077 ids.put (value, Boolean.FALSE);
1079 continue;
1082 if ("IDREFS" == ainfo.type) {
1083 StringTokenizer tokens = new StringTokenizer (value, " ");
1085 if (!tokens.hasMoreTokens ()) {
1086 // VC: IDREF (one or more values)
1087 error ("IDREFS attribute " + aname
1088 + " must have at least one ID ref");
1089 } else do {
1090 String id = tokens.nextToken ();
1092 // VC: IDREF (value must be a name)
1093 if (isName (id, "IDREFS attribute", aname)) {
1094 // VC: IDREF (must match some ID attribute)
1095 if (ids.get (id) == null)
1096 // new -- assume it's a forward ref
1097 ids.put (id, Boolean.FALSE);
1099 } while (tokens.hasMoreTokens ());
1100 continue;
1103 if ("NMTOKEN" == ainfo.type) {
1104 // VC: Name Token (is a name token)
1105 isNmtoken (value, "NMTOKEN attribute", aname);
1106 continue;
1109 if ("NMTOKENS" == ainfo.type) {
1110 StringTokenizer tokens = new StringTokenizer (value, " ");
1112 if (!tokens.hasMoreTokens ()) {
1113 // VC: Name Token (one or more values)
1114 error ("NMTOKENS attribute " + aname
1115 + " must have at least one name token");
1116 } else do {
1117 String token = tokens.nextToken ();
1119 // VC: Name Token (is a name token)
1120 isNmtoken (token, "NMTOKENS attribute", aname);
1121 } while (tokens.hasMoreTokens ());
1122 continue;
1125 if ("ENTITY" == ainfo.type) {
1126 if (!unparsed.contains (value))
1127 // VC: Entity Name
1128 error ("Value of attribute '" + aname
1129 + "' refers to unparsed entity '" + value
1130 + "' which was not declared.");
1131 continue;
1134 if ("ENTITIES" == ainfo.type) {
1135 StringTokenizer tokens = new StringTokenizer (value, " ");
1137 if (!tokens.hasMoreTokens ()) {
1138 // VC: Entity Name (one or more values)
1139 error ("ENTITIES attribute " + aname
1140 + " must have at least one name token");
1141 } else do {
1142 String entity = tokens.nextToken ();
1144 if (!unparsed.contains (entity))
1145 // VC: Entity Name
1146 error ("Value of attribute '" + aname
1147 + "' refers to unparsed entity '" + entity
1148 + "' which was not declared.");
1149 } while (tokens.hasMoreTokens ());
1150 continue;
1154 // check for enumerations last; more expensive
1156 if (ainfo.type.charAt (0) == '(' /*)*/
1157 || ainfo.type.startsWith ("NOTATION ")
1159 // VC: Enumeration (value must be defined)
1160 checkEnumeration (value, ainfo.type, aname);
1161 continue;
1166 // Last, check that all #REQUIRED attributes were provided
1168 if (info != null) {
1169 Hashtable table = info.attributes;
1171 if (table.size () != 0) {
1172 Enumeration e = table.keys ();
1174 // XXX table.keys uses the heap, bleech -- slows things
1176 while (e.hasMoreElements ()) {
1177 aname = (String) e.nextElement ();
1178 ainfo = (AttributeInfo) table.get (aname);
1180 // "#REQUIRED" mode was interned in attributeDecl
1181 if ("#REQUIRED" == ainfo.mode
1182 && atts.getValue (aname) == null) {
1183 // VC: Required Attribute
1184 error ("Attribute '" + aname + "' must be specified "
1185 + "for element type " + qName);
1190 super.startElement (uri, localName, qName, atts);
1194 * <b>ContentHandler</b> Reports a validity error if the element's content
1195 * model does not permit character data.
1196 * Passed to the next consumer.
1198 public void characters (char ch [], int start, int length)
1199 throws SAXException
1201 Recognizer state;
1203 if (contentStack.empty ())
1204 state = null;
1205 else
1206 state = (Recognizer) contentStack.peek ();
1208 // NOTE: if this ever supports with SAX parsers that don't
1209 // report ignorable whitespace as such (only XP?), this class
1210 // needs to morph it into ignorableWhitespace() as needed ...
1212 if (state != null && !state.acceptCharacters ())
1213 // VC: Element Valid (clauses three, four -- see recognizer)
1214 error ("Character content not allowed in element "
1215 + state.type.name);
1217 super.characters (ch, start, length);
1222 * <b>ContentHandler</b> Reports a validity error if the element's content
1223 * model does not permit end-of-element yet, or a well formedness error
1224 * if there was no matching startElement call.
1225 * Passed to the next consumer.
1227 public void endElement (String uri, String localName, String qName)
1228 throws SAXException
1230 try {
1231 Recognizer state = (Recognizer) contentStack.pop ();
1233 if (state != null && !state.completed ())
1234 // VC: Element valid (clauses two, three, four; see Recognizer)
1235 error ("Premature end for element '"
1236 + state.type.name
1237 + "', content model "
1238 + state.type.model);
1240 // could insist on match of start element, but that's
1241 // something the input stream must to guarantee.
1243 } catch (EmptyStackException e) {
1244 fatalError ("endElement without startElement: " + qName
1245 + ((uri == null)
1246 ? ""
1247 : ( " { '" + uri + "', " + localName + " }")));
1249 super.endElement (uri, localName, qName);
1253 * <b>ContentHandler</b> Checks whether all ID values that were
1254 * referenced have been declared, and releases all resources.
1255 * Passed to the next consumer.
1257 * @see #setDocumentLocator
1259 public void endDocument ()
1260 throws SAXException
1262 for (Enumeration idNames = ids.keys ();
1263 idNames.hasMoreElements ();
1264 /* NOP */) {
1265 String id = (String) idNames.nextElement ();
1267 if (Boolean.FALSE == ids.get (id)) {
1268 // VC: IDREF (must match ID)
1269 error ("Undeclared ID value '" + id
1270 + "' was referred to by an IDREF/IDREFS attribute");
1274 resetState ();
1275 super.endDocument ();
1279 /** Holds per-element declarations */
1280 static private final class ElementInfo
1282 String name;
1283 String model;
1285 // key = attribute name; value = AttributeInfo
1286 Hashtable attributes = new Hashtable (11);
1288 ElementInfo (String n) { name = n; }
1290 private Recognizer recognizer;
1292 // for validating content models: one per type, shared,
1293 // and constructed only on demand ... so unused elements do
1294 // not need to consume resources.
1295 Recognizer getRecognizer (ValidationConsumer consumer)
1296 throws SAXException
1298 if (recognizer == null) {
1299 if ("ANY".equals (model))
1300 recognizer = ANY;
1301 else if ("EMPTY".equals (model))
1302 recognizer = new EmptyRecognizer (this);
1303 else if ('#' == model.charAt (1))
1304 // n.b. this constructor does a validity check
1305 recognizer = new MixedRecognizer (this, consumer);
1306 else
1307 recognizer = new ChildrenRecognizer (this, consumer);
1309 return recognizer;
1313 /** Holds per-attribute declarations */
1314 static private final class AttributeInfo
1316 String type;
1317 String mode; // #REQUIRED, etc (or null)
1318 String value; // or null
1323 // Content model validation
1326 static private final Recognizer ANY = new Recognizer (null);
1329 // Base class defines the calls used to validate content,
1330 // and supports the "ANY" content model
1331 static private class Recognizer
1333 final ElementInfo type;
1335 Recognizer (ElementInfo t) { type = t; }
1337 // return true iff character data is legal here
1338 boolean acceptCharacters ()
1339 throws SAXException
1340 // VC: Element Valid (third and fourth clauses)
1341 { return true; }
1343 // null return = failure
1344 // otherwise, next state (like an FSM)
1345 // prerequisite: tested that name was declared
1346 Recognizer acceptElement (String name)
1347 throws SAXException
1348 // VC: Element Valid (fourth clause)
1349 { return this; }
1351 // return true iff model is completed, can finish
1352 boolean completed ()
1353 throws SAXException
1354 // VC: Element Valid (fourth clause)
1355 { return true; }
1357 public String toString ()
1358 // n.b. "children" is the interesting case!
1359 { return (type == null) ? "ANY" : type.model; }
1362 // "EMPTY" content model -- no characters or elements
1363 private static final class EmptyRecognizer extends Recognizer
1365 public EmptyRecognizer (ElementInfo type)
1366 { super (type); }
1368 // VC: Element Valid (first clause)
1369 boolean acceptCharacters ()
1370 { return false; }
1372 // VC: Element Valid (first clause)
1373 Recognizer acceptElement (String name)
1374 { return null; }
1377 // "Mixed" content model -- ANY, but restricts elements
1378 private static final class MixedRecognizer extends Recognizer
1380 private String permitted [];
1382 // N.B. constructor tests for duplicated element names (VC)
1383 public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384 throws SAXException
1386 super (t);
1388 // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389 // with the "..." being "|elname|..."
1390 StringTokenizer tokens = new StringTokenizer (
1391 t.model.substring (8, t.model.lastIndexOf (')')),
1392 "|");
1393 Vector vec = new Vector ();
1395 while (tokens.hasMoreTokens ()) {
1396 String token = tokens.nextToken ();
1398 if (vec.contains (token))
1399 v.error ("element " + token
1400 + " is repeated in mixed content model: "
1401 + t.model);
1402 else
1403 vec.addElement (token.intern ());
1405 permitted = new String [vec.size ()];
1406 for (int i = 0; i < permitted.length; i++)
1407 permitted [i] = (String) vec.elementAt (i);
1409 // in one large machine-derived DTD sample, most of about
1410 // 250 mixed content models were empty, and 25 had ten or
1411 // more entries. 2 had over a hundred elements. Linear
1412 // search isn't obviously wrong.
1415 // VC: Element Valid (third clause)
1416 Recognizer acceptElement (String name)
1418 int length = permitted.length;
1420 // first pass -- optimistic w.r.t. event source interning
1421 // (and document validity)
1422 for (int i = 0; i < length; i++)
1423 if (permitted [i] == name)
1424 return this;
1425 // second pass -- pessimistic w.r.t. event source interning
1426 for (int i = 0; i < length; i++)
1427 if (permitted [i].equals (name))
1428 return this;
1429 return null;
1434 // recognizer loop flags, see later
1435 private static final int F_LOOPHEAD = 0x01;
1436 private static final int F_LOOPNEXT = 0x02;
1438 // for debugging -- used to label/count nodes in toString()
1439 private static int nodeCount;
1442 * "Children" content model -- these are nodes in NDFA state graphs.
1443 * They work in fixed space. Note that these graphs commonly have
1444 * cycles, handling features such as zero-or-more and one-or-more.
1446 * <p>It's readonly, so only one copy is ever needed. The content model
1447 * stack may have any number of pointers into each graph, when a model
1448 * happens to be needed more than once due to element nesting. Since
1449 * traversing the graph just moves to another node, and never changes
1450 * it, traversals never interfere with each other.
1452 * <p>There is an option to report non-deterministic models. These are
1453 * always XML errors, but ones which are not often reported despite the
1454 * fact that they can lead to different validating parsers giving
1455 * different results for the same input. (The XML spec doesn't require
1456 * them to be reported.)
1458 * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459 * it's not actually detecting the non-determinism it tries to detect.
1460 * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461 * maybe some others.) This may relate to the issue flagged below as
1462 * "should not" happen (but it was), which showed up when patching the
1463 * graph to have one exit node (or more EMPTY nodes).
1465 private static final class ChildrenRecognizer extends Recognizer
1466 implements Cloneable
1468 // for reporting non-deterministic content models
1469 // ... a waste of space if we're not reporting those!
1470 // ... along with the 'model' member (in base class)
1471 private ValidationConsumer consumer;
1473 // for CHOICE nodes -- each component is an arc that
1474 // accepts a different NAME (or is EMPTY indicating
1475 // NDFA termination).
1476 private Recognizer components [];
1478 // for NAME/SEQUENCE nodes -- accepts that NAME and
1479 // then goes to the next node (CHOICE, NAME, EMPTY).
1480 private String name;
1481 private Recognizer next;
1483 // loops always point back to a CHOICE node. we mark such choice
1484 // nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485 // We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486 // termination when we patch sequences and loops.
1487 private int flags;
1490 // prevent a needless indirection between 'this' and 'node'
1491 private void copyIn (ChildrenRecognizer node)
1493 // model & consumer are already set
1494 components = node.components;
1495 name = node.name;
1496 next = node.next;
1497 flags = node.flags;
1500 // used to construct top level "children" content models,
1501 public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1503 this (vc, type);
1504 populate (type.model.toCharArray (), 0);
1505 patchNext (new EmptyRecognizer (type), null);
1508 // used internally; populating is separate
1509 private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1511 super (type);
1512 consumer = vc;
1517 // When rewriting some graph nodes we need deep clones in one case;
1518 // mostly shallow clones (what the JVM handles for us) are fine.
1520 private ChildrenRecognizer shallowClone ()
1522 try {
1523 return (ChildrenRecognizer) clone ();
1524 } catch (CloneNotSupportedException e) {
1525 throw new Error ("clone");
1529 private ChildrenRecognizer deepClone ()
1531 return deepClone (new Hashtable (37));
1534 private ChildrenRecognizer deepClone (Hashtable table)
1536 ChildrenRecognizer retval;
1538 if ((flags & F_LOOPHEAD) != 0) {
1539 retval = (ChildrenRecognizer) table.get (this);
1540 if (retval != null)
1541 return this;
1543 retval = shallowClone ();
1544 table.put (this, retval);
1545 } else
1546 retval = shallowClone ();
1548 if (next != null) {
1549 if (next instanceof ChildrenRecognizer)
1550 retval.next = ((ChildrenRecognizer)next)
1551 .deepClone (table);
1552 else if (!(next instanceof EmptyRecognizer))
1553 throw new RuntimeException ("deepClone");
1556 if (components != null) {
1557 retval.components = new Recognizer [components.length];
1558 for (int i = 0; i < components.length; i++) {
1559 Recognizer temp = components [i];
1561 if (temp == null)
1562 retval.components [i] = null;
1563 else if (temp instanceof ChildrenRecognizer)
1564 retval.components [i] = ((ChildrenRecognizer)temp)
1565 .deepClone (table);
1566 else if (!(temp instanceof EmptyRecognizer))
1567 throw new RuntimeException ("deepClone");
1571 return retval;
1574 // connect subgraphs, first to next (sequencing)
1575 private void patchNext (Recognizer theNext, Hashtable table)
1577 // backpointers must not be repatched or followed
1578 if ((flags & F_LOOPNEXT) != 0)
1579 return;
1581 // XXX this table "shouldn't" be needed, right?
1582 // but some choice nodes looped if it isn't there.
1583 if (table != null && table.get (this) != null)
1584 return;
1585 if (table == null)
1586 table = new Hashtable ();
1588 // NAME/SEQUENCE
1589 if (name != null) {
1590 if (next == null)
1591 next = theNext;
1592 else if (next instanceof ChildrenRecognizer) {
1593 ((ChildrenRecognizer)next).patchNext (theNext, table);
1594 } else if (!(next instanceof EmptyRecognizer))
1595 throw new RuntimeException ("patchNext");
1596 return;
1599 // CHOICE
1600 for (int i = 0; i < components.length; i++) {
1601 if (components [i] == null)
1602 components [i] = theNext;
1603 else if (components [i] instanceof ChildrenRecognizer) {
1604 ((ChildrenRecognizer)components [i])
1605 .patchNext (theNext, table);
1606 } else if (!(components [i] instanceof EmptyRecognizer))
1607 throw new RuntimeException ("patchNext");
1610 if (table != null && (flags & F_LOOPHEAD) != 0)
1611 table.put (this, this);
1615 * Parses a 'children' spec (or recursively 'cp') and makes this
1616 * become a regular graph node.
1618 * @return index after this particle
1620 private int populate (char parseBuf [], int startPos)
1622 int nextPos = startPos + 1;
1623 char c;
1625 if (nextPos < 0 || nextPos >= parseBuf.length)
1626 throw new IndexOutOfBoundsException ();
1628 // Grammar of the string is from the XML spec, but
1629 // with whitespace removed by the SAX parser.
1631 // children ::= (choice | seq) ('?' | '*' | '+')?
1632 // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633 // choice ::= '(' cp ('|' choice)* ')'
1634 // seq ::= '(' cp (',' choice)* ')'
1636 // interior nodes only
1637 // cp ::= name ...
1638 if (parseBuf [startPos] != '('/*)*/) {
1639 boolean done = false;
1640 do {
1641 switch (c = parseBuf [nextPos]) {
1642 case '?': case '*': case '+':
1643 case '|': case ',':
1644 case /*(*/ ')':
1645 done = true;
1646 continue;
1647 default:
1648 nextPos++;
1649 continue;
1651 } while (!done);
1652 name = new String (parseBuf, startPos, nextPos - startPos);
1654 // interior OR toplevel nodes
1655 // cp ::= choice ..
1656 // cp ::= seq ..
1657 } else {
1658 // collect everything as a separate list, and merge it
1659 // into "this" later if we can (SEQUENCE or singleton)
1660 ChildrenRecognizer first;
1662 first = new ChildrenRecognizer (consumer, type);
1663 nextPos = first.populate (parseBuf, nextPos);
1664 c = parseBuf [nextPos++];
1666 if (c == ',' || c == '|') {
1667 ChildrenRecognizer current = first;
1668 char separator = c;
1669 Vector v = null;
1671 if (separator == '|') {
1672 v = new Vector ();
1673 v.addElement (first);
1676 do {
1677 ChildrenRecognizer link;
1679 link = new ChildrenRecognizer (consumer, type);
1680 nextPos = link.populate (parseBuf, nextPos);
1682 if (separator == ',') {
1683 current.patchNext (link, null);
1684 current = link;
1685 } else
1686 v.addElement (link);
1688 c = parseBuf [nextPos++];
1689 } while (c == separator);
1691 // choice ... collect everything into one array.
1692 if (separator == '|') {
1693 // assert v.size() > 1
1694 components = new Recognizer [v.size ()];
1695 for (int i = 0; i < components.length; i++) {
1696 components [i] = (Recognizer)
1697 v.elementAt (i);
1699 // assert flags == 0
1701 // sequence ... merge into "this" to be smaller.
1702 } else
1703 copyIn (first);
1705 // treat singletons like one-node sequences.
1706 } else
1707 copyIn (first);
1709 if (c != /*(*/ ')')
1710 throw new RuntimeException ("corrupt content model");
1714 // Arity is optional, and the root of all fun. We keep the
1715 // FSM state graph simple by only having NAME/SEQUENCE and
1716 // CHOICE nodes (or EMPTY to terminate a model), easily
1717 // evaluated. So we rewrite each node that has arity, using
1718 // those primitives. We create loops here, if needed.
1720 if (nextPos < parseBuf.length) {
1721 c = parseBuf [nextPos];
1722 if (c == '?' || c == '*' || c == '+') {
1723 nextPos++;
1725 // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726 // - SEQUENCE (clone, what's next)
1727 // - or, what's next
1728 // Size cost: N --> N + 1
1729 if (c == '?') {
1730 Recognizer once = shallowClone ();
1732 components = new Recognizer [2];
1733 components [0] = once;
1734 // components [1] initted to null
1735 name = null;
1736 next = null;
1737 flags = 0;
1740 // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741 // - LOOP (clone, back to this CHOICE)
1742 // - or, what's next
1743 // Size cost: N --> N + 1
1744 } else if (c == '*') {
1745 ChildrenRecognizer loop = shallowClone ();
1747 loop.patchNext (this, null);
1748 loop.flags |= F_LOOPNEXT;
1749 flags = F_LOOPHEAD;
1751 components = new Recognizer [2];
1752 components [0] = loop;
1753 // components [1] initted to null
1754 name = null;
1755 next = null;
1758 // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759 // Basically (a)+ --> ((a),(a)*).
1760 // - this
1761 // - CHOICE
1762 // * LOOP (clone, back to the CHOICE)
1763 // * or, whatever's next
1764 // Size cost: N --> 2N + 1
1765 } else if (c == '+') {
1766 ChildrenRecognizer loop = deepClone ();
1767 ChildrenRecognizer choice;
1769 choice = new ChildrenRecognizer (consumer, type);
1770 loop.patchNext (choice, null);
1771 loop.flags |= F_LOOPNEXT;
1772 choice.flags = F_LOOPHEAD;
1774 choice.components = new Recognizer [2];
1775 choice.components [0] = loop;
1776 // choice.components [1] initted to null
1777 // choice.name, choice.next initted to null
1779 patchNext (choice, null);
1784 return nextPos;
1787 // VC: Element Valid (second clause)
1788 boolean acceptCharacters ()
1789 { return false; }
1791 // VC: Element Valid (second clause)
1792 Recognizer acceptElement (String type)
1793 throws SAXException
1795 // NAME/SEQUENCE
1796 if (name != null) {
1797 if (name.equals (type))
1798 return next;
1799 return null;
1802 // CHOICE ... optionally reporting nondeterminism we
1803 // run across. we won't check out every transition
1804 // for nondeterminism; only the ones we follow.
1805 Recognizer retval = null;
1807 for (int i = 0; i < components.length; i++) {
1808 Recognizer temp = components [i].acceptElement (type);
1810 if (temp == null)
1811 continue;
1812 else if (!warnNonDeterministic)
1813 return temp;
1814 else if (retval == null)
1815 retval = temp;
1816 else if (retval != temp)
1817 consumer.error ("Content model " + this.type.model
1818 + " is non-deterministic for " + type);
1820 return retval;
1823 // VC: Element Valid (second clause)
1824 boolean completed ()
1825 throws SAXException
1827 // expecting a specific element
1828 if (name != null)
1829 return false;
1831 // choice, some sequences
1832 for (int i = 0; i < components.length; i++) {
1833 if (components [i].completed ())
1834 return true;
1837 return false;
1840 /** /
1841 // FOR DEBUGGING ... flattens the graph for printing.
1843 public String toString ()
1845 StringBuffer buf = new StringBuffer ();
1847 // only one set of loop labels can be generated
1848 // at a time...
1849 synchronized (ANY) {
1850 nodeCount = 0;
1852 toString (buf, new Hashtable ());
1853 return buf.toString ();
1857 private void toString (StringBuffer buf, Hashtable table)
1859 // When we visit a node, label and count it.
1860 // Nodes are never visited/counted more than once.
1861 // For small models labels waste space, but if arity
1862 // mappings were used the savings are substantial.
1863 // (Plus, the output can be more readily understood.)
1864 String temp = (String) table.get (this);
1866 if (temp != null) {
1867 buf.append ('{');
1868 buf.append (temp);
1869 buf.append ('}');
1870 return;
1871 } else {
1872 StringBuffer scratch = new StringBuffer (15);
1874 if ((flags & F_LOOPHEAD) != 0)
1875 scratch.append ("loop");
1876 else
1877 scratch.append ("node");
1878 scratch.append ('-');
1879 scratch.append (++nodeCount);
1880 temp = scratch.toString ();
1882 table.put (this, temp);
1883 buf.append ('[');
1884 buf.append (temp);
1885 buf.append (']');
1886 buf.append (':');
1889 // NAME/SEQUENCE
1890 if (name != null) {
1891 // n.b. some output encodings turn some name chars into '?'
1892 // e.g. with Japanese names and ASCII output
1893 buf.append (name);
1894 if (components != null) // bug!
1895 buf.append ('$');
1896 if (next == null)
1897 buf.append (",*");
1898 else if (next instanceof EmptyRecognizer) // patch-to-next
1899 buf.append (",{}");
1900 else if (next instanceof ChildrenRecognizer) {
1901 buf.append (',');
1902 ((ChildrenRecognizer)next).toString (buf, table);
1903 } else // bug!
1904 buf.append (",+");
1905 return;
1908 // CHOICE
1909 buf.append ("<");
1910 for (int i = 0; i < components.length; i++) {
1911 if (i != 0)
1912 buf.append ("|");
1913 if (components [i] instanceof EmptyRecognizer) {
1914 buf.append ("{}");
1915 } else if (components [i] == null) { // patch-to-next
1916 buf.append ('*');
1917 } else {
1918 ChildrenRecognizer r;
1920 r = (ChildrenRecognizer) components [i];
1921 r.toString (buf, table);
1924 buf.append (">");
1926 /**/