libjava/classpath/gnu/xml/pipeline/ValidationConsumer.java

   1 /* ValidationConsumer.java --
   2    Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
   3
   4 This file is part of GNU Classpath.
   5
   6 GNU Classpath is free software; you can redistribute it and/or modify
   7 it under the terms of the GNU General Public License as published by
   8 the Free Software Foundation; either version 2, or (at your option)
   9 any later version.
  10
  11 GNU Classpath is distributed in the hope that it will be useful, but
  12 WITHOUT ANY WARRANTY; without even the implied warranty of
  13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 General Public License for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GNU Classpath; see the file COPYING.  If not, write to the
  18 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19 02110-1301 USA.
  20
  21 Linking this library statically or dynamically with other modules is
  22 making a combined work based on this library.  Thus, the terms and
  23 conditions of the GNU General Public License cover the whole
  24 combination.
  25
  26 As a special exception, the copyright holders of this library give you
  27 permission to link this library with independent modules to produce an
  28 executable, regardless of the license terms of these independent
  29 modules, and to copy and distribute the resulting executable under
  30 terms of your choice, provided that you also meet, for each linked
  31 independent module, the terms and conditions of the license of that
  32 module.  An independent module is a module which is not derived from
  33 or based on this library.  If you modify this library, you may extend
  34 this exception to your version of the library, but you are not
  35 obligated to do so.  If you do not wish to do so, delete this
  36 exception statement from your version. */
  37
  38 package gnu.xml.pipeline;
  39
  40 import java.io.IOException;
  41 import java.io.StringReader;
  42 import java.io.StringWriter;
  43 import java.util.EmptyStackException;
  44 import java.util.Enumeration;
  45 import java.util.Hashtable;
  46 import java.util.Stack;
  47 import java.util.StringTokenizer;
  48 import java.util.Vector;
  49
  50 import org.xml.sax.Attributes;
  51 import org.xml.sax.EntityResolver;
  52 import org.xml.sax.ErrorHandler;
  53 import org.xml.sax.InputSource;
  54 import org.xml.sax.Locator;
  55 import org.xml.sax.SAXException;
  56 import org.xml.sax.SAXParseException;
  57 import org.xml.sax.XMLReader;
  58 import org.xml.sax.helpers.XMLReaderFactory;
  59
  60 /**
  61  * This class checks SAX2 events to report validity errors; it works as
  62  * both a filter and a terminus on an event pipeline.  It relies on the
  63  * producer of SAX events to:  </p> <ol>
  64  *
  65  *      <li> Conform to the specification of a non-validating XML parser that
  66  *      reads all external entities, reported using SAX2 events. </li>
  67  *
  68  *      <li> Report ignorable whitespace as such (through the ContentHandler
  69  *      interface).  This is, strictly speaking, optional for nonvalidating
  70  *      XML processors.  </li>
  71  *
  72  *      <li> Make SAX2 DeclHandler callbacks, with default
  73  *      attribute values already normalized (and without "&lt;").</li>
  74  *
  75  *      <li> Make SAX2 LexicalHandler startDTD() and endDTD ()
  76  *      callbacks. </li>
  77  *
  78  *      <li> Act as if the <em>(URI)/namespace-prefixes</em> property were
  79  *      set to true, by providing XML 1.0 names and all <code>xmlns*</code>
  80  *      attributes (rather than omitting either or both). </li>
  81  *
  82  *      </ol>
  83  *
  84  * <p> At this writing, the major SAX2 parsers (such as &AElig;lfred2,
  85  * Crimson, and Xerces) meet these requirements, and this validation
  86  * module is used by the optional &AElig;lfred2 validation support.
  87  * </p>
  88  *
  89  * <p> Note that because this is a layered validator, it has to duplicate some
  90  * work that the parser is doing; there are also other cost to layering.
  91  * However, <em>because of layering it doesn't need a parser</em> in order
  92  * to work! You can use it with anything that generates SAX events, such
  93  * as an application component that wants to detect invalid content in
  94  * a changed area without validating an entire document, or which wants to
  95  * ensure that it doesn't write invalid data to a communications partner.</p>
  96  *
  97  * <p> Also, note that because this is a layered validator, the line numbers
  98  * reported for some errors may seem strange.  For example, if an element does
  99  * not permit character content, the validator
 100  * will use the locator provided to it.
 101  * That might reflect the last character of a <em>characters</em> event
 102  * callback, rather than the first non-whitespace character. </p>
 103  *
 104  * <hr />
 105  *
 106  * <!--
 107  * <p> Of interest is the fact that unlike most currently known XML validators,
 108  * this one can report some cases of non-determinism in element content models.
 109  * It is a compile-time option, enabled by default.  This will only report
 110  * such XML errors if they relate to content actually appearing in a document;
 111  * content models aren't aggressively scanned for non-deterministic structure.
 112  * Documents which trigger such non-deterministic transitions may be handled
 113  * differently by different validating parsers, without losing conformance
 114  * to the XML specification. </p>
 115  * -->
 116  *
 117  * <p> Current limitations of the validation performed are in roughly three
 118  * categories.  </p>
 119  *
 120  * <p> The first category represents constraints which demand violations
 121  * of software layering:  exposing lexical details, one of the first things
 122  * that <em>application</em> programming interfaces (APIs) hide.  These
 123  * invariably relate to XML entity handling, and to historical oddities
 124  * of the XML validation semantics.  Curiously,
 125  * recent (Autumn 1999) conformance testing showed that these constraints are
 126  * among those handled worst by existing XML validating parsers.  Arguments
 127  * have been made that each of these VCs should be turned into WFCs (most
 128  * of them) or discarded (popular for the standalone declaration); in short,
 129  * that these are bugs in the XML specification (not all via SGML): </p><ul>
 130  *
 131  *      <li> The <em>Proper Declaration/PE Nesting</em> and
 132  *      <em>Proper Group/PE Nesting</em> VCs can't be tested because they
 133  *      require access to particularly low level lexical level information.
 134  *      In essence, the reason XML isn't a simple thing to parse is that
 135  *      it's not a context free grammar, and these constraints elevate that
 136  *      SGML-derived context sensitivity to the level of a semantic rule.
 137  *
 138  *      <li> The <em>Standalone Document Declaration</em> VC can't be
 139  *      tested.  This is for two reasons.  First, this flag isn't made
 140  *      available through SAX2.  Second, it also requires breaking that
 141  *      lexical layering boundary.  (If you ever wondered why classes
 142  *      in compiler construction or language design barely mention the
 143  *      existence of context-sensitive grammars, it's because of messy
 144  *      issues like these.)
 145  *
 146  *      <li> The <em>Entity Declared</em> VC can't be tested, because it
 147  *      also requires breaking that lexical layering boundary!  There's also
 148  *      another issue: the VC wording (and seemingly intent) is ambiguous.
 149  *      (This is still true in the "Second edition" XML spec.)
 150  *      Since there is a WFC of the same name, everyone's life would be
 151  *      easier if references to undeclared parsed entities were always well
 152  *      formedness errors, regardless of whether they're parameter entities
 153  *      or not.  (Note that nonvalidating parsers are not required
 154  *      to report all such well formedness errors if they don't read external
 155  *      parameter entities, although currently most XML parsers read them
 156  *      in an attempt to avoid problems from inconsistent parser behavior.)
 157  *
 158  *      </ul>
 159  *
 160  * <p> The second category of limitations on this validation represent
 161  * constraints associated with information that is not guaranteed to be
 162  * available (or in one case, <em>is guaranteed not to be available</em>,
 163  * through the SAX2 API: </p><ul>
 164  *
 165  *      <li> The <em>Unique Element Type Declaration</em> VC may not be
 166  *      reportable, if the underlying parser happens not to expose
 167  *      multiple declarations.   (&AElig;lfred2 reports these validity
 168  *      errors directly.)</li>
 169  *
 170  *      <li> Similarly, the <em>Unique Notation Name</em> VC, added in the
 171  *      14-January-2000 XML spec errata to restrict typing models used by
 172  *      elements, may not be reportable.  (&AElig;lfred reports these
 173  *      validity errors directly.) </li>
 174  *
 175  *      </ul>
 176  *
 177  * <p> A third category relates to ease of implementation.  (Think of this
 178  * as "bugs".)  The most notable issue here is character handling.  Rather
 179  * than attempting to implement the voluminous character tables in the XML
 180  * specification (Appendix B), Unicode rules are used directly from
 181  * the java.lang.Character class.  Recent JVMs have begun to diverge from
 182  * the original specification for that class (Unicode 2.0), meaning that
 183  * different JVMs may handle that aspect of conformance differently.
 184  * </p>
 185  *
 186  * <p> Note that for some of the validity errors that SAX2 does not
 187  * expose, a nonvalidating parser is permitted (by the XML specification)
 188  * to report validity errors.  When used with a parser that does so for
 189  * the validity constraints mentioned above (or any other SAX2 event
 190  * stream producer that does the same thing), overall conformance is
 191  * substantially improved.
 192  *
 193  * @see gnu.xml.aelfred2.SAXDriver
 194  * @see gnu.xml.aelfred2.XmlReader
 195  *
 196  * @author David Brownell
 197  */
 198 public final class ValidationConsumer extends EventFilter
 199 {
 200     // report error if we happen to notice a non-deterministic choice?
 201     // we won't report buggy content models; just buggy instances
 202     private static final boolean        warnNonDeterministic = false;
 203
 204     // for tracking active content models
 205     private String              rootName;
 206     private Stack               contentStack = new Stack ();
 207
 208     // flags for "saved DTD" processing
 209     private boolean             disableDeclarations;
 210     private boolean             disableReset;
 211
 212     //
 213     // most VCs get tested when we see element start tags.  the per-element
 214     // info (including attributes) recorded here duplicates that found inside
 215     // many nonvalidating parsers, hence dual lookups etc ... that's why a
 216     // layered validator isn't going to be as fast as a non-layered one.
 217     //
 218
 219     // key = element name; value = ElementInfo
 220     private Hashtable           elements = new Hashtable ();
 221
 222     // some VCs relate to ID/IDREF/IDREFS attributes
 223     // key = id; value = boolean true (defd) or false (refd)
 224     private Hashtable           ids = new Hashtable ();
 225
 226     // we just record declared notation and unparsed entity names.
 227     // the implementation here is simple/slow; these features
 228     // are seldom used, one hopes they'll wither away soon
 229     private Vector              notations = new Vector (5, 5);
 230     private Vector              nDeferred = new Vector (5, 5);
 231     private Vector              unparsed = new Vector (5, 5);
 232     private Vector              uDeferred = new Vector (5, 5);
 233
 234         // note: DocBk 3.1.7 XML defines over 2 dozen notations,
 235         // used when defining unparsed entities for graphics
 236         // (and maybe in other places)
 237
 238
 239
 240     /**
 241      * Creates a pipeline terminus which consumes all events passed to
 242      * it; this will report validity errors as if they were fatal errors,
 243      * unless an error handler is assigned.
 244      *
 245      * @see #setErrorHandler
 246      */
 247         // constructor used by PipelineFactory
 248             // ... and want one taking system ID of an external subset
 249     public ValidationConsumer ()
 250     {
 251         this (null);
 252     }
 253
 254     /**
 255      * Creates a pipeline filter which reports validity errors and then
 256      * passes events on to the next consumer if they were not fatal.
 257      *
 258      * @see #setErrorHandler
 259      */
 260         // constructor used by PipelineFactory
 261             // ... and want one taking system ID of an external subset
 262             // (which won't send declaration events)
 263     public ValidationConsumer (EventConsumer next)
 264     {
 265         super (next);
 266
 267         setContentHandler (this);
 268         setDTDHandler (this);
 269         try { setProperty (DECL_HANDLER, this); }
 270         catch (Exception e) { /* "can't happen" */ }
 271         try { setProperty (LEXICAL_HANDLER, this); }
 272         catch (Exception e) { /* "can't happen" */ }
 273     }
 274
 275
 276     private static final String fakeRootName
 277         = ":Nobody:in:their_Right.Mind_would:use:this-name:1x:";
 278
 279     /**
 280      * Creates a validation consumer which is preloaded with the DTD provided.
 281      * It does this by constructing a document with that DTD, then parsing
 282      * that document and recording its DTD declarations.  Then it arranges
 283      * not to modify that information.
 284      *
 285      * <p> The resulting validation consumer will only validate against
 286      * the specified DTD, regardless of whether some other DTD is found
 287      * in a document being parsed.
 288      *
 289      * @param rootName The name of the required root element; if this is
 290      *  null, any root element name will be accepted.
 291      * @param publicId If non-null and there is a non-null systemId, this
 292      *  identifier provides an alternate access identifier for the DTD's
 293      *  external subset.
 294      * @param systemId If non-null, this is a URI (normally URL) that
 295      *  may be used to access the DTD's external subset.
 296      * @param internalSubset If non-null, holds literal markup declarations
 297      *  comprising the DTD's internal subset.
 298      * @param resolver If non-null, this will be provided to the parser for
 299      *  use when resolving parameter entities (including any external subset).
 300      * @param resolver If non-null, this will be provided to the parser for
 301      *  use when resolving parameter entities (including any external subset).
 302      * @param minimalElement If non-null, a minimal valid document.
 303      *
 304      * @exception SAXNotSupportedException If the default SAX parser does
 305      *  not support the standard lexical or declaration handlers.
 306      * @exception SAXParseException If the specified DTD has either
 307      *  well-formedness or validity errors
 308      * @exception IOException If the specified DTD can't be read for
 309      *  some reason
 310      */
 311     public ValidationConsumer (
 312         String          rootName,
 313         String          publicId,
 314         String          systemId,
 315         String          internalSubset,
 316         EntityResolver  resolver,
 317         String          minimalDocument
 318     ) throws SAXException, IOException
 319     {
 320         this (null);
 321
 322         disableReset = true;
 323         if (rootName == null)
 324             rootName = fakeRootName;
 325
 326         //
 327         // Synthesize document with that DTD; is it possible to do
 328         // better for the declaration of the root element?
 329         //
 330         // NOTE:  can't use SAX2 to write internal subsets.
 331         //
 332         StringWriter    writer = new StringWriter ();
 333
 334         writer.write ("<!DOCTYPE ");
 335         writer.write (rootName);
 336         if (systemId != null) {
 337             writer.write ("\n  ");
 338             if (publicId != null) {
 339                 writer.write ("PUBLIC '");
 340                 writer.write (publicId);
 341                 writer.write ("'\n\t'");
 342             } else
 343                 writer.write ("SYSTEM '");
 344             writer.write (systemId);
 345             writer.write ("'");
 346         }
 347         writer.write (" [ ");
 348         if (rootName == fakeRootName) {
 349             writer.write ("\n<!ELEMENT ");
 350             writer.write (rootName);
 351             writer.write (" EMPTY>");
 352         }
 353         if (internalSubset != null)
 354             writer.write (internalSubset);
 355         writer.write ("\n ]>");
 356
 357         if (minimalDocument != null) {
 358             writer.write ("\n");
 359             writer.write (minimalDocument);
 360             writer.write ("\n");
 361         } else {
 362             writer.write (" <");
 363             writer.write (rootName);
 364             writer.write ("/>\n");
 365         }
 366         minimalDocument = writer.toString ();
 367
 368         //
 369         // OK, load it
 370         //
 371         XMLReader       producer;
 372
 373         producer = XMLReaderFactory.createXMLReader ();
 374         bind (producer, this);
 375
 376         if (resolver != null)
 377             producer.setEntityResolver (resolver);
 378
 379         InputSource     in;
 380
 381         in = new InputSource (new StringReader (minimalDocument));
 382         producer.parse (in);
 383
 384         disableDeclarations = true;
 385         if (rootName == fakeRootName)
 386             this.rootName = null;
 387     }
 388
 389     private void resetState ()
 390     {
 391         if (!disableReset) {
 392             rootName = null;
 393             contentStack.removeAllElements ();
 394             elements.clear ();
 395             ids.clear ();
 396
 397             notations.removeAllElements ();
 398             nDeferred.removeAllElements ();
 399             unparsed.removeAllElements ();
 400             uDeferred.removeAllElements ();
 401         }
 402     }
 403
 404
 405     private void warning (String description)
 406     throws SAXException
 407     {
 408         ErrorHandler            errHandler = getErrorHandler ();
 409         Locator                 locator = getDocumentLocator ();
 410         SAXParseException       err;
 411
 412         if (errHandler == null)
 413             return;
 414
 415         if (locator == null)
 416             err = new SAXParseException (description, null, null, -1, -1);
 417         else
 418             err = new SAXParseException (description, locator);
 419         errHandler.warning (err);
 420     }
 421
 422     // package private (for ChildrenRecognizer)
 423     private void error (String description)
 424     throws SAXException
 425     {
 426         ErrorHandler            errHandler = getErrorHandler ();
 427         Locator                 locator = getDocumentLocator ();
 428         SAXParseException       err;
 429
 430         if (locator == null)
 431             err = new SAXParseException (description, null, null, -1, -1);
 432         else
 433             err = new SAXParseException (description, locator);
 434         if (errHandler != null)
 435             errHandler.error (err);
 436         else    // else we always treat it as fatal!
 437             throw err;
 438     }
 439
 440     private void fatalError (String description)
 441     throws SAXException
 442     {
 443         ErrorHandler            errHandler = getErrorHandler ();
 444         Locator                 locator = getDocumentLocator ();
 445         SAXParseException       err;
 446
 447         if (locator != null)
 448             err = new SAXParseException (description, locator);
 449         else
 450             err = new SAXParseException (description, null, null, -1, -1);
 451         if (errHandler != null)
 452             errHandler.fatalError (err);
 453         // we always treat this as fatal, regardless of the handler
 454         throw err;
 455     }
 456
 457
 458     private static boolean isExtender (char c)
 459     {
 460         // [88] Extender ::= ...
 461         return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
 462                || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
 463                || (c >= 0x3031 && c <= 0x3035)
 464                || (c >= 0x309d && c <= 0x309e)
 465                || (c >= 0x30fc && c <= 0x30fe);
 466     }
 467
 468
 469     // use augmented Unicode rules, not full XML rules
 470     private boolean isName (String name, String context, String id)
 471     throws SAXException
 472     {
 473         char    buf [] = name.toCharArray ();
 474         boolean pass = true;
 475
 476         if (!Character.isUnicodeIdentifierStart (buf [0])
 477                 && ":_".indexOf (buf [0]) == -1)
 478             pass = false;
 479         else {
 480             int max = buf.length;
 481             for (int i = 1; pass && i < max; i++) {
 482                 char c = buf [i];
 483                 if (!Character.isUnicodeIdentifierPart (c)
 484                         && ":-_.".indexOf (c) == -1
 485                         && !isExtender (c))
 486                     pass = false;
 487             }
 488         }
 489
 490         if (!pass)
 491             error ("In " + context + " for " + id
 492                 + ", '" + name + "' is not a name");
 493         return pass;    // true == OK
 494     }
 495
 496     // use augmented Unicode rules, not full XML rules
 497     private boolean isNmtoken (String nmtoken, String context, String id)
 498     throws SAXException
 499     {
 500         char    buf [] = nmtoken.toCharArray ();
 501         boolean pass = true;
 502         int     max = buf.length;
 503
 504         // XXX make this share code with isName
 505
 506         for (int i = 0; pass && i < max; i++) {
 507                 char c = buf [i];
 508             if (!Character.isUnicodeIdentifierPart (c)
 509                     && ":-_.".indexOf (c) == -1
 510                     && !isExtender (c))
 511                 pass = false;
 512         }
 513
 514         if (!pass)
 515             error ("In " + context + " for " + id
 516                 + ", '" + nmtoken + "' is not a name token");
 517         return pass;    // true == OK
 518     }
 519
 520     private void checkEnumeration (String value, String type, String name)
 521     throws SAXException
 522     {
 523         if (!hasMatch (value, type))
 524             // VC: Enumeration
 525             error ("Value '" + value
 526                 + "' for attribute '" + name
 527                 + "' is not permitted: " + type);
 528     }
 529
 530     // used to test enumerated attributes and mixed content models
 531     // package private
 532     static boolean hasMatch (String value, String orList)
 533     {
 534         int len = value.length ();
 535         int max = orList.length () - len;
 536
 537         for (int start = 0;
 538                 (start = orList.indexOf (value, start)) != -1;
 539                 start++) {
 540             char c;
 541
 542             if (start > max)
 543                 break;
 544             c = orList.charAt (start - 1);
 545             if (c != '|' && c != '('/*)*/)
 546                 continue;
 547             c = orList.charAt (start + len);
 548             if (c != '|' && /*(*/ c != ')')
 549                 continue;
 550             return true;
 551         }
 552         return false;
 553     }
 554
 555     /**
 556      * <b>LexicalHandler</b> Records the declaration of the root
 557      * element, so it can be verified later.
 558      * Passed to the next consumer, unless this one was
 559      * preloaded with a particular DTD.
 560      */
 561     public void startDTD (String name, String publicId, String systemId)
 562     throws SAXException
 563     {
 564         if (disableDeclarations)
 565             return;
 566
 567         rootName = name;
 568         super.startDTD (name, publicId, systemId);
 569     }
 570
 571     /**
 572      * <b>LexicalHandler</b> Verifies that all referenced notations
 573      * and unparsed entities have been declared.
 574      * Passed to the next consumer, unless this one was
 575      * preloaded with a particular DTD.
 576      */
 577     public void endDTD ()
 578     throws SAXException
 579     {
 580         if (disableDeclarations)
 581             return;
 582
 583         // this is a convenient hook for end-of-dtd checks, but we
 584         // could also trigger it in the first startElement call.
 585         // locator info is more appropriate here though.
 586
 587         // VC: Notation Declared (NDATA can refer to them before decls,
 588         //      as can NOTATION attribute enumerations and defaults)
 589         int length = nDeferred.size ();
 590         for (int i = 0; i < length; i++) {
 591             String notation = (String) nDeferred.elementAt (i);
 592             if (!notations.contains (notation)) {
 593                 error ("A declaration referred to notation '" + notation
 594                         + "' which was never declared");
 595             }
 596         }
 597         nDeferred.removeAllElements ();
 598
 599         // VC: Entity Name (attribute values can refer to them
 600         //      before they're declared); VC Attribute Default Legal
 601         length = uDeferred.size ();
 602         for (int i = 0; i < length; i++) {
 603             String entity = (String) uDeferred.elementAt (i);
 604             if (!unparsed.contains (entity)) {
 605                 error ("An attribute default referred to entity '" + entity
 606                         + "' which was never declared");
 607             }
 608         }
 609         uDeferred.removeAllElements ();
 610         super.endDTD ();
 611     }
 612
 613
 614     // These are interned, so we can rely on "==" to find the type of
 615     // all attributes except enumerations ...
 616     // "(this|or|that|...)" and "NOTATION (this|or|that|...)"
 617     static final String types [] = {
 618         "CDATA",
 619         "ID", "IDREF", "IDREFS",
 620         "NMTOKEN", "NMTOKENS",
 621         "ENTITY", "ENTITIES"
 622     };
 623
 624
 625     /**
 626      * <b>DecllHandler</b> Records attribute declaration for later use
 627      * in validating document content, and checks validity constraints
 628      * that are applicable to attribute declarations.
 629      * Passed to the next consumer, unless this one was
 630      * preloaded with a particular DTD.
 631      */
 632     public void attributeDecl (
 633         String eName,
 634         String aName,
 635         String type,
 636         String mode,
 637         String value
 638     ) throws SAXException
 639     {
 640         if (disableDeclarations)
 641             return;
 642
 643         ElementInfo     info = (ElementInfo) elements.get (eName);
 644         AttributeInfo   ainfo = new AttributeInfo ();
 645         boolean         checkOne = false;
 646         boolean         interned = false;
 647
 648         // cheap interning of type names and #FIXED, #REQUIRED
 649         // for faster startElement (we can use "==")
 650         for (int i = 0; i < types.length; i++) {
 651             if (types [i].equals (type)) {
 652                 type = types [i];
 653                 interned = true;
 654                 break;
 655             }
 656         }
 657         if ("#FIXED".equals (mode))
 658             mode = "#FIXED";
 659         else if ("#REQUIRED".equals (mode))
 660             mode = "#REQUIRED";
 661
 662         ainfo.type = type;
 663         ainfo.mode = mode;
 664         ainfo.value = value;
 665
 666         // we might not have seen the content model yet
 667         if (info == null) {
 668             info = new ElementInfo (eName);
 669             elements.put (eName, info);
 670         }
 671         if ("ID" == type) {
 672             checkOne = true;
 673             if (!("#REQUIRED" == mode || "#IMPLIED".equals (mode))) {
 674                 // VC: ID Attribute Default
 675                 error ("ID attribute '" + aName
 676                     + "' must be #IMPLIED or #REQUIRED");
 677             }
 678
 679         } else if (!interned && type.startsWith ("NOTATION ")) {
 680             checkOne = true;
 681
 682             // VC: Notation Attributes (notations must be declared)
 683             StringTokenizer     tokens = new StringTokenizer (
 684                 type.substring (10, type.lastIndexOf (')')),
 685                 "|");
 686             while (tokens.hasMoreTokens ()) {
 687                 String  token = tokens.nextToken ();
 688                 if (!notations.contains (token))
 689                     nDeferred.addElement (token);
 690             }
 691         }
 692         if (checkOne) {
 693             for (Enumeration e = info.attributes.keys ();
 694                     e.hasMoreElements ();
 695                     /* NOP */) {
 696                 String          name;
 697                 AttributeInfo   ainfo2;
 698
 699                 name = (String) e.nextElement ();
 700                 ainfo2 = (AttributeInfo) info.attributes.get (name);
 701                 if (type == ainfo2.type || !interned /* NOTATION */) {
 702                     // VC: One ID per Element Type
 703                     // VC: One Notation per Element TYpe
 704                     error ("Element '" + eName
 705                         + "' already has an attribute of type "
 706                         + (interned ? "NOTATION" : type)
 707                         + " ('" + name
 708                         + "') so '" + aName
 709                         + "' is a validity error");
 710                 }
 711             }
 712         }
 713
 714         // VC: Attribute Default Legal
 715         if (value != null) {
 716
 717             if ("CDATA" == type) {
 718                 // event source rejected '<'
 719
 720             } else if ("NMTOKEN" == type) {
 721                 // VC: Name Token (is a nmtoken)
 722                 isNmtoken (value, "attribute default", aName);
 723
 724             } else if ("NMTOKENS" == type) {
 725                 // VC: Name Token (is a nmtoken; at least one value)
 726                 StringTokenizer tokens = new StringTokenizer (value);
 727                 if (!tokens.hasMoreTokens ())
 728                     error ("Default for attribute '" + aName
 729                         + "' must have at least one name token.");
 730                 else do {
 731                     String token = tokens.nextToken ();
 732                     isNmtoken (token, "attribute default", aName);
 733                 } while (tokens.hasMoreTokens ());
 734
 735             } else if ("IDREF" == type || "ENTITY" == type) {
 736                 // VC: Entity Name (is a name)
 737                 // VC: IDREF (is a name) (is declared)
 738                 isName (value, "attribute default", aName);
 739                 if ("ENTITY" == type && !unparsed.contains (value))
 740                     uDeferred.addElement (value);
 741
 742             } else if ("IDREFS" == type || "ENTITIES" == type) {
 743                 // VC: Entity Name (is a name; at least one value)
 744                 // VC: IDREF (is a name; at least one value)
 745                 StringTokenizer names = new StringTokenizer (value);
 746                 if (!names.hasMoreTokens ())
 747                     error ("Default for attribute '" + aName
 748                         + "' must have at least one name.");
 749                 else do {
 750                     String name = names.nextToken ();
 751                     isName (name, "attribute default", aName);
 752                     if ("ENTITIES" == type && !unparsed.contains (name))
 753                         uDeferred.addElement (value);
 754                 } while (names.hasMoreTokens ());
 755
 756             } else if (type.charAt (0) == '(' /*)*/ ) {
 757                 // VC: Enumeration (must match)
 758                 checkEnumeration (value, type, aName);
 759
 760             } else if (!interned && checkOne) { /* NOTATION */
 761                 // VC: Notation attributes (must be names)
 762                 isName (value, "attribute default", aName);
 763
 764                 // VC: Notation attributes (must be declared)
 765                 if (!notations.contains (value))
 766                     nDeferred.addElement (value);
 767
 768                 // VC: Enumeration (must match)
 769                 checkEnumeration (value, type, aName);
 770
 771             } else if ("ID" != type)
 772                 throw new RuntimeException ("illegal attribute type: " + type);
 773         }
 774
 775         if (info.attributes.get (aName) == null)
 776             info.attributes.put (aName, ainfo);
 777         /*
 778         else
 779             warning ("Element '" + eName
 780                 + "' already has an attribute named '" + aName + "'");
 781         */
 782
 783         if ("xml:space".equals (aName)) {
 784             if (!("(default|preserve)".equals (type)
 785                     || "(preserve|default)".equals (type)
 786                         // these next two are arguable; XHTML's DTD doesn't
 787                         // deserve errors.  After all, it's not like any
 788                         // illegal _value_ could pass ...
 789                     || "(preserve)".equals (type)
 790                     || "(default)".equals (type)
 791                     ))
 792                 error (
 793                     "xml:space attribute type must be like '(default|preserve)'"
 794                     + " not '" + type + "'"
 795                     );
 796
 797         }
 798         super.attributeDecl (eName, aName, type, mode, value);
 799     }
 800
 801     /**
 802      * <b>DecllHandler</b> Records the element declaration for later use
 803      * when checking document content, and checks validity constraints that
 804      * apply to element declarations.  Passed to the next consumer, unless
 805      * this one was preloaded with a particular DTD.
 806      */
 807     public void elementDecl (String name, String model)
 808     throws SAXException
 809     {
 810         if (disableDeclarations)
 811             return;
 812
 813         ElementInfo     info = (ElementInfo) elements.get (name);
 814
 815         // we might have seen an attribute decl already
 816         if (info == null) {
 817             info = new ElementInfo (name);
 818             elements.put (name, info);
 819         }
 820         if (info.model != null) {
 821             // NOTE:  not all parsers can report such duplicates.
 822             // VC: Unique Element Type Declaration
 823             error ("Element type '" + name
 824                 + "' was already declared.");
 825         } else {
 826             info.model = model;
 827
 828             // VC: No Duplicate Types (in mixed content models)
 829             if (model.charAt (1) == '#')        // (#PCDATA...
 830                 info.getRecognizer (this);
 831         }
 832         super.elementDecl (name, model);
 833     }
 834
 835     /**
 836      * <b>DecllHandler</b> passed to the next consumer, unless this
 837      * one was preloaded with a particular DTD
 838      */
 839     public void internalEntityDecl (String name, String value)
 840     throws SAXException
 841     {
 842         if (!disableDeclarations)
 843             super.internalEntityDecl (name, value);
 844     }
 845
 846     /**
 847      * <b>DecllHandler</b> passed to the next consumer, unless this
 848      * one was preloaded with a particular DTD
 849      */
 850     public void externalEntityDecl (String name,
 851         String publicId, String systemId)
 852     throws SAXException
 853     {
 854         if (!disableDeclarations)
 855             super.externalEntityDecl (name, publicId, systemId);
 856     }
 857
 858
 859     /**
 860      * <b>DTDHandler</b> Records the notation name, for checking
 861      * NOTATIONS attribute values and declararations of unparsed
 862      * entities.  Passed to the next consumer, unless this one was
 863      * preloaded with a particular DTD.
 864      */
 865     public void notationDecl (String name, String publicId, String systemId)
 866     throws SAXException
 867     {
 868         if (disableDeclarations)
 869             return;
 870
 871         notations.addElement (name);
 872         super.notationDecl (name, publicId, systemId);
 873     }
 874
 875     /**
 876      * <b>DTDHandler</b> Records the entity name, for checking
 877      * ENTITY and ENTITIES attribute values; records the notation
 878      * name if it hasn't yet been declared.  Passed to the next consumer,
 879      * unless this one was preloaded with a particular DTD.
 880      */
 881     public void unparsedEntityDecl (
 882         String name,
 883         String publicId,
 884         String systemId,
 885         String notationName
 886     ) throws SAXException
 887     {
 888         if (disableDeclarations)
 889             return;
 890
 891         unparsed.addElement (name);
 892         if (!notations.contains (notationName))
 893             nDeferred.addElement (notationName);
 894         super.unparsedEntityDecl (name, publicId, systemId, notationName);
 895     }
 896
 897
 898     /**
 899      * <b>ContentHandler</b> Ensures that state from any previous parse
 900      * has been deleted.
 901      * Passed to the next consumer.
 902      */
 903     public void startDocument ()
 904     throws SAXException
 905     {
 906         resetState ();
 907         super.startDocument ();
 908     }
 909
 910
 911     private static boolean isAsciiLetter (char c)
 912     {
 913         return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
 914     }
 915
 916
 917     /**
 918      * <b>ContentHandler</b> Reports a fatal exception.  Validating
 919      * XML processors may not skip any entities.
 920      */
 921     public void skippedEntity (String name)
 922     throws SAXException
 923     {
 924         fatalError ("may not skip entities");
 925     }
 926
 927     /*
 928      * SAX2 doesn't expand non-PE refs in attribute defaults...
 929      */
 930     private String expandDefaultRefs (String s)
 931     throws SAXException
 932     {
 933         if (s.indexOf ('&') < 0)
 934             return s;
 935
 936 // FIXME: handle &#nn; &#xnn; &name;
 937         String message = "Can't expand refs in attribute default: " + s;
 938         warning (message);
 939
 940         return s;
 941     }
 942
 943     /**
 944      * <b>ContentHandler</b> Performs validity checks against element
 945      * (and document) content models, and attribute values.
 946      * Passed to the next consumer.
 947      */
 948     public void startElement (
 949         String          uri,
 950         String          localName,
 951         String          qName,
 952         Attributes      atts
 953     ) throws SAXException
 954     {
 955         //
 956         // First check content model for the enclosing scope.
 957         //
 958         if (contentStack.isEmpty ()) {
 959             // VC:  Root Element Type
 960             if (!qName.equals (rootName)) {
 961                 if (rootName == null)
 962                     warning ("This document has no DTD, can't be valid");
 963                 else
 964                     error ("Root element type '" + qName
 965                         + "' was declared to be '" + rootName + "'");
 966             }
 967         } else {
 968             Recognizer state = (Recognizer) contentStack.peek ();
 969
 970             if (state != null) {
 971                 Recognizer newstate = state.acceptElement (qName);
 972
 973                 if (newstate == null)
 974                     error ("Element type '" + qName
 975                         + "' in element '" + state.type.name
 976                         + "' violates content model " + state.type.model
 977                         );
 978                 if (newstate != state) {
 979                     contentStack.pop ();
 980                     contentStack.push (newstate);
 981                 }
 982             }
 983         }
 984
 985         //
 986         // Then check that this element was declared, and push the
 987         // object used to validate its content model onto our stack.
 988         //
 989         // This is where the recognizer gets created, if needed; if
 990         // it's a "children" (elements) content model, an NDFA is
 991         // created.  (One recognizer is used per content type, no
 992         // matter how complex that recognizer is.)
 993         //
 994         ElementInfo             info;
 995
 996         info = (ElementInfo) elements.get (qName);
 997         if (info == null || info.model == null) {
 998             // VC: Element Valid (base clause)
 999             error ("Element type '" + qName + "' was not declared");
1000             contentStack.push (null);
1001
1002             // for less diagnostic noise, fake a declaration.
1003             elementDecl (qName, "ANY");
1004         } else
1005             contentStack.push (info.getRecognizer (this));
1006
1007         //
1008         // Then check each attribute present
1009         //
1010         int                     len;
1011         String                  aname;
1012         AttributeInfo           ainfo;
1013
1014         if (atts != null)
1015             len = atts.getLength ();
1016         else
1017             len = 0;
1018
1019         for (int i = 0; i < len; i++) {
1020             aname = atts.getQName (i);
1021
1022             if (info == null
1023                     || (ainfo = (AttributeInfo) info.attributes.get (aname))
1024                             == null) {
1025                 // VC: Attribute Value Type
1026                 error ("Attribute '" + aname
1027                     + "' was not declared for element type " + qName);
1028                 continue;
1029             }
1030
1031             String value = atts.getValue (i);
1032
1033             // note that "==" for type names and "#FIXED" is correct
1034             // (and fast) since we've interned those literals.
1035
1036             if ("#FIXED" == ainfo.mode) {
1037                 String expanded = expandDefaultRefs (ainfo.value);
1038
1039                 // VC: Fixed Attribute Default
1040                 if (!value.equals (expanded)) {
1041                     error ("Attribute '" + aname
1042                         + "' must match " + expanded
1043                         );
1044                     continue;
1045                 }
1046             }
1047
1048             if ("CDATA" == ainfo.type)
1049                 continue;
1050
1051             //
1052             // For all other attribute types, there are various
1053             // rules to follow.
1054             //
1055
1056             if ("ID" == ainfo.type) {
1057                 // VC: ID (must be a name)
1058                 if (isName (value, "ID attribute", aname)) {
1059                     if (Boolean.TRUE == ids.get (value))
1060                         // VC: ID (appears once)
1061                         error ("ID attribute " + aname
1062                             + " uses an ID value '" + value
1063                             + "' which was already declared.");
1064                     else
1065                         // any forward refs are no longer problems
1066                         ids.put (value, Boolean.TRUE);
1067                 }
1068                 continue;
1069             }
1070
1071             if ("IDREF" == ainfo.type) {
1072                 // VC: IDREF (value must be a name)
1073                 if (isName (value, "IDREF attribute", aname)) {
1074                     // VC: IDREF (must match some ID attribute)
1075                     if (ids.get (value) == null)
1076                         // new -- assume it's a forward ref
1077                         ids.put (value, Boolean.FALSE);
1078                 }
1079                 continue;
1080             }
1081
1082             if ("IDREFS" == ainfo.type) {
1083                 StringTokenizer tokens = new StringTokenizer (value, " ");
1084
1085                 if (!tokens.hasMoreTokens ()) {
1086                     // VC: IDREF (one or more values)
1087                     error ("IDREFS attribute " + aname
1088                         + " must have at least one ID ref");
1089                 } else do {
1090                     String id = tokens.nextToken ();
1091
1092                     // VC: IDREF (value must be a name)
1093                     if (isName (id, "IDREFS attribute", aname)) {
1094                         // VC: IDREF (must match some ID attribute)
1095                         if (ids.get (id) == null)
1096                             // new -- assume it's a forward ref
1097                             ids.put (id, Boolean.FALSE);
1098                     }
1099                 } while (tokens.hasMoreTokens ());
1100                 continue;
1101             }
1102
1103             if ("NMTOKEN" == ainfo.type) {
1104                 // VC: Name Token (is a name token)
1105                 isNmtoken (value, "NMTOKEN attribute", aname);
1106                 continue;
1107             }
1108
1109             if ("NMTOKENS" == ainfo.type) {
1110                 StringTokenizer tokens = new StringTokenizer (value, " ");
1111
1112                 if (!tokens.hasMoreTokens ()) {
1113                     // VC: Name Token (one or more values)
1114                     error ("NMTOKENS attribute " + aname
1115                         + " must have at least one name token");
1116                 } else do {
1117                     String token = tokens.nextToken ();
1118
1119                     // VC: Name Token (is a name token)
1120                     isNmtoken (token, "NMTOKENS attribute", aname);
1121                 } while (tokens.hasMoreTokens ());
1122                 continue;
1123             }
1124
1125             if ("ENTITY" == ainfo.type) {
1126                 if (!unparsed.contains (value))
1127                     // VC: Entity Name
1128                     error ("Value of attribute '" + aname
1129                         + "' refers to unparsed entity '" + value
1130                         + "' which was not declared.");
1131                 continue;
1132             }
1133
1134             if ("ENTITIES" == ainfo.type) {
1135                 StringTokenizer tokens = new StringTokenizer (value, " ");
1136
1137                 if (!tokens.hasMoreTokens ()) {
1138                     // VC: Entity Name (one or more values)
1139                     error ("ENTITIES attribute " + aname
1140                         + " must have at least one name token");
1141                 } else do {
1142                     String entity = tokens.nextToken ();
1143
1144                     if (!unparsed.contains (entity))
1145                         // VC: Entity Name
1146                         error ("Value of attribute '" + aname
1147                             + "' refers to unparsed entity '" + entity
1148                             + "' which was not declared.");
1149                 } while (tokens.hasMoreTokens ());
1150                 continue;
1151             }
1152
1153             //
1154             // check for enumerations last; more expensive
1155             //
1156             if (ainfo.type.charAt (0) == '(' /*)*/
1157                     || ainfo.type.startsWith ("NOTATION ")
1158                     ) {
1159                 // VC: Enumeration (value must be defined)
1160                 checkEnumeration (value, ainfo.type, aname);
1161                 continue;
1162             }
1163         }
1164
1165         //
1166         // Last, check that all #REQUIRED attributes were provided
1167         //
1168         if (info != null) {
1169             Hashtable   table = info.attributes;
1170
1171             if (table.size () != 0) {
1172                 Enumeration     e = table.keys ();
1173
1174                 // XXX table.keys uses the heap, bleech -- slows things
1175
1176                 while (e.hasMoreElements ()) {
1177                     aname = (String) e.nextElement ();
1178                     ainfo = (AttributeInfo) table.get (aname);
1179
1180                     // "#REQUIRED" mode was interned in attributeDecl
1181                     if ("#REQUIRED" == ainfo.mode
1182                             && atts.getValue (aname) == null) {
1183                         // VC: Required Attribute
1184                         error ("Attribute '" + aname + "' must be specified "
1185                             + "for element type " + qName);
1186                     }
1187                 }
1188             }
1189         }
1190         super.startElement (uri, localName, qName, atts);
1191     }
1192
1193     /**
1194      * <b>ContentHandler</b> Reports a validity error if the element's content
1195      * model does not permit character data.
1196      * Passed to the next consumer.
1197      */
1198     public void characters (char ch [], int start, int length)
1199     throws SAXException
1200     {
1201         Recognizer state;
1202
1203         if (contentStack.empty ())
1204             state = null;
1205         else
1206             state = (Recognizer) contentStack.peek ();
1207
1208         // NOTE:  if this ever supports with SAX parsers that don't
1209         // report ignorable whitespace as such (only XP?), this class
1210         // needs to morph it into ignorableWhitespace() as needed ...
1211
1212         if (state != null && !state.acceptCharacters ())
1213             // VC: Element Valid (clauses three, four -- see recognizer)
1214             error ("Character content not allowed in element "
1215                 + state.type.name);
1216
1217         super.characters (ch, start, length);
1218     }
1219
1220
1221     /**
1222      * <b>ContentHandler</b> Reports a validity error if the element's content
1223      * model does not permit end-of-element yet, or a well formedness error
1224      * if there was no matching startElement call.
1225      * Passed to the next consumer.
1226      */
1227     public void endElement (String uri, String localName, String qName)
1228     throws SAXException
1229     {
1230         try {
1231             Recognizer state = (Recognizer) contentStack.pop ();
1232
1233             if (state != null && !state.completed ())
1234                 // VC: Element valid (clauses two, three, four; see Recognizer)
1235                 error ("Premature end for element '"
1236                     + state.type.name
1237                     + "', content model "
1238                     + state.type.model);
1239
1240             // could insist on match of start element, but that's
1241             // something the input stream must to guarantee.
1242
1243         } catch (EmptyStackException e) {
1244             fatalError ("endElement without startElement: " + qName
1245                 + ((uri == null)
1246                     ? ""
1247                     : ( " { '" + uri + "', " + localName + " }")));
1248         }
1249         super.endElement (uri, localName, qName);
1250     }
1251
1252     /**
1253      * <b>ContentHandler</b> Checks whether all ID values that were
1254      * referenced have been declared, and releases all resources.
1255      * Passed to the next consumer.
1256      *
1257      * @see #setDocumentLocator
1258      */
1259     public void endDocument ()
1260     throws SAXException
1261     {
1262         for (Enumeration idNames = ids.keys ();
1263                 idNames.hasMoreElements ();
1264                 /* NOP */) {
1265             String id = (String) idNames.nextElement ();
1266
1267             if (Boolean.FALSE == ids.get (id)) {
1268                 // VC: IDREF (must match ID)
1269                 error ("Undeclared ID value '" + id
1270                     + "' was referred to by an IDREF/IDREFS attribute");
1271             }
1272         }
1273
1274         resetState ();
1275         super.endDocument ();
1276     }
1277
1278
1279     /** Holds per-element declarations */
1280     static private final class ElementInfo
1281     {
1282         String                  name;
1283         String                  model;
1284
1285         // key = attribute name; value = AttributeInfo
1286         Hashtable               attributes = new Hashtable (11);
1287
1288         ElementInfo (String n) { name = n; }
1289
1290         private Recognizer      recognizer;
1291
1292         // for validating content models:  one per type, shared,
1293         // and constructed only on demand ... so unused elements do
1294         // not need to consume resources.
1295         Recognizer      getRecognizer (ValidationConsumer consumer)
1296         throws SAXException
1297         {
1298             if (recognizer == null) {
1299                 if ("ANY".equals (model))
1300                     recognizer = ANY;
1301                 else if ("EMPTY".equals (model))
1302                     recognizer = new EmptyRecognizer (this);
1303                 else if ('#' == model.charAt (1))
1304                     // n.b. this constructor does a validity check
1305                     recognizer = new MixedRecognizer (this, consumer);
1306                 else
1307                     recognizer = new ChildrenRecognizer (this, consumer);
1308             }
1309             return recognizer;
1310         }
1311     }
1312
1313     /** Holds per-attribute declarations */
1314     static private final class AttributeInfo
1315     {
1316         String  type;
1317         String  mode;           // #REQUIRED, etc (or null)
1318         String  value;          // or null
1319     }
1320
1321
1322     //
1323     // Content model validation
1324     //
1325
1326     static private final Recognizer     ANY = new Recognizer (null);
1327
1328
1329     // Base class defines the calls used to validate content,
1330     // and supports the "ANY" content model
1331     static private class Recognizer
1332     {
1333         final ElementInfo       type;
1334
1335         Recognizer (ElementInfo t) { type = t; }
1336
1337         // return true iff character data is legal here
1338         boolean acceptCharacters ()
1339         throws SAXException
1340             // VC: Element Valid (third and fourth clauses)
1341             { return true; }
1342
1343         // null return = failure
1344         // otherwise, next state (like an FSM)
1345         // prerequisite: tested that name was declared
1346         Recognizer acceptElement (String name)
1347         throws SAXException
1348             // VC: Element Valid (fourth clause)
1349             { return this; }
1350
1351         // return true iff model is completed, can finish
1352         boolean completed ()
1353         throws SAXException
1354             // VC: Element Valid (fourth clause)
1355             { return true; }
1356
1357         public String toString ()
1358             // n.b. "children" is the interesting case!
1359             { return (type == null) ? "ANY" : type.model; }
1360     }
1361
1362     // "EMPTY" content model -- no characters or elements
1363     private static final class EmptyRecognizer extends Recognizer
1364     {
1365         public EmptyRecognizer (ElementInfo type)
1366             { super (type); }
1367
1368         // VC: Element Valid (first clause)
1369         boolean acceptCharacters ()
1370             { return false; }
1371
1372         // VC: Element Valid (first clause)
1373         Recognizer acceptElement (String name)
1374             { return null; }
1375     }
1376
1377     // "Mixed" content model -- ANY, but restricts elements
1378     private static final class MixedRecognizer extends Recognizer
1379     {
1380         private String  permitted [];
1381
1382         // N.B. constructor tests for duplicated element names (VC)
1383         public MixedRecognizer (ElementInfo t, ValidationConsumer v)
1384         throws SAXException
1385         {
1386             super (t);
1387
1388             // (#PCDATA...)* or (#PCDATA) ==> ... or empty
1389             // with the "..." being "|elname|..."
1390             StringTokenizer     tokens = new StringTokenizer (
1391                 t.model.substring (8, t.model.lastIndexOf (')')),
1392                 "|");
1393             Vector              vec = new Vector ();
1394
1395             while (tokens.hasMoreTokens ()) {
1396                 String token = tokens.nextToken ();
1397
1398                 if (vec.contains (token))
1399                     v.error ("element " + token
1400                         + " is repeated in mixed content model: "
1401                         + t.model);
1402                 else
1403                     vec.addElement (token.intern ());
1404             }
1405             permitted = new String [vec.size ()];
1406             for (int i = 0; i < permitted.length; i++)
1407                 permitted [i] = (String) vec.elementAt (i);
1408
1409             // in one large machine-derived DTD sample, most of about
1410             // 250 mixed content models were empty, and 25 had ten or
1411             // more entries.  2 had over a hundred elements.  Linear
1412             // search isn't obviously wrong.
1413         }
1414
1415         // VC: Element Valid (third clause)
1416         Recognizer acceptElement (String name)
1417         {
1418             int         length = permitted.length;
1419
1420             // first pass -- optimistic w.r.t. event source interning
1421             // (and document validity)
1422             for (int i = 0; i < length; i++)
1423                 if (permitted [i] == name)
1424                     return this;
1425             // second pass -- pessimistic w.r.t. event source interning
1426             for (int i = 0; i < length; i++)
1427                 if (permitted [i].equals (name))
1428                     return this;
1429             return null;
1430         }
1431     }
1432
1433
1434     // recognizer loop flags, see later
1435     private static final int            F_LOOPHEAD = 0x01;
1436     private static final int            F_LOOPNEXT = 0x02;
1437
1438     // for debugging -- used to label/count nodes in toString()
1439     private static int                  nodeCount;
1440
1441     /**
1442      * "Children" content model -- these are nodes in NDFA state graphs.
1443      * They work in fixed space.  Note that these graphs commonly have
1444      * cycles, handling features such as zero-or-more and one-or-more.
1445      *
1446      * <p>It's readonly, so only one copy is ever needed.  The content model
1447      * stack may have any number of pointers into each graph, when a model
1448      * happens to be needed more than once due to element nesting.  Since
1449      * traversing the graph just moves to another node, and never changes
1450      * it, traversals never interfere with each other.
1451      *
1452      * <p>There is an option to report non-deterministic models.  These are
1453      * always XML errors, but ones which are not often reported despite the
1454      * fact that they can lead to different validating parsers giving
1455      * different results for the same input.  (The XML spec doesn't require
1456      * them to be reported.)
1457      *
1458      * <p><b>FIXME</b> There's currently at least one known bug here, in that
1459      * it's not actually detecting the non-determinism it tries to detect.
1460      * (Of the "optional.xml" test, the once-or-twice-2* tests are all non-D;
1461      * maybe some others.)  This may relate to the issue flagged below as
1462      * "should not" happen (but it was), which showed up when patching the
1463      * graph to have one exit node (or more EMPTY nodes).
1464      */
1465     private static final class ChildrenRecognizer extends Recognizer
1466         implements Cloneable
1467     {
1468         // for reporting non-deterministic content models
1469         // ... a waste of space if we're not reporting those!
1470         // ... along with the 'model' member (in base class)
1471         private ValidationConsumer      consumer;
1472
1473         // for CHOICE nodes -- each component is an arc that
1474         // accepts a different NAME (or is EMPTY indicating
1475         // NDFA termination).
1476         private Recognizer              components [];
1477
1478         // for NAME/SEQUENCE nodes -- accepts that NAME and
1479         // then goes to the next node (CHOICE, NAME, EMPTY).
1480         private String                  name;
1481         private Recognizer              next;
1482
1483         // loops always point back to a CHOICE node. we mark such choice
1484         // nodes (F_LOOPHEAD) for diagnostics and faster deep cloning.
1485         // We also mark nodes before back pointers (F_LOOPNEXT), to ensure
1486         // termination when we patch sequences and loops.
1487         private int                     flags;
1488
1489
1490         // prevent a needless indirection between 'this' and 'node'
1491         private void copyIn (ChildrenRecognizer node)
1492         {
1493             // model & consumer are already set
1494             components = node.components;
1495             name = node.name;
1496             next = node.next;
1497             flags = node.flags;
1498         }
1499
1500         // used to construct top level "children" content models,
1501         public ChildrenRecognizer (ElementInfo type, ValidationConsumer vc)
1502         {
1503             this (vc, type);
1504             populate (type.model.toCharArray (), 0);
1505             patchNext (new EmptyRecognizer (type), null);
1506         }
1507
1508         // used internally; populating is separate
1509         private ChildrenRecognizer (ValidationConsumer vc, ElementInfo type)
1510         {
1511             super (type);
1512             consumer = vc;
1513         }
1514
1515
1516         //
1517         // When rewriting some graph nodes we need deep clones in one case;
1518         // mostly shallow clones (what the JVM handles for us) are fine.
1519         //
1520         private ChildrenRecognizer shallowClone ()
1521         {
1522             try {
1523                 return (ChildrenRecognizer) clone ();
1524             } catch (CloneNotSupportedException e) {
1525                 throw new Error ("clone");
1526             }
1527         }
1528
1529         private ChildrenRecognizer deepClone ()
1530         {
1531             return deepClone (new Hashtable (37));
1532         }
1533
1534         private ChildrenRecognizer deepClone (Hashtable table)
1535         {
1536             ChildrenRecognizer retval;
1537
1538             if ((flags & F_LOOPHEAD) != 0) {
1539                 retval = (ChildrenRecognizer) table.get (this);
1540                 if (retval != null)
1541                     return this;
1542
1543                 retval = shallowClone ();
1544                 table.put (this, retval);
1545             } else
1546                 retval = shallowClone ();
1547
1548             if (next != null) {
1549                 if (next instanceof ChildrenRecognizer)
1550                     retval.next = ((ChildrenRecognizer)next)
1551                             .deepClone (table);
1552                 else if (!(next instanceof EmptyRecognizer))
1553                     throw new RuntimeException ("deepClone");
1554             }
1555
1556             if (components != null) {
1557                 retval.components = new Recognizer [components.length];
1558                 for (int i = 0; i < components.length; i++) {
1559                     Recognizer temp = components [i];
1560
1561                     if (temp == null)
1562                         retval.components [i] = null;
1563                     else if (temp instanceof ChildrenRecognizer)
1564                         retval.components [i] = ((ChildrenRecognizer)temp)
1565                                 .deepClone (table);
1566                     else if (!(temp instanceof EmptyRecognizer))
1567                         throw new RuntimeException ("deepClone");
1568                 }
1569             }
1570
1571             return retval;
1572         }
1573
1574         // connect subgraphs, first to next (sequencing)
1575         private void patchNext (Recognizer theNext, Hashtable table)
1576         {
1577             // backpointers must not be repatched or followed
1578             if ((flags & F_LOOPNEXT) != 0)
1579                 return;
1580
1581             // XXX this table "shouldn't" be needed, right?
1582             // but some choice nodes looped if it isn't there.
1583             if (table != null && table.get (this) != null)
1584                 return;
1585             if (table == null)
1586                 table = new Hashtable ();
1587
1588             // NAME/SEQUENCE
1589             if (name != null) {
1590                 if (next == null)
1591                     next = theNext;
1592                 else if (next instanceof ChildrenRecognizer) {
1593                     ((ChildrenRecognizer)next).patchNext (theNext, table);
1594                 } else if (!(next instanceof EmptyRecognizer))
1595                     throw new RuntimeException ("patchNext");
1596                 return;
1597             }
1598
1599             // CHOICE
1600             for (int i = 0; i < components.length; i++) {
1601                 if (components [i] == null)
1602                     components [i] = theNext;
1603                 else if (components [i] instanceof ChildrenRecognizer) {
1604                     ((ChildrenRecognizer)components [i])
1605                             .patchNext (theNext, table);
1606                 } else if (!(components [i] instanceof EmptyRecognizer))
1607                     throw new RuntimeException ("patchNext");
1608             }
1609
1610             if (table != null && (flags & F_LOOPHEAD) != 0)
1611                 table.put (this, this);
1612         }
1613
1614         /**
1615          * Parses a 'children' spec (or recursively 'cp') and makes this
1616          * become a regular graph node.
1617          *
1618          * @return index after this particle
1619          */
1620         private int populate (char parseBuf [], int startPos)
1621         {
1622             int         nextPos = startPos + 1;
1623             char        c;
1624
1625             if (nextPos < 0 || nextPos >= parseBuf.length)
1626                 throw new IndexOutOfBoundsException ();
1627
1628             // Grammar of the string is from the XML spec, but
1629             // with whitespace removed by the SAX parser.
1630
1631             // children ::= (choice | seq) ('?' | '*' | '+')?
1632             // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1633             // choice ::= '(' cp ('|' choice)* ')'
1634             // seq ::= '(' cp (',' choice)* ')'
1635
1636             // interior nodes only
1637             //   cp ::= name ...
1638             if (parseBuf [startPos] != '('/*)*/) {
1639                 boolean         done = false;
1640                 do {
1641                     switch (c = parseBuf [nextPos]) {
1642                         case '?': case '*': case '+':
1643                         case '|': case ',':
1644                         case /*(*/ ')':
1645                             done = true;
1646                             continue;
1647                         default:
1648                             nextPos++;
1649                             continue;
1650                     }
1651                 } while (!done);
1652                 name = new String (parseBuf, startPos, nextPos - startPos);
1653
1654             // interior OR toplevel nodes
1655             //   cp ::= choice ..
1656             //   cp ::= seq ..
1657             } else {
1658                 // collect everything as a separate list, and merge it
1659                 // into "this" later if we can (SEQUENCE or singleton)
1660                 ChildrenRecognizer      first;
1661
1662                 first = new ChildrenRecognizer (consumer, type);
1663                 nextPos = first.populate (parseBuf, nextPos);
1664                 c = parseBuf [nextPos++];
1665
1666                 if (c == ',' || c == '|') {
1667                     ChildrenRecognizer  current = first;
1668                     char                separator = c;
1669                     Vector              v = null;
1670
1671                     if (separator == '|') {
1672                         v = new Vector ();
1673                         v.addElement (first);
1674                     }
1675
1676                     do {
1677                         ChildrenRecognizer link;
1678
1679                         link = new ChildrenRecognizer (consumer, type);
1680                         nextPos = link.populate (parseBuf, nextPos);
1681
1682                         if (separator == ',') {
1683                             current.patchNext (link, null);
1684                             current = link;
1685                         } else
1686                             v.addElement (link);
1687
1688                         c = parseBuf [nextPos++];
1689                     } while (c == separator);
1690
1691                     // choice ... collect everything into one array.
1692                     if (separator == '|') {
1693                         // assert v.size() > 1
1694                         components = new Recognizer [v.size ()];
1695                         for (int i = 0; i < components.length; i++) {
1696                             components [i] = (Recognizer)
1697                                     v.elementAt (i);
1698                         }
1699                         // assert flags == 0
1700
1701                     // sequence ... merge into "this" to be smaller.
1702                     } else
1703                         copyIn (first);
1704
1705                 // treat singletons like one-node sequences.
1706                 } else
1707                     copyIn (first);
1708
1709                 if (c != /*(*/ ')')
1710                     throw new RuntimeException ("corrupt content model");
1711             }
1712
1713             //
1714             // Arity is optional, and the root of all fun.  We keep the
1715             // FSM state graph simple by only having NAME/SEQUENCE and
1716             // CHOICE nodes (or EMPTY to terminate a model), easily
1717             // evaluated.  So we rewrite each node that has arity, using
1718             // those primitives.  We create loops here, if needed.
1719             //
1720             if (nextPos < parseBuf.length) {
1721                 c = parseBuf [nextPos];
1722                 if (c == '?' || c == '*' || c == '+') {
1723                     nextPos++;
1724
1725                     // Rewrite 'zero-or-one' "?" arity to a CHOICE:
1726                     //   - SEQUENCE (clone, what's next)
1727                     //   - or, what's next
1728                     // Size cost: N --> N + 1
1729                     if (c == '?') {
1730                         Recognizer              once = shallowClone ();
1731
1732                         components = new Recognizer [2];
1733                         components [0] = once;
1734                         // components [1] initted to null
1735                         name = null;
1736                         next = null;
1737                         flags = 0;
1738
1739
1740                     // Rewrite 'zero-or-more' "*" arity to a CHOICE.
1741                     //   - LOOP (clone, back to this CHOICE)
1742                     //   - or, what's next
1743                     // Size cost: N --> N + 1
1744                     } else if (c == '*') {
1745                         ChildrenRecognizer      loop = shallowClone ();
1746
1747                         loop.patchNext (this, null);
1748                         loop.flags |= F_LOOPNEXT;
1749                         flags = F_LOOPHEAD;
1750
1751                         components = new Recognizer [2];
1752                         components [0] = loop;
1753                         // components [1] initted to null
1754                         name = null;
1755                         next = null;
1756
1757
1758                     // Rewrite 'one-or-more' "+" arity to a SEQUENCE.
1759                     // Basically (a)+ --> ((a),(a)*).
1760                     //   - this
1761                     //   - CHOICE
1762                     //      * LOOP (clone, back to the CHOICE)
1763                     //      * or, whatever's next
1764                     // Size cost: N --> 2N + 1
1765                     } else if (c == '+') {
1766                         ChildrenRecognizer loop = deepClone ();
1767                         ChildrenRecognizer choice;
1768
1769                         choice = new ChildrenRecognizer (consumer, type);
1770                         loop.patchNext (choice, null);
1771                         loop.flags |= F_LOOPNEXT;
1772                         choice.flags = F_LOOPHEAD;
1773
1774                         choice.components = new Recognizer [2];
1775                         choice.components [0] = loop;
1776                         // choice.components [1] initted to null
1777                         // choice.name, choice.next initted to null
1778
1779                         patchNext (choice, null);
1780                     }
1781                 }
1782             }
1783
1784             return nextPos;
1785         }
1786
1787         // VC: Element Valid (second clause)
1788         boolean acceptCharacters ()
1789             { return false; }
1790
1791         // VC: Element Valid (second clause)
1792         Recognizer acceptElement (String type)
1793         throws SAXException
1794         {
1795             // NAME/SEQUENCE
1796             if (name != null) {
1797                 if (name.equals (type))
1798                     return next;
1799                 return null;
1800             }
1801
1802             // CHOICE ... optionally reporting nondeterminism we
1803             // run across.  we won't check out every transition
1804             // for nondeterminism; only the ones we follow.
1805             Recognizer  retval = null;
1806
1807             for (int i = 0; i < components.length; i++) {
1808                 Recognizer temp = components [i].acceptElement (type);
1809
1810                 if (temp == null)
1811                     continue;
1812                 else if (!warnNonDeterministic)
1813                     return temp;
1814                 else if (retval == null)
1815                     retval = temp;
1816                 else if (retval != temp)
1817                     consumer.error ("Content model " + this.type.model
1818                         + " is non-deterministic for " + type);
1819             }
1820             return retval;
1821         }
1822
1823         // VC: Element Valid (second clause)
1824         boolean completed ()
1825         throws SAXException
1826         {
1827             // expecting a specific element
1828             if (name != null)
1829                 return false;
1830
1831             // choice, some sequences
1832             for (int i = 0; i < components.length; i++) {
1833                 if (components [i].completed ())
1834                     return true;
1835             }
1836
1837             return false;
1838         }
1839
1840 /** /
1841         // FOR DEBUGGING ... flattens the graph for printing.
1842
1843         public String toString ()
1844         {
1845             StringBuffer buf = new StringBuffer ();
1846
1847             // only one set of loop labels can be generated
1848             // at a time...
1849             synchronized (ANY) {
1850                 nodeCount = 0;
1851
1852                 toString (buf, new Hashtable ());
1853                 return buf.toString ();
1854             }
1855         }
1856
1857         private void toString (StringBuffer buf, Hashtable table)
1858         {
1859             // When we visit a node, label and count it.
1860             // Nodes are never visited/counted more than once.
1861             // For small models labels waste space, but if arity
1862             // mappings were used the savings are substantial.
1863             // (Plus, the output can be more readily understood.)
1864             String temp = (String) table.get (this);
1865
1866             if (temp != null) {
1867                 buf.append ('{');
1868                 buf.append (temp);
1869                 buf.append ('}');
1870                 return;
1871             } else {
1872                 StringBuffer scratch = new StringBuffer (15);
1873
1874                 if ((flags & F_LOOPHEAD) != 0)
1875                     scratch.append ("loop");
1876                 else
1877                     scratch.append ("node");
1878                 scratch.append ('-');
1879                 scratch.append (++nodeCount);
1880                 temp = scratch.toString ();
1881
1882                 table.put (this, temp);
1883                 buf.append ('[');
1884                 buf.append (temp);
1885                 buf.append (']');
1886                 buf.append (':');
1887             }
1888
1889             // NAME/SEQUENCE
1890             if (name != null) {
1891                 // n.b. some output encodings turn some name chars into '?'
1892                 // e.g. with Japanese names and ASCII output
1893                 buf.append (name);
1894                 if (components != null)         // bug!
1895                     buf.append ('$');
1896                 if (next == null)
1897                     buf.append (",*");
1898                 else if (next instanceof EmptyRecognizer) // patch-to-next
1899                     buf.append (",{}");
1900                 else if (next instanceof ChildrenRecognizer) {
1901                     buf.append (',');
1902                     ((ChildrenRecognizer)next).toString (buf, table);
1903                 } else                          // bug!
1904                     buf.append (",+");
1905                 return;
1906             }
1907
1908             // CHOICE
1909             buf.append ("<");
1910             for (int i = 0; i < components.length; i++) {
1911                 if (i != 0)
1912                     buf.append ("|");
1913                 if (components [i] instanceof EmptyRecognizer) {
1914                     buf.append ("{}");
1915                 } else if (components [i] == null) {    // patch-to-next
1916                     buf.append ('*');
1917                 } else {
1918                     ChildrenRecognizer r;
1919
1920                     r = (ChildrenRecognizer) components [i];
1921                     r.toString (buf, table);
1922                 }
1923             }
1924             buf.append (">");
1925         }
1926 /**/
1927     }
1928 }