Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/memoire/mst/MstXmlParser.java


1   
2   
3   package com.memoire.mst;
4   import com.memoire.mst.*;
5   
6   // MstXmlParser.java: the main parser class.
7   // NO WARRANTY! See README, and copyright below.
8   // $Id: MstXmlParser.java,v 1.3 2002/12/16 18:56:26 desnoix Exp $
9   
10  
11  
12  import java.io.BufferedInputStream;
13  import java.io.EOFException;
14  import java.io.InputStream;
15  import java.io.Reader;
16  import java.net.URL;
17  import java.net.URLConnection;
18  import java.util.Enumeration;
19  import java.util.Hashtable;
20  import java.util.Stack;
21  
22  // @GDX
23  import java.io.FileNotFoundException;
24  import java.io.ByteArrayInputStream;
25  
26  /**
27    * Parse XML documents and return parse events through call-backs.
28    * <p>You need to define a class implementing the <code>MstXmlHandler</code>
29    * interface: an object belonging to this class will receive the
30    * callbacks for the events.  (As an alternative to implementing
31    * the full MstXmlHandler interface, you can simply extend the 
32    * <code>MstHandlerBase</code> convenience class.)
33    * <p>Usage (assuming that <code>MyHandler</code> is your implementation
34    * of the <code>MstXmlHandler</code> interface):
35    * <pre>
36    * MstXmlHandler handler = new MyHandler();
37    * MstXmlParser parser = new MstXmlParser();
38    * parser.setHandler(handler);
39    * try {
40    *   parser.parse("http://www.host.com/doc.xml", null);
41    * } catch (Exception e) {
42    *   [do something interesting]
43    * }
44    * </pre>
45    * <p>Alternatively, you can use the standard SAX interfaces
46    * with the <code>SAXDriver</code> class as your entry point.
47    * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
48    * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
49    * @version 1.1
50    * @see MstXmlHandler
51    * @see MstHandlerBase
52    * @see SAXDriver
53    */
54  public class MstXmlParser {
55  
56    //
57    // Use special cheats that speed up the code (currently about 50%),
58    // but may cause problems with future maintenance and add to the
59    // class file size (about 500 bytes).
60    //
61    private static final boolean USE_CHEATS = true;
62  
63  
64  
65    //////////////////////////////////////////////////////////////////////
66    // Constructors.
67    ////////////////////////////////////////////////////////////////////////
68  
69  
70    /**
71      * Construct a new parser with no associated handler.
72      * @see #setHandler
73      * @see #parse
74      */
75    public MstXmlParser ()
76    {
77    }
78  
79  
80    /**
81      * Set the handler that will receive parsing events.
82      * @param handler The handler to receive callback events.
83      * @see #parse
84      * @see MstXmlHandler
85      */
86    public void setHandler (MstXmlHandler handler)
87    {
88      this.handler = handler;
89    }
90  
91  
92    /**
93      * Parse an XML document from a URI.
94      * <p>You may parse a document more than once, but only one thread
95      * may call this method for an object at one time.
96      * @param systemId The URI of the document.
97      * @param publicId The public identifier of the document, or null.
98      * @param encoding The suggested encoding, or null if unknown.
99      * @exception java.lang.Exception Any exception thrown by your
100     *            own handlers, or any derivation of java.io.IOException
101     *            thrown by the parser itself.
102     */
103   public void parse (String systemId, String publicId, String encoding)
104     throws java.lang.Exception
105   {
106     doParse(systemId, publicId, null, null, encoding);
107   }
108 
109 
110   /**
111     * Parse an XML document from a byte stream.
112     * <p>The URI that you supply will become the base URI for
113     * resolving relative links, but &AElig;lfred will actually read
114     * the document from the supplied input stream.
115     * <p>You may parse a document more than once, but only one thread
116     * may call this method for an object at one time.
117     * @param systemId The base URI of the document, or null if not
118     *                 known.
119     * @param publicId The public identifier of the document, or null
120     *                 if not known.
121     * @param stream A byte input stream.
122     * @param encoding The suggested encoding, or null if unknown.
123     * @exception java.lang.Exception Any exception thrown by your
124     *            own handlers, or any derivation of java.io.IOException
125     *            thrown by the parser itself.
126     */
127   public void parse (String systemId, String publicId,
128          InputStream stream, String encoding)
129     throws java.lang.Exception
130   {
131     doParse(systemId, publicId, null, stream, encoding);
132   }
133 
134 
135   /**
136     * Parse an XML document from a character stream.
137     * <p>The URI that you supply will become the base URI for
138     * resolving relative links, but &AElig;lfred will actually read
139     * the document from the supplied input stream.
140     * <p>You may parse a document more than once, but only one thread
141     * may call this method for an object at one time.
142     * @param systemId The base URI of the document, or null if not
143     *                 known.
144     * @param publicId The public identifier of the document, or null
145     *                 if not known.
146     * @param reader A character stream.
147     * @exception java.lang.Exception Any exception thrown by your
148     *            own handlers, or any derivation of java.io.IOException
149     *            thrown by the parser itself.
150     */
151   public void parse (String systemId, String publicId, Reader reader)
152     throws java.lang.Exception
153   {
154     doParse(systemId, publicId, reader, null, null);
155   }
156 
157 
158   private synchronized void doParse (String systemId, String publicId,
159              Reader reader, InputStream stream,
160              String encoding)
161     throws java.lang.Exception
162   {
163     basePublicId = publicId;
164     baseURI = systemId;
165     baseReader = reader;
166     baseInputStream = stream;
167 
168     initializeVariables();
169 
170         // Set the default entities here.
171     setInternalEntity(intern("amp"), "&#38;");
172     setInternalEntity(intern("lt"), "&#60;");
173     setInternalEntity(intern("gt"), "&#62;");
174     setInternalEntity(intern("apos"), "&#39;");
175     setInternalEntity(intern("quot"), "&#34;");
176 
177     if (handler != null) {
178       handler.startDocument();
179     }
180 
181     pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
182       encoding);
183 
184     parseDocument();
185 
186     if (handler != null) {
187       handler.endDocument();
188     }
189     cleanupVariables();
190   }
191 
192 
193 
194   ////////////////////////////////////////////////////////////////////////
195   // Constants.
196   ////////////////////////////////////////////////////////////////////////
197 
198   //
199   // Constants for element content type.
200   //
201 
202   /**
203     * Constant: an element has not been declared.
204     * @see #getElementContentType
205     */
206   public static final int CONTENT_UNDECLARED = 0;
207 
208   /**
209     * Constant: the element has a content model of ANY.
210     * @see #getElementContentType
211     */
212   public static final int CONTENT_ANY = 1;
213 
214   /**
215     * Constant: the element has declared content of EMPTY.
216     * @see #getElementContentType
217     */
218   public static final int CONTENT_EMPTY = 2;
219 
220   /**
221     * Constant: the element has mixed content.
222     * @see #getElementContentType
223     */
224   public static final int CONTENT_MIXED = 3;
225 
226   /**
227     * Constant: the element has element content.
228     * @see #getElementContentType
229     */
230   public static final int CONTENT_ELEMENTS = 4;
231 
232 
233   //
234   // Constants for the entity type.
235   //
236 
237   /**
238     * Constant: the entity has not been declared.
239     * @see #getEntityType
240     */
241   public static final int ENTITY_UNDECLARED = 0;
242 
243   /**
244     * Constant: the entity is internal.
245     * @see #getEntityType
246     */
247   public static final int ENTITY_INTERNAL = 1;
248 
249   /**
250     * Constant: the entity is external, non-XML data.
251     * @see #getEntityType
252     */
253   public static final int ENTITY_NDATA = 2;
254 
255   /**
256     * Constant: the entity is external XML data.
257     * @see #getEntityType
258     */
259   public static final int ENTITY_TEXT = 3;
260 
261 
262   //
263   // Constants for attribute type.
264   //
265 
266   /**
267     * Constant: the attribute has not been declared for this element type.
268     * @see #getAttributeType
269     */
270   public static final int ATTRIBUTE_UNDECLARED = 0;
271 
272   /**
273     * Constant: the attribute value is a string value.
274     * @see #getAttributeType
275     */
276   public static final int ATTRIBUTE_CDATA = 1;
277 
278   /**
279     * Constant: the attribute value is a unique identifier.
280     * @see #getAttributeType
281     */
282   public static final int ATTRIBUTE_ID = 2;
283 
284   /**
285     * Constant: the attribute value is a reference to a unique identifier.
286     * @see #getAttributeType
287     */
288   public static final int ATTRIBUTE_IDREF = 3;
289 
290   /**
291     * Constant: the attribute value is a list of ID references.
292     * @see #getAttributeType
293     */
294   public static final int ATTRIBUTE_IDREFS = 4;
295 
296   /**
297     * Constant: the attribute value is the name of an entity.
298     * @see #getAttributeType
299     */
300   public static final int ATTRIBUTE_ENTITY = 5;
301 
302   /**
303     * Constant: the attribute value is a list of entity names.
304     * @see #getAttributeType
305     */
306   public static final int ATTRIBUTE_ENTITIES = 6;
307 
308   /**
309     * Constant: the attribute value is a name token.
310     * @see #getAttributeType
311     */
312   public static final int ATTRIBUTE_NMTOKEN = 7;
313 
314   /**
315     * Constant: the attribute value is a list of name tokens.
316     * @see #getAttributeType
317     */
318   public static final int ATTRIBUTE_NMTOKENS = 8;
319 
320   /**
321     * Constant: the attribute value is a token from an enumeration.
322     * @see #getAttributeType
323     */
324   public static final int ATTRIBUTE_ENUMERATED = 9;
325 
326   /**
327     * Constant: the attribute is the name of a notation.
328     * @see #getAttributeType
329     */
330   public static final int ATTRIBUTE_NOTATION = 10;
331 
332 
333   //
334   // When the class is loaded, populate the hash table of
335   // attribute types.
336   //
337 
338   /**
339     * Hash table of attribute types.
340     */
341   private static Hashtable attributeTypeHash;
342   static {
343     attributeTypeHash = new Hashtable();
344     attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
345     attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
346     attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
347     attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
348     attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
349     attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
350     attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
351     attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
352     attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
353   }
354 
355 
356   //
357   // Constants for supported encodings.
358   //
359   private static final int ENCODING_UTF_8 = 1;
360   private static final int ENCODING_ISO_8859_1 = 2;
361   private static final int ENCODING_UCS_2_12 = 3;
362   private static final int ENCODING_UCS_2_21 = 4;
363   private static final int ENCODING_UCS_4_1234 = 5;
364   private static final int ENCODING_UCS_4_4321 = 6;
365   private static final int ENCODING_UCS_4_2143 = 7;
366   private static final int ENCODING_UCS_4_3412 = 8;
367 
368 
369   //
370   // Constants for attribute default value.
371   //
372 
373   /**
374     * Constant: the attribute is not declared.
375     * @see #getAttributeDefaultValueType
376     */
377   public static final int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
378 
379   /**
380     * Constant: the attribute has a literal default value specified.
381     * @see #getAttributeDefaultValueType
382     * @see #getAttributeDefaultValue
383     */
384   public static final int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
385 
386   /**
387     * Constant: the attribute was declared #IMPLIED.
388     * @see #getAttributeDefaultValueType
389     */
390   public static final int ATTRIBUTE_DEFAULT_IMPLIED = 2;
391 
392   /**
393     * Constant: the attribute was declared #REQUIRED.
394     * @see #getAttributeDefaultValueType
395     */
396   public static final int ATTRIBUTE_DEFAULT_REQUIRED = 3;
397 
398   /**
399     * Constant: the attribute was declared #FIXED.
400     * @see #getAttributeDefaultValueType
401     * @see #getAttributeDefaultValue
402     */
403   public static final int ATTRIBUTE_DEFAULT_FIXED = 4;
404 
405 
406   //
407   // Constants for input.
408   //
409   private static final int INPUT_NONE = 0;
410   private static final int INPUT_INTERNAL = 1;
411   private static final int INPUT_EXTERNAL = 2;
412   private static final int INPUT_STREAM = 3;
413   private static final int INPUT_BUFFER = 4;
414   private static final int INPUT_READER = 5;
415 
416 
417   //
418   // Flags for reading literals.
419   //
420   private static final int LIT_CHAR_REF = 1;
421   private static final int LIT_ENTITY_REF = 2;
422   private static final int LIT_PE_REF = 4;
423   private static final int LIT_NORMALIZE = 8;
424 
425 
426   //
427   // Flags for parsing context.
428   //
429   private static final int CONTEXT_NONE = 0;
430   private static final int CONTEXT_DTD = 1;
431   private static final int CONTEXT_ENTITYVALUE = 2;
432   private static final int CONTEXT_ATTRIBUTEVALUE = 3;
433 
434 
435 
436   //////////////////////////////////////////////////////////////////////
437   // Error reporting.
438   //////////////////////////////////////////////////////////////////////
439 
440 
441   /**
442     * Report an error.
443     * @param message The error message.
444     * @param textFound The text that caused the error (or null).
445     * @see MstXmlHandler#error
446     * @see #line
447     */
448   void error (String message, String textFound, String textExpected)
449     throws java.lang.Exception
450   {
451     errorCount++;
452     if (textFound != null) {
453       message = message + " (found \"" + textFound + "\")";
454     }
455     if (textExpected != null) {
456       message = message + " (expected \"" + textExpected + "\")";
457     }
458     if (handler != null) {
459       String uri = null;
460 
461       if (externalEntity != null) {
462   uri = externalEntity.getURL().toString();
463       }
464       handler.error(message, uri, line, column);
465     }
466   }
467 
468 
469   /**
470     * Report a serious error.
471     * @param message The error message.
472     * @param textFound The text that caused the error (or null).
473     */
474   void error (String message, char textFound, String textExpected)
475     throws java.lang.Exception
476   {
477     error(message, new Character(textFound).toString(), textExpected);
478   }
479 
480 
481 
482   //////////////////////////////////////////////////////////////////////
483   // Major syntactic productions.
484   //////////////////////////////////////////////////////////////////////
485 
486 
487   /**
488     * Parse an XML document.
489     * <pre>
490     * [1] document ::= prolog element Misc*
491     * </pre>
492     * <p>This is the top-level parsing function for a single XML
493     * document.  As a minimum, a well-formed document must have
494     * a document element, and a valid document must have a prolog
495     * as well.
496     */
497   void parseDocument ()
498     throws java.lang.Exception
499     {
500     char c;
501 
502     parseProlog();
503     require('<');
504     parseElement();
505     try
506       {
507       parseMisc();  //skip all white, PIs, and comments
508       c=readCh();   //if this doesn't throw an exception...
509       error("unexpected characters after document end",c,null);
510       }
511     catch (EOFException e)
512       {return;}
513     }
514 
515 
516   /**
517     * Skip a comment.
518     * <pre>
519     * [18] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
520     * </pre>
521     * <p>(The <code>&lt;!--</code> has already been read.)
522     */
523   void parseComment ()
524     throws java.lang.Exception
525   {
526     skipUntil("-->");
527   }
528 
529 
530   /**
531     * Parse a processing instruction and do a call-back.
532     * <pre>
533     * [19] PI ::= '&lt;?' Name (S (Char* - (Char* '?&gt;' Char*)))? '?&gt;'
534     * </pre>
535     * <p>(The <code>&lt;?</code> has already been read.)
536     * <p>An XML processing instruction <em>must</em> begin with
537     * a Name, which is the instruction's target.
538     */
539   void parsePI ()
540     throws java.lang.Exception
541   {
542     String name;
543 
544     name = readNmtoken(true);
545     if (!tryRead("?>")) {
546       requireWhitespace();
547       parseUntil("?>");
548     }
549     if (handler != null) {
550       handler.processingInstruction(name, dataBufferToString());
551     }
552   }
553 
554 
555   /**
556     * Parse a CDATA marked section.
557     * <pre>
558     * [20] CDSect ::= CDStart CData CDEnd
559     * [21] CDStart ::= '&lt;![CDATA['
560     * [22] CData ::= (Char* - (Char* ']]&gt;' Char*))
561     * [23] CDEnd ::= ']]&gt;'
562     * </pre>
563     * <p>(The '&lt;![CDATA[' has already been read.)
564     * <p>Note that this just appends characters to the dataBuffer,
565     * without actually generating an event.
566     */
567   void parseCDSect ()
568     throws java.lang.Exception
569   {
570     parseUntil("]]>");
571   }
572 
573 
574   /**
575     * Parse the prolog of an XML document.
576     * <pre>
577     * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
578     * </pre>
579     * <p>There are a couple of tricks here.  First, it is necessary to
580     * declare the XML default attributes after the DTD (if present)
581     * has been read.  Second, it is not possible to expand general
582     * references in attribute value literals until after the entire
583     * DTD (if present) has been parsed.
584     * <p>We do not look for the XML declaration here, because it is
585     * handled by pushURL().
586     * @see pushURL
587     */
588   void parseProlog ()
589     throws java.lang.Exception
590   {
591     parseMisc();
592 
593     if (tryRead("<!DOCTYPE")) {
594       parseDoctypedecl();
595       parseMisc();
596     }
597   }
598 
599 
600   /**
601     * Parse the XML declaration.
602     * <pre>
603     * [25] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
604     * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
605     * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
606     *               | S 'standalone' Eq '"' ("yes" | "no") '"'
607     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
608     * </pre>
609     * <p>([80] to [82] are also significant.)
610     * <p>(The <code>&lt;?xml</code> and whitespace have already been read.)
611     * <p>TODO: validate value of standalone.
612     * @see #parseTextDecl
613     * @see #checkEncoding
614     */
615   void parseXMLDecl (boolean ignoreEncoding)
616     throws java.lang.Exception
617   {
618     String version;
619     String encodingName = null;
620     String standalone = null;
621 
622         // Read the version.
623     require("version");
624     parseEq();
625     version = readLiteral(0);
626     if (!version.equals("1.0")) {
627       error("unsupported XML version", version, "1.0");
628     }
629 
630         // Try reading an encoding declaration.
631     skipWhitespace();
632     if (tryRead("encoding")) {
633       parseEq();
634       encodingName = readLiteral(0);
635       checkEncoding(encodingName, ignoreEncoding);
636     }
637 
638         // Try reading a standalone declaration
639     skipWhitespace();
640     if (tryRead("standalone")) {
641       parseEq();
642       standalone = readLiteral(0);
643     }
644 
645     skipWhitespace();
646     require("?>");
647   }
648 
649 
650   /**
651     * Parse the Encoding PI.
652     * <pre>
653     * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
654     * [79] EncodingPI ::= '&lt;?xml' S 'encoding' Eq QEncoding S? '?&gt;'
655     * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
656     * [81] Encoding ::= LatinName
657     * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
658     * </pre>
659     * <p>(The <code>&lt;?xml</code>' and whitespace have already been read.)
660     * @see #parseXMLDecl
661     * @see #checkEncoding
662     */
663   void parseTextDecl (boolean ignoreEncoding)
664     throws java.lang.Exception
665   {
666     String encodingName = null;
667     
668         // Read an optional version.
669     if (tryRead("version")) {
670       String version;
671       parseEq();
672       version = readLiteral(0);
673       if (!version.equals("1.0")) {
674   error("unsupported XML version", version, "1.0");
675       }
676       requireWhitespace();
677     }
678       
679 
680         // Read the encoding.
681     require("encoding");
682     parseEq();
683     encodingName = readLiteral(0);
684     checkEncoding(encodingName, ignoreEncoding);
685 
686     skipWhitespace();
687     require("?>");
688   }
689 
690 
691   /**
692     * Check that the encoding specified makes sense.
693     * <p>Compare what the author has specified in the XML declaration
694     * or encoding PI with what we have detected.
695     * <p>This is also important for distinguishing among the various
696     * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
697     * those).
698     * @param encodingName The name of the encoding specified by the user.
699     * @see #parseXMLDecl
700     * @see #parseTextDecl
701     */
702   void checkEncoding (String encodingName, boolean ignoreEncoding)
703     throws java.lang.Exception
704   {
705     encodingName = encodingName.toUpperCase();
706 
707     if (ignoreEncoding) {
708       return;
709     }
710 
711     switch (encoding) {
712         // 8-bit encodings
713     case ENCODING_UTF_8:
714       if (encodingName.equals("ISO-8859-1")) {
715   encoding = ENCODING_ISO_8859_1;
716       } else if (!encodingName.equals("UTF-8")) {
717   error("unsupported 8-bit encoding",
718         encodingName,
719         "UTF-8 or ISO-8859-1");
720       }
721       break;
722         // 16-bit encodings
723     case ENCODING_UCS_2_12:
724     case ENCODING_UCS_2_21:
725       if (!encodingName.equals("ISO-10646-UCS-2") &&
726     !encodingName.equals("UTF-16")) {
727   error("unsupported 16-bit encoding",
728         encodingName,
729         "ISO-10646-UCS-2");
730       }
731       break;
732         // 32-bit encodings
733     case ENCODING_UCS_4_1234:
734     case ENCODING_UCS_4_4321:
735     case ENCODING_UCS_4_2143:
736     case ENCODING_UCS_4_3412:
737       if (!encodingName.equals("ISO-10646-UCS-4")) {
738   error("unsupported 32-bit encoding",
739         encodingName,
740         "ISO-10646-UCS-4");
741       }
742     }
743   }
744 
745 
746   /**
747     * Parse miscellaneous markup outside the document element and DOCTYPE
748     * declaration.
749     * <pre>
750     * [27] Misc ::= Comment | PI | S
751     * </pre>
752     */
753   void parseMisc ()
754     throws java.lang.Exception
755     {
756     while (true)
757       {
758       skipWhitespace();
759       if (tryRead("<?"))
760         {parsePI();}
761       else if (tryRead("<!--"))
762         {parseComment();}
763       else
764         {return;}
765       }
766     }
767 
768 
769   /**
770     * Parse a document type declaration.
771     * <pre>
772     * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
773     *                      ('[' %markupdecl* ']' S?)? '&gt;'
774     * </pre>
775     * <p>(The <code>&lt;!DOCTYPE</code> has already been read.)
776     */
777   void parseDoctypedecl ()
778     throws java.lang.Exception
779   {
780     char c;
781     String doctypeName, ids[];
782 
783         // Read the document type name.
784     requireWhitespace();
785     doctypeName = readNmtoken(true);
786 
787         // Read the ExternalIDs.
788     skipWhitespace();
789     ids = readExternalIds(false);
790 
791         // Look for a declaration subset.
792     skipWhitespace();
793     if (tryRead('[')) {
794 
795         // loop until the subset ends
796       while (true) {
797   context = CONTEXT_DTD;
798   skipWhitespace();
799   context = CONTEXT_NONE;
800   if (tryRead(']')) {
801     break;    // end of subset
802   } else {
803     context = CONTEXT_DTD;
804     parseMarkupdecl();
805     context = CONTEXT_NONE;
806   }
807       }
808     }
809 
810         // Read the external subset, if any
811     if (ids[1] != null) {
812       pushURL("[external subset]", ids[0], ids[1], null, null, null);
813 
814         // Loop until we end up back at '>'
815       while (true) {
816   context = CONTEXT_DTD;
817   skipWhitespace();
818   context = CONTEXT_NONE;
819   if (tryRead('>')) {
820     break;
821   } else {
822     context = CONTEXT_DTD;
823     parseMarkupdecl();
824     context = CONTEXT_NONE;
825   }
826       }
827     } else {
828         // No external subset.
829       skipWhitespace();
830       require('>');
831     }
832 
833     if (handler != null) {
834       handler.doctypeDecl(doctypeName, ids[0], ids[1]);
835     }
836 
837         // Expand general entities in
838         // default values of attributes.
839         // (Do this after the doctypeDecl
840         // event!).
841     // expandAttributeDefaultValues();
842   }
843 
844 
845   /**
846     * Parse a markup declaration in the internal or external DTD subset.
847     * <pre>
848     * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
849     *                       %NotationDecl | %PI | %S | %Comment |
850     *                       InternalPERef )
851     * [30] InternalPERef ::= PEReference
852     * [31] extSubset ::= (%markupdecl | %conditionalSect)*
853     * </pre>
854     */
855   void parseMarkupdecl ()
856     throws java.lang.Exception
857   {
858     if (tryRead("<!ELEMENT")) {
859       parseElementdecl();
860     } else if (tryRead("<!ATTLIST")) {
861       parseAttlistDecl();
862     } else if (tryRead("<!ENTITY")) {
863       parseEntityDecl();
864     } else if (tryRead("<!NOTATION")) {
865       parseNotationDecl();
866     } else if (tryRead("<?")) {
867       parsePI();
868     } else if (tryRead("<!--")) {
869       parseComment();
870     } else if (tryRead("<![")) {
871       parseConditionalSect();
872     } else {
873       error("expected markup declaration", null, null);
874     }
875   }
876 
877 
878   /**
879     * Parse an element, with its tags.
880     * <pre>
881     * [33] STag ::= '&lt;' Name (S Attribute)* S? '&gt;' [WFC: unique Att spec]
882     * [38] element ::= EmptyElement | STag content ETag
883     * [39] EmptyElement ::= '&lt;' Name (S Attribute)* S? '/&gt;'
884     *                       [WFC: unique Att spec]
885     * </pre>
886     * <p>(The '&lt;' has already been read.)
887     * <p>NOTE: this method actually chains onto parseContent(), if necessary,
888     * and parseContent() will take care of calling parseETag().
889     */
890   void parseElement ()
891     throws java.lang.Exception
892   {
893     String gi;
894     char c;
895     int oldElementContent = currentElementContent;
896     String oldElement = currentElement;
897 
898         // This is the (global) counter for the
899         // array of specified attributes.
900     tagAttributePos = 0;
901 
902         // Read the element type name.
903     gi = readNmtoken(true);
904 
905         // Determine the current content type.
906     currentElement = gi;
907     currentElementContent = getElementContentType(gi);
908     if (currentElementContent == CONTENT_UNDECLARED) {
909       currentElementContent = CONTENT_ANY;
910     }
911 
912         // Read the attributes, if any.
913         // After this loop, we should be just
914         // in front of the closing delimiter.
915     skipWhitespace();
916     c = readCh();
917     while (c != '/' && c != '>') {
918       unread(c);
919       parseAttribute(gi);
920       skipWhitespace();
921       c = readCh();
922     }
923     unread(c);
924 
925         // Supply any defaulted attributes.
926     Enumeration atts = declaredAttributes(gi);
927     if (atts != null) {
928       String aname;
929     loop: while (atts.hasMoreElements()) {
930       aname = (String)atts.nextElement();
931         // See if it was specified.
932       for (int i = 0; i < tagAttributePos; i++) {
933   if (tagAttributes[i] == aname) {
934     continue loop;
935   }
936       }
937         // I guess not...
938       if (handler != null) {
939   handler.attribute(aname,
940         getAttributeExpandedValue(gi, aname),
941         false);
942       }
943     }
944     }
945 
946         // Figure out if this is a start tag
947         // or an empty element, and dispatch an
948         // event accordingly.
949     c = readCh();
950     switch (c) {
951     case '>':
952       if (handler != null) {
953   handler.startElement(gi);
954       }
955       parseContent();
956       break;
957     case '/':
958       require('>');
959       if (handler != null) {
960   handler.startElement(gi);
961   handler.endElement(gi);
962       }
963       break;
964     }
965 
966         // Restore the previous state.
967     currentElement = oldElement;
968     currentElementContent = oldElementContent;
969   }
970 
971 
972   /**
973     * Parse an attribute assignment.
974     * <pre>
975     * [34] Attribute ::= Name Eq AttValue
976     * </pre>
977     * @param name The name of the attribute's element.
978     * @see MstXmlHandler#attribute
979     */
980   void parseAttribute (String name)
981     throws java.lang.Exception
982   {
983     String aname;
984     int type;
985     String value;
986 
987         // Read the attribute name.
988     aname = readNmtoken(true).intern();
989     type = getAttributeDefaultValueType(name, aname);
990 
991         // Parse '='
992     parseEq();
993 
994         // Read the value, normalizing whitespace
995         // if it is not CDATA.
996     if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
997       value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
998     } else {
999       value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
1000    }
1001
1002        // Inform the handler about the
1003        // attribute.
1004    if (handler != null) {
1005      handler.attribute(aname, value, true);
1006    }
1007    dataBufferPos = 0;
1008
1009        // Note that the attribute has been
1010        // specified.
1011    if (tagAttributePos == tagAttributes.length) {
1012      String newAttrib[] = new String[tagAttributes.length * 2];
1013      System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1014      tagAttributes = newAttrib;
1015    }
1016    tagAttributes[tagAttributePos++] = aname;
1017  }
1018
1019
1020  /**
1021    * Parse an equals sign surrounded by optional whitespace.
1022    * [35] Eq ::= S? '=' S?
1023    */
1024  void parseEq ()
1025    throws java.lang.Exception
1026  {
1027    skipWhitespace();
1028    require('=');
1029    skipWhitespace();
1030  }
1031
1032
1033  /**
1034    * Parse an end tag.
1035    * [36] ETag ::= '</' Name S? '>'
1036    * *NOTE: parseContent() chains to here.
1037    */
1038  void parseETag ()
1039    throws java.lang.Exception
1040  {
1041    String name;
1042    name = readNmtoken(true);
1043    if (name != currentElement) {
1044      error("mismatched end tag", name, currentElement);
1045    }
1046    skipWhitespace();
1047    require('>');
1048    if (handler != null) {
1049      handler.endElement(name);
1050    }
1051  }
1052
1053
1054  /**
1055    * Parse the content of an element.
1056    * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
1057    * [68] Reference ::= EntityRef | CharRef
1058    */
1059  void parseContent ()
1060    throws java.lang.Exception
1061  {
1062    String data;
1063    char c;
1064
1065    while (true) {
1066
1067      switch (currentElementContent) {
1068      case CONTENT_ANY:
1069      case CONTENT_MIXED:
1070  parsePCData();
1071  break;
1072      case CONTENT_ELEMENTS:
1073  parseWhitespace();
1074  break;
1075      }
1076
1077        // Handle delimiters
1078      c = readCh();
1079      switch (c) {
1080
1081      case '&':      // Found "&"
1082  c = readCh();
1083  if (c == '#') {
1084    parseCharRef();
1085  } else {
1086    unread(c);
1087    parseEntityRef(true);
1088  }
1089  break;
1090
1091      case '<':      // Found "<"
1092
1093  c = readCh();
1094  switch (c) {
1095
1096  case '!':    // Found "<!"
1097    c = readCh();
1098    switch (c) {
1099    case '-':    // Found "<!-"
1100      require('-');
1101      parseComment();
1102      break;
1103    case '[':    // Found "<!["
1104      require("CDATA[");
1105      parseCDSect();
1106      break;
1107    default:
1108      error("expected comment or CDATA section", c, null);
1109      break;
1110    }
1111    break;
1112
1113  case '?':    // Found "<?"
1114    dataBufferFlush();
1115    parsePI();
1116    break;
1117
1118  case '/':    // Found "</"
1119    dataBufferFlush();
1120    parseETag();
1121    return;
1122
1123  default:    // Found "<" followed by something else
1124    dataBufferFlush();
1125    unread(c);
1126    parseElement();
1127    break;
1128  }
1129      }
1130    }
1131  }
1132
1133
1134  /**
1135    * Parse an element type declaration.
1136    * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
1137    *                      [VC: Unique Element Declaration]
1138    * *NOTE: the '<!ELEMENT' has already been read.
1139    */
1140  void parseElementdecl ()
1141    throws java.lang.Exception
1142  {
1143    String name;
1144
1145    requireWhitespace();
1146        // Read the element type name.
1147    name = readNmtoken(true);
1148
1149    requireWhitespace();
1150        // Read the content model.
1151    parseContentspec(name);
1152
1153    skipWhitespace();
1154    require('>');
1155  }
1156
1157
1158  /**
1159    * Content specification.
1160    * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1161    */
1162  void parseContentspec (String name)
1163    throws java.lang.Exception
1164  {
1165    if (tryRead("EMPTY")) {
1166      setElement(name, CONTENT_EMPTY, null, null);
1167      return;
1168    } else if (tryRead("ANY")) {
1169      setElement(name, CONTENT_ANY, null, null);
1170      return;
1171    } else {
1172      require('(');
1173      dataBufferAppend('(');
1174      skipWhitespace();
1175      if (tryRead("#PCDATA")) {
1176  dataBufferAppend("#PCDATA");
1177  parseMixed();
1178  setElement(name, CONTENT_MIXED, dataBufferToString(), null);
1179      } else {
1180  parseElements();
1181  setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
1182      }
1183    }
1184  }
1185
1186
1187  /**
1188    * Parse an element-content model.
1189    * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1190    * [44] cps ::= S? %cp S?
1191    * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1192    * [46] ctokplus ::= cps ('|' cps)+
1193    * [47] ctoks ::= cps ('|' cps)*
1194    * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1195    * [49] stoks ::= cps (',' cps)*
1196    * *NOTE: the opening '(' and S have already been read.
1197    * *TODO: go over parameter entity boundaries more carefully.
1198    */
1199  void parseElements ()
1200    throws java.lang.Exception
1201  {
1202    char c;
1203    char sep;
1204
1205        // Parse the first content particle
1206    skipWhitespace();
1207    parseCp();
1208
1209        // Check for end or for a separator.
1210    skipWhitespace();
1211    c = readCh();
1212    switch (c) {
1213    case ')':
1214      dataBufferAppend(')');
1215      c = readCh();
1216      switch (c) {
1217      case '*':
1218      case '+':
1219      case '?':
1220  dataBufferAppend(c);
1221  break;
1222      default:
1223  unread(c);
1224      }
1225      return;
1226    case ',':      // Register the separator.
1227    case '|':
1228      sep = c;
1229      dataBufferAppend(c);
1230      break;
1231    default:
1232      error("bad separator in content model", c, null);
1233      return;
1234    }
1235
1236        // Parse the rest of the content model.
1237    while (true) {
1238      skipWhitespace();
1239      parseCp();
1240      skipWhitespace();
1241      c = readCh();
1242      if (c == ')') {
1243  dataBufferAppend(')');
1244  break;
1245      } else if (c != sep) {
1246  error("bad separator in content model", c, null);
1247  return;
1248      } else {
1249  dataBufferAppend(c);
1250      }
1251    }
1252
1253        // Check for the occurrence indicator.
1254    c = readCh();
1255    switch (c) {
1256    case '?':
1257    case '*':
1258    case '+':
1259      dataBufferAppend(c);
1260      return;
1261    default:
1262      unread(c);
1263      return;
1264    }
1265  }
1266
1267
1268  /**
1269    * Parse a content particle.
1270    * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1271    * *NOTE: I actually use a slightly different production here:
1272    *        cp ::= (elements | (Name ('?' | '*' | '+')?))
1273    */
1274  void parseCp ()
1275    throws java.lang.Exception
1276  {
1277    char c;
1278
1279    if (tryRead('(')) {
1280      dataBufferAppend('(');
1281      parseElements();
1282    } else {
1283      dataBufferAppend(readNmtoken(true));
1284      c = readCh();
1285      switch (c) {
1286      case '?':
1287      case '*':
1288      case '+':
1289  dataBufferAppend(c);
1290  break;
1291      default:
1292  unread(c);
1293  break;
1294      }
1295    }
1296  }
1297
1298
1299  /**
1300    * Parse mixed content.
1301    * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1302    *              | '(' S? %('#PCDATA') S? ')'
1303    * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1304    * *NOTE: the S and '#PCDATA' have already been read.
1305    */
1306  void parseMixed ()
1307    throws java.lang.Exception
1308  {
1309    char c;
1310
1311        // Check for PCDATA alone.
1312    skipWhitespace();
1313    if (tryRead(')')) {
1314      dataBufferAppend(")*");
1315      tryRead('*');
1316      return;
1317    }
1318
1319        // Parse mixed content.
1320    skipWhitespace();
1321    while (!tryRead(")*")) {
1322      require('|');
1323      dataBufferAppend('|');
1324      skipWhitespace();
1325      dataBufferAppend(readNmtoken(true));
1326      skipWhitespace();
1327    }
1328    dataBufferAppend(")*");
1329  }
1330
1331
1332  /**
1333    * Parse an attribute list declaration.
1334    * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
1335    * *NOTE: the '<!ATTLIST' has already been read.
1336    */
1337  void parseAttlistDecl ()
1338    throws java.lang.Exception
1339  {
1340    String elementName;
1341
1342    requireWhitespace();
1343    elementName = readNmtoken(true);
1344    requireWhitespace();
1345    while (!tryRead('>')) {
1346      parseAttDef(elementName);
1347      skipWhitespace();
1348    }
1349  }
1350
1351
1352  /**
1353    * Parse a single attribute definition.
1354    * [53] AttDef ::= S %Name S %AttType S %Default
1355    */
1356  void parseAttDef (String elementName)
1357    throws java.lang.Exception
1358  {
1359    String name;
1360    int type;
1361    String enum = null;
1362
1363        // Read the attribute name.
1364    name = readNmtoken(true);
1365
1366        // Read the attribute type.
1367    requireWhitespace();
1368    type = readAttType();
1369
1370        // Get the string of enumerated values
1371        // if necessary.
1372    if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1373      enum = dataBufferToString();
1374    }
1375
1376        // Read the default value.
1377    requireWhitespace();
1378    parseDefault(elementName, name, type, enum);
1379  }
1380
1381
1382  /**
1383    * Parse the attribute type.
1384    * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1385    * [55] StringType ::= 'CDATA'
1386    * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1387    *                        'NMTOKEN' | 'NMTOKENS'
1388    * [57] EnumeratedType ::= NotationType | Enumeration
1389    * *TODO: validate the type!!
1390    */
1391  int readAttType ()
1392    throws java.lang.Exception
1393  {
1394    String typeString;
1395    Integer type;
1396
1397    if (tryRead('(')) {
1398      parseEnumeration();
1399      return ATTRIBUTE_ENUMERATED;
1400    } else {
1401      typeString = readNmtoken(true);
1402      if (typeString.equals("NOTATION")) {
1403  parseNotationType();
1404      }
1405      type = (Integer)attributeTypeHash.get(typeString);
1406      if (type == null) {
1407  error("illegal attribute type", typeString, null);
1408  return ATTRIBUTE_UNDECLARED;
1409      } else {
1410  return type.intValue();
1411      }
1412    }
1413  }
1414
1415
1416  /**
1417    * Parse an enumeration.
1418    * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1419    * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1420    * *NOTE: the '(' has already been read.
1421    */
1422  void parseEnumeration ()
1423    throws java.lang.Exception
1424  {
1425    char c;
1426
1427    dataBufferAppend('(');
1428
1429        // Read the first token.
1430    skipWhitespace();
1431    dataBufferAppend(readNmtoken(true));
1432        // Read the remaining tokens.
1433    skipWhitespace();
1434    while (!tryRead(')')) {
1435      require('|');
1436      dataBufferAppend('|');
1437      skipWhitespace();
1438      dataBufferAppend(readNmtoken(true));
1439      skipWhitespace();
1440    }
1441    dataBufferAppend(')');
1442  }
1443
1444
1445  /**
1446    * Parse a notation type for an attribute.
1447    * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1448    *                       S? ')'
1449    * [59] Ntoks ::= %Name (S? '|' S? %Name)
1450    * *NOTE: the 'NOTATION' has already been read
1451    */
1452  void parseNotationType ()
1453    throws java.lang.Exception
1454  {
1455    requireWhitespace();
1456    require('(');
1457
1458    parseEnumeration();
1459  }
1460
1461
1462  /**
1463    * Parse the default value for an attribute.
1464    * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1465    */
1466  void parseDefault (String elementName, String name, int type, String enum)
1467    throws java.lang.Exception
1468  {
1469    int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1470    String value = null;
1471    boolean normalizeWSFlag;
1472
1473    if (tryRead('#')) {
1474      if (tryRead("FIXED")) {
1475  valueType = ATTRIBUTE_DEFAULT_FIXED;
1476  requireWhitespace();
1477  context = CONTEXT_ATTRIBUTEVALUE;
1478  value = readLiteral(LIT_CHAR_REF);
1479  context = CONTEXT_DTD;
1480      } else if (tryRead("REQUIRED")) {
1481  valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1482      } else if (tryRead("IMPLIED")) {
1483  valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1484      } else {
1485  error("illegal keyword for attribute default value", null, null);
1486      }
1487    } else {
1488      context = CONTEXT_ATTRIBUTEVALUE;
1489      value = readLiteral(LIT_CHAR_REF);
1490      context = CONTEXT_DTD;
1491    }
1492    setAttribute(elementName, name, type, enum, value, valueType);
1493  }
1494
1495
1496  /**
1497    * Parse a conditional section.
1498    * [63] conditionalSect ::= includeSect || ignoreSect
1499    * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
1500    * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
1501    * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
1502    *                           | ('<![' ignoreSectContents* ']]>')
1503    *                           | (Char - (']' | [<'"]))
1504    *                           | ('<!' (Char - ('-' | '[')))
1505    * *NOTE: the '<![' has already been read.
1506    * *TODO: verify that I am handling ignoreSectContents right.
1507    */
1508  void parseConditionalSect ()
1509    throws java.lang.Exception
1510  {
1511    skipWhitespace();
1512    if (tryRead("INCLUDE")) {
1513      skipWhitespace();
1514      require('[');
1515      skipWhitespace();
1516      while (!tryRead("]]>")) {
1517  parseMarkupdecl();
1518  skipWhitespace();
1519      }
1520    } else if (tryRead("IGNORE")) {
1521      skipWhitespace();
1522      require('[');
1523      int nesting = 1;
1524      char c;
1525      for (int nest = 1; nest > 0; ) {
1526  c = readCh();
1527  switch (c) {
1528  case '<':
1529    if (tryRead("![")) {
1530      nest++;
1531    }
1532  case ']':
1533    if (tryRead("]>")) {
1534      nest--;
1535    }
1536  }
1537      }
1538    } else {
1539      error("conditional section must begin with INCLUDE or IGNORE",
1540      null, null);
1541    }
1542  }
1543
1544
1545  /**
1546    * Read a character reference.
1547    * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1548    * *NOTE: the '&#' has already been read.
1549    */
1550  void parseCharRef ()
1551    throws java.lang.Exception
1552  {
1553    int value = 0;
1554    char c;
1555
1556    if (tryRead('x')) {
1557      loop1: while (true) {
1558  c = readCh();
1559  switch (c) {
1560  case '0':
1561  case '1':
1562  case '2':
1563  case '3':
1564  case '4':
1565  case '5':
1566  case '6':
1567  case '7':
1568  case '8':
1569  case '9':
1570  case 'a':
1571  case 'A':
1572  case 'b':
1573  case 'B':
1574  case 'c':
1575  case 'C':
1576  case 'd':
1577  case 'D':
1578  case 'e':
1579  case 'E':
1580  case 'f':
1581  case 'F':
1582    value *= 16;
1583    value += Integer.parseInt(new Character(c).toString(), 16);
1584    break;
1585  case ';':
1586    break loop1;
1587  default:
1588    error("illegal character in character reference", c, null);
1589    break loop1;
1590  }
1591      }
1592    } else {
1593      loop2: while (true) {
1594  c = readCh();
1595  switch (c) {
1596  case '0':
1597  case '1':
1598  case '2':
1599  case '3':
1600  case '4':
1601  case '5':
1602  case '6':
1603  case '7':
1604  case '8':
1605  case '9':
1606    value *= 10;
1607    value += Integer.parseInt(new Character(c).toString(), 10);
1608    break;
1609  case ';':
1610    break loop2;
1611  default:
1612    error("illegal character in character reference", c, null);
1613    break loop2;
1614  }
1615      }
1616    }
1617
1618    // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1619    //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 
1620    if (value <= 0x0000ffff) {
1621        // no surrogates needed
1622      dataBufferAppend((char)value);
1623    } else if (value <= 0x000fffff) {
1624        // > 16 bits, surrogate needed
1625      dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
1626      dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
1627    } else {
1628        // too big for surrogate
1629      error("character reference " + value + " is too large for UTF-16",
1630      new Integer(value).toString(), null);
1631    }
1632  }
1633
1634
1635  /**
1636    * Parse a reference.
1637    * [69] EntityRef ::= '&' Name ';'
1638    * *NOTE: the '&' has already been read.
1639    * @param externalAllowed External entities are allowed here.
1640    */
1641  void parseEntityRef (boolean externalAllowed)
1642    throws java.lang.Exception
1643  {
1644    String name;
1645
<