Source code: com/memoire/mst/MstXmlParser.java
1
2
3 package com.memoire.mst;
4 import com.memoire.mst.*;
5
6 // MstXmlParser.java: the main parser class.
7 // NO WARRANTY! See README, and copyright below.
8 // $Id: MstXmlParser.java,v 1.3 2002/12/16 18:56:26 desnoix Exp $
9
10
11
12 import java.io.BufferedInputStream;
13 import java.io.EOFException;
14 import java.io.InputStream;
15 import java.io.Reader;
16 import java.net.URL;
17 import java.net.URLConnection;
18 import java.util.Enumeration;
19 import java.util.Hashtable;
20 import java.util.Stack;
21
22 // @GDX
23 import java.io.FileNotFoundException;
24 import java.io.ByteArrayInputStream;
25
26 /**
27 * Parse XML documents and return parse events through call-backs.
28 * <p>You need to define a class implementing the <code>MstXmlHandler</code>
29 * interface: an object belonging to this class will receive the
30 * callbacks for the events. (As an alternative to implementing
31 * the full MstXmlHandler interface, you can simply extend the
32 * <code>MstHandlerBase</code> convenience class.)
33 * <p>Usage (assuming that <code>MyHandler</code> is your implementation
34 * of the <code>MstXmlHandler</code> interface):
35 * <pre>
36 * MstXmlHandler handler = new MyHandler();
37 * MstXmlParser parser = new MstXmlParser();
38 * parser.setHandler(handler);
39 * try {
40 * parser.parse("http://www.host.com/doc.xml", null);
41 * } catch (Exception e) {
42 * [do something interesting]
43 * }
44 * </pre>
45 * <p>Alternatively, you can use the standard SAX interfaces
46 * with the <code>SAXDriver</code> class as your entry point.
47 * @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
48 * @author Written by David Megginson <dmeggins@microstar.com>
49 * @version 1.1
50 * @see MstXmlHandler
51 * @see MstHandlerBase
52 * @see SAXDriver
53 */
54 public class MstXmlParser {
55
56 //
57 // Use special cheats that speed up the code (currently about 50%),
58 // but may cause problems with future maintenance and add to the
59 // class file size (about 500 bytes).
60 //
61 private static final boolean USE_CHEATS = true;
62
63
64
65 //////////////////////////////////////////////////////////////////////
66 // Constructors.
67 ////////////////////////////////////////////////////////////////////////
68
69
70 /**
71 * Construct a new parser with no associated handler.
72 * @see #setHandler
73 * @see #parse
74 */
75 public MstXmlParser ()
76 {
77 }
78
79
80 /**
81 * Set the handler that will receive parsing events.
82 * @param handler The handler to receive callback events.
83 * @see #parse
84 * @see MstXmlHandler
85 */
86 public void setHandler (MstXmlHandler handler)
87 {
88 this.handler = handler;
89 }
90
91
92 /**
93 * Parse an XML document from a URI.
94 * <p>You may parse a document more than once, but only one thread
95 * may call this method for an object at one time.
96 * @param systemId The URI of the document.
97 * @param publicId The public identifier of the document, or null.
98 * @param encoding The suggested encoding, or null if unknown.
99 * @exception java.lang.Exception Any exception thrown by your
100 * own handlers, or any derivation of java.io.IOException
101 * thrown by the parser itself.
102 */
103 public void parse (String systemId, String publicId, String encoding)
104 throws java.lang.Exception
105 {
106 doParse(systemId, publicId, null, null, encoding);
107 }
108
109
110 /**
111 * Parse an XML document from a byte stream.
112 * <p>The URI that you supply will become the base URI for
113 * resolving relative links, but Ælfred will actually read
114 * the document from the supplied input stream.
115 * <p>You may parse a document more than once, but only one thread
116 * may call this method for an object at one time.
117 * @param systemId The base URI of the document, or null if not
118 * known.
119 * @param publicId The public identifier of the document, or null
120 * if not known.
121 * @param stream A byte input stream.
122 * @param encoding The suggested encoding, or null if unknown.
123 * @exception java.lang.Exception Any exception thrown by your
124 * own handlers, or any derivation of java.io.IOException
125 * thrown by the parser itself.
126 */
127 public void parse (String systemId, String publicId,
128 InputStream stream, String encoding)
129 throws java.lang.Exception
130 {
131 doParse(systemId, publicId, null, stream, encoding);
132 }
133
134
135 /**
136 * Parse an XML document from a character stream.
137 * <p>The URI that you supply will become the base URI for
138 * resolving relative links, but Ælfred will actually read
139 * the document from the supplied input stream.
140 * <p>You may parse a document more than once, but only one thread
141 * may call this method for an object at one time.
142 * @param systemId The base URI of the document, or null if not
143 * known.
144 * @param publicId The public identifier of the document, or null
145 * if not known.
146 * @param reader A character stream.
147 * @exception java.lang.Exception Any exception thrown by your
148 * own handlers, or any derivation of java.io.IOException
149 * thrown by the parser itself.
150 */
151 public void parse (String systemId, String publicId, Reader reader)
152 throws java.lang.Exception
153 {
154 doParse(systemId, publicId, reader, null, null);
155 }
156
157
158 private synchronized void doParse (String systemId, String publicId,
159 Reader reader, InputStream stream,
160 String encoding)
161 throws java.lang.Exception
162 {
163 basePublicId = publicId;
164 baseURI = systemId;
165 baseReader = reader;
166 baseInputStream = stream;
167
168 initializeVariables();
169
170 // Set the default entities here.
171 setInternalEntity(intern("amp"), "&");
172 setInternalEntity(intern("lt"), "<");
173 setInternalEntity(intern("gt"), ">");
174 setInternalEntity(intern("apos"), "'");
175 setInternalEntity(intern("quot"), """);
176
177 if (handler != null) {
178 handler.startDocument();
179 }
180
181 pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream,
182 encoding);
183
184 parseDocument();
185
186 if (handler != null) {
187 handler.endDocument();
188 }
189 cleanupVariables();
190 }
191
192
193
194 ////////////////////////////////////////////////////////////////////////
195 // Constants.
196 ////////////////////////////////////////////////////////////////////////
197
198 //
199 // Constants for element content type.
200 //
201
202 /**
203 * Constant: an element has not been declared.
204 * @see #getElementContentType
205 */
206 public static final int CONTENT_UNDECLARED = 0;
207
208 /**
209 * Constant: the element has a content model of ANY.
210 * @see #getElementContentType
211 */
212 public static final int CONTENT_ANY = 1;
213
214 /**
215 * Constant: the element has declared content of EMPTY.
216 * @see #getElementContentType
217 */
218 public static final int CONTENT_EMPTY = 2;
219
220 /**
221 * Constant: the element has mixed content.
222 * @see #getElementContentType
223 */
224 public static final int CONTENT_MIXED = 3;
225
226 /**
227 * Constant: the element has element content.
228 * @see #getElementContentType
229 */
230 public static final int CONTENT_ELEMENTS = 4;
231
232
233 //
234 // Constants for the entity type.
235 //
236
237 /**
238 * Constant: the entity has not been declared.
239 * @see #getEntityType
240 */
241 public static final int ENTITY_UNDECLARED = 0;
242
243 /**
244 * Constant: the entity is internal.
245 * @see #getEntityType
246 */
247 public static final int ENTITY_INTERNAL = 1;
248
249 /**
250 * Constant: the entity is external, non-XML data.
251 * @see #getEntityType
252 */
253 public static final int ENTITY_NDATA = 2;
254
255 /**
256 * Constant: the entity is external XML data.
257 * @see #getEntityType
258 */
259 public static final int ENTITY_TEXT = 3;
260
261
262 //
263 // Constants for attribute type.
264 //
265
266 /**
267 * Constant: the attribute has not been declared for this element type.
268 * @see #getAttributeType
269 */
270 public static final int ATTRIBUTE_UNDECLARED = 0;
271
272 /**
273 * Constant: the attribute value is a string value.
274 * @see #getAttributeType
275 */
276 public static final int ATTRIBUTE_CDATA = 1;
277
278 /**
279 * Constant: the attribute value is a unique identifier.
280 * @see #getAttributeType
281 */
282 public static final int ATTRIBUTE_ID = 2;
283
284 /**
285 * Constant: the attribute value is a reference to a unique identifier.
286 * @see #getAttributeType
287 */
288 public static final int ATTRIBUTE_IDREF = 3;
289
290 /**
291 * Constant: the attribute value is a list of ID references.
292 * @see #getAttributeType
293 */
294 public static final int ATTRIBUTE_IDREFS = 4;
295
296 /**
297 * Constant: the attribute value is the name of an entity.
298 * @see #getAttributeType
299 */
300 public static final int ATTRIBUTE_ENTITY = 5;
301
302 /**
303 * Constant: the attribute value is a list of entity names.
304 * @see #getAttributeType
305 */
306 public static final int ATTRIBUTE_ENTITIES = 6;
307
308 /**
309 * Constant: the attribute value is a name token.
310 * @see #getAttributeType
311 */
312 public static final int ATTRIBUTE_NMTOKEN = 7;
313
314 /**
315 * Constant: the attribute value is a list of name tokens.
316 * @see #getAttributeType
317 */
318 public static final int ATTRIBUTE_NMTOKENS = 8;
319
320 /**
321 * Constant: the attribute value is a token from an enumeration.
322 * @see #getAttributeType
323 */
324 public static final int ATTRIBUTE_ENUMERATED = 9;
325
326 /**
327 * Constant: the attribute is the name of a notation.
328 * @see #getAttributeType
329 */
330 public static final int ATTRIBUTE_NOTATION = 10;
331
332
333 //
334 // When the class is loaded, populate the hash table of
335 // attribute types.
336 //
337
338 /**
339 * Hash table of attribute types.
340 */
341 private static Hashtable attributeTypeHash;
342 static {
343 attributeTypeHash = new Hashtable();
344 attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA));
345 attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID));
346 attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF));
347 attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS));
348 attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY));
349 attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES));
350 attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN));
351 attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS));
352 attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION));
353 }
354
355
356 //
357 // Constants for supported encodings.
358 //
359 private static final int ENCODING_UTF_8 = 1;
360 private static final int ENCODING_ISO_8859_1 = 2;
361 private static final int ENCODING_UCS_2_12 = 3;
362 private static final int ENCODING_UCS_2_21 = 4;
363 private static final int ENCODING_UCS_4_1234 = 5;
364 private static final int ENCODING_UCS_4_4321 = 6;
365 private static final int ENCODING_UCS_4_2143 = 7;
366 private static final int ENCODING_UCS_4_3412 = 8;
367
368
369 //
370 // Constants for attribute default value.
371 //
372
373 /**
374 * Constant: the attribute is not declared.
375 * @see #getAttributeDefaultValueType
376 */
377 public static final int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
378
379 /**
380 * Constant: the attribute has a literal default value specified.
381 * @see #getAttributeDefaultValueType
382 * @see #getAttributeDefaultValue
383 */
384 public static final int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
385
386 /**
387 * Constant: the attribute was declared #IMPLIED.
388 * @see #getAttributeDefaultValueType
389 */
390 public static final int ATTRIBUTE_DEFAULT_IMPLIED = 2;
391
392 /**
393 * Constant: the attribute was declared #REQUIRED.
394 * @see #getAttributeDefaultValueType
395 */
396 public static final int ATTRIBUTE_DEFAULT_REQUIRED = 3;
397
398 /**
399 * Constant: the attribute was declared #FIXED.
400 * @see #getAttributeDefaultValueType
401 * @see #getAttributeDefaultValue
402 */
403 public static final int ATTRIBUTE_DEFAULT_FIXED = 4;
404
405
406 //
407 // Constants for input.
408 //
409 private static final int INPUT_NONE = 0;
410 private static final int INPUT_INTERNAL = 1;
411 private static final int INPUT_EXTERNAL = 2;
412 private static final int INPUT_STREAM = 3;
413 private static final int INPUT_BUFFER = 4;
414 private static final int INPUT_READER = 5;
415
416
417 //
418 // Flags for reading literals.
419 //
420 private static final int LIT_CHAR_REF = 1;
421 private static final int LIT_ENTITY_REF = 2;
422 private static final int LIT_PE_REF = 4;
423 private static final int LIT_NORMALIZE = 8;
424
425
426 //
427 // Flags for parsing context.
428 //
429 private static final int CONTEXT_NONE = 0;
430 private static final int CONTEXT_DTD = 1;
431 private static final int CONTEXT_ENTITYVALUE = 2;
432 private static final int CONTEXT_ATTRIBUTEVALUE = 3;
433
434
435
436 //////////////////////////////////////////////////////////////////////
437 // Error reporting.
438 //////////////////////////////////////////////////////////////////////
439
440
441 /**
442 * Report an error.
443 * @param message The error message.
444 * @param textFound The text that caused the error (or null).
445 * @see MstXmlHandler#error
446 * @see #line
447 */
448 void error (String message, String textFound, String textExpected)
449 throws java.lang.Exception
450 {
451 errorCount++;
452 if (textFound != null) {
453 message = message + " (found \"" + textFound + "\")";
454 }
455 if (textExpected != null) {
456 message = message + " (expected \"" + textExpected + "\")";
457 }
458 if (handler != null) {
459 String uri = null;
460
461 if (externalEntity != null) {
462 uri = externalEntity.getURL().toString();
463 }
464 handler.error(message, uri, line, column);
465 }
466 }
467
468
469 /**
470 * Report a serious error.
471 * @param message The error message.
472 * @param textFound The text that caused the error (or null).
473 */
474 void error (String message, char textFound, String textExpected)
475 throws java.lang.Exception
476 {
477 error(message, new Character(textFound).toString(), textExpected);
478 }
479
480
481
482 //////////////////////////////////////////////////////////////////////
483 // Major syntactic productions.
484 //////////////////////////////////////////////////////////////////////
485
486
487 /**
488 * Parse an XML document.
489 * <pre>
490 * [1] document ::= prolog element Misc*
491 * </pre>
492 * <p>This is the top-level parsing function for a single XML
493 * document. As a minimum, a well-formed document must have
494 * a document element, and a valid document must have a prolog
495 * as well.
496 */
497 void parseDocument ()
498 throws java.lang.Exception
499 {
500 char c;
501
502 parseProlog();
503 require('<');
504 parseElement();
505 try
506 {
507 parseMisc(); //skip all white, PIs, and comments
508 c=readCh(); //if this doesn't throw an exception...
509 error("unexpected characters after document end",c,null);
510 }
511 catch (EOFException e)
512 {return;}
513 }
514
515
516 /**
517 * Skip a comment.
518 * <pre>
519 * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
520 * </pre>
521 * <p>(The <code><!--</code> has already been read.)
522 */
523 void parseComment ()
524 throws java.lang.Exception
525 {
526 skipUntil("-->");
527 }
528
529
530 /**
531 * Parse a processing instruction and do a call-back.
532 * <pre>
533 * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
534 * </pre>
535 * <p>(The <code><?</code> has already been read.)
536 * <p>An XML processing instruction <em>must</em> begin with
537 * a Name, which is the instruction's target.
538 */
539 void parsePI ()
540 throws java.lang.Exception
541 {
542 String name;
543
544 name = readNmtoken(true);
545 if (!tryRead("?>")) {
546 requireWhitespace();
547 parseUntil("?>");
548 }
549 if (handler != null) {
550 handler.processingInstruction(name, dataBufferToString());
551 }
552 }
553
554
555 /**
556 * Parse a CDATA marked section.
557 * <pre>
558 * [20] CDSect ::= CDStart CData CDEnd
559 * [21] CDStart ::= '<![CDATA['
560 * [22] CData ::= (Char* - (Char* ']]>' Char*))
561 * [23] CDEnd ::= ']]>'
562 * </pre>
563 * <p>(The '<![CDATA[' has already been read.)
564 * <p>Note that this just appends characters to the dataBuffer,
565 * without actually generating an event.
566 */
567 void parseCDSect ()
568 throws java.lang.Exception
569 {
570 parseUntil("]]>");
571 }
572
573
574 /**
575 * Parse the prolog of an XML document.
576 * <pre>
577 * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
578 * </pre>
579 * <p>There are a couple of tricks here. First, it is necessary to
580 * declare the XML default attributes after the DTD (if present)
581 * has been read. Second, it is not possible to expand general
582 * references in attribute value literals until after the entire
583 * DTD (if present) has been parsed.
584 * <p>We do not look for the XML declaration here, because it is
585 * handled by pushURL().
586 * @see pushURL
587 */
588 void parseProlog ()
589 throws java.lang.Exception
590 {
591 parseMisc();
592
593 if (tryRead("<!DOCTYPE")) {
594 parseDoctypedecl();
595 parseMisc();
596 }
597 }
598
599
600 /**
601 * Parse the XML declaration.
602 * <pre>
603 * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
604 * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
605 * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
606 * | S 'standalone' Eq '"' ("yes" | "no") '"'
607 * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
608 * </pre>
609 * <p>([80] to [82] are also significant.)
610 * <p>(The <code><?xml</code> and whitespace have already been read.)
611 * <p>TODO: validate value of standalone.
612 * @see #parseTextDecl
613 * @see #checkEncoding
614 */
615 void parseXMLDecl (boolean ignoreEncoding)
616 throws java.lang.Exception
617 {
618 String version;
619 String encodingName = null;
620 String standalone = null;
621
622 // Read the version.
623 require("version");
624 parseEq();
625 version = readLiteral(0);
626 if (!version.equals("1.0")) {
627 error("unsupported XML version", version, "1.0");
628 }
629
630 // Try reading an encoding declaration.
631 skipWhitespace();
632 if (tryRead("encoding")) {
633 parseEq();
634 encodingName = readLiteral(0);
635 checkEncoding(encodingName, ignoreEncoding);
636 }
637
638 // Try reading a standalone declaration
639 skipWhitespace();
640 if (tryRead("standalone")) {
641 parseEq();
642 standalone = readLiteral(0);
643 }
644
645 skipWhitespace();
646 require("?>");
647 }
648
649
650 /**
651 * Parse the Encoding PI.
652 * <pre>
653 * [78] EncodingDecl ::= S 'encoding' Eq QEncoding
654 * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
655 * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
656 * [81] Encoding ::= LatinName
657 * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
658 * </pre>
659 * <p>(The <code><?xml</code>' and whitespace have already been read.)
660 * @see #parseXMLDecl
661 * @see #checkEncoding
662 */
663 void parseTextDecl (boolean ignoreEncoding)
664 throws java.lang.Exception
665 {
666 String encodingName = null;
667
668 // Read an optional version.
669 if (tryRead("version")) {
670 String version;
671 parseEq();
672 version = readLiteral(0);
673 if (!version.equals("1.0")) {
674 error("unsupported XML version", version, "1.0");
675 }
676 requireWhitespace();
677 }
678
679
680 // Read the encoding.
681 require("encoding");
682 parseEq();
683 encodingName = readLiteral(0);
684 checkEncoding(encodingName, ignoreEncoding);
685
686 skipWhitespace();
687 require("?>");
688 }
689
690
691 /**
692 * Check that the encoding specified makes sense.
693 * <p>Compare what the author has specified in the XML declaration
694 * or encoding PI with what we have detected.
695 * <p>This is also important for distinguishing among the various
696 * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
697 * those).
698 * @param encodingName The name of the encoding specified by the user.
699 * @see #parseXMLDecl
700 * @see #parseTextDecl
701 */
702 void checkEncoding (String encodingName, boolean ignoreEncoding)
703 throws java.lang.Exception
704 {
705 encodingName = encodingName.toUpperCase();
706
707 if (ignoreEncoding) {
708 return;
709 }
710
711 switch (encoding) {
712 // 8-bit encodings
713 case ENCODING_UTF_8:
714 if (encodingName.equals("ISO-8859-1")) {
715 encoding = ENCODING_ISO_8859_1;
716 } else if (!encodingName.equals("UTF-8")) {
717 error("unsupported 8-bit encoding",
718 encodingName,
719 "UTF-8 or ISO-8859-1");
720 }
721 break;
722 // 16-bit encodings
723 case ENCODING_UCS_2_12:
724 case ENCODING_UCS_2_21:
725 if (!encodingName.equals("ISO-10646-UCS-2") &&
726 !encodingName.equals("UTF-16")) {
727 error("unsupported 16-bit encoding",
728 encodingName,
729 "ISO-10646-UCS-2");
730 }
731 break;
732 // 32-bit encodings
733 case ENCODING_UCS_4_1234:
734 case ENCODING_UCS_4_4321:
735 case ENCODING_UCS_4_2143:
736 case ENCODING_UCS_4_3412:
737 if (!encodingName.equals("ISO-10646-UCS-4")) {
738 error("unsupported 32-bit encoding",
739 encodingName,
740 "ISO-10646-UCS-4");
741 }
742 }
743 }
744
745
746 /**
747 * Parse miscellaneous markup outside the document element and DOCTYPE
748 * declaration.
749 * <pre>
750 * [27] Misc ::= Comment | PI | S
751 * </pre>
752 */
753 void parseMisc ()
754 throws java.lang.Exception
755 {
756 while (true)
757 {
758 skipWhitespace();
759 if (tryRead("<?"))
760 {parsePI();}
761 else if (tryRead("<!--"))
762 {parseComment();}
763 else
764 {return;}
765 }
766 }
767
768
769 /**
770 * Parse a document type declaration.
771 * <pre>
772 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
773 * ('[' %markupdecl* ']' S?)? '>'
774 * </pre>
775 * <p>(The <code><!DOCTYPE</code> has already been read.)
776 */
777 void parseDoctypedecl ()
778 throws java.lang.Exception
779 {
780 char c;
781 String doctypeName, ids[];
782
783 // Read the document type name.
784 requireWhitespace();
785 doctypeName = readNmtoken(true);
786
787 // Read the ExternalIDs.
788 skipWhitespace();
789 ids = readExternalIds(false);
790
791 // Look for a declaration subset.
792 skipWhitespace();
793 if (tryRead('[')) {
794
795 // loop until the subset ends
796 while (true) {
797 context = CONTEXT_DTD;
798 skipWhitespace();
799 context = CONTEXT_NONE;
800 if (tryRead(']')) {
801 break; // end of subset
802 } else {
803 context = CONTEXT_DTD;
804 parseMarkupdecl();
805 context = CONTEXT_NONE;
806 }
807 }
808 }
809
810 // Read the external subset, if any
811 if (ids[1] != null) {
812 pushURL("[external subset]", ids[0], ids[1], null, null, null);
813
814 // Loop until we end up back at '>'
815 while (true) {
816 context = CONTEXT_DTD;
817 skipWhitespace();
818 context = CONTEXT_NONE;
819 if (tryRead('>')) {
820 break;
821 } else {
822 context = CONTEXT_DTD;
823 parseMarkupdecl();
824 context = CONTEXT_NONE;
825 }
826 }
827 } else {
828 // No external subset.
829 skipWhitespace();
830 require('>');
831 }
832
833 if (handler != null) {
834 handler.doctypeDecl(doctypeName, ids[0], ids[1]);
835 }
836
837 // Expand general entities in
838 // default values of attributes.
839 // (Do this after the doctypeDecl
840 // event!).
841 // expandAttributeDefaultValues();
842 }
843
844
845 /**
846 * Parse a markup declaration in the internal or external DTD subset.
847 * <pre>
848 * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
849 * %NotationDecl | %PI | %S | %Comment |
850 * InternalPERef )
851 * [30] InternalPERef ::= PEReference
852 * [31] extSubset ::= (%markupdecl | %conditionalSect)*
853 * </pre>
854 */
855 void parseMarkupdecl ()
856 throws java.lang.Exception
857 {
858 if (tryRead("<!ELEMENT")) {
859 parseElementdecl();
860 } else if (tryRead("<!ATTLIST")) {
861 parseAttlistDecl();
862 } else if (tryRead("<!ENTITY")) {
863 parseEntityDecl();
864 } else if (tryRead("<!NOTATION")) {
865 parseNotationDecl();
866 } else if (tryRead("<?")) {
867 parsePI();
868 } else if (tryRead("<!--")) {
869 parseComment();
870 } else if (tryRead("<![")) {
871 parseConditionalSect();
872 } else {
873 error("expected markup declaration", null, null);
874 }
875 }
876
877
878 /**
879 * Parse an element, with its tags.
880 * <pre>
881 * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec]
882 * [38] element ::= EmptyElement | STag content ETag
883 * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>'
884 * [WFC: unique Att spec]
885 * </pre>
886 * <p>(The '<' has already been read.)
887 * <p>NOTE: this method actually chains onto parseContent(), if necessary,
888 * and parseContent() will take care of calling parseETag().
889 */
890 void parseElement ()
891 throws java.lang.Exception
892 {
893 String gi;
894 char c;
895 int oldElementContent = currentElementContent;
896 String oldElement = currentElement;
897
898 // This is the (global) counter for the
899 // array of specified attributes.
900 tagAttributePos = 0;
901
902 // Read the element type name.
903 gi = readNmtoken(true);
904
905 // Determine the current content type.
906 currentElement = gi;
907 currentElementContent = getElementContentType(gi);
908 if (currentElementContent == CONTENT_UNDECLARED) {
909 currentElementContent = CONTENT_ANY;
910 }
911
912 // Read the attributes, if any.
913 // After this loop, we should be just
914 // in front of the closing delimiter.
915 skipWhitespace();
916 c = readCh();
917 while (c != '/' && c != '>') {
918 unread(c);
919 parseAttribute(gi);
920 skipWhitespace();
921 c = readCh();
922 }
923 unread(c);
924
925 // Supply any defaulted attributes.
926 Enumeration atts = declaredAttributes(gi);
927 if (atts != null) {
928 String aname;
929 loop: while (atts.hasMoreElements()) {
930 aname = (String)atts.nextElement();
931 // See if it was specified.
932 for (int i = 0; i < tagAttributePos; i++) {
933 if (tagAttributes[i] == aname) {
934 continue loop;
935 }
936 }
937 // I guess not...
938 if (handler != null) {
939 handler.attribute(aname,
940 getAttributeExpandedValue(gi, aname),
941 false);
942 }
943 }
944 }
945
946 // Figure out if this is a start tag
947 // or an empty element, and dispatch an
948 // event accordingly.
949 c = readCh();
950 switch (c) {
951 case '>':
952 if (handler != null) {
953 handler.startElement(gi);
954 }
955 parseContent();
956 break;
957 case '/':
958 require('>');
959 if (handler != null) {
960 handler.startElement(gi);
961 handler.endElement(gi);
962 }
963 break;
964 }
965
966 // Restore the previous state.
967 currentElement = oldElement;
968 currentElementContent = oldElementContent;
969 }
970
971
972 /**
973 * Parse an attribute assignment.
974 * <pre>
975 * [34] Attribute ::= Name Eq AttValue
976 * </pre>
977 * @param name The name of the attribute's element.
978 * @see MstXmlHandler#attribute
979 */
980 void parseAttribute (String name)
981 throws java.lang.Exception
982 {
983 String aname;
984 int type;
985 String value;
986
987 // Read the attribute name.
988 aname = readNmtoken(true).intern();
989 type = getAttributeDefaultValueType(name, aname);
990
991 // Parse '='
992 parseEq();
993
994 // Read the value, normalizing whitespace
995 // if it is not CDATA.
996 if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) {
997 value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
998 } else {
999 value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
1000 }
1001
1002 // Inform the handler about the
1003 // attribute.
1004 if (handler != null) {
1005 handler.attribute(aname, value, true);
1006 }
1007 dataBufferPos = 0;
1008
1009 // Note that the attribute has been
1010 // specified.
1011 if (tagAttributePos == tagAttributes.length) {
1012 String newAttrib[] = new String[tagAttributes.length * 2];
1013 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1014 tagAttributes = newAttrib;
1015 }
1016 tagAttributes[tagAttributePos++] = aname;
1017 }
1018
1019
1020 /**
1021 * Parse an equals sign surrounded by optional whitespace.
1022 * [35] Eq ::= S? '=' S?
1023 */
1024 void parseEq ()
1025 throws java.lang.Exception
1026 {
1027 skipWhitespace();
1028 require('=');
1029 skipWhitespace();
1030 }
1031
1032
1033 /**
1034 * Parse an end tag.
1035 * [36] ETag ::= '</' Name S? '>'
1036 * *NOTE: parseContent() chains to here.
1037 */
1038 void parseETag ()
1039 throws java.lang.Exception
1040 {
1041 String name;
1042 name = readNmtoken(true);
1043 if (name != currentElement) {
1044 error("mismatched end tag", name, currentElement);
1045 }
1046 skipWhitespace();
1047 require('>');
1048 if (handler != null) {
1049 handler.endElement(name);
1050 }
1051 }
1052
1053
1054 /**
1055 * Parse the content of an element.
1056 * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
1057 * [68] Reference ::= EntityRef | CharRef
1058 */
1059 void parseContent ()
1060 throws java.lang.Exception
1061 {
1062 String data;
1063 char c;
1064
1065 while (true) {
1066
1067 switch (currentElementContent) {
1068 case CONTENT_ANY:
1069 case CONTENT_MIXED:
1070 parsePCData();
1071 break;
1072 case CONTENT_ELEMENTS:
1073 parseWhitespace();
1074 break;
1075 }
1076
1077 // Handle delimiters
1078 c = readCh();
1079 switch (c) {
1080
1081 case '&': // Found "&"
1082 c = readCh();
1083 if (c == '#') {
1084 parseCharRef();
1085 } else {
1086 unread(c);
1087 parseEntityRef(true);
1088 }
1089 break;
1090
1091 case '<': // Found "<"
1092
1093 c = readCh();
1094 switch (c) {
1095
1096 case '!': // Found "<!"
1097 c = readCh();
1098 switch (c) {
1099 case '-': // Found "<!-"
1100 require('-');
1101 parseComment();
1102 break;
1103 case '[': // Found "<!["
1104 require("CDATA[");
1105 parseCDSect();
1106 break;
1107 default:
1108 error("expected comment or CDATA section", c, null);
1109 break;
1110 }
1111 break;
1112
1113 case '?': // Found "<?"
1114 dataBufferFlush();
1115 parsePI();
1116 break;
1117
1118 case '/': // Found "</"
1119 dataBufferFlush();
1120 parseETag();
1121 return;
1122
1123 default: // Found "<" followed by something else
1124 dataBufferFlush();
1125 unread(c);
1126 parseElement();
1127 break;
1128 }
1129 }
1130 }
1131 }
1132
1133
1134 /**
1135 * Parse an element type declaration.
1136 * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
1137 * [VC: Unique Element Declaration]
1138 * *NOTE: the '<!ELEMENT' has already been read.
1139 */
1140 void parseElementdecl ()
1141 throws java.lang.Exception
1142 {
1143 String name;
1144
1145 requireWhitespace();
1146 // Read the element type name.
1147 name = readNmtoken(true);
1148
1149 requireWhitespace();
1150 // Read the content model.
1151 parseContentspec(name);
1152
1153 skipWhitespace();
1154 require('>');
1155 }
1156
1157
1158 /**
1159 * Content specification.
1160 * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1161 */
1162 void parseContentspec (String name)
1163 throws java.lang.Exception
1164 {
1165 if (tryRead("EMPTY")) {
1166 setElement(name, CONTENT_EMPTY, null, null);
1167 return;
1168 } else if (tryRead("ANY")) {
1169 setElement(name, CONTENT_ANY, null, null);
1170 return;
1171 } else {
1172 require('(');
1173 dataBufferAppend('(');
1174 skipWhitespace();
1175 if (tryRead("#PCDATA")) {
1176 dataBufferAppend("#PCDATA");
1177 parseMixed();
1178 setElement(name, CONTENT_MIXED, dataBufferToString(), null);
1179 } else {
1180 parseElements();
1181 setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
1182 }
1183 }
1184 }
1185
1186
1187 /**
1188 * Parse an element-content model.
1189 * [42] elements ::= (choice | seq) ('?' | '*' | '+')?
1190 * [44] cps ::= S? %cp S?
1191 * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
1192 * [46] ctokplus ::= cps ('|' cps)+
1193 * [47] ctoks ::= cps ('|' cps)*
1194 * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
1195 * [49] stoks ::= cps (',' cps)*
1196 * *NOTE: the opening '(' and S have already been read.
1197 * *TODO: go over parameter entity boundaries more carefully.
1198 */
1199 void parseElements ()
1200 throws java.lang.Exception
1201 {
1202 char c;
1203 char sep;
1204
1205 // Parse the first content particle
1206 skipWhitespace();
1207 parseCp();
1208
1209 // Check for end or for a separator.
1210 skipWhitespace();
1211 c = readCh();
1212 switch (c) {
1213 case ')':
1214 dataBufferAppend(')');
1215 c = readCh();
1216 switch (c) {
1217 case '*':
1218 case '+':
1219 case '?':
1220 dataBufferAppend(c);
1221 break;
1222 default:
1223 unread(c);
1224 }
1225 return;
1226 case ',': // Register the separator.
1227 case '|':
1228 sep = c;
1229 dataBufferAppend(c);
1230 break;
1231 default:
1232 error("bad separator in content model", c, null);
1233 return;
1234 }
1235
1236 // Parse the rest of the content model.
1237 while (true) {
1238 skipWhitespace();
1239 parseCp();
1240 skipWhitespace();
1241 c = readCh();
1242 if (c == ')') {
1243 dataBufferAppend(')');
1244 break;
1245 } else if (c != sep) {
1246 error("bad separator in content model", c, null);
1247 return;
1248 } else {
1249 dataBufferAppend(c);
1250 }
1251 }
1252
1253 // Check for the occurrence indicator.
1254 c = readCh();
1255 switch (c) {
1256 case '?':
1257 case '*':
1258 case '+':
1259 dataBufferAppend(c);
1260 return;
1261 default:
1262 unread(c);
1263 return;
1264 }
1265 }
1266
1267
1268 /**
1269 * Parse a content particle.
1270 * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
1271 * *NOTE: I actually use a slightly different production here:
1272 * cp ::= (elements | (Name ('?' | '*' | '+')?))
1273 */
1274 void parseCp ()
1275 throws java.lang.Exception
1276 {
1277 char c;
1278
1279 if (tryRead('(')) {
1280 dataBufferAppend('(');
1281 parseElements();
1282 } else {
1283 dataBufferAppend(readNmtoken(true));
1284 c = readCh();
1285 switch (c) {
1286 case '?':
1287 case '*':
1288 case '+':
1289 dataBufferAppend(c);
1290 break;
1291 default:
1292 unread(c);
1293 break;
1294 }
1295 }
1296 }
1297
1298
1299 /**
1300 * Parse mixed content.
1301 * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
1302 * | '(' S? %('#PCDATA') S? ')'
1303 * [51] Mtoks ::= %Name (S? '|' S? %Name)*
1304 * *NOTE: the S and '#PCDATA' have already been read.
1305 */
1306 void parseMixed ()
1307 throws java.lang.Exception
1308 {
1309 char c;
1310
1311 // Check for PCDATA alone.
1312 skipWhitespace();
1313 if (tryRead(')')) {
1314 dataBufferAppend(")*");
1315 tryRead('*');
1316 return;
1317 }
1318
1319 // Parse mixed content.
1320 skipWhitespace();
1321 while (!tryRead(")*")) {
1322 require('|');
1323 dataBufferAppend('|');
1324 skipWhitespace();
1325 dataBufferAppend(readNmtoken(true));
1326 skipWhitespace();
1327 }
1328 dataBufferAppend(")*");
1329 }
1330
1331
1332 /**
1333 * Parse an attribute list declaration.
1334 * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
1335 * *NOTE: the '<!ATTLIST' has already been read.
1336 */
1337 void parseAttlistDecl ()
1338 throws java.lang.Exception
1339 {
1340 String elementName;
1341
1342 requireWhitespace();
1343 elementName = readNmtoken(true);
1344 requireWhitespace();
1345 while (!tryRead('>')) {
1346 parseAttDef(elementName);
1347 skipWhitespace();
1348 }
1349 }
1350
1351
1352 /**
1353 * Parse a single attribute definition.
1354 * [53] AttDef ::= S %Name S %AttType S %Default
1355 */
1356 void parseAttDef (String elementName)
1357 throws java.lang.Exception
1358 {
1359 String name;
1360 int type;
1361 String enum = null;
1362
1363 // Read the attribute name.
1364 name = readNmtoken(true);
1365
1366 // Read the attribute type.
1367 requireWhitespace();
1368 type = readAttType();
1369
1370 // Get the string of enumerated values
1371 // if necessary.
1372 if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) {
1373 enum = dataBufferToString();
1374 }
1375
1376 // Read the default value.
1377 requireWhitespace();
1378 parseDefault(elementName, name, type, enum);
1379 }
1380
1381
1382 /**
1383 * Parse the attribute type.
1384 * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1385 * [55] StringType ::= 'CDATA'
1386 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
1387 * 'NMTOKEN' | 'NMTOKENS'
1388 * [57] EnumeratedType ::= NotationType | Enumeration
1389 * *TODO: validate the type!!
1390 */
1391 int readAttType ()
1392 throws java.lang.Exception
1393 {
1394 String typeString;
1395 Integer type;
1396
1397 if (tryRead('(')) {
1398 parseEnumeration();
1399 return ATTRIBUTE_ENUMERATED;
1400 } else {
1401 typeString = readNmtoken(true);
1402 if (typeString.equals("NOTATION")) {
1403 parseNotationType();
1404 }
1405 type = (Integer)attributeTypeHash.get(typeString);
1406 if (type == null) {
1407 error("illegal attribute type", typeString, null);
1408 return ATTRIBUTE_UNDECLARED;
1409 } else {
1410 return type.intValue();
1411 }
1412 }
1413 }
1414
1415
1416 /**
1417 * Parse an enumeration.
1418 * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
1419 * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
1420 * *NOTE: the '(' has already been read.
1421 */
1422 void parseEnumeration ()
1423 throws java.lang.Exception
1424 {
1425 char c;
1426
1427 dataBufferAppend('(');
1428
1429 // Read the first token.
1430 skipWhitespace();
1431 dataBufferAppend(readNmtoken(true));
1432 // Read the remaining tokens.
1433 skipWhitespace();
1434 while (!tryRead(')')) {
1435 require('|');
1436 dataBufferAppend('|');
1437 skipWhitespace();
1438 dataBufferAppend(readNmtoken(true));
1439 skipWhitespace();
1440 }
1441 dataBufferAppend(')');
1442 }
1443
1444
1445 /**
1446 * Parse a notation type for an attribute.
1447 * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
1448 * S? ')'
1449 * [59] Ntoks ::= %Name (S? '|' S? %Name)
1450 * *NOTE: the 'NOTATION' has already been read
1451 */
1452 void parseNotationType ()
1453 throws java.lang.Exception
1454 {
1455 requireWhitespace();
1456 require('(');
1457
1458 parseEnumeration();
1459 }
1460
1461
1462 /**
1463 * Parse the default value for an attribute.
1464 * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
1465 */
1466 void parseDefault (String elementName, String name, int type, String enum)
1467 throws java.lang.Exception
1468 {
1469 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
1470 String value = null;
1471 boolean normalizeWSFlag;
1472
1473 if (tryRead('#')) {
1474 if (tryRead("FIXED")) {
1475 valueType = ATTRIBUTE_DEFAULT_FIXED;
1476 requireWhitespace();
1477 context = CONTEXT_ATTRIBUTEVALUE;
1478 value = readLiteral(LIT_CHAR_REF);
1479 context = CONTEXT_DTD;
1480 } else if (tryRead("REQUIRED")) {
1481 valueType = ATTRIBUTE_DEFAULT_REQUIRED;
1482 } else if (tryRead("IMPLIED")) {
1483 valueType = ATTRIBUTE_DEFAULT_IMPLIED;
1484 } else {
1485 error("illegal keyword for attribute default value", null, null);
1486 }
1487 } else {
1488 context = CONTEXT_ATTRIBUTEVALUE;
1489 value = readLiteral(LIT_CHAR_REF);
1490 context = CONTEXT_DTD;
1491 }
1492 setAttribute(elementName, name, type, enum, value, valueType);
1493 }
1494
1495
1496 /**
1497 * Parse a conditional section.
1498 * [63] conditionalSect ::= includeSect || ignoreSect
1499 * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
1500 * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
1501 * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
1502 * | ('<![' ignoreSectContents* ']]>')
1503 * | (Char - (']' | [<'"]))
1504 * | ('<!' (Char - ('-' | '[')))
1505 * *NOTE: the '<![' has already been read.
1506 * *TODO: verify that I am handling ignoreSectContents right.
1507 */
1508 void parseConditionalSect ()
1509 throws java.lang.Exception
1510 {
1511 skipWhitespace();
1512 if (tryRead("INCLUDE")) {
1513 skipWhitespace();
1514 require('[');
1515 skipWhitespace();
1516 while (!tryRead("]]>")) {
1517 parseMarkupdecl();
1518 skipWhitespace();
1519 }
1520 } else if (tryRead("IGNORE")) {
1521 skipWhitespace();
1522 require('[');
1523 int nesting = 1;
1524 char c;
1525 for (int nest = 1; nest > 0; ) {
1526 c = readCh();
1527 switch (c) {
1528 case '<':
1529 if (tryRead("![")) {
1530 nest++;
1531 }
1532 case ']':
1533 if (tryRead("]>")) {
1534 nest--;
1535 }
1536 }
1537 }
1538 } else {
1539 error("conditional section must begin with INCLUDE or IGNORE",
1540 null, null);
1541 }
1542 }
1543
1544
1545 /**
1546 * Read a character reference.
1547 * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
1548 * *NOTE: the '&#' has already been read.
1549 */
1550 void parseCharRef ()
1551 throws java.lang.Exception
1552 {
1553 int value = 0;
1554 char c;
1555
1556 if (tryRead('x')) {
1557 loop1: while (true) {
1558 c = readCh();
1559 switch (c) {
1560 case '0':
1561 case '1':
1562 case '2':
1563 case '3':
1564 case '4':
1565 case '5':
1566 case '6':
1567 case '7':
1568 case '8':
1569 case '9':
1570 case 'a':
1571 case 'A':
1572 case 'b':
1573 case 'B':
1574 case 'c':
1575 case 'C':
1576 case 'd':
1577 case 'D':
1578 case 'e':
1579 case 'E':
1580 case 'f':
1581 case 'F':
1582 value *= 16;
1583 value += Integer.parseInt(new Character(c).toString(), 16);
1584 break;
1585 case ';':
1586 break loop1;
1587 default:
1588 error("illegal character in character reference", c, null);
1589 break loop1;
1590 }
1591 }
1592 } else {
1593 loop2: while (true) {
1594 c = readCh();
1595 switch (c) {
1596 case '0':
1597 case '1':
1598 case '2':
1599 case '3':
1600 case '4':
1601 case '5':
1602 case '6':
1603 case '7':
1604 case '8':
1605 case '9':
1606 value *= 10;
1607 value += Integer.parseInt(new Character(c).toString(), 10);
1608 break;
1609 case ';':
1610 break loop2;
1611 default:
1612 error("illegal character in character reference", c, null);
1613 break loop2;
1614 }
1615 }
1616 }
1617
1618 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
1619 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
1620 if (value <= 0x0000ffff) {
1621 // no surrogates needed
1622 dataBufferAppend((char)value);
1623 } else if (value <= 0x000fffff) {
1624 // > 16 bits, surrogate needed
1625 dataBufferAppend((char)(0xd8 | ((value & 0x000ffc00) >> 10)));
1626 dataBufferAppend((char)(0xdc | (value & 0x0003ff)));
1627 } else {
1628 // too big for surrogate
1629 error("character reference " + value + " is too large for UTF-16",
1630 new Integer(value).toString(), null);
1631 }
1632 }
1633
1634
1635 /**
1636 * Parse a reference.
1637 * [69] EntityRef ::= '&' Name ';'
1638 * *NOTE: the '&' has already been read.
1639 * @param externalAllowed External entities are allowed here.
1640 */
1641 void parseEntityRef (boolean externalAllowed)
1642 throws java.lang.Exception
1643 {
1644 String name;
1645
<