1 /*
2 * Copyright 1998-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26 package javax.swing.text.html.parser;
27
28 import javax.swing.text.SimpleAttributeSet;
29 import javax.swing.text.html.HTML;
30 import javax.swing.text.ChangedCharSetException;
31 import java.io;
32 import java.util.Hashtable;
33 import java.util.Properties;
34 import java.util.Vector;
35 import java.util.Enumeration;
36 import java.net.URL;
37
38 import sun.misc.MessageUtils;
39
40 /**
41 * A simple DTD-driven HTML parser. The parser reads an
42 * HTML file from an InputStream and calls various methods
43 * (which should be overridden in a subclass) when tags and
44 * data are encountered.
45 * <p>
46 * Unfortunately there are many badly implemented HTML parsers
47 * out there, and as a result there are many badly formatted
48 * HTML files. This parser attempts to parse most HTML files.
49 * This means that the implementation sometimes deviates from
50 * the SGML specification in favor of HTML.
51 * <p>
52 * The parser treats \r and \r\n as \n. Newlines after starttags
53 * and before end tags are ignored just as specified in the SGML/HTML
54 * specification.
55 * <p>
56 * The html spec does not specify how spaces are to be coalesced very well.
57 * Specifically, the following scenarios are not discussed (note that a
58 * space should be used here, but I am using &nbsp to force the space to
59 * be displayed):
60 * <p>
61 * '<b>blah <i> <strike> foo' which can be treated as:
62 * '<b>blah <i><strike>foo'
63 * <p>as well as:
64 * '<p><a href="xx"> <em>Using</em></a></p>'
65 * which appears to be treated as:
66 * '<p><a href="xx"><em>Using</em></a></p>'
67 * <p>
68 * If <code>strict</code> is false, when a tag that breaks flow,
69 * (<code>TagElement.breaksFlows</code>) or trailing whitespace is
70 * encountered, all whitespace will be ignored until a non whitespace
71 * character is encountered. This appears to give behavior closer to
72 * the popular browsers.
73 *
74 * @see DTD
75 * @see TagElement
76 * @see SimpleAttributeSet
77 * @author Arthur van Hoff
78 * @author Sunita Mani
79 */
80 public
81 class Parser implements DTDConstants {
82
83 private char text[] = new char[1024];
84 private int textpos = 0;
85 private TagElement last;
86 private boolean space;
87
88 private char str[] = new char[128];
89 private int strpos = 0;
90
91 protected DTD dtd = null;
92
93 private int ch;
94 private int ln;
95 private Reader in;
96
97 private Element recent;
98 private TagStack stack;
99 private boolean skipTag = false;
100 private TagElement lastFormSent = null;
101 private SimpleAttributeSet attributes = new SimpleAttributeSet();
102
103 // State for <html>, <head> and <body>. Since people like to slap
104 // together HTML documents without thinking, occasionally they
105 // have multiple instances of these tags. These booleans track
106 // the first sightings of these tags so they can be safely ignored
107 // by the parser if repeated.
108 private boolean seenHtml = false;
109 private boolean seenHead = false;
110 private boolean seenBody = false;
111
112 /**
113 * The html spec does not specify how spaces are coalesced very well.
114 * If strict == false, ignoreSpace is used to try and mimic the behavior
115 * of the popular browsers.
116 * <p>
117 * The problematic scenarios are:
118 * '<b>blah <i> <strike> foo' which can be treated as:
119 * '<b>blah <i><strike>foo'
120 * as well as:
121 * '<p><a href="xx"> <em>Using</em></a></p>'
122 * which appears to be treated as:
123 * '<p><a href="xx"><em>Using</em></a></p>'
124 * <p>
125 * When a tag that breaks flow, or trailing whitespace is encountered
126 * ignoreSpace is set to true. From then on, all whitespace will be
127 * ignored.
128 * ignoreSpace will be set back to false the first time a
129 * non whitespace character is encountered. This appears to give
130 * behavior closer to the popular browsers.
131 */
132 private boolean ignoreSpace;
133
134 /**
135 * This flag determines whether or not the Parser will be strict
136 * in enforcing SGML compatibility. If false, it will be lenient
137 * with certain common classes of erroneous HTML constructs.
138 * Strict or not, in either case an error will be recorded.
139 *
140 */
141 protected boolean strict = false;
142
143
144 /** Number of \r\n's encountered. */
145 private int crlfCount;
146 /** Number of \r's encountered. A \r\n will not increment this. */
147 private int crCount;
148 /** Number of \n's encountered. A \r\n will not increment this. */
149 private int lfCount;
150
151 //
152 // To correctly identify the start of a tag/comment/text we need two
153 // ivars. Two are needed as handleText isn't invoked until the tag
154 // after the text has been parsed, that is the parser parses the text,
155 // then a tag, then invokes handleText followed by handleStart.
156 //
157 /** The start position of the current block. Block is overloaded here,
158 * it really means the current start position for the current comment,
159 * tag, text. Use getBlockStartPosition to access this. */
160 private int currentBlockStartPos;
161 /** Start position of the last block. */
162 private int lastBlockStartPos;
163
164 /**
165 * array for mapping numeric references in range
166 * 130-159 to displayable Unicode characters.
167 */
168 private static final char[] cp1252Map = {
169 8218, // ‚
170 402, // ƒ
171 8222, // „
172 8230, // …
173 8224, // †
174 8225, // ‡
175 710, // ˆ
176 8240, // ‰
177 352, // Š
178 8249, // ‹
179 338, // Œ
180 141, // 
181 142, // Ž
182 143, // 
183 144, // 
184 8216, // ‘
185 8217, // ’
186 8220, // “
187 8221, // ”
188 8226, // •
189 8211, // –
190 8212, // —
191 732, // ˜
192 8482, // ™
193 353, // š
194 8250, // ›
195 339, // œ
196 157, // 
197 158, // ž
198 376 // Ÿ
199 };
200
201 public Parser(DTD dtd) {
202 this.dtd = dtd;
203 }
204
205
206 /**
207 * @return the line number of the line currently being parsed
208 */
209 protected int getCurrentLine() {
210 return ln;
211 }
212
213 /**
214 * Returns the start position of the current block. Block is
215 * overloaded here, it really means the current start position for
216 * the current comment tag, text, block.... This is provided for
217 * subclassers that wish to know the start of the current block when
218 * called with one of the handleXXX methods.
219 */
220 int getBlockStartPosition() {
221 return Math.max(0, lastBlockStartPos - 1);
222 }
223
224 /**
225 * Makes a TagElement.
226 */
227 protected TagElement makeTag(Element elem, boolean fictional) {
228 return new TagElement(elem, fictional);
229 }
230
231 protected TagElement makeTag(Element elem) {
232 return makeTag(elem, false);
233 }
234
235 protected SimpleAttributeSet getAttributes() {
236 return attributes;
237 }
238
239 protected void flushAttributes() {
240 attributes.removeAttributes(attributes);
241 }
242
243 /**
244 * Called when PCDATA is encountered.
245 */
246 protected void handleText(char text[]) {
247 }
248
249 /**
250 * Called when an HTML title tag is encountered.
251 */
252 protected void handleTitle(char text[]) {
253 // default behavior is to call handleText. Subclasses
254 // can override if necessary.
255 handleText(text);
256 }
257
258 /**
259 * Called when an HTML comment is encountered.
260 */
261 protected void handleComment(char text[]) {
262 }
263
264 protected void handleEOFInComment() {
265 // We've reached EOF. Our recovery strategy is to
266 // see if we have more than one line in the comment;
267 // if so, we pretend that the comment was an unterminated
268 // single line comment, and reparse the lines after the
269 // first line as normal HTML content.
270
271 int commentEndPos = strIndexOf('\n');
272 if (commentEndPos >= 0) {
273 handleComment(getChars(0, commentEndPos));
274 try {
275 in.close();
276 in = new CharArrayReader(getChars(commentEndPos + 1));
277 ch = '>';
278 } catch (IOException e) {
279 error("ioexception");
280 }
281
282 resetStrBuffer();
283 } else {
284 // no newline, so signal an error
285 error("eof.comment");
286 }
287 }
288
289 /**
290 * Called when an empty tag is encountered.
291 */
292 protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
293 }
294
295 /**
296 * Called when a start tag is encountered.
297 */
298 protected void handleStartTag(TagElement tag) {
299 }
300
301 /**
302 * Called when an end tag is encountered.
303 */
304 protected void handleEndTag(TagElement tag) {
305 }
306
307 /**
308 * An error has occurred.
309 */
310 protected void handleError(int ln, String msg) {
311 /*
312 Thread.dumpStack();
313 System.out.println("**** " + stack);
314 System.out.println("line " + ln + ": error: " + msg);
315 System.out.println();
316 */
317 }
318
319 /**
320 * Output text.
321 */
322 void handleText(TagElement tag) {
323 if (tag.breaksFlow()) {
324 space = false;
325 if (!strict) {
326 ignoreSpace = true;
327 }
328 }
329 if (textpos == 0) {
330 if ((!space) || (stack == null) || last.breaksFlow() ||
331 !stack.advance(dtd.pcdata)) {
332 last = tag;
333 space = false;
334 lastBlockStartPos = currentBlockStartPos;
335 return;
336 }
337 }
338 if (space) {
339 if (!ignoreSpace) {
340 // enlarge buffer if needed
341 if (textpos + 1 > text.length) {
342 char newtext[] = new char[text.length + 200];
343 System.arraycopy(text, 0, newtext, 0, text.length);
344 text = newtext;
345 }
346
347 // output pending space
348 text[textpos++] = ' ';
349 if (!strict && !tag.getElement().isEmpty()) {
350 ignoreSpace = true;
351 }
352 }
353 space = false;
354 }
355 char newtext[] = new char[textpos];
356 System.arraycopy(text, 0, newtext, 0, textpos);
357 // Handles cases of bad html where the title tag
358 // was getting lost when we did error recovery.
359 if (tag.getElement().getName().equals("title")) {
360 handleTitle(newtext);
361 } else {
362 handleText(newtext);
363 }
364 lastBlockStartPos = currentBlockStartPos;
365 textpos = 0;
366 last = tag;
367 space = false;
368 }
369
370 /**
371 * Invoke the error handler.
372 */
373 protected void error(String err, String arg1, String arg2,
374 String arg3) {
375 handleError(ln, err + " " + arg1 + " " + arg2 + " " + arg3);
376 }
377
378 protected void error(String err, String arg1, String arg2) {
379 error(err, arg1, arg2, "?");
380 }
381 protected void error(String err, String arg1) {
382 error(err, arg1, "?", "?");
383 }
384 protected void error(String err) {
385 error(err, "?", "?", "?");
386 }
387
388
389 /**
390 * Handle a start tag. The new tag is pushed
391 * onto the tag stack. The attribute list is
392 * checked for required attributes.
393 */
394 protected void startTag(TagElement tag) throws ChangedCharSetException {
395 Element elem = tag.getElement();
396
397 // If the tag is an empty tag and texpos != 0
398 // this implies that there is text before the
399 // start tag that needs to be processed before
400 // handling the tag.
401 //
402 if (!elem.isEmpty() ||
403 ((last != null) && !last.breaksFlow()) ||
404 (textpos != 0)) {
405 handleText(tag);
406 } else {
407 // this variable gets updated in handleText().
408 // Since in this case we do not call handleText()
409 // we need to update it here.
410 //
411 last = tag;
412 // Note that we should really check last.breakFlows before
413 // assuming this should be false.
414 space = false;
415 }
416 lastBlockStartPos = currentBlockStartPos;
417
418 // check required attributes
419 for (AttributeList a = elem.atts ; a != null ; a = a.next) {
420 if ((a.modifier == REQUIRED) &&
421 ((attributes.isEmpty()) ||
422 ((!attributes.isDefined(a.name)) &&
423 (!attributes.isDefined(HTML.getAttributeKey(a.name)))))) {
424 error("req.att ", a.getName(), elem.getName());
425 }
426 }
427
428 if (elem.isEmpty()) {
429 handleEmptyTag(tag);
430 /*
431 } else if (elem.getName().equals("form")) {
432 handleStartTag(tag);
433 */
434 } else {
435 recent = elem;
436 stack = new TagStack(tag, stack);
437 handleStartTag(tag);
438 }
439 }
440
441 /**
442 * Handle an end tag. The end tag is popped
443 * from the tag stack.
444 */
445 protected void endTag(boolean omitted) {
446 handleText(stack.tag);
447
448 if (omitted && !stack.elem.omitEnd()) {
449 error("end.missing", stack.elem.getName());
450 } else if (!stack.terminate()) {
451 error("end.unexpected", stack.elem.getName());
452 }
453
454 // handle the tag
455 handleEndTag(stack.tag);
456 stack = stack.next;
457 recent = (stack != null) ? stack.elem : null;
458 }
459
460
461 boolean ignoreElement(Element elem) {
462
463 String stackElement = stack.elem.getName();
464 String elemName = elem.getName();
465 /* We ignore all elements that are not valid in the context of
466 a table except <td>, <th> (these we handle in
467 legalElementContext()) and #pcdata. We also ignore the
468 <font> tag in the context of <ul> and <ol> We additonally
469 ignore the <meta> and the <style> tag if the body tag has
470 been seen. **/
471 if ((elemName.equals("html") && seenHtml) ||
472 (elemName.equals("head") && seenHead) ||
473 (elemName.equals("body") && seenBody)) {
474 return true;
475 }
476 if (elemName.equals("dt") || elemName.equals("dd")) {
477 TagStack s = stack;
478 while (s != null && !s.elem.getName().equals("dl")) {
479 s = s.next;
480 }
481 if (s == null) {
482 return true;
483 }
484 }
485
486 if (((stackElement.equals("table")) &&
487 (!elemName.equals("#pcdata")) && (!elemName.equals("input"))) ||
488 ((elemName.equals("font")) &&
489 (stackElement.equals("ul") || stackElement.equals("ol"))) ||
490 (elemName.equals("meta") && stack != null) ||
491 (elemName.equals("style") && seenBody) ||
492 (stackElement.equals("table") && elemName.equals("a"))) {
493 return true;
494 }
495 return false;
496 }
497
498
499 /**
500 * Marks the first time a tag has been seen in a document
501 */
502
503 protected void markFirstTime(Element elem) {
504 String elemName = elem.getName();
505 if (elemName.equals("html")) {
506 seenHtml = true;
507 } else if (elemName.equals("head")) {
508 seenHead = true;
509 } else if (elemName.equals("body")) {
510 if (buf.length == 1) {
511 // Refer to note in definition of buf for details on this.
512 char[] newBuf = new char[256];
513
514 newBuf[0] = buf[0];
515 buf = newBuf;
516 }
517 seenBody = true;
518 }
519 }
520
521 /**
522 * Create a legal content for an element.
523 */
524 boolean legalElementContext(Element elem) throws ChangedCharSetException {
525
526 // System.out.println("-- legalContext -- " + elem);
527
528 // Deal with the empty stack
529 if (stack == null) {
530 // System.out.println("-- stack is empty");
531 if (elem != dtd.html) {
532 // System.out.println("-- pushing html");
533 startTag(makeTag(dtd.html, true));
534 return legalElementContext(elem);
535 }
536 return true;
537 }
538
539 // Is it allowed in the current context
540 if (stack.advance(elem)) {
541 // System.out.println("-- legal context");
542 markFirstTime(elem);
543 return true;
544 }
545 boolean insertTag = false;
546
547 // The use of all error recovery strategies are contingent
548 // on the value of the strict property.
549 //
550 // These are commonly occuring errors. if insertTag is true,
551 // then we want to adopt an error recovery strategy that
552 // involves attempting to insert an additional tag to
553 // legalize the context. The two errors addressed here
554 // are:
555 // 1) when a <td> or <th> is seen soon after a <table> tag.
556 // In this case we insert a <tr>.
557 // 2) when any other tag apart from a <tr> is seen
558 // in the context of a <tr>. In this case we would
559 // like to add a <td>. If a <tr> is seen within a
560 // <tr> context, then we will close out the current
561 // <tr>.
562 //
563 // This insertion strategy is handled later in the method.
564 // The reason for checking this now, is that in other cases
565 // we would like to apply other error recovery strategies for example
566 // ignoring tags.
567 //
568 // In certain cases it is better to ignore a tag than try to
569 // fix the situation. So the first test is to see if this
570 // is what we need to do.
571 //
572 String stackElemName = stack.elem.getName();
573 String elemName = elem.getName();
574
575
576 if (!strict &&
577 ((stackElemName.equals("table") && elemName.equals("td")) ||
578 (stackElemName.equals("table") && elemName.equals("th")) ||
579 (stackElemName.equals("tr") && !elemName.equals("tr")))){
580 insertTag = true;
581 }
582
583
584 if (!strict && !insertTag && (stack.elem.getName() != elem.getName() ||
585 elem.getName().equals("body"))) {
586 if (skipTag = ignoreElement(elem)) {
587 error("tag.ignore", elem.getName());
588 return skipTag;
589 }
590 }
591
592 // Check for anything after the start of the table besides tr, td, th
593 // or caption, and if those aren't there, insert the <tr> and call
594 // legalElementContext again.
595 if (!strict && stackElemName.equals("table") &&
596 !elemName.equals("tr") && !elemName.equals("td") &&
597 !elemName.equals("th") && !elemName.equals("caption")) {
598 Element e = dtd.getElement("tr");
599 TagElement t = makeTag(e, true);
600 legalTagContext(t);
601 startTag(t);
602 error("start.missing", elem.getName());
603 return legalElementContext(elem);
604 }
605
606 // They try to find a legal context by checking if the current
607 // tag is valid in an enclosing context. If so
608 // close out the tags by outputing end tags and then
609 // insert the curent tag. If the tags that are
610 // being closed out do not have an optional end tag
611 // specification in the DTD then an html error is
612 // reported.
613 //
614 if (!insertTag && stack.terminate() && (!strict || stack.elem.omitEnd())) {
615 for (TagStack s = stack.next ; s != null ; s = s.next) {
616 if (s.advance(elem)) {
617 while (stack != s) {
618 endTag(true);
619 }
620 return true;
621 }
622 if (!s.terminate() || (strict && !s.elem.omitEnd())) {
623 break;
624 }
625 }
626 }
627
628 // Check if we know what tag is expected next.
629 // If so insert the tag. Report an error if the
630 // tag does not have its start tag spec in the DTD as optional.
631 //
632 Element next = stack.first();
633 if (next != null && (!strict || next.omitStart()) &&
634 !(next==dtd.head && elem==dtd.pcdata) ) {
635 // System.out.println("-- omitting start tag: " + next);
636 TagElement t = makeTag(next, true);
637 legalTagContext(t);
638 startTag(t);
639 if (!next.omitStart()) {
640 error("start.missing", elem.getName());
641 }
642 return legalElementContext(elem);
643 }
644
645
646 // Traverse the list of expected elements and determine if adding
647 // any of these elements would make for a legal context.
648 //
649
650 if (!strict) {
651 ContentModel content = stack.contentModel();
652 Vector elemVec = new Vector();
653 if (content != null) {
654 content.getElements(elemVec);
655 for (Enumeration v = elemVec.elements(); v.hasMoreElements();) {
656 Element e = (Element)v.nextElement();
657
658 // Ensure that this element has not been included as
659 // part of the exclusions in the DTD.
660 //
661 if (stack.excluded(e.getIndex())) {
662 continue;
663 }
664
665 boolean reqAtts = false;
666
667 for (AttributeList a = e.getAttributes(); a != null ; a = a.next) {
668 if (a.modifier == REQUIRED) {
669 reqAtts = true;
670 break;
671 }
672 }
673 // Ensure that no tag that has required attributes
674 // gets inserted.
675 //
676 if (reqAtts) {
677 continue;
678 }
679
680 ContentModel m = e.getContent();
681 if (m != null && m.first(elem)) {
682 // System.out.println("-- adding a legal tag: " + e);
683 TagElement t = makeTag(e, true);
684 legalTagContext(t);
685 startTag(t);
686 error("start.missing", e.getName());
687 return legalElementContext(elem);
688 }
689 }
690 }
691 }
692
693 // Check if the stack can be terminated. If so add the appropriate
694 // end tag. Report an error if the tag being ended does not have its
695 // end tag spec in the DTD as optional.
696 //
697 if (stack.terminate() && (stack.elem != dtd.body) && (!strict || stack.elem.omitEnd())) {
698 // System.out.println("-- omitting end tag: " + stack.elem);
699 if (!stack.elem.omitEnd()) {
700 error("end.missing", elem.getName());
701 }
702
703 endTag(true);
704 return legalElementContext(elem);
705 }
706
707 // At this point we know that something is screwed up.
708 return false;
709 }
710
711 /**
712 * Create a legal context for a tag.
713 */
714 void legalTagContext(TagElement tag) throws ChangedCharSetException {
715 if (legalElementContext(tag.getElement())) {
716 markFirstTime(tag.getElement());
717 return;
718 }
719
720 // Avoid putting a block tag in a flow tag.
721 if (tag.breaksFlow() && (stack != null) && !stack.tag.breaksFlow()) {
722 endTag(true);
723 legalTagContext(tag);
724 return;
725 }
726
727 // Avoid putting something wierd in the head of the document.
728 for (TagStack s = stack ; s != null ; s = s.next) {
729 if (s.tag.getElement() == dtd.head) {
730 while (stack != s) {
731 endTag(true);
732 }
733 endTag(true);
734 legalTagContext(tag);
735 return;
736 }
737 }
738
739 // Everything failed
740 error("tag.unexpected", tag.getElement().getName());
741 }
742
743 /**
744 * Error context. Something went wrong, make sure we are in
745 * the document's body context
746 */
747 void errorContext() throws ChangedCharSetException {
748 for (; (stack != null) && (stack.tag.getElement() != dtd.body) ; stack = stack.next) {
749 handleEndTag(stack.tag);
750 }
751 if (stack == null) {
752 legalElementContext(dtd.body);
753 startTag(makeTag(dtd.body, true));
754 }
755 }
756
757 /**
758 * Add a char to the string buffer.
759 */
760 void addString(int c) {
761 if (strpos == str.length) {
762 char newstr[] = new char[str.length + 128];
763 System.arraycopy(str, 0, newstr, 0, str.length);
764 str = newstr;
765 }
766 str[strpos++] = (char)c;
767 }
768
769 /**
770 * Get the string that's been accumulated.
771 */
772 String getString(int pos) {
773 char newStr[] = new char[strpos - pos];
774 System.arraycopy(str, pos, newStr, 0, strpos - pos);
775 strpos = pos;
776 return new String(newStr);
777 }
778
779 char[] getChars(int pos) {
780 char newStr[] = new char[strpos - pos];
781 System.arraycopy(str, pos, newStr, 0, strpos - pos);
782 strpos = pos;
783 return newStr;
784 }
785
786 char[] getChars(int pos, int endPos) {
787 char newStr[] = new char[endPos - pos];
788 System.arraycopy(str, pos, newStr, 0, endPos - pos);
789 // REMIND: it's not clear whether this version should set strpos or not
790 // strpos = pos;
791 return newStr;
792 }
793
794 void resetStrBuffer() {
795 strpos = 0;
796 }
797
798 int strIndexOf(char target) {
799 for (int i = 0; i < strpos; i++) {
800 if (str[i] == target) {
801 return i;
802 }
803 }
804
805 return -1;
806 }
807
808 /**
809 * Skip space.
810 * [5] 297:5
811 */
812 void skipSpace() throws IOException {
813 while (true) {
814 switch (ch) {
815 case '\n':
816 ln++;
817 ch = readCh();
818 lfCount++;
819 break;
820
821 case '\r':
822 ln++;
823 if ((ch = readCh()) == '\n') {
824 ch = readCh();
825 crlfCount++;
826 }
827 else {
828 crCount++;
829 }
830 break;
831 case ' ':
832 case '\t':
833 ch = readCh();
834 break;
835
836 default:
837 return;
838 }
839 }
840 }
841
842 /**
843 * Parse identifier. Uppercase characters are folded
844 * to lowercase when lower is true. Returns falsed if
845 * no identifier is found. [55] 346:17
846 */
847 boolean parseIdentifier(boolean lower) throws IOException {
848 switch (ch) {
849 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
850 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
851 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
852 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
853 case 'Y': case 'Z':
854 if (lower) {
855 ch = 'a' + (ch - 'A');
856 }
857
858 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
859 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
860 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
861 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
862 case 'y': case 'z':
863 break;
864
865 default:
866 return false;
867 }
868
869 while (true) {
870 addString(ch);
871
872 switch (ch = readCh()) {
873 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
874 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
875 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
876 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
877 case 'Y': case 'Z':
878 if (lower) {
879 ch = 'a' + (ch - 'A');
880 }
881
882 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
883 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
884 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
885 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
886 case 'y': case 'z':
887
888 case '0': case '1': case '2': case '3': case '4':
889 case '5': case '6': case '7': case '8': case '9':
890
891 case '.': case '-':
892
893 case '_': // not officially allowed
894 break;
895
896 default:
897 return true;
898 }
899 }
900 }
901
902 /**
903 * Parse an entity reference. [59] 350:17
904 */
905 private char[] parseEntityReference() throws IOException {
906 int pos = strpos;
907
908 if ((ch = readCh()) == '#') {
909 int n = 0;
910 ch = readCh();
911 if ((ch >= '0') && (ch <= '9') ||
912 ch == 'x' || ch == 'X') {
913
914 if ((ch >= '0') && (ch <= '9')) {
915 // parse decimal reference
916 while ((ch >= '0') && (ch <= '9')) {
917 n = (n * 10) + ch - '0';
918 ch = readCh();
919 }
920 } else {
921 // parse hexadecimal reference
922 ch = readCh();
923 char lch = (char) Character.toLowerCase(ch);
924 while ((lch >= '0') && (lch <= '9') ||
925 (lch >= 'a') && (lch <= 'f')) {
926 if (lch >= '0' && lch <= '9') {
927 n = (n * 16) + lch - '0';
928 } else {
929 n = (n * 16) + lch - 'a' + 10;
930 }
931 ch = readCh();
932 lch = (char) Character.toLowerCase(ch);
933 }
934 }
935 switch (ch) {
936 case '\n':
937 ln++;
938 ch = readCh();
939 lfCount++;
940 break;
941
942 case '\r':
943 ln++;
944 if ((ch = readCh()) == '\n') {
945 ch = readCh();
946 crlfCount++;
947 }
948 else {
949 crCount++;
950 }
951 break;
952
953 case ';':
954 ch = readCh();
955 break;
956 }
957 char data[] = {mapNumericReference((char) n)};
958 return data;
959 }
960 addString('#');
961 if (!parseIdentifier(false)) {
962 error("ident.expected");
963 strpos = pos;
964 char data[] = {'&', '#'};
965 return data;
966 }
967 } else if (!parseIdentifier(false)) {
968 char data[] = {'&'};
969 return data;
970 }
971 switch (ch) {
972 case '\n':
973 ln++;
974 ch = readCh();
975 lfCount++;
976 break;
977
978 case '\r':
979 ln++;
980 if ((ch = readCh()) == '\n') {
981 ch = readCh();
982 crlfCount++;
983 }
984 else {
985 crCount++;
986 }
987 break;
988
989 case ';':
990 ch = readCh();
991 break;
992 }
993
994 String nm = getString(pos);
995 Entity ent = dtd.getEntity(nm);
996
997 // entities are case sensitive - however if strict
998 // is false then we will try to make a match by
999 // converting the string to all lowercase.
1000 //
1001 if (!strict && (ent == null)) {
1002 ent = dtd.getEntity(nm.toLowerCase());
1003 }
1004 if ((ent == null) || !ent.isGeneral()) {
1005
1006 if (nm.length() == 0) {
1007 error("invalid.entref", nm);
1008 return new char[0];
1009 }
1010 /* given that there is not a match restore the entity reference */
1011 String str = "&" + nm + ";";
1012
1013 char b[] = new char[str.length()];
1014 str.getChars(0, b.length, b, 0);
1015 return b;
1016 }
1017 return ent.getData();
1018 }
1019
1020 /**
1021 * Converts numeric character reference to Unicode character.
1022 *
1023 * Normally the code in a reference should be always converted
1024 * to the Unicode character with the same code, but due to
1025 * wide usage of Cp1252 charset most browsers map numeric references
1026 * in the range 130-159 (which are control chars in Unicode set)
1027 * to displayable characters with other codes.
1028 *
1029 * @param c the code of numeric character reference.
1030 * @return the character corresponding to the reference code.
1031 */
1032 private char mapNumericReference(char c) {
1033 if (c < 130 || c > 159) {
1034 return c;
1035 }
1036 return cp1252Map[c - 130];
1037 }
1038
1039 /**
1040 * Parse a comment. [92] 391:7
1041 */
1042 void parseComment() throws IOException {
1043
1044 while (true) {
1045 int c = ch;
1046 switch (c) {
1047 case '-':
1048 /** Presuming that the start string of a comment "<!--" has
1049 already been parsed, the '-' character is valid only as
1050 part of a comment termination and further more it must
1051 be present in even numbers. Hence if strict is true, we
1052 presume the comment has been terminated and return.
1053 However if strict is false, then there is no even number
1054 requirement and this character can appear anywhere in the
1055 comment. The parser reads on until it sees the following
1056 pattern: "-->" or "--!>".
1057 **/
1058 if (!strict && (strpos != 0) && (str[strpos - 1] == '-')) {
1059 if ((ch = readCh()) == '>') {
1060 return;
1061 }
1062 if (ch == '!') {
1063 if ((ch = readCh()) == '>') {
1064 return;
1065 } else {
1066 /* to account for extra read()'s that happened */
1067 addString('-');
1068 addString('!');
1069 continue;
1070 }
1071 }
1072 break;
1073 }
1074
1075 if ((ch = readCh()) == '-') {
1076 ch = readCh();
1077 if (strict || ch == '>') {
1078 return;
1079 }
1080 if (ch == '!') {
1081 if ((ch = readCh()) == '>') {
1082 return;
1083 } else {
1084 /* to account for extra read()'s that happened */
1085 addString('-');
1086 addString('!');
1087 continue;
1088 }
1089 }
1090 /* to account for the extra read() */
1091 addString('-');
1092 }
1093 break;
1094
1095 case -1:
1096 handleEOFInComment();
1097 return;
1098
1099 case '\n':
1100 ln++;
1101 ch = readCh();
1102 lfCount++;
1103 break;
1104
1105 case '>':
1106 ch = readCh();
1107 break;
1108
1109 case '\r':
1110 ln++;
1111 if ((ch = readCh()) == '\n') {
1112 ch = readCh();
1113 crlfCount++;
1114 }
1115 else {
1116 crCount++;
1117 }
1118 c = '\n';
1119 break;
1120 default:
1121 ch = readCh();
1122 break;
1123 }
1124
1125 addString(c);
1126 }
1127 }
1128
1129 /**
1130 * Parse literal content. [46] 343:1 and [47] 344:1
1131 */
1132 void parseLiteral(boolean replace) throws IOException {
1133 while (true) {
1134 int c = ch;
1135 switch (c) {
1136 case -1:
1137 error("eof.literal", stack.elem.getName());
1138 endTag(true);
1139 return;
1140
1141 case '>':
1142 ch = readCh();
1143 int i = textpos - (stack.elem.name.length() + 2), j = 0;
1144
1145 // match end tag
1146 if ((i >= 0) && (text[i++] == '<') && (text[i] == '/')) {
1147 while ((++i < textpos) &&
1148 (Character.toLowerCase(text[i]) == stack.elem.name.charAt(j++)));
1149 if (i == textpos) {
1150 textpos -= (stack.elem.name.length() + 2);
1151 if ((textpos > 0) && (text[textpos-1] == '\n')) {
1152 textpos--;
1153 }
1154 endTag(false);
1155 return;
1156 }
1157 }
1158 break;
1159
1160 case '&':
1161 char data[] = parseEntityReference();
1162 if (textpos + data.length > text.length) {
1163 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
1164 System.arraycopy(text, 0, newtext, 0, text.length);
1165 text = newtext;
1166 }
1167 System.arraycopy(data, 0, text, textpos, data.length);
1168 textpos += data.length;
1169 continue;
1170
1171 case '\n':
1172 ln++;
1173 ch = readCh();
1174 lfCount++;
1175 break;
1176
1177 case '\r':
1178 ln++;
1179 if ((ch = readCh()) == '\n') {
1180 ch = readCh();
1181 crlfCount++;
1182 }
1183 else {
1184 crCount++;
1185 }
1186 c = '\n';
1187 break;
1188 default:
1189 ch = readCh();
1190 break;
1191 }
1192
1193 // output character
1194 if (textpos == text.length) {
1195 char newtext[] = new char[text.length + 128];
1196 System.arraycopy(text, 0, newtext, 0, text.length);
1197 text = newtext;
1198 }
1199 text[textpos++] = (char)c;
1200 }
1201 }
1202
1203 /**
1204 * Parse attribute value. [33] 331:1
1205 */
1206 String parseAttributeValue(boolean lower) throws IOException {
1207 int delim = -1;
1208
1209 // Check for a delimiter
1210 switch(ch) {
1211 case '\'':
1212 case '"':
1213 delim = ch;
1214 ch = readCh();
1215 break;
1216 }
1217
1218 // Parse the rest of the value
1219 while (true) {
1220 int c = ch;
1221
1222 switch (c) {
1223 case '\n':
1224 ln++;
1225 ch = readCh();
1226 lfCount++;
1227 if (delim < 0) {
1228 return getString(0);
1229 }
1230 break;
1231
1232 case '\r':
1233 ln++;
1234
1235 if ((ch = readCh()) == '\n') {
1236 ch = readCh();
1237 crlfCount++;
1238 }
1239 else {
1240 crCount++;
1241 }
1242 if (delim < 0) {
1243 return getString(0);
1244 }
1245 break;
1246
1247 case '\t':
1248 if (delim < 0)
1249 c = ' ';
1250 case ' ':
1251 ch = readCh();
1252 if (delim < 0) {
1253 return getString(0);
1254 }
1255 break;
1256
1257 case '>':
1258 case '<':
1259 if (delim < 0) {
1260 return getString(0);
1261 }
1262 ch = readCh();
1263 break;
1264
1265 case '\'':
1266 case '"':
1267 ch = readCh();
1268 if (c == delim) {
1269 return getString(0);
1270 } else if (delim == -1) {
1271 error("attvalerr");
1272 if (strict || ch == ' ') {
1273 return getString(0);
1274 } else {
1275 continue;
1276 }
1277 }
1278 break;
1279
1280 case '=':
1281 if (delim < 0) {
1282 /* In SGML a construct like <img src=/cgi-bin/foo?x=1>
1283 is considered invalid since an = sign can only be contained
1284 in an attributes value if the string is quoted.
1285 */
1286 error("attvalerr");
1287 /* If strict is true then we return with the string we have thus far.
1288 Otherwise we accept the = sign as part of the attribute's value and
1289 process the rest of the img tag. */
1290 if (strict) {
1291 return getString(0);
1292 }
1293 }
1294 ch = readCh();
1295 break;
1296
1297 case '&':
1298 if (strict && delim < 0) {
1299 ch = readCh();
1300 break;
1301 }
1302
1303 char data[] = parseEntityReference();
1304 for (int i = 0 ; i < data.length ; i++) {
1305 c = data[i];
1306 addString((lower && (c >= 'A') && (c <= 'Z')) ? 'a' + c - 'A' : c);
1307 }
1308 continue;
1309
1310 case -1:
1311 return getString(0);
1312
1313 default:
1314 if (lower && (c >= 'A') && (c <= 'Z')) {
1315 c = 'a' + c - 'A';
1316 }
1317 ch = readCh();
1318 break;
1319 }
1320 addString(c);
1321 }
1322 }
1323
1324
1325 /**
1326 * Parse attribute specification List. [31] 327:17
1327 */
1328 void parseAttributeSpecificationList(Element elem) throws IOException {
1329
1330 while (true) {
1331 skipSpace();
1332
1333 switch (ch) {
1334 case '/':
1335 case '>':
1336 case '<':
1337 case -1:
1338 return;
1339
1340 case '-':
1341 if ((ch = readCh()) == '-') {
1342 ch = readCh();
1343 parseComment();
1344 strpos = 0;
1345 } else {
1346 error("invalid.tagchar", "-", elem.getName());
1347 ch = readCh();
1348 }
1349 continue;
1350 }
1351
1352 AttributeList att = null;
1353 String attname = null;
1354 String attvalue = null;
1355
1356 if (parseIdentifier(true)) {
1357 attname = getString(0);
1358 skipSpace();
1359 if (ch == '=') {
1360 ch = readCh();
1361 skipSpace();
1362 att = elem.getAttribute(attname);
1363 // Bug ID 4102750
1364 // Load the NAME of an Attribute Case Sensitive
1365 // The case of the NAME must be intact
1366 // MG 021898
1367 attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION) && (att.type != NAME));
1368 // attvalue = parseAttributeValue((att != null) && (att.type != CDATA) && (att.type != NOTATION));
1369 } else {
1370 attvalue = attname;
1371 att = elem.getAttributeByValue(attvalue);
1372 if (att == null) {
1373 att = elem.getAttribute(attname);
1374 if (att != null) {
1375 attvalue = att.getValue();
1376 }
1377 else {
1378 // Make it null so that NULL_ATTRIBUTE_VALUE is
1379 // used
1380 attvalue = null;
1381 }
1382 }
1383 }
1384 } else if (!strict && ch == ',') { // allows for comma separated attribute-value pairs
1385 ch = readCh();
1386 continue;
1387 } else if (!strict && ch == '"') { // allows for quoted attributes
1388 ch = readCh();
1389 skipSpace();
1390 if (parseIdentifier(true)) {
1391 attname = getString(0);
1392 if (ch == '"') {
1393 ch = readCh();
1394 }
1395 skipSpace();
1396 if (ch == '=') {
1397 ch = readCh();
1398 skipSpace();
1399 att = elem.getAttribute(attname);
1400 attvalue = parseAttributeValue((att != null) &&
1401 (att.type != CDATA) &&
1402 (att.type != NOTATION));
1403 } else {
1404 attvalue = attname;
1405 att = elem.getAttributeByValue(attvalue);
1406 if (att == null) {
1407 att = elem.getAttribute(attname);
1408 if (att != null) {
1409 attvalue = att.getValue();
1410 }
1411 }
1412 }
1413 } else {
1414 char str[] = {(char)ch};
1415 error("invalid.tagchar", new String(str), elem.getName());
1416 ch = readCh();
1417 continue;
1418 }
1419 } else if (!strict && (attributes.isEmpty()) && (ch == '=')) {
1420 ch = readCh();
1421 skipSpace();
1422 attname = elem.getName();
1423 att = elem.getAttribute(attname);
1424 attvalue = parseAttributeValue((att != null) &&
1425 (att.type != CDATA) &&
1426 (att.type != NOTATION));
1427 } else if (!strict && (ch == '=')) {
1428 ch = readCh();
1429 skipSpace();
1430 attvalue = parseAttributeValue(true);
1431 error("attvalerr");
1432 return;
1433 } else {
1434 char str[] = {(char)ch};
1435 error("invalid.tagchar", new String(str), elem.getName());
1436 if (!strict) {
1437 ch = readCh();
1438 continue;
1439 } else {
1440 return;
1441 }
1442 }
1443
1444 if (att != null) {
1445 attname = att.getName();
1446 } else {
1447 error("invalid.tagatt", attname, elem.getName());
1448 }
1449
1450 // Check out the value
1451 if (attributes.isDefined(attname)) {
1452 error("multi.tagatt", attname, elem.getName());
1453 }
1454 if (attvalue == null) {
1455 attvalue = ((att != null) && (att.value != null)) ? att.value :
1456 HTML.NULL_ATTRIBUTE_VALUE;
1457 } else if ((att != null) && (att.values != null) && !att.values.contains(attvalue)) {
1458 error("invalid.tagattval", attname, elem.getName());
1459 }
1460 HTML.Attribute attkey = HTML.getAttributeKey(attname);
1461 if (attkey == null) {
1462 attributes.addAttribute(attname, attvalue);
1463 } else {
1464 attributes.addAttribute(attkey, attvalue);
1465 }
1466 }
1467 }
1468
1469 /**
1470 * Parses th Document Declaration Type markup declaration.
1471 * Currently ignores it.
1472 */
1473 public String parseDTDMarkup() throws IOException {
1474
1475 StringBuffer strBuff = new StringBuffer();
1476 ch = readCh();
1477 while(true) {
1478 switch (ch) {
1479 case '>':
1480 ch = readCh();
1481 return strBuff.toString();
1482 case -1:
1483 error("invalid.markup");
1484 return strBuff.toString();
1485 case '\n':
1486 ln++;
1487 ch = readCh();
1488 lfCount++;
1489 break;
1490 case '"':
1491 ch = readCh();
1492 break;
1493 case '\r':
1494 ln++;
1495 if ((ch = readCh()) == '\n') {
1496 ch = readCh();
1497 crlfCount++;
1498 }
1499 else {
1500 crCount++;
1501 }
1502 break;
1503 default:
1504 strBuff.append((char)(ch & 0xFF));
1505 ch = readCh();
1506 break;
1507 }
1508 }
1509 }
1510
1511 /**
1512 * Parse markup declarations.
1513 * Currently only handles the Document Type Declaration markup.
1514 * Returns true if it is a markup declaration false otherwise.
1515 */
1516 protected boolean parseMarkupDeclarations(StringBuffer strBuff) throws IOException {
1517
1518 /* Currently handles only the DOCTYPE */
1519 if ((strBuff.length() == "DOCTYPE".length()) &&
1520 (strBuff.toString().toUpperCase().equals("DOCTYPE"))) {
1521 parseDTDMarkup();
1522 return true;
1523 }
1524 return false;
1525 }
1526
1527 /**
1528 * Parse an invalid tag.
1529 */
1530 void parseInvalidTag() throws IOException {
1531 // ignore all data upto the close bracket '>'
1532 while (true) {
1533 skipSpace();
1534 switch (ch) {
1535 case '>':
1536 case -1:
1537 ch = readCh();
1538 return;
1539 case '<':
1540 return;
1541 default:
1542 ch = readCh();
1543
1544 }
1545 }
1546 }
1547
1548 /**
1549 * Parse a start or end tag.
1550 */
1551 void parseTag() throws IOException {
1552 Element elem = null;
1553 boolean net = false;
1554 boolean warned = false;
1555 boolean unknown = false;
1556
1557 switch (ch = readCh()) {
1558 case '!':
1559 switch (ch = readCh()) {
1560 case '-':
1561 // Parse comment. [92] 391:7
1562 while (true) {
1563 if (ch == '-') {
1564 if (!strict || ((ch = readCh()) == '-')) {
1565 ch = readCh();
1566 if (!strict && ch == '-') {
1567 ch = readCh();
1568 }
1569 // send over any text you might see
1570 // before parsing and sending the
1571 // comment
1572 if (textpos != 0) {
1573 char newtext[] = new char[textpos];
1574 System.arraycopy(text, 0, newtext, 0, textpos);
1575 handleText(newtext);
1576 lastBlockStartPos = currentBlockStartPos;
1577 textpos = 0;
1578 }
1579 parseComment();
1580 last = makeTag(dtd.getElement("comment"), true);
1581 handleComment(getChars(0));
1582 continue;
1583 } else if (!warned) {
1584 warned = true;
1585 error("invalid.commentchar", "-");
1586 }
1587 }
1588 skipSpace();
1589 switch (ch) {
1590 case '-':
1591 continue;
1592 case '>':
1593 ch = readCh();
1594 case -1:
1595 return;
1596 default:
1597 ch = readCh();
1598 if (!warned) {
1599 warned = true;
1600 error("invalid.commentchar",
1601 String.valueOf((char)ch));
1602 }
1603 break;
1604 }
1605 }
1606
1607 default:
1608 // deal with marked sections
1609 StringBuffer strBuff = new StringBuffer();
1610 while (true) {
1611 strBuff.append((char)ch);
1612 if (parseMarkupDeclarations(strBuff)) {
1613 return;
1614 }
1615 switch(ch) {
1616 case '>':
1617 ch = readCh();
1618 case -1:
1619 error("invalid.markup");
1620 return;
1621 case '\n':
1622 ln++;
1623 ch = readCh();
1624 lfCount++;
1625 break;
1626 case '\r':
1627 ln++;
1628 if ((ch = readCh()) == '\n') {
1629 ch = readCh();
1630 crlfCount++;
1631 }
1632 else {
1633 crCount++;
1634 }
1635 break;
1636
1637 default:
1638 ch = readCh();
1639 break;
1640 }
1641 }
1642 }
1643
1644 case '/':
1645 // parse end tag [19] 317:4
1646 switch (ch = readCh()) {
1647 case '>':
1648 ch = readCh();
1649 case '<':
1650 // empty end tag. either </> or </<
1651 if (recent == null) {
1652 error("invalid.shortend");
1653 return;
1654 }
1655 elem = recent;
1656 break;
1657
1658 default:
1659 if (!parseIdentifier(true)) {
1660 error("expected.endtagname");
1661 return;
1662 }
1663 skipSpace();
1664 switch (ch) {
1665 case '>':
1666 ch = readCh();
1667 case '<':
1668 break;
1669
1670 default:
1671 error("expected", "'>'");
1672 while ((ch != -1) && (ch != '\n') && (ch != '>')) {
1673 ch = readCh();
1674 }
1675 if (ch == '>') {
1676 ch = readCh();
1677 }
1678 break;
1679 }
1680 String elemStr = getString(0);
1681 if (!dtd.elementExists(elemStr)) {
1682 error("end.unrecognized", elemStr);
1683 // Ignore RE before end tag
1684 if ((textpos > 0) && (text[textpos-1] == '\n')) {
1685 textpos--;
1686 }
1687 elem = dtd.getElement("unknown");
1688 elem.name = elemStr;
1689 unknown = true;
1690 } else {
1691 elem = dtd.getElement(elemStr);
1692 }
1693 break;
1694 }
1695
1696
1697 // If the stack is null, we're seeing end tags without any begin
1698 // tags. Ignore them.
1699
1700 if (stack == null) {
1701 error("end.extra.tag", elem.getName());
1702 return;
1703 }
1704
1705 // Ignore RE before end tag
1706 if ((textpos > 0) && (text[textpos-1] == '\n')) {
1707 // In a pre tag, if there are blank lines
1708 // we do not want to remove the newline
1709 // before the end tag. Hence this code.
1710 //
1711 if (stack.pre) {
1712 if ((textpos > 1) && (text[textpos-2] != '\n')) {
1713 textpos--;
1714 }
1715 } else {
1716 textpos--;
1717 }
1718 }
1719
1720 // If the end tag is a form, since we did not put it
1721 // on the tag stack, there is no corresponding start
1722 // start tag to find. Hence do not touch the tag stack.
1723 //
1724
1725 /*
1726 if (!strict && elem.getName().equals("form")) {
1727 if (lastFormSent != null) {
1728 handleEndTag(lastFormSent);
1729 return;
1730 } else {
1731 // do nothing.
1732 return;
1733 }
1734 }
1735 */
1736
1737 if (unknown) {
1738 // we will not see a corresponding start tag
1739 // on the the stack. If we are seeing an
1740 // end tag, lets send this on as an empty
1741 // tag with the end tag attribute set to
1742 // true.
1743 TagElement t = makeTag(elem);
1744 handleText(t);
1745 attributes.addAttribute(HTML.Attribute.ENDTAG, "true");
1746 handleEmptyTag(makeTag(elem));
1747 unknown = false;
1748 return;
1749 }
1750
1751 // find the corresponding start tag
1752
1753 // A commonly occuring error appears to be the insertion
1754 // of extra end tags in a table. The intent here is ignore
1755 // such extra end tags.
1756 //
1757 if (!strict) {
1758 String stackElem = stack.elem.getName();
1759
1760 if (stackElem.equals("table")) {
1761 // If it isnt a valid end tag ignore it and return
1762 //
1763 if (!elem.getName().equals(stackElem)) {
1764 error("tag.ignore", elem.getName());
1765 return;
1766 }
1767 }
1768
1769
1770
1771 if (stackElem.equals("tr") ||
1772 stackElem.equals("td")) {
1773 if ((!elem.getName().equals("table")) &&
1774 (!elem.getName().equals(stackElem))) {
1775 error("tag.ignore", elem.getName());
1776 return;
1777 }
1778 }
1779 }
1780 TagStack sp = stack;
1781
1782 while ((sp != null) && (elem != sp.elem)) {
1783 sp = sp.next;
1784 }
1785 if (sp == null) {
1786 error("unmatched.endtag", elem.getName());
1787 return;
1788 }
1789
1790 // People put font ending tags in the darndest places.
1791 // Don't close other contexts based on them being between
1792 // a font tag and the corresponding end tag. Instead,
1793 // ignore the end tag like it doesn't exist and allow the end
1794 // of the document to close us out.
1795 String elemName = elem.getName();
1796 if (stack != sp &&
1797 (elemName.equals("font") ||
1798 elemName.equals("center"))) {
1799
1800 // Since closing out a center tag can have real wierd
1801 // effects on the formatting, make sure that tags
1802 // for which omitting an end tag is legimitate
1803 // get closed out.
1804 //
1805 if (elemName.equals("center")) {
1806 while(stack.elem.omitEnd() && stack != sp) {
1807 endTag(true);
1808 }
1809 if (stack.elem == elem) {
1810 endTag(false);
1811 }
1812 }
1813 return;
1814 }
1815 // People do the same thing with center tags. In this
1816 // case we would like to close off the center tag but
1817 // not necessarily all enclosing tags.
1818
1819
1820
1821 // end tags
1822 while (stack != sp) {
1823 endTag(true);
1824 }
1825
1826 endTag(false);
1827 return;
1828
1829 case -1:
1830 error("eof");
1831 return;
1832 }
1833
1834 // start tag [14] 314:1
1835 if (!parseIdentifier(true)) {
1836 elem = recent;
1837 if ((ch != '>') || (elem == null)) {
1838 error("expected.tagname");
1839 return;
1840 }
1841 } else {
1842 String elemStr = getString(0);
1843
1844 if (elemStr.equals("image")) {
1845 elemStr = "img";
1846 }
1847
1848 /* determine if this element is part of the dtd. */
1849
1850 if (!dtd.elementExists(elemStr)) {
1851 // parseInvalidTag();
1852 error("tag.unrecognized ", elemStr);
1853 elem = dtd.getElement("unknown");
1854 elem.name = elemStr;
1855 unknown = true;
1856 } else {
1857 elem = dtd.getElement(elemStr);
1858 }
1859 }
1860
1861 // Parse attributes
1862 parseAttributeSpecificationList(elem);
1863
1864 switch (ch) {
1865 case '/':
1866 net = true;
1867 case '>':
1868 ch = readCh();
1869 if (ch == '>' && net) {
1870 ch = readCh();
1871 }
1872 case '<':
1873 break;
1874
1875 default:
1876 error("expected", "'>'");
1877 break;
1878 }
1879
1880 if (!strict) {
1881 if (elem.getName().equals("script")) {
1882 error("javascript.unsupported");
1883 }
1884 }
1885
1886 // ignore RE after start tag
1887 //
1888 if (!elem.isEmpty()) {
1889 if (ch == '\n') {
1890 ln++;
1891 lfCount++;
1892 ch = readCh();
1893 } else if (ch == '\r') {
1894 ln++;
1895 if ((ch = readCh()) == '\n') {
1896 ch = readCh();
1897 crlfCount++;
1898 }
1899 else {
1900 crCount++;
1901 }
1902 }
1903 }
1904
1905 // ensure a legal context for the tag
1906 TagElement tag = makeTag(elem, false);
1907
1908
1909 /** In dealing with forms, we have decided to treat
1910 them as legal in any context. Also, even though
1911 they do have a start and an end tag, we will
1912 not put this tag on the stack. This is to deal
1913 several pages in the web oasis that choose to
1914 start and end forms in any possible location. **/
1915
1916 /*
1917 if (!strict && elem.getName().equals("form")) {
1918 if (lastFormSent == null) {
1919 lastFormSent = tag;
1920 } else {
1921 handleEndTag(lastFormSent);
1922 lastFormSent = tag;
1923 }
1924 } else {
1925 */
1926 // Smlly, if a tag is unknown, we will apply
1927 // no legalTagContext logic to it.
1928 //
1929 if (!unknown) {
1930 legalTagContext(tag);
1931
1932 // If skip tag is true, this implies that
1933 // the tag was illegal and that the error
1934 // recovery strategy adopted is to ignore
1935 // the tag.
1936 if (!strict && skipTag) {
1937 skipTag = false;
1938 return;
1939 }
1940 }
1941 /*
1942 }
1943 */
1944
1945 startTag(tag);
1946
1947 if (!elem.isEmpty()) {
1948 switch (elem.getType()) {
1949 case CDATA:
1950 parseLiteral(false);
1951 break;
1952 case RCDATA:
1953 parseLiteral(true);
1954 break;
1955 default:
1956 if (stack != null) {
1957 stack.net = net;
1958 }
1959 break;
1960 }
1961 }
1962 }
1963
1964 private static final String START_COMMENT = "<!--";
1965 private static final String END_COMMENT = "-->";
1966 private static final char[] SCRIPT_END_TAG = "</script>".toCharArray();
1967 private static final char[] SCRIPT_END_TAG_UPPER_CASE =
1968 "</SCRIPT>".toCharArray();
1969
1970 void parseScript() throws IOException {
1971 char[] charsToAdd = new char[SCRIPT_END_TAG.length];
1972
1973 /* Here, ch should be the first character after <script> */
1974 while (true) {
1975 int i = 0;
1976 while (i < SCRIPT_END_TAG.length
1977 && (SCRIPT_END_TAG[i] == ch
1978 || SCRIPT_END_TAG_UPPER_CASE[i] == ch)) {
1979 charsToAdd[i] = (char) ch;
1980 ch = readCh();
1981 i++;
1982 }
1983 if (i == SCRIPT_END_TAG.length) {
1984
1985 /* '</script>' tag detected */
1986 /* Here, ch == '>' */
1987 ch = readCh();
1988 /* Here, ch == the first character after </script> */
1989 return;
1990 } else {
1991
1992 /* To account for extra read()'s that happened */
1993 for (int j = 0; j < i; j++) {
1994 addString(charsToAdd[j]);
1995 }
1996
1997 switch (ch) {
1998 case -1:
1999 error("eof.script");
2000 return;
2001 case '\n':
2002 ln++;
2003 ch = readCh();
2004 lfCount++;
2005 addString('\n');
2006 break;
2007 case '\r':
2008 ln++;
2009 if ((ch = readCh()) == '\n') {
2010 ch = readCh();
2011 crlfCount++;
2012 } else {
2013 crCount++;
2014 }
2015 addString('\n');
2016 break;
2017 default:
2018 addString(ch);
2019 ch = readCh();
2020 break;
2021 } // switch
2022 }
2023 } // while
2024 }
2025
2026 /**
2027 * Parse Content. [24] 320:1
2028 */
2029 void parseContent() throws IOException {
2030 Thread curThread = Thread.currentThread();
2031
2032 for (;;) {
2033 if (curThread.isInterrupted()) {
2034 curThread.interrupt(); // resignal the interrupt
2035 break;
2036 }
2037
2038 int c = ch;
2039 currentBlockStartPos = currentPosition;
2040
2041 if (recent == dtd.script) { // means: if after starting <script> tag
2042
2043 /* Here, ch has to be the first character after <script> */
2044 parseScript();
2045 last = makeTag(dtd.getElement("comment"), true);
2046
2047 /* Remove leading and trailing HTML comment declarations */
2048 String str = new String(getChars(0)).trim();
2049 int minLength = START_COMMENT.length() + END_COMMENT.length();
2050 if (str.startsWith(START_COMMENT) && str.endsWith(END_COMMENT)
2051 && str.length() >= (minLength)) {
2052 str = str.substring(START_COMMENT.length(),
2053 str.length() - END_COMMENT.length());
2054 }
2055
2056 /* Handle resulting chars as comment */
2057 handleComment(str.toCharArray());
2058 endTag(false);
2059 lastBlockStartPos = currentPosition;
2060 } else {
2061 switch (c) {
2062 case '<':
2063 parseTag();
2064 lastBlockStartPos = currentPosition;
2065 continue;
2066
2067 case '/':
2068 ch = readCh();
2069 if ((stack != null) && stack.net) {
2070 // null end tag.
2071 endTag(false);
2072 continue;
2073 }
2074 break;
2075
2076 case -1:
2077 return;
2078
2079 case '&':
2080 if (textpos == 0) {
2081 if (!legalElementContext(dtd.pcdata)) {
2082 error("unexpected.pcdata");
2083 }
2084 if (last.breaksFlow()) {
2085 space = false;
2086 }
2087 }
2088 char data[] = parseEntityReference();
2089 if (textpos + data.length + 1 > text.length) {
2090 char newtext[] = new char[Math.max(textpos + data.length + 128, text.length * 2)];
2091 System.arraycopy(text, 0, newtext, 0, text.length);
2092 text = newtext;
2093 }
2094 if (space) {
2095 space = false;
2096 text[textpos++] = ' ';
2097 }
2098 System.arraycopy(data, 0, text, textpos, data.length);
2099 textpos += data.length;
2100 ignoreSpace = false;
2101 continue;
2102
2103 case '\n':
2104 ln++;
2105 lfCount++;
2106 ch = readCh();
2107 if ((stack != null) && stack.pre) {
2108 break;
2109 }
2110 if (textpos == 0) {
2111 lastBlockStartPos = currentPosition;
2112 }
2113 if (!ignoreSpace) {
2114 space = true;
2115 }
2116 continue;
2117
2118 case '\r':
2119 ln++;
2120 c = '\n';
2121 if ((ch = readCh()) == '\n') {
2122 ch = readCh();
2123 crlfCount++;
2124 }
2125 else {
2126 crCount++;
2127 }
2128 if ((stack != null) && stack.pre) {
2129 break;
2130 }
2131 if (textpos == 0) {
2132 lastBlockStartPos = currentPosition;
2133 }
2134 if (!ignoreSpace) {
2135 space = true;
2136 }
2137 continue;
2138
2139
2140 case '\t':
2141 case ' ':
2142 ch = readCh();
2143 if ((stack != null) && stack.pre) {
2144 break;
2145 }
2146 if (textpos == 0) {
2147 lastBlockStartPos = currentPosition;
2148 }
2149 if (!ignoreSpace) {
2150 space = true;
2151 }
2152 continue;
2153
2154 default:
2155 if (textpos == 0) {
2156 if (!legalElementContext(dtd.pcdata)) {
2157 error("unexpected.pcdata");
2158 }
2159 if (last.breaksFlow()) {
2160 space = false;
2161 }
2162 }
2163 ch = readCh();
2164 break;
2165 }
2166 }
2167
2168 // enlarge buffer if needed
2169 if (textpos + 2 > text.length) {
2170 char newtext[] = new char[text.length + 128];
2171 System.arraycopy(text, 0, newtext, 0, text.length);
2172 text = newtext;
2173 }
2174
2175 // output pending space
2176 if (space) {
2177 if (textpos == 0) {
2178 lastBlockStartPos--;
2179 }
2180 text[textpos++] = ' ';
2181 space = false;
2182 }
2183 text[textpos++] = (char)c;
2184 ignoreSpace = false;
2185 }
2186 }
2187
2188 /**
2189 * Returns the end of line string. This will return the end of line
2190 * string that has been encountered the most, one of \r, \n or \r\n.
2191 */
2192 String getEndOfLineString() {
2193 if (crlfCount >= crCount) {
2194 if (lfCount >= crlfCount) {
2195 return "\n";
2196 }
2197 else {
2198 return "\r\n";
2199 }
2200 }
2201 else {
2202 if (crCount > lfCount) {
2203 return "\r";
2204 }
2205 else {
2206 return "\n";
2207 }
2208 }
2209 }
2210
2211 /**
2212 * Parse an HTML stream, given a DTD.
2213 */
2214 public synchronized void parse(Reader in) throws IOException {
2215 this.in = in;
2216
2217 this.ln = 1;
2218
2219 seenHtml = false;
2220 seenHead = false;
2221 seenBody = false;
2222
2223 crCount = lfCount = crlfCount = 0;
2224
2225 try {
2226 ch = readCh();
2227 text = new char[1024];
2228 str = new char[128];
2229
2230 parseContent();
2231 // NOTE: interruption may have occurred. Control flows out
2232 // of here normally.
2233 while (stack != null) {
2234 endTag(true);
2235 }
2236 in.close();
2237 } catch (IOException e) {
2238 errorContext();
2239 error("ioexception");
2240 throw e;
2241 } catch (Exception e) {
2242 errorContext();
2243 error("exception", e.getClass().getName(), e.getMessage());
2244 e.printStackTrace();
2245 } catch (ThreadDeath e) {
2246 errorContext();
2247 error("terminated");
2248 e.printStackTrace();
2249 throw e;
2250 } finally {
2251 for (; stack != null ; stack = stack.next) {
2252 handleEndTag(stack.tag);
2253 }
2254
2255 text = null;
2256 str = null;
2257 }
2258
2259 }
2260
2261
2262 /*
2263 * Input cache. This is much faster than calling down to a synchronized
2264 * method of BufferedReader for each byte. Measurements done 5/30/97
2265 * show that there's no point in having a bigger buffer: Increasing
2266 * the buffer to 8192 had no measurable impact for a program discarding
2267 * one character at a time (reading from an http URL to a local machine).
2268 * NOTE: If the current encoding is bogus, and we read too much
2269 * (past the content-type) we may suffer a MalformedInputException. For
2270 * this reason the initial size is 1 and when the body is encountered the
2271 * size is adjusted to 256.
2272 */
2273 private char buf[] = new char[1];
2274 private int pos;
2275 private int len;
2276 /*
2277 tracks position relative to the beginning of the
2278 document.
2279 */
2280 private int currentPosition;
2281
2282
2283 private final int readCh() throws IOException {
2284
2285 if (pos >= len) {
2286
2287 // This loop allows us to ignore interrupts if the flag
2288 // says so
2289 for (;;) {
2290 try {
2291 len = in.read(buf);
2292 break;
2293 } catch (InterruptedIOException ex) {
2294 throw ex;
2295 }
2296 }
2297
2298 if (len <= 0) {
2299 return -1; // eof
2300 }
2301 pos = 0;
2302 }
2303 ++currentPosition;
2304
2305 return buf[pos++];
2306 }
2307
2308
2309 protected int getCurrentPos() {
2310 return currentPosition;
2311 }
2312 }