Source code: com/port80/html/tidy/Lexer.java
1 /*
2 * @(#)Lexer.java 1.11 2000/08/16
3 *
4 */
5
6 package com.port80.html.tidy;
7
8 /**
9 *
10 * Lexer for html parser
11 *
12 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13 * See Tidy.java for the copyright notice.
14 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15 * HTML Tidy Release 4 Aug 2000</a>
16 *
17 * @author Dave Raggett <dsr@w3.org>
18 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19 * @version 1.0, 1999/05/22
20 * @version 1.0.1, 1999/05/29
21 * @version 1.1, 1999/06/18 Java Bean
22 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24 * @version 1.4, 1999/09/04 DOM support
25 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
32 */
33
34 /*
35 Given a file stream fp it returns a sequence of tokens.
36
37 GetToken(fp) gets the next token
38 UngetToken(fp) provides one level undo
39
40 The tags include an attribute list:
41
42 - linked list of attribute/value nodes
43 - each node has 2 null-terminated strings.
44 - entities are replaced in attribute values
45
46 white space is compacted if not in preformatted mode
47 If not in preformatted mode then leading white space
48 is discarded and subsequent white space sequences
49 compacted to single space chars.
50
51 If XmlTags is no then Tag names are folded to upper
52 case and attribute names to lower case.
53
54 Not yet done:
55 - Doctype subset and marked sections
56 */
57
58 import java.io.FileNotFoundException;
59 import java.io.FileReader;
60 import java.io.Reader;
61 import java.util.Stack;
62
63 import com.port80.util.SystemWatch;
64
65 public class Lexer {
66
67 ////////////////////////////////////////////////////////////////////////////////////
68
69 private static final String NAME = "Lexer";
70 private static final boolean CHECK = true;
71 private static boolean VERBOSE = false;
72
73 /* the 3 URIs for the XHTML 1.0 DTDs */
74 private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
75 private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
76 private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
77 private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
78
79 private static Lexer.W3CVersionInfo[] W3CVersion =
80 {
81 new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
82 new W3CVersionInfo(
83 "HTML 4.01 Transitional",
84 "XHTML 1.0 Transitional",
85 voyager_loose,
86 Dict.VERS_HTML40_LOOSE),
87 new W3CVersionInfo(
88 "HTML 4.01 Frameset",
89 "XHTML 1.0 Frameset",
90 voyager_frameset,
91 Dict.VERS_FRAMES),
92 new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
93 new W3CVersionInfo(
94 "HTML 4.0 Transitional",
95 "XHTML 1.0 Transitional",
96 voyager_loose,
97 Dict.VERS_HTML40_LOOSE),
98 new W3CVersionInfo(
99 "HTML 4.0 Frameset",
100 "XHTML 1.0 Frameset",
101 voyager_frameset,
102 Dict.VERS_FRAMES),
103 new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML32),
104 new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML20)};
105
106 /* Private methods and fields */
107
108 /* lexer char types */
109 private static final short DIGIT = 1;
110 private static final short LETTER = 2;
111 private static final short NAMECHAR = 4;
112 private static final short WHITE = 8;
113 private static final short NEWLINE = 16;
114 private static final short LOWERCASE = 32;
115 private static final short UPPERCASE = 64;
116 private static final short HEXDIGIT = 128;
117
118 public static final short MODE_IGNORE_WHITESPACE = 0x00;
119 public static final short MODE_MIXED_CONTENT = 0x01;
120 public static final short MODE_PREFORMATTED = 0x02;
121 public static final short MODE_IGNORE_MARKUP = 0x04;
122 public static final short MODE_JAVASCRIPT = 0x08;
123 public static final short MODE_SCRIPT = 0x10;
124
125 public static final short MODE_ATTR_VALUE = 0x20;
126
127 /* used to classify chars for lexical purposes */
128 private static short[] lexmap = new short[128];
129
130 static {
131 mapStr("\r\n\f", (short) (NEWLINE | WHITE));
132 mapStr(" \t", WHITE);
133 mapStr("-.:_", NAMECHAR);
134 mapStr("0123456789", (short) (DIGIT | NAMECHAR));
135 mapStr("abcdef", (short) (HEXDIGIT | LOWERCASE | LETTER | NAMECHAR));
136 mapStr("ghijklmnopqrstuvwxyz", (short) (HEXDIGIT | LOWERCASE | LETTER | NAMECHAR));
137 mapStr("ABCDEF", (short) (UPPERCASE | LETTER | NAMECHAR));
138 mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR));
139 }
140
141 private static void mapStr(String str, short code) {
142 int j;
143 for (int i = 0; i < str.length(); i++) {
144 j = (int) str.charAt(i);
145 lexmap[j] |= code;
146 }
147 }
148
149 ////////////////////////////////////////////////////////////////////////////////////
150
151 public IHTMLReader iStream; /* file stream */
152 public short badAccess; /* for accessibility errors */
153 public short badLayout; /* for bad style errors */
154 public short badChars; /* for bad char encodings */
155 public short badForm; /* for mismatched/mispositioned form tags */
156 public short warnings; /* count of warnings in this document */
157 public short errors; /* count of errors */
158 public boolean excludeBlocks; /* Netscape compatibility */
159 public boolean exiled; /* true if moved out of table */
160 public boolean isvoyager; /* true if xmlns attribute on html element */
161 //
162 public short versions; /* bit vector of HTML versions */
163 public int doctype; /* version as given by doctype (if any) */
164 public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
165 private String fInputName; /* sasdjb 01May00 for GNU Emacs error parsing */
166 //
167 private int lineno; /* Line number at start of current token. */
168 private int column; /* Column at start of current token. */
169 private Node token;
170 private boolean pushed; /* true after token has been pushed back */
171 private LexerState fState; /* state of lexer's finite state machine */
172
173 /*
174 lexer character buffer
175
176 parse tree nodes span onto this buffer
177 which contains the concatenated text
178 contents of all of the elements.
179
180 lexsize must be reset for each file.
181 */
182 private CharBuffer lexbuf; /* byte buffer of chars */
183
184 /* Inline stack for compatibility with Mosaic */
185 public Stack istack;
186 public int istackbase; /* Start of frame */
187 /* For deferring text node which would be consumed after inline tags are inferred. */
188 public Node fSavedTextNode;
189 /* For inferring inline tags */
190 public int insert;
191
192 public TidyConfiguration configuration;
193 public Style styles; /* Used for cleaning up presentation markup */
194
195 private TagTable fTagTable;
196 protected boolean seenBodyEndTag; /* used by parser */
197 private char fInsertSpace;
198 boolean fWasWhite;
199
200 /** Text content is ended with a line separator. */
201 boolean isEndWithLineBreak;
202 boolean isEndWithMultiLineBreak;
203 boolean isStartWithLineBreak;
204 boolean isStartWithMultiLineBreak;
205
206 ////////////////////////////////////////////////////////////////////////////////////
207
208 public Lexer(IHTMLReader in, String inputname, TidyConfiguration configuration) {
209 this.iStream = in;
210 fInputName = inputname;
211 this.configuration = configuration;
212 //
213 this.lineno = 1;
214 this.column = 1;
215 this.fState = initState();
216 this.badAccess = 0;
217 this.badLayout = 0;
218 this.badChars = 0;
219 this.badForm = 0;
220 this.warnings = 0;
221 this.errors = 0;
222 this.pushed = false;
223 this.exiled = false;
224 this.isvoyager = false;
225 this.versions = Dict.VERS_EVERYTHING;
226 this.doctype = Dict.VERS_UNKNOWN;
227 this.badDoctype = false;
228 this.token = null;
229 this.lexbuf = new CharBuffer();
230 this.fSavedTextNode = null;
231 this.insert = -1;
232 this.istack = new Stack();
233 this.istackbase = 0;
234 this.styles = null;
235 this.seenBodyEndTag = false;
236 //
237 fTagTable = configuration.getTagTable();
238 if (fInputName != null && (fInputName.endsWith(".jsp") || fInputName.endsWith(".JSP"))) {
239 fTagTable.setXML(true);
240 }
241 // Ensure config is self-consistent.
242 configuration.adjust();
243 }
244
245 private LexerState initState() {
246 LexerTagState.getDefault(this);
247 LexerEndTagState.getDefault(this);
248 LexerStartTagState.getDefault(this);
249 LexerCommentState.getDefault(this);
250 LexerDocTypeState.getDefault(this);
251 LexerProcInstState.getDefault(this);
252 LexerCDataState.getDefault(this);
253 LexerSectionState.getDefault(this);
254 LexerASPState.getDefault(this);
255 LexerJSTEState.getDefault(this);
256 LexerPHPState.getDefault(this);
257 LexerScriptState.getDefault(this);
258 return LexerContentState.getDefault(this);
259 }
260
261 ////////////////////////////////////////////////////////////////////////////////////
262
263 /* attr must be non-null */
264 public static boolean isValidAttrName(String attr) {
265 /* first character should be a letter */
266 if (!Character.isLetter(attr.charAt(0)))
267 return false;
268 /* remaining characters should be namechars */
269 char c;
270 for (int i = 1; i < attr.length(); i++) {
271 c = attr.charAt(i);
272 if (!Character.isLetterOrDigit(c) && (MAP(c) & NAMECHAR) == 0)
273 return false;
274 }
275 return true;
276 }
277
278 // // Should always be able convert to/from UTF-8, so encoding exceptions are
279 // // converted to an Error to avoid adding throws declarations in
280 // // lots of methods.
281 //
282 // public static byte[] getBytes(String str) {
283 // try {
284 // return str.getBytes("UTF8");
285 // } catch (java.io.UnsupportedEncodingException e) {
286 // throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
287 // }
288 // }
289 //
290 // public static String getString(byte[] bytes, int offset, int length) {
291 // try {
292 // return new String(bytes, offset, length, "UTF8");
293 // } catch (java.io.UnsupportedEncodingException e) {
294 // throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
295 // }
296 // }
297
298 public static String getString(CharBuffer buf, int offset, int len) {
299 return buf.substring(offset, offset + len);
300 }
301
302 public static boolean expectsContent(Node node) {
303 if (node.type != Node.StartTag)
304 return false;
305 /* unknown element? */
306 if (node.tag == null)
307 return true;
308 if ((node.tag.model & Dict.CM_EMPTY) != 0)
309 return false;
310 return true;
311 }
312
313 /* AQ: Try this for speed optimization */
314 public static int wstrcasecmp(String s1, String s2) {
315 return (s1.equalsIgnoreCase(s2) ? 0 : 1);
316 }
317
318 public static int wstrcaselexcmp(String s1, String s2) {
319 char c;
320 int i = 0;
321 while (i < s1.length() && i < s2.length()) {
322 c = s1.charAt(i);
323 if (Character.toLowerCase(c) != Character.toLowerCase(s2.charAt(i))) {
324 break;
325 }
326 i += 1;
327 }
328 if (i == s1.length() && i == s2.length()) {
329 return 0;
330 } else if (i == s1.length()) {
331 return -1;
332 } else if (i == s2.length()) {
333 return 1;
334 } else {
335 return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
336 }
337 }
338
339 // public static boolean wsubstr(String s1, String s2) {
340 // int i;
341 // int len1 = s1.length();
342 // int len2 = s2.length();
343 // for (i = 0; i <= len1 - len2; ++i) {
344 // if (s2.equalsIgnoreCase(s1.substring(i)))
345 // return true;
346 // }
347 // return false;
348 // }
349
350 ////////////////////////////////////////////////////////////////////////////////////
351
352 public Node newNode(int type) {
353 Node node = new Node(type);
354 return node;
355 }
356
357 public Node newNode(int type, int start, int end, int srcstart) {
358 Node node = new Node(type, this.lexbuf, start, end);
359 node.srcStart = srcstart;
360 node.srcEnd = getPosition();
361 return node;
362 }
363
364 public Node newNode(int type, int start, int end, int srcstart, String element) {
365 Node node = new Node(type, this.lexbuf, start, end, element, fTagTable);
366 node.srcStart = srcstart;
367 node.srcEnd = getPosition();
368 return node;
369 }
370
371 public Node newNode(int type, int start, int end, int srcstart, String element, AttVal attributes) {
372 Node node = new Node(type, this.lexbuf, start, end, element, fTagTable, attributes);
373 node.srcStart = srcstart;
374 node.srcEnd = getPosition();
375 return node;
376 }
377
378 public Node newTextNode(int start, int end, int srcstart, int srcend) {
379 Node node = new Node(Node.TextNode, this.lexbuf, start, end);
380 node.srcStart = srcstart;
381 node.srcEnd = srcend;
382 node.isEndWithLineBreak = isEndWithLineBreak;
383 node.isEndWithMultiLineBreak = isEndWithMultiLineBreak;
384 node.isStartWithMultiLineBreak = isStartWithMultiLineBreak;
385 return node;
386 }
387
388 /* used for creating preformatted text from Word2000 */
389 public Node newLineNode() {
390 int start = this.lexbuf.length();
391 append('\n');
392 return new Node(Node.TextNode, this.lexbuf, start, this.lexbuf.length());
393 }
394
395 public Node cloneNode(Node node) {
396 return (Node) node.clone();
397 }
398
399 public AttVal cloneAttributes(AttVal attrs) {
400 return (AttVal) attrs.clone();
401 }
402
403 ////////////////////////////////////////////////////////////////////////////////////
404
405 public void changeChar(char c) {
406 if (this.lexbuf.length() > 0) {
407 this.lexbuf.setCharAt(this.lexbuf.length() - 1, c);
408 }
409 }
410
411 // public void addByte(char c) {
412 // if (this.lexbuf.length() + 1 >= this.lexlength) {
413 // while (this.lexbuf.length() + 1 >= this.lexlength) {
414 // if (this.lexlength == 0)
415 // this.lexlength = 8192;
416 // else
417 // this.lexlength = this.lexlength * 2;
418 // }
419 // char[] temp = this.lexbuf;
420 // this.lexbuf = new char[this.lexlength];
421 // if (temp != null) {
422 // System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
423 // updateNodeTextArrays(temp, this.lexbuf);
424 // }
425 // }
426 // this.lexbuf.append(c);
427 // this.lexbuf.charAt(this.lexbuf.length()) = '\0'; /* debug */
428 // }
429
430 public final void append(char c) {
431 this.lexbuf.append(c);
432 // if (c < 128)
433 // addByte(c);
434 // else if (c <= 0x7FF) {
435 // addByte(0xC0 | (c >> 6));
436 // addByte(0x80 | (c & 0x3F));
437 // } else if (c <= 0xFFFF) {
438 // addByte(0xE0 | (c >> 12));
439 // addByte(0x80 | ((c >> 6) & 0x3F));
440 // addByte(0x80 | (c & 0x3F));
441 // } else if (c <= 0x1FFFFF) {
442 // addByte(0xF0 | (c >> 18));
443 // addByte(0x80 | ((c >> 12) & 0x3F));
444 // addByte(0x80 | ((c >> 6) & 0x3F));
445 // addByte(0x80 | (c & 0x3F));
446 // } else {
447 // addByte(0xF8 | (c >> 24));
448 // addByte(0x80 | ((c >> 18) & 0x3F));
449 // addByte(0x80 | ((c >> 12) & 0x3F));
450 // addByte(0x80 | ((c >> 6) & 0x3F));
451 // addByte(0x80 | (c & 0x3F));
452 // }
453 }
454
455 public final void append(String str) {
456 this.lexbuf.append(str);
457 }
458
459 final void decLength(int n) {
460 this.lexbuf.decLength(n);
461 }
462
463 final CharBuffer getBuffer() {
464 return this.lexbuf;
465 }
466
467 final TagTable getTagTable() {
468 return fTagTable;
469 }
470
471 final void setPosition(int line, int column) {
472 this.lineno = line;
473 this.column = column;
474 }
475
476 ////////////////////////////////////////////////////////////////////////////////////
477
478 public String getInputName() {
479 return fInputName;
480 }
481
482 public LexerState setState(LexerState state) {
483 if (CHECK && state == null) {
484 Report.error(NAME + ".setState(): state==null", this);
485 }
486 fState = state;
487 return fState;
488 }
489
490 public LexerState getState() {
491 return fState;
492 }
493
494 public void ungetToken() {
495 this.pushed = true;
496 }
497
498 /*
499 modes for GetToken()
500
501 MixedContent -- for elements which don't accept PCDATA
502 Preformatted -- white space preserved as is
503 IgnoreMarkup -- for CDATA elements such as script, style
504 */
505
506 public Node getToken(int mode) {
507 // Duplicate inlines in preference to pushed text nodes when appropriate.
508 if (this.pushed) {
509 if (this.token.type != Node.TextNode || (this.insert == -1 && this.fSavedTextNode == null)) {
510 this.pushed = false;
511 return this.token;
512 }
513 }
514 // At start of block elements, unclosed inline elements are inserted into the token stream.
515 if (this.insert != -1 || this.fSavedTextNode != null)
516 return insertedToken();
517 //
518 int start = lexbuf.length();
519 //FIXME: Check this.
520 fWasWhite = false;
521 if (fInsertSpace != '\0' && mode != MODE_IGNORE_WHITESPACE) {
522 append(fInsertSpace);
523 if (fInsertSpace == ' ')
524 fWasWhite = true;
525 fInsertSpace = '\0';
526 }
527 int c;
528 if ((c = iStream.readChar()) == IHTMLReader.EOF) {
529 return null;
530 } else
531 iStream.ungetChar(c);
532 this.token = fState.getToken(mode, start, iStream.getPosition());
533 return this.token;
534 }
535
536 /*
537 create a text node for the contents of
538 a CDATA element like style or script
539 which ends with </foo> for some foo.
540 */
541 public Node getScript(Node container) {
542 if (CHECK && this.pushed) {
543 Report.error(NAME + ".getScript(): Pushed back token should not exists.", this);
544 }
545 fState = LexerScriptState.getDefault();
546 ((LexerScriptState) fState).setContainer(container);
547 setInsertSpace('\0');
548 return getToken(isJavaScript(container) ? MODE_JAVASCRIPT : MODE_SCRIPT);
549 }
550
551 /*
552 push a copy of an inline node onto stack
553 but don't push if implicit or OBJECT or APPLET
554 (implicit tags are ones generated from the istack)
555
556 One issue arises with pushing inlines when
557 the tag is already pushed. For instance:
558
559 <p><em>text
560 <p><em>more text
561
562 Shouldn't be mapped to
563
564 <p><em>text</em></p>
565 <p><em><em>more text</em></em>
566 */
567 public void pushInline(Node node) {
568 IStack is;
569
570 if (node.implicit)
571 return;
572 if (node.tag == null)
573 return;
574 if ((node.tag.model & Dict.CM_INLINE) == 0)
575 return;
576 if ((node.tag.model & Dict.CM_OBJECT) != 0)
577 return;
578 if (node.tag != fTagTable.tagFont && isPushed(node))
579 return;
580
581 // make sure there is enough space for the stack
582 is = new IStack();
583 is.tag = node.tag;
584 is.element = node.element;
585 if (node.attributes != null)
586 is.attributes = cloneAttributes(node.attributes);
587 this.istack.push(is);
588 }
589
590 /** Pop inline tag stack. */
591 public void popInline(Node node) {
592 IStack is;
593 if (node != null) {
594 if (node.tag == null)
595 return;
596 if ((node.tag.model & Dict.CM_INLINE) == 0)
597 return;
598 if ((node.tag.model & Dict.CM_OBJECT) != 0)
599 return;
600 // If node is </a> then pop until we find an <a>
601 if (node.tag == fTagTable.tagA) {
602 while (this.istack.size() > 0) {
603 is = (IStack) this.istack.pop();
604 if (is.tag == fTagTable.tagA) {
605 break;
606 }
607 }
608 if (this.insert >= this.istack.size())
609 this.insert = -1;
610 return;
611 }
612 }
613 if (this.istack.size() > 0) {
614 is = (IStack) this.istack.pop();
615 if (this.insert >= this.istack.size())
616 this.insert = -1;
617 }
618 }
619
620 public boolean isPushed(Node node) {
621 int i;
622 IStack is;
623 for (i = this.istack.size() - 1; i >= 0; --i) {
624 is = (IStack) this.istack.elementAt(i);
625 if (is.tag == node.tag)
626 return true;
627 }
628 return false;
629 }
630
631 /*
632 This has the effect of inserting "missing" inline
633 elements around the contents of blocklevel elements
634 such as P, TD, TH, DIV, PRE etc. This procedure is
635 called at the start of ParseBlock. when the inline
636 stack is not empty, as will be the case in:
637
638 <i><h1>italic heading</h1></i>
639
640 which is then treated as equivalent to
641
642 <h1><i>italic heading</i></h1>
643
644 This is implemented by setting the lexer into a mode
645 where it gets tokens from the inline stack rather than
646 from the input stream.
647 */
648 public int inlineDup(Node node) {
649 if (!configuration.getDoReformat())
650 return 0;
651 int n = this.istack.size() - this.istackbase;
652 if (n > 0) {
653 this.insert = this.istackbase;
654 this.fSavedTextNode = node;
655 }
656 return n;
657 }
658
659 ////////////////////////////////////////////////////////////////////////////////////
660
661 /* choose what version to use for new doctype */
662 public short HTMLVersion() {
663 short versions = this.versions;
664 if ((versions & Dict.VERS_HTML20) != 0)
665 return Dict.VERS_HTML20;
666 if ((versions & Dict.VERS_HTML32) != 0)
667 return Dict.VERS_HTML32;
668 if ((versions & Dict.VERS_HTML40_STRICT) != 0)
669 return Dict.VERS_HTML40_STRICT;
670 if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
671 return Dict.VERS_HTML40_LOOSE;
672 if ((versions & Dict.VERS_FRAMES) != 0)
673 return Dict.VERS_FRAMES;
674 return Dict.VERS_UNKNOWN;
675 }
676
677 public String HTMLVersionName() {
678 short guessed;
679 int j;
680 guessed = apparentVersion();
681 for (j = 0; j < W3CVersion.length; ++j) {
682 if (guessed == W3CVersion[j].code) {
683 if (this.isvoyager)
684 return W3CVersion[j].voyagerName;
685 return W3CVersion[j].name;
686 }
687 }
688 return null;
689 }
690
691 /* add meta element for Tidy */
692 public boolean addGenerator(Node root) {
693 AttVal attval;
694 Node node;
695 Node head = root.findHEAD(fTagTable);
696 String generator = Report.getString("generator") + "; " + Report.RELEASE_DATE;
697 if (head != null) {
698 for (node = head.content; node != null;) {
699 if (node.tag == fTagTable.tagMeta) {
700 attval = node.getAttrByName("name");
701 if (attval != null
702 && attval.value != null
703 && Lexer.wstrcasecmp(attval.value, "generator") == 0) {
704 Node tmp = node;
705 node = node.next;
706 Node.removeNode(tmp);
707 continue;
708 }
709 }
710 node = node.next;
711 }
712 node = this.inferredTag("meta");
713 node.addAttribute("name", "Generator");
714 node.addAttribute("content", generator);
715 Node.insertNodeAtStart(head, node);
716 return true;
717 }
718 return false;
719 }
720
721 public boolean checkDocTypeKeyWords(Node doctype) {
722 int len = doctype.end - doctype.start;
723 String s = getString(this.lexbuf, doctype.start, len);
724
725 return !(
726 findBadSubString("SYSTEM", s, len)
727 || findBadSubString("PUBLIC", s, len)
728 || findBadSubString("//DTD", s, len)
729 || findBadSubString("//W3C", s, len)
730 || findBadSubString("//EN", s, len));
731 }
732
733 public boolean setXHTMLDocType(Node root) {
734 String fpi = " ";
735 String sysid = "";
736 String namespace = XHTML_NAMESPACE;
737 Node doctype;
738
739 doctype = root.findDocType();
740
741 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
742 if (doctype != null)
743 Node.discardElement(doctype);
744 return true;
745 }
746
747 if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
748 /* see what flavor of XHTML this document matches */
749 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
750 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
751 sysid = voyager_strict;
752 } else if ((this.versions & Dict.VERS_LOOSE) != 0) {
753 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
754 sysid = voyager_loose;
755 } else if ((this.versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
756 fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
757 sysid = voyager_frameset;
758 } else /* lets assume XHTML transitional */ {
759 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
760 sysid = voyager_loose;
761 }
762 } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
763 fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
764 sysid = voyager_strict;
765 } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
766 fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
767 sysid = voyager_loose;
768 }
769
770 fixHTMLNameSpace(root, namespace);
771
772 if (doctype == null) {
773 doctype = newNode(Node.DocTypeTag, 0, 0, -1);
774 doctype.next = root.content;
775 doctype.parent = root;
776 doctype.prev = null;
777 root.content = doctype;
778 }
779
780 if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null) {
781 fpi = configuration.docTypeStr;
782 sysid = "";
783 }
784
785 doctype.start = this.lexbuf.length();
786 /* add public identifier */
787 append("html PUBLIC ");
788 /* check if the fpi is quoted or not */
789 if (fpi.charAt(0) == '"')
790 append(fpi);
791 else {
792 append("\"");
793 append(fpi);
794 append("\"");
795 }
796 if (sysid.length() + 6 >= this.configuration.wraplen)
797 append("\n\"");
798 else
799 append("\n \"");
800 /* add system identifier */
801 append(sysid);
802 append("\"");
803 doctype.end = this.lexbuf.length();
804 return false;
805 }
806
807 /* fixup doctype if missing */
808 public boolean fixDocType(Node root) {
809 Node doctype;
810 int guessed = Dict.VERS_HTML40_STRICT, i;
811
812 if (this.badDoctype)
813 Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
814
815 if (configuration.XmlOut)
816 return true;
817
818 doctype = root.findDocType();
819
820 if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
821 if (doctype != null)
822 Node.discardElement(doctype);
823 return true;
824 }
825
826 if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
827 Node.discardElement(doctype);
828 doctype = null;
829 guessed = Dict.VERS_HTML40_STRICT;
830 } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
831 Node.discardElement(doctype);
832 doctype = null;
833 guessed = Dict.VERS_HTML40_LOOSE;
834 } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
835 if (doctype != null) {
836 if (this.doctype == Dict.VERS_UNKNOWN)
837 return false;
838
839 switch (this.doctype) {
840 case Dict.VERS_UNKNOWN :
841 return false;
842
843 case Dict.VERS_HTML20 :
844 if ((this.versions & Dict.VERS_HTML20) != 0)
845 return true;
846
847 break; /* to replace old version by new */
848
849 case Dict.VERS_HTML32 :
850 if ((this.versions & Dict.VERS_HTML32) != 0)
851 return true;
852
853 break; /* to replace old version by new */
854
855 case Dict.VERS_HTML40_STRICT :
856 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
857 return true;
858
859 break; /* to replace old version by new */
860
861 case Dict.VERS_HTML40_LOOSE :
862 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
863 return true;
864
865 break; /* to replace old version by new */
866
867 case Dict.VERS_FRAMES :
868 if ((this.versions & Dict.VERS_FRAMES) != 0)
869 return true;
870
871 break; /* to replace old version by new */
872 }
873
874 /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
875 }
876
877 /* choose new doctype */
878 guessed = HTMLVersion();
879 }
880
881 if (guessed == Dict.VERS_UNKNOWN)
882 return false;
883
884 /* for XML use the Voyager system identifier */
885 if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) {
886 if (doctype != null)
887 Node.discardElement(doctype);
888
889 for (i = 0; i < W3CVersion.length; ++i) {
890 if (guessed == W3CVersion[i].code) {
891 fixHTMLNameSpace(root, W3CVersion[i].profile);
892 break;
893 }
894 }
895
896 return true;
897 }
898
899 if (doctype == null) {
900 doctype = newNode(Node.DocTypeTag, 0, 0, -1);
901 doctype.next = root.content;
902 doctype.parent = root;
903 doctype.prev = null;
904 root.content = doctype;
905 }
906
907 /* use the appropriate public identifier */
908 doctype.start = this.lexbuf.length();
909 append("html PUBLIC ");
910 if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null)
911 append(configuration.docTypeStr);
912 else if (guessed == Dict.VERS_HTML20)
913 append("\"-//IETF//DTD HTML 2.0//EN\"");
914 else {
915 append("\"-//W3C//DTD ");
916 for (i = 0; i < W3CVersion.length; ++i) {
917 if (guessed == W3CVersion[i].code) {
918 append(W3CVersion[i].name);
919 break;
920 }
921 }
922 append("//EN\"");
923 }
924 doctype.end = this.lexbuf.length();
925 ;
926 return true;
927 }
928
929 /* ensure XML document starts with <?XML version="1.0"?> */
930 public boolean fixXMLPI(Node root) {
931 Node xml;
932 int s;
933 if (root.content != null && root.content.type == Node.ProcInsTag) {
934 s = root.content.start;
935 if (this.lexbuf.charAt(s) == 'x'
936 && this.lexbuf.charAt(s + 1) == 'm'
937 && this.lexbuf.charAt(s + 2) == 'l')
938 return true;
939 }
940 xml = newNode(Node.ProcInsTag, 0, 0, -1);
941 xml.next = root.content;
942 if (root.content != null) {
943 root.content.prev = xml;
944 xml.next = root.content;
945 }
946 root.content = xml;
947 xml.start = this.lexbuf.length();
948 append("xml version=\"1.0\"");
949 if (this.configuration.CharEncoding == Configuration.LATIN1)
950 append(" encoding=\"ISO-8859-1\"");
951 xml.end = this.lexbuf.length();
952 return false;
953 }
954
955 public Node inferredTag(String name) {
956 Node node;
957 node = newNode(Node.StartTag, this.lexbuf.length(), this.lexbuf.length(), -1, name);
958 node.implicit = true;
959 return node;
960 }
961
962 /* duplicate name attribute as an id */
963 public void fixId(Node node) {
964 AttVal name = node.getAttrByName("name");
965 AttVal id = node.getAttrByName("id");
966
967 if (name != null) {
968 if (id != null) {
969 // Anchor name/id is case-sensitive (HTML4.0).
970 if (!id.value.equals(name.value))
971 Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
972 } else if (this.configuration.XmlOut)
973 node.addAttribute("id", name.value);
974 }
975 }
976
977 /**
978 * Defer duplicates when entering a table or other
979 * element where the inlines shouldn't be duplicated
980 */
981 public void deferDup() {
982 //CHECK: This seems to be discarding instead of deferring.
983 this.insert = -1;
984 this.fSavedTextNode = null;
985 }
986
987 public boolean canPrune(Node element) {
988 if (element.type == Node.TextNode)
989 return true;
990
991 if (element.content != null)
992 return false;
993
994 if (element.tag == fTagTable.tagA && element.attributes != null)
995 return false;
996
997 if (element.tag == fTagTable.tagP && !this.configuration.DropEmptyParas)
998 return false;
999
1000 if (element.tag == null)
1001 return false;
1002
1003 if ((element.tag.model & Dict.CM_ROW) != 0)
1004 return false;
1005
1006 if (element.tag == fTagTable.tagApplet)
1007 return false;
1008
1009 if (element.tag == fTagTable.tagObject)
1010 return false;
1011
1012 if (element.attributes != null
1013 && (element.getAttrByName("id") != null || element.getAttrByName("name") != null))
1014 return false;
1015
1016 return true;
1017 }
1018
1019 ////////////////////////////////////////////////////////////////////////
1020
1021 public final int length() {
1022 return lexbuf.length();
1023 }
1024
1025 public final char getChar(int i) {
1026 return lexbuf.charAt(i);
1027 }
1028
1029 public final int getLineNumber() {
1030 return iStream.getLineNumber();
1031 }
1032
1033 public final int getColumn() {
1034 return iStream.getColumn();
1035 }
1036
1037 public final int getPosition() {
1038 return iStream.getPosition();
1039 }
1040
1041 public final void setInsertSpace(char c) {
1042 fInsertSpace = c;
1043 }
1044
1045 public final void setWasWhite(boolean b) {
1046 fWasWhite = b;
1047 }
1048
1049 public void markPosition() {
1050 lineno = iStream.getLineNumber();
1051 column = iStream.getColumn();
1052 }
1053
1054 ////////////////////////////////////////////////////////////////////////////////////
1055
1056 /* return true if substring s is in p and isn't all in same case */
1057 /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
1058 /* len is how many chars to check in p */
1059 private static boolean findBadSubString(String s, String p, int len) {
1060 int n = s.length();
1061 int i = 0;
1062 String ps;
1063 while (n < len) {
1064 ps = p.substring(i, i + n);
1065 if (wstrcasecmp(s, ps) == 0)
1066 return (!ps.equals(s.substring(0, n)));
1067 ++i;
1068 --len;
1069 }
1070 return false;
1071 }
1072
1073 private static short MAP(char c) {
1074 return ((int) c < 128 ? lexmap[(int) c] : 0);
1075 }
1076
1077 private static boolean isJavaScript(Node node) {
1078 if (node.attributes == null)
1079 return true;
1080 for (AttVal attr = node.attributes; attr != null; attr = attr.next) {
1081 if (("language".equalsIgnoreCase(attr.attribute) || "type".equalsIgnoreCase(attr.attribute))
1082 && "javascript".equalsIgnoreCase(attr.value))
1083 return true;
1084 }
1085 return false;
1086 }
1087
1088 ////////////////////////////////////////////////////////////////////////
1089
1090 /* examine <!DOCTYPE> to identify version */
1091 short findGivenVersion(Node doctype) {
1092 String p, s;
1093 int i, j;
1094 String str1;
1095 String str2;
1096 int len;
1097 /* if root tag for doctype isn't html give up now */
1098 str1 = getString(this.lexbuf, doctype.start, 5);
1099 if (wstrcasecmp(str1, "html ") != 0)
1100 return 0;
1101
1102 if (!checkDocTypeKeyWords(doctype))
1103 Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1104
1105 /* give up if all we are given is the system id for the doctype */
1106 int start = doctype.start + 5;
1107 str1 = getString(this.lexbuf, start, 7);
1108 if (wstrcasecmp(str1, "SYSTEM ") == 0) {
1109 /* but at least ensure the case is correct */
1110 this.lexbuf.replace(start, start + 6, "SYSTEM");
1111 return 0; /* unrecognized */
1112 }
1113
1114 if (wstrcasecmp(str1, "PUBLIC ") == 0) {
1115 this.lexbuf.replace(start, start + 6, "PUBLIC");
1116 } else
1117 this.badDoctype = true;
1118
1119 for (i = doctype.start; i < doctype.end; ++i) {
1120 if (this.lexbuf.charAt(i) == '"') {
1121 str1 = getString(this.lexbuf, i + 1, 12);
1122 str2 = getString(this.lexbuf, i + 1, 13);
1123 if (str1.equals("-//W3C//DTD ")) {
1124 /* compute length of identifier e.g. "HTML 4.0 Transitional" */
1125 for (j = i + 13; j < doctype.end && this.lexbuf.charAt(i) != '/'; ++j);
1126 len = j - i - 13;
1127 p = getString(this.lexbuf, i + 13, len);
1128
1129 for (j = 1; j < W3CVersion.length; ++j) {
1130 s = W3CVersion[j].name;
1131 if (len == s.length() && s.equals(p))
1132 return W3CVersion[j].code;
1133 }
1134
1135 /* else unrecognized version */
1136 } else if (str2.equals("-//IETF//DTD ")) {
1137 /* compute length of identifier e.g. "HTML 2.0" */
1138 for (j = i + 14; j < doctype.end && this.lexbuf.charAt(i) != '/'; ++j);
1139 len = j - i - 14;
1140
1141 p = getString(this.lexbuf, i + 14, len);
1142 s = W3CVersion[0].name;
1143 if (len == s.length() && s.equals(p))
1144 return W3CVersion[0].code;
1145
1146 /* else unrecognized version */
1147 }
1148 break;
1149 }
1150 }
1151
1152 return 0;
1153 }
1154
1155 private void fixHTMLNameSpace(Node root, String profile) {
1156 Node node;
1157 AttVal attr;
1158 for (node = root.content; node != null && node.tag != fTagTable.tagHtml; node = node.next);
1159 if (node != null) {
1160 for (attr = node.attributes; attr != null; attr = attr.next) {
1161 if (attr.attribute.equals("xmlns"))
1162 break;
1163 }
1164 if (attr != null) {
1165 if (!attr.value.equals(profile)) {
1166 Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1167 attr.value = profile;
1168 }
1169 } else {
1170 attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1171 attr.dict = configuration.getAttributeTable().findAttribute(attr);
1172 node.attributes = attr;
1173 }
1174 }
1175 }
1176
1177 private short apparentVersion() {
1178 switch (this.doctype) {
1179 case Dict.VERS_UNKNOWN :
1180 return HTMLVersion();
1181
1182 case Dict.VERS_HTML20 :
1183 if ((this.versions & Dict.VERS_HTML20) != 0)
1184 return Dict.VERS_HTML20;
1185
1186 break;
1187
1188 case Dict.VERS_HTML32 :
1189 if ((this.versions & Dict.VERS_HTML32) != 0)
1190 return Dict.VERS_HTML32;
1191
1192 break; /* to replace old version by new */
1193
1194 case Dict.VERS_HTML40_STRICT :
1195 if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
1196 return Dict.VERS_HTML40_STRICT;
1197
1198 break;
1199
1200 case Dict.VERS_HTML40_LOOSE :
1201 if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
1202 return Dict.VERS_HTML40_LOOSE;
1203
1204 break; /* to replace old version by new */
1205
1206 case Dict.VERS_FRAMES :
1207 if ((this.versions & Dict.VERS_FRAMES) != 0)
1208 return Dict.VERS_FRAMES;
1209
1210 break;
1211 }
1212
1213 Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1214 return this.HTMLVersion();
1215 }
1216
1217 ////////////////////////////////////////////////////////////////////////
1218
1219 /**
1220 * Get tag name.
1221 * @enter <c or </c have been read, and c is saved at lexbuf.charAt(txtstart).
1222 */
1223 char parseTagName(char c) {
1224 if (CHECK && !Character.isLetter(c)) {
1225 Report.error(NAME + ".parseTagName(): Tag name should started with a letter: c=" + c, this);
1226 }
1227 /* fold case of first char in buffer */
1228 if (!this.configuration.XmlTags && Character.isUpperCase(c))
1229 c = Character.toLowerCase(c);
1230 append(c);
1231 while (true) {
1232 c = (char) this.iStream.readChar();
1233 if (iStream.isEOF())
1234 break;
1235 if (!Character.isLetterOrDigit(c) && (MAP(c) & NAMECHAR) == 0)
1236 break;
1237 /* fold case of subsequent chars */
1238 if (!this.configuration.XmlTags && Character.isUpperCase(c))
1239 c = Character.toLowerCase(c);
1240 append(c);
1241 }
1242 return (char) c;
1243 }
1244
1245 /**
1246 No longer attempts to insert missing ';' for unknown
1247 enitities unless one was present already, since this
1248 gives unexpected results.
1249
1250 For example: <a href="something.htm?foo&bar&fred">
1251 was tidied to: <a href="something.htm?foo&bar;&fred;">
1252 rather than: <a href="something.htm?foo&bar&fred">
1253
1254 My thanks for Maurice Buxton for spotting this.
1255
1256 @param mode MODE_PREFORMATTED|MODE_ATTR_VALUE
1257 */
1258 void parseEntity(int mode) {
1259 final int FIRST = 0;
1260 final int NUMERIC = 1;
1261 final int NAME = 2;
1262 final int DIGIT = 3;
1263 final int HEXDIGIT = 4;
1264 final int SEMICOLON = 5;
1265 //
1266 short map;
1267 char c;
1268 String str;
1269 //
1270 int state = FIRST;
1271 int start = this.lexbuf.length() - 1; /* to start at "&" */
1272 int startcol = this.iStream.getColumn() - 1;
1273
1274 while (true) {
1275 c = (char) this.iStream.readChar();
1276 if (iStream.isEOF())
1277 break;
1278 if (c == ';') {
1279 state = SEMICOLON;
1280 break;
1281 }
1282 if (state == FIRST) {
1283 if (c == '#') {
1284 append(c);
1285 state = NUMERIC;
1286 continue;
1287