Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/port80/html/tidy/Lexer.java


1   /*
2    * @(#)Lexer.java   1.11 2000/08/16
3    *
4    */
5   
6   package com.port80.html.tidy;
7   
8   /**
9    *
10   * Lexer for html parser
11   *
12   * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
13   * See Tidy.java for the copyright notice.
14   * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
15   * HTML Tidy Release 4 Aug 2000</a>
16   *
17   * @author  Dave Raggett <dsr@w3.org>
18   * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
19   * @version 1.0, 1999/05/22
20   * @version 1.0.1, 1999/05/29
21   * @version 1.1, 1999/06/18 Java Bean
22   * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
23   * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
24   * @version 1.4, 1999/09/04 DOM support
25   * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
26   * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
27   * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
28   * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
29   * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
30   * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
31   * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
32   */
33  
34  /*
35    Given a file stream fp it returns a sequence of tokens.
36  
37       GetToken(fp) gets the next token
38       UngetToken(fp) provides one level undo
39  
40    The tags include an attribute list:
41  
42      - linked list of attribute/value nodes
43      - each node has 2 null-terminated strings.
44      - entities are replaced in attribute values
45  
46    white space is compacted if not in preformatted mode
47    If not in preformatted mode then leading white space
48    is discarded and subsequent white space sequences
49    compacted to single space chars.
50  
51    If XmlTags is no then Tag names are folded to upper
52    case and attribute names to lower case.
53  
54   Not yet done:
55      -   Doctype subset and marked sections
56  */
57  
58  import java.io.FileNotFoundException;
59  import java.io.FileReader;
60  import java.io.Reader;
61  import java.util.Stack;
62  
63  import com.port80.util.SystemWatch;
64  
65  public class Lexer {
66  
67    ////////////////////////////////////////////////////////////////////////////////////
68  
69    private static final String NAME = "Lexer";
70    private static final boolean CHECK = true;
71    private static boolean VERBOSE = false;
72  
73    /* the 3 URIs  for the XHTML 1.0 DTDs */
74    private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
75    private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
76    private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
77    private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
78  
79    private static Lexer.W3CVersionInfo[] W3CVersion =
80      {
81        new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
82        new W3CVersionInfo(
83          "HTML 4.01 Transitional",
84          "XHTML 1.0 Transitional",
85          voyager_loose,
86          Dict.VERS_HTML40_LOOSE),
87        new W3CVersionInfo(
88          "HTML 4.01 Frameset",
89          "XHTML 1.0 Frameset",
90          voyager_frameset,
91          Dict.VERS_FRAMES),
92        new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
93        new W3CVersionInfo(
94          "HTML 4.0 Transitional",
95          "XHTML 1.0 Transitional",
96          voyager_loose,
97          Dict.VERS_HTML40_LOOSE),
98        new W3CVersionInfo(
99          "HTML 4.0 Frameset",
100         "XHTML 1.0 Frameset",
101         voyager_frameset,
102         Dict.VERS_FRAMES),
103       new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML32),
104       new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML20)};
105 
106   /* Private methods and fields */
107 
108   /* lexer char types */
109   private static final short DIGIT = 1;
110   private static final short LETTER = 2;
111   private static final short NAMECHAR = 4;
112   private static final short WHITE = 8;
113   private static final short NEWLINE = 16;
114   private static final short LOWERCASE = 32;
115   private static final short UPPERCASE = 64;
116   private static final short HEXDIGIT = 128;
117 
118   public static final short MODE_IGNORE_WHITESPACE = 0x00;
119   public static final short MODE_MIXED_CONTENT = 0x01;
120   public static final short MODE_PREFORMATTED = 0x02;
121   public static final short MODE_IGNORE_MARKUP = 0x04;
122   public static final short MODE_JAVASCRIPT = 0x08;
123   public static final short MODE_SCRIPT = 0x10;
124 
125   public static final short MODE_ATTR_VALUE = 0x20;
126 
127   /* used to classify chars for lexical purposes */
128   private static short[] lexmap = new short[128];
129 
130   static {
131     mapStr("\r\n\f", (short) (NEWLINE | WHITE));
132     mapStr(" \t", WHITE);
133     mapStr("-.:_", NAMECHAR);
134     mapStr("0123456789", (short) (DIGIT | NAMECHAR));
135     mapStr("abcdef", (short) (HEXDIGIT | LOWERCASE | LETTER | NAMECHAR));
136     mapStr("ghijklmnopqrstuvwxyz", (short) (HEXDIGIT | LOWERCASE | LETTER | NAMECHAR));
137     mapStr("ABCDEF", (short) (UPPERCASE | LETTER | NAMECHAR));
138     mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR));
139   }
140 
141   private static void mapStr(String str, short code) {
142     int j;
143     for (int i = 0; i < str.length(); i++) {
144       j = (int) str.charAt(i);
145       lexmap[j] |= code;
146     }
147   }
148 
149   ////////////////////////////////////////////////////////////////////////////////////
150 
151   public IHTMLReader iStream; /* file stream */
152   public short badAccess; /* for accessibility errors */
153   public short badLayout; /* for bad style errors */
154   public short badChars; /* for bad char encodings */
155   public short badForm; /* for mismatched/mispositioned form tags */
156   public short warnings; /* count of warnings in this document */
157   public short errors; /* count of errors */
158   public boolean excludeBlocks; /* Netscape compatibility */
159   public boolean exiled; /* true if moved out of table */
160   public boolean isvoyager; /* true if xmlns attribute on html element */
161   //
162   public short versions; /* bit vector of HTML versions */
163   public int doctype; /* version as given by doctype (if any) */
164   public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
165   private String fInputName; /* sasdjb 01May00 for GNU Emacs error parsing */
166   //
167   private int lineno; /* Line number at start of current token. */
168   private int column; /* Column at start of current token. */
169   private Node token;
170   private boolean pushed; /* true after token has been pushed back */
171   private LexerState fState; /* state of lexer's finite state machine */
172 
173   /* 
174     lexer character buffer
175   
176     parse tree nodes span onto this buffer
177     which contains the concatenated text
178     contents of all of the elements.
179   
180    lexsize must be reset for each file.
181   */
182   private CharBuffer lexbuf; /* byte buffer of chars */
183 
184   /* Inline stack for compatibility with Mosaic */
185   public Stack istack;
186   public int istackbase; /* Start of frame */
187   /* For deferring text node which would be consumed after inline tags are inferred. */
188   public Node fSavedTextNode;
189   /* For inferring inline tags */
190   public int insert;
191 
192   public TidyConfiguration configuration;
193   public Style styles; /* Used for cleaning up presentation markup */
194 
195   private TagTable fTagTable;
196   protected boolean seenBodyEndTag; /* used by parser */
197   private char fInsertSpace;
198   boolean fWasWhite;
199 
200   /** Text content is ended with a line separator. */
201   boolean isEndWithLineBreak;
202   boolean isEndWithMultiLineBreak;
203   boolean isStartWithLineBreak;
204   boolean isStartWithMultiLineBreak;
205 
206   ////////////////////////////////////////////////////////////////////////////////////
207 
208   public Lexer(IHTMLReader in, String inputname, TidyConfiguration configuration) {
209     this.iStream = in;
210     fInputName = inputname;
211     this.configuration = configuration;
212     //
213     this.lineno = 1;
214     this.column = 1;
215     this.fState = initState();
216     this.badAccess = 0;
217     this.badLayout = 0;
218     this.badChars = 0;
219     this.badForm = 0;
220     this.warnings = 0;
221     this.errors = 0;
222     this.pushed = false;
223     this.exiled = false;
224     this.isvoyager = false;
225     this.versions = Dict.VERS_EVERYTHING;
226     this.doctype = Dict.VERS_UNKNOWN;
227     this.badDoctype = false;
228     this.token = null;
229     this.lexbuf = new CharBuffer();
230     this.fSavedTextNode = null;
231     this.insert = -1;
232     this.istack = new Stack();
233     this.istackbase = 0;
234     this.styles = null;
235     this.seenBodyEndTag = false;
236     //
237     fTagTable = configuration.getTagTable();
238     if (fInputName != null && (fInputName.endsWith(".jsp") || fInputName.endsWith(".JSP"))) {
239       fTagTable.setXML(true);
240     }
241     // Ensure config is self-consistent.
242     configuration.adjust();
243   }
244 
245   private LexerState initState() {
246     LexerTagState.getDefault(this);
247     LexerEndTagState.getDefault(this);
248     LexerStartTagState.getDefault(this);
249     LexerCommentState.getDefault(this);
250     LexerDocTypeState.getDefault(this);
251     LexerProcInstState.getDefault(this);
252     LexerCDataState.getDefault(this);
253     LexerSectionState.getDefault(this);
254     LexerASPState.getDefault(this);
255     LexerJSTEState.getDefault(this);
256     LexerPHPState.getDefault(this);
257     LexerScriptState.getDefault(this);
258     return LexerContentState.getDefault(this);
259   }
260 
261   ////////////////////////////////////////////////////////////////////////////////////
262 
263   /* attr must be non-null */
264   public static boolean isValidAttrName(String attr) {
265     /* first character should be a letter */
266     if (!Character.isLetter(attr.charAt(0)))
267       return false;
268     /* remaining characters should be namechars */
269     char c;
270     for (int i = 1; i < attr.length(); i++) {
271       c = attr.charAt(i);
272       if (!Character.isLetterOrDigit(c) && (MAP(c) & NAMECHAR) == 0)
273         return false;
274     }
275     return true;
276   }
277 
278   //  // Should always be able convert to/from UTF-8, so encoding exceptions are
279   //  // converted to an Error to avoid adding throws declarations in
280   //  // lots of methods.
281   //
282   //  public static byte[] getBytes(String str) {
283   //    try {
284   //      return str.getBytes("UTF8");
285   //    } catch (java.io.UnsupportedEncodingException e) {
286   //      throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
287   //    }
288   //  }
289   //
290   //  public static String getString(byte[] bytes, int offset, int length) {
291   //    try {
292   //      return new String(bytes, offset, length, "UTF8");
293   //    } catch (java.io.UnsupportedEncodingException e) {
294   //      throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
295   //    }
296   //  }
297 
298   public static String getString(CharBuffer buf, int offset, int len) {
299     return buf.substring(offset, offset + len);
300   }
301 
302   public static boolean expectsContent(Node node) {
303     if (node.type != Node.StartTag)
304       return false;
305     /* unknown element? */
306     if (node.tag == null)
307       return true;
308     if ((node.tag.model & Dict.CM_EMPTY) != 0)
309       return false;
310     return true;
311   }
312 
313   /* AQ: Try this for speed optimization */
314   public static int wstrcasecmp(String s1, String s2) {
315     return (s1.equalsIgnoreCase(s2) ? 0 : 1);
316   }
317 
318   public static int wstrcaselexcmp(String s1, String s2) {
319     char c;
320     int i = 0;
321     while (i < s1.length() && i < s2.length()) {
322       c = s1.charAt(i);
323       if (Character.toLowerCase(c) != Character.toLowerCase(s2.charAt(i))) {
324         break;
325       }
326       i += 1;
327     }
328     if (i == s1.length() && i == s2.length()) {
329       return 0;
330     } else if (i == s1.length()) {
331       return -1;
332     } else if (i == s2.length()) {
333       return 1;
334     } else {
335       return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
336     }
337   }
338 
339   //  public static boolean wsubstr(String s1, String s2) {
340   //    int i;
341   //    int len1 = s1.length();
342   //    int len2 = s2.length();
343   //    for (i = 0; i <= len1 - len2; ++i) {
344   //      if (s2.equalsIgnoreCase(s1.substring(i)))
345   //        return true;
346   //    }
347   //    return false;
348   //  }
349 
350   ////////////////////////////////////////////////////////////////////////////////////
351 
352   public Node newNode(int type) {
353     Node node = new Node(type);
354     return node;
355   }
356 
357   public Node newNode(int type, int start, int end, int srcstart) {
358     Node node = new Node(type, this.lexbuf, start, end);
359     node.srcStart = srcstart;
360     node.srcEnd = getPosition();
361     return node;
362   }
363 
364   public Node newNode(int type, int start, int end, int srcstart, String element) {
365     Node node = new Node(type, this.lexbuf, start, end, element, fTagTable);
366     node.srcStart = srcstart;
367     node.srcEnd = getPosition();
368     return node;
369   }
370 
371   public Node newNode(int type, int start, int end, int srcstart, String element, AttVal attributes) {
372     Node node = new Node(type, this.lexbuf, start, end, element, fTagTable, attributes);
373     node.srcStart = srcstart;
374     node.srcEnd = getPosition();
375     return node;
376   }
377 
378   public Node newTextNode(int start, int end, int srcstart, int srcend) {
379     Node node = new Node(Node.TextNode, this.lexbuf, start, end);
380     node.srcStart = srcstart;
381     node.srcEnd = srcend;
382     node.isEndWithLineBreak = isEndWithLineBreak;
383     node.isEndWithMultiLineBreak = isEndWithMultiLineBreak;
384     node.isStartWithMultiLineBreak = isStartWithMultiLineBreak;
385     return node;
386   }
387 
388   /* used for creating preformatted text from Word2000 */
389   public Node newLineNode() {
390     int start = this.lexbuf.length();
391     append('\n');
392     return new Node(Node.TextNode, this.lexbuf, start, this.lexbuf.length());
393   }
394 
395   public Node cloneNode(Node node) {
396     return (Node) node.clone();
397   }
398 
399   public AttVal cloneAttributes(AttVal attrs) {
400     return (AttVal) attrs.clone();
401   }
402 
403   ////////////////////////////////////////////////////////////////////////////////////
404 
405   public void changeChar(char c) {
406     if (this.lexbuf.length() > 0) {
407       this.lexbuf.setCharAt(this.lexbuf.length() - 1, c);
408     }
409   }
410 
411   //  public void addByte(char c) {
412   //    if (this.lexbuf.length() + 1 >= this.lexlength) {
413   //      while (this.lexbuf.length() + 1 >= this.lexlength) {
414   //        if (this.lexlength == 0)
415   //          this.lexlength = 8192;
416   //        else
417   //          this.lexlength = this.lexlength * 2;
418   //      }
419   //      char[] temp = this.lexbuf;
420   //      this.lexbuf = new char[this.lexlength];
421   //      if (temp != null) {
422   //        System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
423   //        updateNodeTextArrays(temp, this.lexbuf);
424   //      }
425   //    }
426   //    this.lexbuf.append(c);
427   //    this.lexbuf.charAt(this.lexbuf.length()) = '\0'; /* debug */
428   //  }
429 
430   public final void append(char c) {
431     this.lexbuf.append(c);
432     //    if (c < 128)
433     //      addByte(c);
434     //    else if (c <= 0x7FF) {
435     //      addByte(0xC0 | (c >> 6));
436     //      addByte(0x80 | (c & 0x3F));
437     //    } else if (c <= 0xFFFF) {
438     //      addByte(0xE0 | (c >> 12));
439     //      addByte(0x80 | ((c >> 6) & 0x3F));
440     //      addByte(0x80 | (c & 0x3F));
441     //    } else if (c <= 0x1FFFFF) {
442     //      addByte(0xF0 | (c >> 18));
443     //      addByte(0x80 | ((c >> 12) & 0x3F));
444     //      addByte(0x80 | ((c >> 6) & 0x3F));
445     //      addByte(0x80 | (c & 0x3F));
446     //    } else {
447     //      addByte(0xF8 | (c >> 24));
448     //      addByte(0x80 | ((c >> 18) & 0x3F));
449     //      addByte(0x80 | ((c >> 12) & 0x3F));
450     //      addByte(0x80 | ((c >> 6) & 0x3F));
451     //      addByte(0x80 | (c & 0x3F));
452     //    }
453   }
454 
455   public final void append(String str) {
456     this.lexbuf.append(str);
457   }
458 
459   final void decLength(int n) {
460     this.lexbuf.decLength(n);
461   }
462 
463   final CharBuffer getBuffer() {
464     return this.lexbuf;
465   }
466 
467   final TagTable getTagTable() {
468     return fTagTable;
469   }
470 
471   final void setPosition(int line, int column) {
472     this.lineno = line;
473     this.column = column;
474   }
475 
476   ////////////////////////////////////////////////////////////////////////////////////
477 
478   public String getInputName() {
479     return fInputName;
480   }
481 
482   public LexerState setState(LexerState state) {
483     if (CHECK && state == null) {
484       Report.error(NAME + ".setState(): state==null", this);
485     }
486     fState = state;
487     return fState;
488   }
489 
490   public LexerState getState() {
491     return fState;
492   }
493 
494   public void ungetToken() {
495     this.pushed = true;
496   }
497 
498   /*
499     modes for GetToken()
500   
501     MixedContent   -- for elements which don't accept PCDATA
502     Preformatted       -- white space preserved as is
503     IgnoreMarkup       -- for CDATA elements such as script, style
504   */
505 
506   public Node getToken(int mode) {
507     // Duplicate inlines in preference to pushed text nodes when appropriate.
508     if (this.pushed) {
509       if (this.token.type != Node.TextNode || (this.insert == -1 && this.fSavedTextNode == null)) {
510         this.pushed = false;
511         return this.token;
512       }
513     }
514     // At start of block elements, unclosed inline elements are inserted into the token stream.
515     if (this.insert != -1 || this.fSavedTextNode != null)
516       return insertedToken();
517     //
518     int start = lexbuf.length();
519     //FIXME: Check this.
520     fWasWhite = false;
521     if (fInsertSpace != '\0' && mode != MODE_IGNORE_WHITESPACE) {
522       append(fInsertSpace);
523       if (fInsertSpace == ' ')
524         fWasWhite = true;
525       fInsertSpace = '\0';
526     }
527     int c;
528     if ((c = iStream.readChar()) == IHTMLReader.EOF) {
529       return null;
530     } else
531       iStream.ungetChar(c);
532     this.token = fState.getToken(mode, start, iStream.getPosition());
533     return this.token;
534   }
535 
536   /*
537     create a text node for the contents of
538     a CDATA element like style or script
539     which ends with </foo> for some foo.
540   */
541   public Node getScript(Node container) {
542     if (CHECK && this.pushed) {
543       Report.error(NAME + ".getScript(): Pushed back token should not exists.", this);
544     }
545     fState = LexerScriptState.getDefault();
546     ((LexerScriptState) fState).setContainer(container);
547     setInsertSpace('\0');
548     return getToken(isJavaScript(container) ? MODE_JAVASCRIPT : MODE_SCRIPT);
549   }
550 
551   /*
552     push a copy of an inline node onto stack
553     but don't push if implicit or OBJECT or APPLET
554     (implicit tags are ones generated from the istack)
555   
556     One issue arises with pushing inlines when
557     the tag is already pushed. For instance:
558   
559         <p><em>text
560         <p><em>more text
561   
562     Shouldn't be mapped to
563   
564         <p><em>text</em></p>
565         <p><em><em>more text</em></em>
566   */
567   public void pushInline(Node node) {
568     IStack is;
569 
570     if (node.implicit)
571       return;
572     if (node.tag == null)
573       return;
574     if ((node.tag.model & Dict.CM_INLINE) == 0)
575       return;
576     if ((node.tag.model & Dict.CM_OBJECT) != 0)
577       return;
578     if (node.tag != fTagTable.tagFont && isPushed(node))
579       return;
580 
581     // make sure there is enough space for the stack
582     is = new IStack();
583     is.tag = node.tag;
584     is.element = node.element;
585     if (node.attributes != null)
586       is.attributes = cloneAttributes(node.attributes);
587     this.istack.push(is);
588   }
589 
590   /** Pop inline tag stack. */
591   public void popInline(Node node) {
592     IStack is;
593     if (node != null) {
594       if (node.tag == null)
595         return;
596       if ((node.tag.model & Dict.CM_INLINE) == 0)
597         return;
598       if ((node.tag.model & Dict.CM_OBJECT) != 0)
599         return;
600       // If node is </a> then pop until we find an <a>
601       if (node.tag == fTagTable.tagA) {
602         while (this.istack.size() > 0) {
603           is = (IStack) this.istack.pop();
604           if (is.tag == fTagTable.tagA) {
605             break;
606           }
607         }
608         if (this.insert >= this.istack.size())
609           this.insert = -1;
610         return;
611       }
612     }
613     if (this.istack.size() > 0) {
614       is = (IStack) this.istack.pop();
615       if (this.insert >= this.istack.size())
616         this.insert = -1;
617     }
618   }
619 
620   public boolean isPushed(Node node) {
621     int i;
622     IStack is;
623     for (i = this.istack.size() - 1; i >= 0; --i) {
624       is = (IStack) this.istack.elementAt(i);
625       if (is.tag == node.tag)
626         return true;
627     }
628     return false;
629   }
630 
631   /*
632     This has the effect of inserting "missing" inline
633     elements around the contents of blocklevel elements
634     such as P, TD, TH, DIV, PRE etc. This procedure is
635     called at the start of ParseBlock. when the inline
636     stack is not empty, as will be the case in:
637   
638       <i><h1>italic heading</h1></i>
639   
640     which is then treated as equivalent to
641   
642       <h1><i>italic heading</i></h1>
643   
644     This is implemented by setting the lexer into a mode
645     where it gets tokens from the inline stack rather than
646     from the input stream.
647   */
648   public int inlineDup(Node node) {
649     if (!configuration.getDoReformat())
650       return 0;
651     int n = this.istack.size() - this.istackbase;
652     if (n > 0) {
653       this.insert = this.istackbase;
654       this.fSavedTextNode = node;
655     }
656     return n;
657   }
658 
659   ////////////////////////////////////////////////////////////////////////////////////
660 
661   /* choose what version to use for new doctype */
662   public short HTMLVersion() {
663     short versions = this.versions;
664     if ((versions & Dict.VERS_HTML20) != 0)
665       return Dict.VERS_HTML20;
666     if ((versions & Dict.VERS_HTML32) != 0)
667       return Dict.VERS_HTML32;
668     if ((versions & Dict.VERS_HTML40_STRICT) != 0)
669       return Dict.VERS_HTML40_STRICT;
670     if ((versions & Dict.VERS_HTML40_LOOSE) != 0)
671       return Dict.VERS_HTML40_LOOSE;
672     if ((versions & Dict.VERS_FRAMES) != 0)
673       return Dict.VERS_FRAMES;
674     return Dict.VERS_UNKNOWN;
675   }
676 
677   public String HTMLVersionName() {
678     short guessed;
679     int j;
680     guessed = apparentVersion();
681     for (j = 0; j < W3CVersion.length; ++j) {
682       if (guessed == W3CVersion[j].code) {
683         if (this.isvoyager)
684           return W3CVersion[j].voyagerName;
685         return W3CVersion[j].name;
686       }
687     }
688     return null;
689   }
690 
691   /* add meta element for Tidy */
692   public boolean addGenerator(Node root) {
693     AttVal attval;
694     Node node;
695     Node head = root.findHEAD(fTagTable);
696     String generator = Report.getString("generator") + "; " + Report.RELEASE_DATE;
697     if (head != null) {
698       for (node = head.content; node != null;) {
699         if (node.tag == fTagTable.tagMeta) {
700           attval = node.getAttrByName("name");
701           if (attval != null
702             && attval.value != null
703             && Lexer.wstrcasecmp(attval.value, "generator") == 0) {
704             Node tmp = node;
705             node = node.next;
706             Node.removeNode(tmp);
707             continue;
708           }
709         }
710         node = node.next;
711       }
712       node = this.inferredTag("meta");
713       node.addAttribute("name", "Generator");
714       node.addAttribute("content", generator);
715       Node.insertNodeAtStart(head, node);
716       return true;
717     }
718     return false;
719   }
720 
721   public boolean checkDocTypeKeyWords(Node doctype) {
722     int len = doctype.end - doctype.start;
723     String s = getString(this.lexbuf, doctype.start, len);
724 
725     return !(
726       findBadSubString("SYSTEM", s, len)
727         || findBadSubString("PUBLIC", s, len)
728         || findBadSubString("//DTD", s, len)
729         || findBadSubString("//W3C", s, len)
730         || findBadSubString("//EN", s, len));
731   }
732 
733   public boolean setXHTMLDocType(Node root) {
734     String fpi = " ";
735     String sysid = "";
736     String namespace = XHTML_NAMESPACE;
737     Node doctype;
738 
739     doctype = root.findDocType();
740 
741     if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
742       if (doctype != null)
743         Node.discardElement(doctype);
744       return true;
745     }
746 
747     if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
748       /* see what flavor of XHTML this document matches */
749       if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
750         fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
751         sysid = voyager_strict;
752       } else if ((this.versions & Dict.VERS_LOOSE) != 0) {
753         fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
754         sysid = voyager_loose;
755       } else if ((this.versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
756         fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
757         sysid = voyager_frameset;
758       } else /* lets assume XHTML transitional */ {
759         fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
760         sysid = voyager_loose;
761       }
762     } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
763       fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
764       sysid = voyager_strict;
765     } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
766       fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
767       sysid = voyager_loose;
768     }
769 
770     fixHTMLNameSpace(root, namespace);
771 
772     if (doctype == null) {
773       doctype = newNode(Node.DocTypeTag, 0, 0, -1);
774       doctype.next = root.content;
775       doctype.parent = root;
776       doctype.prev = null;
777       root.content = doctype;
778     }
779 
780     if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null) {
781       fpi = configuration.docTypeStr;
782       sysid = "";
783     }
784 
785     doctype.start = this.lexbuf.length();
786     /* add public identifier */
787     append("html PUBLIC ");
788     /* check if the fpi is quoted or not */
789     if (fpi.charAt(0) == '"')
790       append(fpi);
791     else {
792       append("\"");
793       append(fpi);
794       append("\"");
795     }
796     if (sysid.length() + 6 >= this.configuration.wraplen)
797       append("\n\"");
798     else
799       append("\n    \"");
800     /* add system identifier */
801     append(sysid);
802     append("\"");
803     doctype.end = this.lexbuf.length();
804     return false;
805   }
806 
807   /* fixup doctype if missing */
808   public boolean fixDocType(Node root) {
809     Node doctype;
810     int guessed = Dict.VERS_HTML40_STRICT, i;
811 
812     if (this.badDoctype)
813       Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
814 
815     if (configuration.XmlOut)
816       return true;
817 
818     doctype = root.findDocType();
819 
820     if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
821       if (doctype != null)
822         Node.discardElement(doctype);
823       return true;
824     }
825 
826     if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
827       Node.discardElement(doctype);
828       doctype = null;
829       guessed = Dict.VERS_HTML40_STRICT;
830     } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
831       Node.discardElement(doctype);
832       doctype = null;
833       guessed = Dict.VERS_HTML40_LOOSE;
834     } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
835       if (doctype != null) {
836         if (this.doctype == Dict.VERS_UNKNOWN)
837           return false;
838 
839         switch (this.doctype) {
840           case Dict.VERS_UNKNOWN :
841             return false;
842 
843           case Dict.VERS_HTML20 :
844             if ((this.versions & Dict.VERS_HTML20) != 0)
845               return true;
846 
847             break; /* to replace old version by new */
848 
849           case Dict.VERS_HTML32 :
850             if ((this.versions & Dict.VERS_HTML32) != 0)
851               return true;
852 
853             break; /* to replace old version by new */
854 
855           case Dict.VERS_HTML40_STRICT :
856             if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
857               return true;
858 
859             break; /* to replace old version by new */
860 
861           case Dict.VERS_HTML40_LOOSE :
862             if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
863               return true;
864 
865             break; /* to replace old version by new */
866 
867           case Dict.VERS_FRAMES :
868             if ((this.versions & Dict.VERS_FRAMES) != 0)
869               return true;
870 
871             break; /* to replace old version by new */
872         }
873 
874         /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
875       }
876 
877       /* choose new doctype */
878       guessed = HTMLVersion();
879     }
880 
881     if (guessed == Dict.VERS_UNKNOWN)
882       return false;
883 
884     /* for XML use the Voyager system identifier */
885     if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) {
886       if (doctype != null)
887         Node.discardElement(doctype);
888 
889       for (i = 0; i < W3CVersion.length; ++i) {
890         if (guessed == W3CVersion[i].code) {
891           fixHTMLNameSpace(root, W3CVersion[i].profile);
892           break;
893         }
894       }
895 
896       return true;
897     }
898 
899     if (doctype == null) {
900       doctype = newNode(Node.DocTypeTag, 0, 0, -1);
901       doctype.next = root.content;
902       doctype.parent = root;
903       doctype.prev = null;
904       root.content = doctype;
905     }
906 
907     /* use the appropriate public identifier */
908     doctype.start = this.lexbuf.length();
909     append("html PUBLIC ");
910     if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null)
911       append(configuration.docTypeStr);
912     else if (guessed == Dict.VERS_HTML20)
913       append("\"-//IETF//DTD HTML 2.0//EN\"");
914     else {
915       append("\"-//W3C//DTD ");
916       for (i = 0; i < W3CVersion.length; ++i) {
917         if (guessed == W3CVersion[i].code) {
918           append(W3CVersion[i].name);
919           break;
920         }
921       }
922       append("//EN\"");
923     }
924     doctype.end = this.lexbuf.length();
925     ;
926     return true;
927   }
928 
929   /* ensure XML document starts with <?XML version="1.0"?> */
930   public boolean fixXMLPI(Node root) {
931     Node xml;
932     int s;
933     if (root.content != null && root.content.type == Node.ProcInsTag) {
934       s = root.content.start;
935       if (this.lexbuf.charAt(s) == 'x'
936         && this.lexbuf.charAt(s + 1) == 'm'
937         && this.lexbuf.charAt(s + 2) == 'l')
938         return true;
939     }
940     xml = newNode(Node.ProcInsTag, 0, 0, -1);
941     xml.next = root.content;
942     if (root.content != null) {
943       root.content.prev = xml;
944       xml.next = root.content;
945     }
946     root.content = xml;
947     xml.start = this.lexbuf.length();
948     append("xml version=\"1.0\"");
949     if (this.configuration.CharEncoding == Configuration.LATIN1)
950       append(" encoding=\"ISO-8859-1\"");
951     xml.end = this.lexbuf.length();
952     return false;
953   }
954 
955   public Node inferredTag(String name) {
956     Node node;
957     node = newNode(Node.StartTag, this.lexbuf.length(), this.lexbuf.length(), -1, name);
958     node.implicit = true;
959     return node;
960   }
961 
962   /* duplicate name attribute as an id */
963   public void fixId(Node node) {
964     AttVal name = node.getAttrByName("name");
965     AttVal id = node.getAttrByName("id");
966 
967     if (name != null) {
968       if (id != null) {
969         // Anchor name/id is case-sensitive (HTML4.0).
970         if (!id.value.equals(name.value))
971           Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
972       } else if (this.configuration.XmlOut)
973         node.addAttribute("id", name.value);
974     }
975   }
976 
977   /**
978    * Defer duplicates when entering a table or other
979    * element where the inlines shouldn't be duplicated
980      */
981   public void deferDup() {
982     //CHECK: This seems to be discarding instead of deferring.
983     this.insert = -1;
984     this.fSavedTextNode = null;
985   }
986 
987   public boolean canPrune(Node element) {
988     if (element.type == Node.TextNode)
989       return true;
990 
991     if (element.content != null)
992       return false;
993 
994     if (element.tag == fTagTable.tagA && element.attributes != null)
995       return false;
996 
997     if (element.tag == fTagTable.tagP && !this.configuration.DropEmptyParas)
998       return false;
999 
1000    if (element.tag == null)
1001      return false;
1002
1003    if ((element.tag.model & Dict.CM_ROW) != 0)
1004      return false;
1005
1006    if (element.tag == fTagTable.tagApplet)
1007      return false;
1008
1009    if (element.tag == fTagTable.tagObject)
1010      return false;
1011
1012    if (element.attributes != null
1013      && (element.getAttrByName("id") != null || element.getAttrByName("name") != null))
1014      return false;
1015
1016    return true;
1017  }
1018
1019  ////////////////////////////////////////////////////////////////////////
1020
1021  public final int length() {
1022    return lexbuf.length();
1023  }
1024
1025  public final char getChar(int i) {
1026    return lexbuf.charAt(i);
1027  }
1028
1029  public final int getLineNumber() {
1030    return iStream.getLineNumber();
1031  }
1032
1033  public final int getColumn() {
1034    return iStream.getColumn();
1035  }
1036
1037  public final int getPosition() {
1038    return iStream.getPosition();
1039  }
1040
1041  public final void setInsertSpace(char c) {
1042    fInsertSpace = c;
1043  }
1044
1045  public final void setWasWhite(boolean b) {
1046    fWasWhite = b;
1047  }
1048
1049  public void markPosition() {
1050    lineno = iStream.getLineNumber();
1051    column = iStream.getColumn();
1052  }
1053
1054  ////////////////////////////////////////////////////////////////////////////////////
1055
1056  /* return true if substring s is in p and isn't all in same case */
1057  /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
1058  /* len is how many chars to check in p */
1059  private static boolean findBadSubString(String s, String p, int len) {
1060    int n = s.length();
1061    int i = 0;
1062    String ps;
1063    while (n < len) {
1064      ps = p.substring(i, i + n);
1065      if (wstrcasecmp(s, ps) == 0)
1066        return (!ps.equals(s.substring(0, n)));
1067      ++i;
1068      --len;
1069    }
1070    return false;
1071  }
1072
1073  private static short MAP(char c) {
1074    return ((int) c < 128 ? lexmap[(int) c] : 0);
1075  }
1076
1077  private static boolean isJavaScript(Node node) {
1078    if (node.attributes == null)
1079      return true;
1080    for (AttVal attr = node.attributes; attr != null; attr = attr.next) {
1081      if (("language".equalsIgnoreCase(attr.attribute) || "type".equalsIgnoreCase(attr.attribute))
1082        && "javascript".equalsIgnoreCase(attr.value))
1083        return true;
1084    }
1085    return false;
1086  }
1087
1088  ////////////////////////////////////////////////////////////////////////
1089
1090  /* examine <!DOCTYPE> to identify version */
1091  short findGivenVersion(Node doctype) {
1092    String p, s;
1093    int i, j;
1094    String str1;
1095    String str2;
1096    int len;
1097    /* if root tag for doctype isn't html give up now */
1098    str1 = getString(this.lexbuf, doctype.start, 5);
1099    if (wstrcasecmp(str1, "html ") != 0)
1100      return 0;
1101
1102    if (!checkDocTypeKeyWords(doctype))
1103      Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
1104
1105    /* give up if all we are given is the system id for the doctype */
1106    int start = doctype.start + 5;
1107    str1 = getString(this.lexbuf, start, 7);
1108    if (wstrcasecmp(str1, "SYSTEM ") == 0) {
1109      /* but at least ensure the case is correct */
1110      this.lexbuf.replace(start, start + 6, "SYSTEM");
1111      return 0; /* unrecognized */
1112    }
1113
1114    if (wstrcasecmp(str1, "PUBLIC ") == 0) {
1115      this.lexbuf.replace(start, start + 6, "PUBLIC");
1116    } else
1117      this.badDoctype = true;
1118
1119    for (i = doctype.start; i < doctype.end; ++i) {
1120      if (this.lexbuf.charAt(i) == '"') {
1121        str1 = getString(this.lexbuf, i + 1, 12);
1122        str2 = getString(this.lexbuf, i + 1, 13);
1123        if (str1.equals("-//W3C//DTD ")) {
1124          /* compute length of identifier e.g. "HTML 4.0 Transitional" */
1125          for (j = i + 13; j < doctype.end && this.lexbuf.charAt(i) != '/'; ++j);
1126          len = j - i - 13;
1127          p = getString(this.lexbuf, i + 13, len);
1128
1129          for (j = 1; j < W3CVersion.length; ++j) {
1130            s = W3CVersion[j].name;
1131            if (len == s.length() && s.equals(p))
1132              return W3CVersion[j].code;
1133          }
1134
1135          /* else unrecognized version */
1136        } else if (str2.equals("-//IETF//DTD ")) {
1137          /* compute length of identifier e.g. "HTML 2.0" */
1138          for (j = i + 14; j < doctype.end && this.lexbuf.charAt(i) != '/'; ++j);
1139          len = j - i - 14;
1140
1141          p = getString(this.lexbuf, i + 14, len);
1142          s = W3CVersion[0].name;
1143          if (len == s.length() && s.equals(p))
1144            return W3CVersion[0].code;
1145
1146          /* else unrecognized version */
1147        }
1148        break;
1149      }
1150    }
1151
1152    return 0;
1153  }
1154
1155  private void fixHTMLNameSpace(Node root, String profile) {
1156    Node node;
1157    AttVal attr;
1158    for (node = root.content; node != null && node.tag != fTagTable.tagHtml; node = node.next);
1159    if (node != null) {
1160      for (attr = node.attributes; attr != null; attr = attr.next) {
1161        if (attr.attribute.equals("xmlns"))
1162          break;
1163      }
1164      if (attr != null) {
1165        if (!attr.value.equals(profile)) {
1166          Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
1167          attr.value = profile;
1168        }
1169      } else {
1170        attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
1171        attr.dict = configuration.getAttributeTable().findAttribute(attr);
1172        node.attributes = attr;
1173      }
1174    }
1175  }
1176
1177  private short apparentVersion() {
1178    switch (this.doctype) {
1179      case Dict.VERS_UNKNOWN :
1180        return HTMLVersion();
1181
1182      case Dict.VERS_HTML20 :
1183        if ((this.versions & Dict.VERS_HTML20) != 0)
1184          return Dict.VERS_HTML20;
1185
1186        break;
1187
1188      case Dict.VERS_HTML32 :
1189        if ((this.versions & Dict.VERS_HTML32) != 0)
1190          return Dict.VERS_HTML32;
1191
1192        break; /* to replace old version by new */
1193
1194      case Dict.VERS_HTML40_STRICT :
1195        if ((this.versions & Dict.VERS_HTML40_STRICT) != 0)
1196          return Dict.VERS_HTML40_STRICT;
1197
1198        break;
1199
1200      case Dict.VERS_HTML40_LOOSE :
1201        if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0)
1202          return Dict.VERS_HTML40_LOOSE;
1203
1204        break; /* to replace old version by new */
1205
1206      case Dict.VERS_FRAMES :
1207        if ((this.versions & Dict.VERS_FRAMES) != 0)
1208          return Dict.VERS_FRAMES;
1209
1210        break;
1211    }
1212
1213    Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
1214    return this.HTMLVersion();
1215  }
1216
1217  ////////////////////////////////////////////////////////////////////////
1218
1219  /**
1220   * Get tag name.
1221   * @enter &lt;c or &lt;/c have been read, and c is saved at lexbuf.charAt(txtstart).
1222   */
1223  char parseTagName(char c) {
1224    if (CHECK && !Character.isLetter(c)) {
1225      Report.error(NAME + ".parseTagName(): Tag name should started with a letter: c=" + c, this);
1226    }
1227    /* fold case of first char in buffer */
1228    if (!this.configuration.XmlTags && Character.isUpperCase(c))
1229      c = Character.toLowerCase(c);
1230    append(c);
1231    while (true) {
1232      c = (char) this.iStream.readChar();
1233      if (iStream.isEOF())
1234        break;
1235      if (!Character.isLetterOrDigit(c) && (MAP(c) & NAMECHAR) == 0)
1236        break;
1237      /* fold case of subsequent chars */
1238      if (!this.configuration.XmlTags && Character.isUpperCase(c))
1239        c = Character.toLowerCase(c);
1240      append(c);
1241    }
1242    return (char) c;
1243  }
1244
1245  /**
1246    No longer attempts to insert missing ';' for unknown
1247    enitities unless one was present already, since this
1248    gives unexpected results.
1249  
1250    For example:   <a href="something.htm?foo&bar&fred">
1251    was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
1252    rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
1253  
1254    My thanks for Maurice Buxton for spotting this.
1255    
1256    @param mode MODE_PREFORMATTED|MODE_ATTR_VALUE
1257  */
1258  void parseEntity(int mode) {
1259    final int FIRST = 0;
1260    final int NUMERIC = 1;
1261    final int NAME = 2;
1262    final int DIGIT = 3;
1263    final int HEXDIGIT = 4;
1264    final int SEMICOLON = 5;
1265    //
1266    short map;
1267    char c;
1268    String str;
1269    //
1270    int state = FIRST;
1271    int start = this.lexbuf.length() - 1; /* to start at "&" */
1272    int startcol = this.iStream.getColumn() - 1;
1273
1274    while (true) {
1275      c = (char) this.iStream.readChar();
1276      if (iStream.isEOF())
1277        break;
1278      if (c == ';') {
1279        state = SEMICOLON;
1280        break;
1281      }
1282      if (state == FIRST) {
1283        if (c == '#') {
1284          append(c);
1285          state = NUMERIC;
1286          continue;
1287</