Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/port80/html/tidy/TidyHTMLParser.java


1   package com.port80.html.tidy;
2   
3   import java.io.BufferedReader;
4   import java.io.FileInputStream;
5   import java.io.FileNotFoundException;
6   import java.io.IOException;
7   import java.io.InputStreamReader;
8   import java.io.PrintWriter;
9   import java.io.Reader;
10  
11  import org.eclipse.jface.text.source.ISourceViewer;
12  import org.eclipse.ui.editors.text.TextEditor;
13  
14  /**
15   * HTML parser and pretty printer.
16   * 
17   * Adapted from Tidy v1.11 to use Java Reader/Writer instead of custom InputStream/OutputStream
18   * which make it easier to parse String input and return result in StringWriter.
19   */
20  
21  public class TidyHTMLParser implements java.io.Serializable {
22  
23    ////////////////////////////////////////////////////////////////////////
24  
25    private static final String NAME = "TidyHTMLParser";
26    static final long serialVersionUID = -2794371560623987718L;
27  
28    private static final int ERROR_INVALID_FILENAME = 1;
29  
30    ////////////////////////////////////////////////////////////////////////
31  
32    private TextEditor fEditor;
33    private ISourceViewer fViewer;
34    //
35    private TidyConfiguration configuration = null;
36    private String inputStreamName = "noname";
37    private int parseErrors = 0;
38    private int parseWarnings = 0;
39  
40    ////////////////////////////////////////////////////////////////////////
41  
42    public TidyHTMLParser(TidyConfiguration cf) {
43      init(cf);
44    }
45  
46    ////////////////////////////////////////////////////////////////////////
47  
48    public TidyConfiguration getConfiguration() {
49      return configuration;
50    }
51  
52    /**
53     * ParseErrors - the number of errors that occurred in the most
54     * recent parse operation
55     */
56  
57    public int getParseErrors() {
58      return parseErrors;
59    }
60  
61    /**
62     * ParseWarnings - the number of warnings that occurred in the most
63     * recent parse operation
64     */
65  
66    public int getParseWarnings() {
67      return parseWarnings;
68    }
69  
70    public void setCompactFormat(boolean b) {
71      configuration.CompactFormat = b;
72    }
73  
74    ////////////////////////////////////////////////////////////////////////
75  
76    /**
77     * First time initialization which should precede reading the command line.
78     */
79    private void init(TidyConfiguration cf) {
80      configuration = cf;
81      /* Unnecessary - same initial values in Configuration
82      Configuration.XmlTags       = false;
83      Configuration.XmlOut        = false;
84      Configuration.HideEndTags   = false;
85      Configuration.UpperCaseTags = false;
86      Configuration.MakeClean     = false;
87      Configuration.writeback     = false;
88      Configuration.OnlyErrors    = false;
89      */
90    }
91  
92    public Node parse(String file, PrintWriter out) throws FileNotFoundException, IOException {
93      Node document = null;
94      Reader in = null;
95      if (file == null) {
96        in = new BufferedReader(new InputStreamReader(System.in));
97        inputStreamName = "stdin";
98      } else {
99        in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
100       inputStreamName = file;
101     }
102     document = parse(in, out);
103     // Try to close the InputStream but only if if we created it.
104     if (file != null && in != null) {
105       in.close();
106     }
107     return document;
108   }
109 
110   public Node parse(Reader in, String inputname, PrintWriter out) throws IOException {
111     inputStreamName = inputname;
112     return parse(in, out);
113   }
114 
115   /**
116    * Parse a complete HTML document.  Tags are inferred if needed.
117    */
118   public Node parse(Reader in, PrintWriter out) throws IOException {
119     Lexer lexer;
120     Node document = null;
121     //
122     parseErrors = 0;
123     parseWarnings = 0;
124     //
125     if (in == null) {
126       in = new BufferedReader(new InputStreamReader(System.in));
127       inputStreamName = "stdin";
128     }
129     lexer =
130       new Lexer(
131         new HTMLReader(in, configuration.getCharEncoding(), configuration.getTabsize()),
132         inputStreamName,
133         configuration);
134 
135     /* Tidy doesn't alter the doctype for generic XML docs */
136     if (configuration.getXmlTags()) {
137       document = ParserImpl.parseXMLDocument(lexer);
138     } else {
139       lexer.warnings = 0;
140       if (!configuration.getQuiet())
141         Report.helloMessage(inputStreamName);
142       //
143       document = ParserImpl.parseDocument(lexer);
144       //
145       if (!document.checkNodeIntegrity()) {
146         Report.badTree();
147         return null;
148       }
149       if (configuration.doReformat) {
150         Clean cleaner = new Clean(configuration.getTagTable());
151 
152         /* simplifies <b><b> ... </b> ...</b> etc. */
153         cleaner.nestedEmphasis(document);
154 
155         /* cleans up <dir>indented text</dir> etc. */
156         cleaner.list2BQ(document);
157         cleaner.bQ2Div(document);
158 
159         /* replaces i by em and b by strong */
160         if (configuration.getLogicalEmphasis())
161           cleaner.emFromI(document);
162 
163         if (configuration.getWord2000()
164           && cleaner.isWord2000(document, configuration.getTagTable())) {
165           /* prune Word2000's <![if ...]> ... <![endif]> */
166           cleaner.dropSections(lexer, document);
167 
168           /* drop style & class attributes and empty p, span elements */
169           cleaner.cleanWord2000(lexer, document);
170         }
171 
172         /* replaces presentational markup by style rules */
173         if (configuration.getMakeClean() || configuration.getDropFontTags())
174           cleaner.cleanTree(lexer, document);
175 
176         if (!document.checkNodeIntegrity()) {
177           Report.badTree();
178           return null;
179         }
180         if (document.getContent() != null) {
181           if (configuration.getXHTML())
182             lexer.setXHTMLDocType(document);
183           else
184             lexer.fixDocType(document);
185 
186           if (configuration.getTidyMark())
187             lexer.addGenerator(document);
188         }
189 
190         /* ensure presence of initial <?XML version="1.0"?> */
191         if (configuration.getXmlOut() && configuration.getXmlPi())
192           lexer.fixXMLPI(document);
193       }
194       if (!configuration.getQuiet() && document.getContent() != null) {
195         //Report.reportVersion(lexer, inputStreamName, doctype);
196         Report.reportNumWarnings(lexer);
197       }
198     }
199 
200     parseWarnings = lexer.warnings;
201     parseErrors = lexer.errors;
202     if (lexer.errors > 0)
203       Report.needsAuthorIntervention();
204     Report.errorSummary(lexer);
205 
206     if (!configuration.getOnlyErrors() && lexer.errors == 0) {
207       if (out != null) {
208         PrettyPrint pprint = new PrettyPrint(configuration, out);
209         if (configuration.getXmlTags())
210           pprint.printXMLTree(0, 0, lexer, document);
211         else
212           pprint.printTree(0, 0, lexer, document);
213         pprint.flush();
214       }
215     }
216     return document;
217   }
218 
219   /**
220    * Parses InputStream in and returns a DOM Document node.
221    * If out is non-null, pretty prints to OutputStream out.
222    */
223 
224   public org.w3c.dom.Document parseDOM(Reader in, PrintWriter out) throws IOException {
225     Node document = parse(in, out);
226     if (document != null)
227       return (org.w3c.dom.Document) document.getAdapter();
228     else
229       return null;
230   }
231 
232   /**
233    * Pretty-prints a DOM Document.
234    */
235 
236   public void pprint(org.w3c.dom.Document doc, PrintWriter out) {
237     if (!(doc instanceof DOMDocumentImpl)) {
238       return;
239     }
240     Node document = ((DOMDocumentImpl) doc).getDocument();
241     if (out != null) {
242       PrettyPrint pprint = new PrettyPrint(configuration, out);
243       if (configuration.getXmlTags())
244         pprint.printXMLTree(0, 0, null, document);
245       else
246         pprint.printTree(0, 0, null, document);
247       pprint.flush();
248     }
249   }
250 
251   ////////////////////////////////////////////////////////////////////////
252 
253   /**
254    * Creates an empty DOM Document.
255    */
256 
257   public static org.w3c.dom.Document createEmptyDocument() {
258     Node document = new Node(Node.RootNode, new CharBuffer(0), 0, 0);
259     Node node = new Node(Node.StartTag, new CharBuffer(0), 0, 0, "html", new TagTable());
260     if (document != null && node != null) {
261       Node.insertNodeAtStart(document, node);
262       return (org.w3c.dom.Document) document.getAdapter();
263     } else {
264       return null;
265     }
266   }
267 
268   /**
269    * Command line interface to parser and pretty printer.
270    */
271 
272   public static void main(String[] argv) {
273     int totalerrors = 0;
274     int totalwarnings = 0;
275     String file;
276     int argc = argv.length + 1;
277     int argIndex = 0;
278     String arg;
279 
280     TidyHTMLParser tidy = new TidyHTMLParser(new TidyConfiguration(null, null, null));
281     TidyConfiguration configuration = tidy.getConfiguration();
282 
283     /* read command line */
284 
285     while (argc > 0) {
286       if (argc > 1 && argv[argIndex].startsWith("-")) {
287         /* support -foo and --foo */
288         arg = argv[argIndex].substring(1);
289 
290         if (arg.length() > 0 && arg.charAt(0) == '-')
291           arg = arg.substring(1);
292 
293         if (arg.equals("xml"))
294           configuration.setXmlTags(true);
295         else if (arg.equals("asxml") || arg.equals("asxhtml"))
296           configuration.setXHTML(true);
297         else if (arg.equals("noindent")) {
298           configuration.setIndentContent(false);
299           configuration.setSmartIndent(false);
300         } else if (arg.equals("omit"))
301           configuration.setHideEndTags(true);
302         else if (arg.equals("upper"))
303           configuration.setUpperCaseTags(true);
304         else if (arg.equals("clean"))
305           configuration.setMakeClean(true);
306         else if (arg.equals("raw"))
307           configuration.setCharEncoding(Configuration.RAW);
308         else if (arg.equals("ascii"))
309           configuration.setCharEncoding(Configuration.ASCII);
310         else if (arg.equals("latin1"))
311           configuration.setCharEncoding(Configuration.LATIN1);
312         else if (arg.equals("utf8"))
313           configuration.setCharEncoding(Configuration.UTF8);
314         else if (arg.equals("iso2022"))
315           configuration.setCharEncoding(Configuration.ISO2022);
316         else if (arg.equals("mac"))
317           configuration.setCharEncoding(Configuration.MACROMAN);
318         else if (arg.equals("numeric"))
319           configuration.setNumEntities(true);
320         else if (arg.equals("modify"))
321           configuration.setWriteBack(true);
322         else if (arg.equals("change")) /* obsolete */
323           configuration.setWriteBack(true);
324         else if (arg.equals("update")) /* obsolete */
325           configuration.setWriteBack(true);
326         else if (arg.equals("errors"))
327           configuration.setOnlyErrors(true);
328         else if (arg.equals("quiet"))
329           configuration.setQuiet(true);
330         else if (arg.equals("slides"))
331           configuration.setBurstSlides(true);
332         else if (
333           arg.equals("help")
334             || argv[argIndex].charAt(1) == '?'
335             || argv[argIndex].charAt(1) == 'h') {
336           Report.helpText(NAME);
337           System.exit(1);
338         } else if (arg.equals("config")) {
339           if (argc >= 3) {
340             configuration.parseFile(argv[argIndex + 1]);
341             --argc;
342             ++argIndex;
343           }
344         } else if (
345           argv[argIndex].equals("-file")
346             || argv[argIndex].equals("--file")
347             || argv[argIndex].equals("-f")) {
348           if (argc >= 3) {
349             configuration.setErrfile(argv[argIndex + 1]);
350             --argc;
351             ++argIndex;
352           }
353         } else if (
354           argv[argIndex].equals("-wrap")
355             || argv[argIndex].equals("--wrap")
356             || argv[argIndex].equals("-w")) {
357           if (argc >= 3) {
358             configuration.setWraplen(Integer.parseInt(argv[argIndex + 1]));
359             --argc;
360             ++argIndex;
361           }
362         } else if (
363           argv[argIndex].equals("-version")
364             || argv[argIndex].equals("--version")
365             || argv[argIndex].equals("-v")) {
366           Report.showVersion();
367           System.exit(0);
368         } else {
369           String s = argv[argIndex];
370 
371           for (int i = 1; i < s.length(); i++) {
372             if (s.charAt(i) == 'i') {
373               configuration.setIndentContent(true);
374               configuration.setSmartIndent(true);
375             } else if (s.charAt(i) == 'o')
376               configuration.setHideEndTags(true);
377             else if (s.charAt(i) == 'u')
378               configuration.setUpperCaseTags(true);
379             else if (s.charAt(i) == 'c')
380               configuration.setMakeClean(true);
381             else if (s.charAt(i) == 'n')
382               configuration.setNumEntities(true);
383             else if (s.charAt(i) == 'm')
384               configuration.setWriteBack(true);
385             else if (s.charAt(i) == 'e')
386               configuration.setOnlyErrors(true);
387             else if (s.charAt(i) == 'q')
388               configuration.setQuiet(true);
389             else
390               Report.unknownOption(s.charAt(i));
391           }
392         }
393         --argc;
394         ++argIndex;
395         continue;
396       }
397 
398       if (argc > 1) {
399         file = argv[argIndex];
400       } else {
401         file = "stdin";
402       }
403 
404       Report.setWriter(configuration.getErrWriter());
405       try {
406         tidy.parse(file, new PrintWriter(System.out));
407         totalwarnings += tidy.parseWarnings;
408         totalerrors += tidy.parseErrors;
409       } catch (FileNotFoundException fnfe) {
410         Report.unknownFile(NAME, file);
411       } catch (IOException ioe) {
412         Report.unknownFile(NAME, file);
413       }
414 
415       --argc;
416       ++argIndex;
417 
418       if (argc <= 1)
419         break;
420     }
421 
422     if (totalerrors + totalwarnings > 0)
423       Report.generalInfo();
424     configuration.closeErrWriter();
425 
426     /* return status can be used by scripts */
427     if (totalerrors > 0)
428       System.exit(2);
429     if (totalwarnings > 0)
430       System.exit(1);
431     /* 0 signifies all is ok */
432     System.exit(0);
433   }
434 
435   ////////////////////////////////////////////////////////////////////////
436 }