Source code: com/port80/html/tidy/TidyHTMLParser.java
1 package com.port80.html.tidy;
2
3 import java.io.BufferedReader;
4 import java.io.FileInputStream;
5 import java.io.FileNotFoundException;
6 import java.io.IOException;
7 import java.io.InputStreamReader;
8 import java.io.PrintWriter;
9 import java.io.Reader;
10
11 import org.eclipse.jface.text.source.ISourceViewer;
12 import org.eclipse.ui.editors.text.TextEditor;
13
14 /**
15 * HTML parser and pretty printer.
16 *
17 * Adapted from Tidy v1.11 to use Java Reader/Writer instead of custom InputStream/OutputStream
18 * which make it easier to parse String input and return result in StringWriter.
19 */
20
21 public class TidyHTMLParser implements java.io.Serializable {
22
23 ////////////////////////////////////////////////////////////////////////
24
25 private static final String NAME = "TidyHTMLParser";
26 static final long serialVersionUID = -2794371560623987718L;
27
28 private static final int ERROR_INVALID_FILENAME = 1;
29
30 ////////////////////////////////////////////////////////////////////////
31
32 private TextEditor fEditor;
33 private ISourceViewer fViewer;
34 //
35 private TidyConfiguration configuration = null;
36 private String inputStreamName = "noname";
37 private int parseErrors = 0;
38 private int parseWarnings = 0;
39
40 ////////////////////////////////////////////////////////////////////////
41
42 public TidyHTMLParser(TidyConfiguration cf) {
43 init(cf);
44 }
45
46 ////////////////////////////////////////////////////////////////////////
47
48 public TidyConfiguration getConfiguration() {
49 return configuration;
50 }
51
52 /**
53 * ParseErrors - the number of errors that occurred in the most
54 * recent parse operation
55 */
56
57 public int getParseErrors() {
58 return parseErrors;
59 }
60
61 /**
62 * ParseWarnings - the number of warnings that occurred in the most
63 * recent parse operation
64 */
65
66 public int getParseWarnings() {
67 return parseWarnings;
68 }
69
70 public void setCompactFormat(boolean b) {
71 configuration.CompactFormat = b;
72 }
73
74 ////////////////////////////////////////////////////////////////////////
75
76 /**
77 * First time initialization which should precede reading the command line.
78 */
79 private void init(TidyConfiguration cf) {
80 configuration = cf;
81 /* Unnecessary - same initial values in Configuration
82 Configuration.XmlTags = false;
83 Configuration.XmlOut = false;
84 Configuration.HideEndTags = false;
85 Configuration.UpperCaseTags = false;
86 Configuration.MakeClean = false;
87 Configuration.writeback = false;
88 Configuration.OnlyErrors = false;
89 */
90 }
91
92 public Node parse(String file, PrintWriter out) throws FileNotFoundException, IOException {
93 Node document = null;
94 Reader in = null;
95 if (file == null) {
96 in = new BufferedReader(new InputStreamReader(System.in));
97 inputStreamName = "stdin";
98 } else {
99 in = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
100 inputStreamName = file;
101 }
102 document = parse(in, out);
103 // Try to close the InputStream but only if if we created it.
104 if (file != null && in != null) {
105 in.close();
106 }
107 return document;
108 }
109
110 public Node parse(Reader in, String inputname, PrintWriter out) throws IOException {
111 inputStreamName = inputname;
112 return parse(in, out);
113 }
114
115 /**
116 * Parse a complete HTML document. Tags are inferred if needed.
117 */
118 public Node parse(Reader in, PrintWriter out) throws IOException {
119 Lexer lexer;
120 Node document = null;
121 //
122 parseErrors = 0;
123 parseWarnings = 0;
124 //
125 if (in == null) {
126 in = new BufferedReader(new InputStreamReader(System.in));
127 inputStreamName = "stdin";
128 }
129 lexer =
130 new Lexer(
131 new HTMLReader(in, configuration.getCharEncoding(), configuration.getTabsize()),
132 inputStreamName,
133 configuration);
134
135 /* Tidy doesn't alter the doctype for generic XML docs */
136 if (configuration.getXmlTags()) {
137 document = ParserImpl.parseXMLDocument(lexer);
138 } else {
139 lexer.warnings = 0;
140 if (!configuration.getQuiet())
141 Report.helloMessage(inputStreamName);
142 //
143 document = ParserImpl.parseDocument(lexer);
144 //
145 if (!document.checkNodeIntegrity()) {
146 Report.badTree();
147 return null;
148 }
149 if (configuration.doReformat) {
150 Clean cleaner = new Clean(configuration.getTagTable());
151
152 /* simplifies <b><b> ... </b> ...</b> etc. */
153 cleaner.nestedEmphasis(document);
154
155 /* cleans up <dir>indented text</dir> etc. */
156 cleaner.list2BQ(document);
157 cleaner.bQ2Div(document);
158
159 /* replaces i by em and b by strong */
160 if (configuration.getLogicalEmphasis())
161 cleaner.emFromI(document);
162
163 if (configuration.getWord2000()
164 && cleaner.isWord2000(document, configuration.getTagTable())) {
165 /* prune Word2000's <![if ...]> ... <![endif]> */
166 cleaner.dropSections(lexer, document);
167
168 /* drop style & class attributes and empty p, span elements */
169 cleaner.cleanWord2000(lexer, document);
170 }
171
172 /* replaces presentational markup by style rules */
173 if (configuration.getMakeClean() || configuration.getDropFontTags())
174 cleaner.cleanTree(lexer, document);
175
176 if (!document.checkNodeIntegrity()) {
177 Report.badTree();
178 return null;
179 }
180 if (document.getContent() != null) {
181 if (configuration.getXHTML())
182 lexer.setXHTMLDocType(document);
183 else
184 lexer.fixDocType(document);
185
186 if (configuration.getTidyMark())
187 lexer.addGenerator(document);
188 }
189
190 /* ensure presence of initial <?XML version="1.0"?> */
191 if (configuration.getXmlOut() && configuration.getXmlPi())
192 lexer.fixXMLPI(document);
193 }
194 if (!configuration.getQuiet() && document.getContent() != null) {
195 //Report.reportVersion(lexer, inputStreamName, doctype);
196 Report.reportNumWarnings(lexer);
197 }
198 }
199
200 parseWarnings = lexer.warnings;
201 parseErrors = lexer.errors;
202 if (lexer.errors > 0)
203 Report.needsAuthorIntervention();
204 Report.errorSummary(lexer);
205
206 if (!configuration.getOnlyErrors() && lexer.errors == 0) {
207 if (out != null) {
208 PrettyPrint pprint = new PrettyPrint(configuration, out);
209 if (configuration.getXmlTags())
210 pprint.printXMLTree(0, 0, lexer, document);
211 else
212 pprint.printTree(0, 0, lexer, document);
213 pprint.flush();
214 }
215 }
216 return document;
217 }
218
219 /**
220 * Parses InputStream in and returns a DOM Document node.
221 * If out is non-null, pretty prints to OutputStream out.
222 */
223
224 public org.w3c.dom.Document parseDOM(Reader in, PrintWriter out) throws IOException {
225 Node document = parse(in, out);
226 if (document != null)
227 return (org.w3c.dom.Document) document.getAdapter();
228 else
229 return null;
230 }
231
232 /**
233 * Pretty-prints a DOM Document.
234 */
235
236 public void pprint(org.w3c.dom.Document doc, PrintWriter out) {
237 if (!(doc instanceof DOMDocumentImpl)) {
238 return;
239 }
240 Node document = ((DOMDocumentImpl) doc).getDocument();
241 if (out != null) {
242 PrettyPrint pprint = new PrettyPrint(configuration, out);
243 if (configuration.getXmlTags())
244 pprint.printXMLTree(0, 0, null, document);
245 else
246 pprint.printTree(0, 0, null, document);
247 pprint.flush();
248 }
249 }
250
251 ////////////////////////////////////////////////////////////////////////
252
253 /**
254 * Creates an empty DOM Document.
255 */
256
257 public static org.w3c.dom.Document createEmptyDocument() {
258 Node document = new Node(Node.RootNode, new CharBuffer(0), 0, 0);
259 Node node = new Node(Node.StartTag, new CharBuffer(0), 0, 0, "html", new TagTable());
260 if (document != null && node != null) {
261 Node.insertNodeAtStart(document, node);
262 return (org.w3c.dom.Document) document.getAdapter();
263 } else {
264 return null;
265 }
266 }
267
268 /**
269 * Command line interface to parser and pretty printer.
270 */
271
272 public static void main(String[] argv) {
273 int totalerrors = 0;
274 int totalwarnings = 0;
275 String file;
276 int argc = argv.length + 1;
277 int argIndex = 0;
278 String arg;
279
280 TidyHTMLParser tidy = new TidyHTMLParser(new TidyConfiguration(null, null, null));
281 TidyConfiguration configuration = tidy.getConfiguration();
282
283 /* read command line */
284
285 while (argc > 0) {
286 if (argc > 1 && argv[argIndex].startsWith("-")) {
287 /* support -foo and --foo */
288 arg = argv[argIndex].substring(1);
289
290 if (arg.length() > 0 && arg.charAt(0) == '-')
291 arg = arg.substring(1);
292
293 if (arg.equals("xml"))
294 configuration.setXmlTags(true);
295 else if (arg.equals("asxml") || arg.equals("asxhtml"))
296 configuration.setXHTML(true);
297 else if (arg.equals("noindent")) {
298 configuration.setIndentContent(false);
299 configuration.setSmartIndent(false);
300 } else if (arg.equals("omit"))
301 configuration.setHideEndTags(true);
302 else if (arg.equals("upper"))
303 configuration.setUpperCaseTags(true);
304 else if (arg.equals("clean"))
305 configuration.setMakeClean(true);
306 else if (arg.equals("raw"))
307 configuration.setCharEncoding(Configuration.RAW);
308 else if (arg.equals("ascii"))
309 configuration.setCharEncoding(Configuration.ASCII);
310 else if (arg.equals("latin1"))
311 configuration.setCharEncoding(Configuration.LATIN1);
312 else if (arg.equals("utf8"))
313 configuration.setCharEncoding(Configuration.UTF8);
314 else if (arg.equals("iso2022"))
315 configuration.setCharEncoding(Configuration.ISO2022);
316 else if (arg.equals("mac"))
317 configuration.setCharEncoding(Configuration.MACROMAN);
318 else if (arg.equals("numeric"))
319 configuration.setNumEntities(true);
320 else if (arg.equals("modify"))
321 configuration.setWriteBack(true);
322 else if (arg.equals("change")) /* obsolete */
323 configuration.setWriteBack(true);
324 else if (arg.equals("update")) /* obsolete */
325 configuration.setWriteBack(true);
326 else if (arg.equals("errors"))
327 configuration.setOnlyErrors(true);
328 else if (arg.equals("quiet"))
329 configuration.setQuiet(true);
330 else if (arg.equals("slides"))
331 configuration.setBurstSlides(true);
332 else if (
333 arg.equals("help")
334 || argv[argIndex].charAt(1) == '?'
335 || argv[argIndex].charAt(1) == 'h') {
336 Report.helpText(NAME);
337 System.exit(1);
338 } else if (arg.equals("config")) {
339 if (argc >= 3) {
340 configuration.parseFile(argv[argIndex + 1]);
341 --argc;
342 ++argIndex;
343 }
344 } else if (
345 argv[argIndex].equals("-file")
346 || argv[argIndex].equals("--file")
347 || argv[argIndex].equals("-f")) {
348 if (argc >= 3) {
349 configuration.setErrfile(argv[argIndex + 1]);
350 --argc;
351 ++argIndex;
352 }
353 } else if (
354 argv[argIndex].equals("-wrap")
355 || argv[argIndex].equals("--wrap")
356 || argv[argIndex].equals("-w")) {
357 if (argc >= 3) {
358 configuration.setWraplen(Integer.parseInt(argv[argIndex + 1]));
359 --argc;
360 ++argIndex;
361 }
362 } else if (
363 argv[argIndex].equals("-version")
364 || argv[argIndex].equals("--version")
365 || argv[argIndex].equals("-v")) {
366 Report.showVersion();
367 System.exit(0);
368 } else {
369 String s = argv[argIndex];
370
371 for (int i = 1; i < s.length(); i++) {
372 if (s.charAt(i) == 'i') {
373 configuration.setIndentContent(true);
374 configuration.setSmartIndent(true);
375 } else if (s.charAt(i) == 'o')
376 configuration.setHideEndTags(true);
377 else if (s.charAt(i) == 'u')
378 configuration.setUpperCaseTags(true);
379 else if (s.charAt(i) == 'c')
380 configuration.setMakeClean(true);
381 else if (s.charAt(i) == 'n')
382 configuration.setNumEntities(true);
383 else if (s.charAt(i) == 'm')
384 configuration.setWriteBack(true);
385 else if (s.charAt(i) == 'e')
386 configuration.setOnlyErrors(true);
387 else if (s.charAt(i) == 'q')
388 configuration.setQuiet(true);
389 else
390 Report.unknownOption(s.charAt(i));
391 }
392 }
393 --argc;
394 ++argIndex;
395 continue;
396 }
397
398 if (argc > 1) {
399 file = argv[argIndex];
400 } else {
401 file = "stdin";
402 }
403
404 Report.setWriter(configuration.getErrWriter());
405 try {
406 tidy.parse(file, new PrintWriter(System.out));
407 totalwarnings += tidy.parseWarnings;
408 totalerrors += tidy.parseErrors;
409 } catch (FileNotFoundException fnfe) {
410 Report.unknownFile(NAME, file);
411 } catch (IOException ioe) {
412 Report.unknownFile(NAME, file);
413 }
414
415 --argc;
416 ++argIndex;
417
418 if (argc <= 1)
419 break;
420 }
421
422 if (totalerrors + totalwarnings > 0)
423 Report.generalInfo();
424 configuration.closeErrWriter();
425
426 /* return status can be used by scripts */
427 if (totalerrors > 0)
428 System.exit(2);
429 if (totalwarnings > 0)
430 System.exit(1);
431 /* 0 signifies all is ok */
432 System.exit(0);
433 }
434
435 ////////////////////////////////////////////////////////////////////////
436 }