Source code: com/sitemesh/parser/DOMPageParser.java
1 package com.sitemesh.parser;
2
3 import com.sitemesh.Page;
4 import java.io.ByteArrayInputStream;
5 import java.io.IOException;
6 import org.openxml.parser.HTMLParser;
7 import org.w3c.dom.*;
8 import org.w3c.dom.html.*;
9 import org.xml.sax.InputSource;
10 import org.xml.sax.SAXException;
11
12 /**
13 * Implementation of {@link com.sitemesh.PageParser} that builds up a
14 * {@link com.sitemesh.HTMLPage} using DOM.
15 *
16 * <p>This implementation uses <a href="http://www.openxml.org/">OpenXML 1.2</a>
17 * as it is capable of parsing (and beform basic clean-up operations) on
18 * HTML.</p>
19 *
20 * @author <a href="joe@truemesh.com">Joe Walnes</a>
21 * @version $Revision: 1.7 $
22 */
23 public class DOMPageParser extends AbstractPageParser {
24 /**
25 * @label builds
26 * @directed
27 */
28 /*#private DOMPage lnkDOMPage;*/
29
30 /**
31 * Send data to DOM parser and build new
32 * {@link com.sitemesh.parser.DOMPage}. If the data
33 * cannot be parsed, an {@link com.sitemesh.parser.UnParsedPage}
34 * is returned instead.
35 */
36 public Page parse( byte[] data ) throws IOException {
37 try {
38 HTMLDocument document = parseHTML( data );
39 return new DOMPage( data, document );
40 }
41 catch ( SAXException e ) {
42 return new UnParsedPage( data );
43 }
44 }
45
46 /**
47 * Parse the data and return a DOM HTMLDocument.
48 */
49 private HTMLDocument parseHTML( byte[] data ) throws IOException, SAXException {
50 HTMLParser parser = new HTMLParser();
51 parser.parse(
52 new InputSource(
53 new ByteArrayInputStream( data ) ) );
54 return ( HTMLDocument )parser.getDocument();
55 }
56
57 }