Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/sitemesh/parser/DOMPageParser.java


1   package com.sitemesh.parser;
2   
3   import com.sitemesh.Page;
4   import java.io.ByteArrayInputStream;
5   import java.io.IOException;
6   import org.openxml.parser.HTMLParser;
7   import org.w3c.dom.*;
8   import org.w3c.dom.html.*;
9   import org.xml.sax.InputSource;
10  import org.xml.sax.SAXException;
11  
12  /**
13   * Implementation of {@link com.sitemesh.PageParser} that builds up a
14   * {@link com.sitemesh.HTMLPage} using DOM.
15   *
16   * <p>This implementation uses <a href="http://www.openxml.org/">OpenXML 1.2</a>
17   * as it is capable of parsing (and beform basic clean-up operations) on
18   * HTML.</p>
19   *
20   * @author <a href="joe@truemesh.com">Joe Walnes</a>
21   * @version $Revision: 1.7 $
22   */
23  public class DOMPageParser extends AbstractPageParser {
24    /**
25     * @label builds
26     * @directed
27     */
28    /*#private DOMPage lnkDOMPage;*/
29  
30    /**
31     * Send data to DOM parser and build new
32     * {@link com.sitemesh.parser.DOMPage}. If the data
33     * cannot be parsed, an {@link com.sitemesh.parser.UnParsedPage}
34     * is returned instead.
35     */
36    public Page parse( byte[] data ) throws IOException {
37      try {
38        HTMLDocument document = parseHTML( data );
39        return new DOMPage( data, document );
40      }
41      catch ( SAXException e ) {
42        return new UnParsedPage( data );
43      }
44    }
45  
46    /**
47     * Parse the data and return a DOM HTMLDocument.
48     */
49    private HTMLDocument parseHTML( byte[] data ) throws IOException, SAXException {
50      HTMLParser parser = new HTMLParser();
51      parser.parse(
52        new InputSource(
53        new ByteArrayInputStream( data ) ) );
54      return ( HTMLDocument )parser.getDocument();
55    }
56  
57  }