com.opensymphony.module.sitemesh.parser
public class: HTMLPageParser [javadoc |
source]
java.lang.Object
com.opensymphony.module.sitemesh.parser.HTMLPageParser
All Implemented Interfaces:
PageParser
Direct Known Subclasses:
DivExtractingPageParser
Builds an HTMLPage object from an HTML document. This behaves
similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as
extraction and transformation of elements.
To customize the rules used, this class can be extended and have the userDefinedRules() methods overridden.
| Method from com.opensymphony.module.sitemesh.parser.HTMLPageParser Detail: |
protected void addUserDefinedRules(State html,
PageBuilder page) {
// Ensure that while in < xml > tag, none of the other rules kick in.
// For example < xml >< book >< title >hello< /title >< /book >< /xml > should not change the affect the title of the page.
State xml = new State();
html.addRule(new StateTransitionRule("xml", xml));
// Useful properties
html.addRule(new HtmlAttributesRule(page)); // attributes in < html > element
html.addRule(new MetaTagRule(page)); // all < meta > tags
html.addRule(new ParameterExtractingRule(page)); // < parameter > blocks
html.addRule(new ContentBlockExtractingRule(page)); // < content > blocks
// Capture properties written to documents by MS Office (author, version, company, etc).
// Note: These properties are from the xml state, not the html state.
xml.addRule(new MSOfficeDocumentPropertiesRule(page));
}
|
public Page parse(char[] data) throws IOException {
CharArray head = new CharArray(64);
CharArray body = new CharArray(4096);
TokenizedHTMLPage page = new TokenizedHTMLPage(data, body, head);
HTMLProcessor processor = new HTMLProcessor(data, body);
State html = processor.defaultState();
// Core rules for SiteMesh to be functional.
html.addRule(new HeadExtractingRule(head)); // contents of < head >
html.addRule(new BodyTagRule(page, body)); // contents of < body >
html.addRule(new TitleExtractingRule(page)); // the < title >
html.addRule(new FramesetRule(page)); // if the page is a frameset
// Additional rules - designed to be tweaked.
addUserDefinedRules(html, page);
processor.process();
return page;
}
|