1 package com.opensymphony.module.sitemesh.parser;
2
3 import com.opensymphony.module.sitemesh.Page;
4 import com.opensymphony.module.sitemesh.PageParser;
5 import com.opensymphony.module.sitemesh.html.HTMLProcessor;
6 import com.opensymphony.module.sitemesh.html.State;
7 import com.opensymphony.module.sitemesh.html.StateTransitionRule;
8 import com.opensymphony.module.sitemesh.html.tokenizer.TagTokenizer;
9 import com.opensymphony.module.sitemesh.html.util.CharArray;
10 import com.opensymphony.module.sitemesh.html.rules.BodyTagRule;
11 import com.opensymphony.module.sitemesh.html.rules.ContentBlockExtractingRule;
12 import com.opensymphony.module.sitemesh.html.rules.FramesetRule;
13 import com.opensymphony.module.sitemesh.html.rules.HeadExtractingRule;
14 import com.opensymphony.module.sitemesh.html.rules.HtmlAttributesRule;
15 import com.opensymphony.module.sitemesh.html.rules.MSOfficeDocumentPropertiesRule;
16 import com.opensymphony.module.sitemesh.html.rules.MetaTagRule;
17 import com.opensymphony.module.sitemesh.html.rules.ParameterExtractingRule;
18 import com.opensymphony.module.sitemesh.html.rules.TitleExtractingRule;
19 import com.opensymphony.module.sitemesh.html.rules.PageBuilder;
20
21 import java.io.IOException;
22
23 /**
24 * <p>Builds an HTMLPage object from an HTML document. This behaves
25 * similarly to the FastPageParser, however it's a complete rewrite that is simpler to add custom features to such as
26 * extraction and transformation of elements.</p>
27 *
28 * <p>To customize the rules used, this class can be extended and have the userDefinedRules() methods overridden.</p>
29 *
30 * @author Joe Walnes
31 *
32 * @see HTMLProcessor
33 */
34 public class HTMLPageParser implements PageParser {
35
36 public Page parse(char[] data) throws IOException {
37 CharArray head = new CharArray(64);
38 CharArray body = new CharArray(4096);
39 TokenizedHTMLPage page = new TokenizedHTMLPage(data, body, head);
40 HTMLProcessor processor = new HTMLProcessor(data, body);
41 State html = processor.defaultState();
42
43 // Core rules for SiteMesh to be functional.
44 html.addRule(new HeadExtractingRule(head)); // contents of <head>
45 html.addRule(new BodyTagRule(page, body)); // contents of <body>
46 html.addRule(new TitleExtractingRule(page)); // the <title>
47 html.addRule(new FramesetRule(page)); // if the page is a frameset
48
49 // Additional rules - designed to be tweaked.
50 addUserDefinedRules(html, page);
51
52 processor.process();
53 return page;
54 }
55
56 protected void addUserDefinedRules(State html, PageBuilder page) {
57 // Ensure that while in <xml> tag, none of the other rules kick in.
58 // For example <xml><book><title>hello</title></book></xml> should not change the affect the title of the page.
59 State xml = new State();
60 html.addRule(new StateTransitionRule("xml", xml));
61
62 // Useful properties
63 html.addRule(new HtmlAttributesRule(page)); // attributes in <html> element
64 html.addRule(new MetaTagRule(page)); // all <meta> tags
65 html.addRule(new ParameterExtractingRule(page)); // <parameter> blocks
66 html.addRule(new ContentBlockExtractingRule(page)); // <content> blocks
67
68 // Capture properties written to documents by MS Office (author, version, company, etc).
69 // Note: These properties are from the xml state, not the html state.
70 xml.addRule(new MSOfficeDocumentPropertiesRule(page));
71 }
72
73 }