Source code: org/htmlparser/scanners/CompositeTagScanner.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/CompositeTagScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33 package org.htmlparser.scanners;
34
35 import java.util.HashSet;
36 import java.util.Set;
37
38 import org.htmlparser.Node;
39 import org.htmlparser.NodeReader;
40 import org.htmlparser.parserHelper.CompositeTagScannerHelper;
41 import org.htmlparser.tags.EndTag;
42 import org.htmlparser.tags.Tag;
43 import org.htmlparser.tags.data.CompositeTagData;
44 import org.htmlparser.tags.data.TagData;
45 import org.htmlparser.util.ParserException;
46
47 /**
48 * To create your own scanner that can hold children, create a subclass of this class.
49 * The composite tag scanner can be configured with:<br>
50 * <ul>
51 * <li>Tags which will trigger a match</li>
52 * <li>Tags which when encountered before a legal end tag, should force a correction</li>
53 * <li>Preventing more tags of its own type to appear as children
54 * </ul>
55 * Here are examples of each:<BR>
56 * <B>Tags which will trigger a match</B>
57 * If we wish to recognize <mytag>,
58 * <pre>
59 * MyScanner extends CompositeTagScanner {
60 * private static final String [] MATCH_IDS = { "MYTAG" };
61 * MyScanner() {
62 * super(MATCH_IDS);
63 * }
64 * ...
65 * }
66 * </pre>
67 * <B>Tags which force correction</B>
68 * If we wish to insert end tags if we get a </BODY> or </HTML> without recieving
69 * </mytag>
70 * <pre>
71 * MyScanner extends CompositeTagScanner {
72 * private static final String [] MATCH_IDS = { "MYTAG" };
73 * private static final String [] ENDERS = {};
74 * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
75 * MyScanner() {
76 * super(MATCH_IDS, ENDERS, END_TAG_ENDERS, true);
77 * }
78 * ...
79 * }
80 * </pre>
81 * <B>Preventing children of same type</B>
82 * This is useful when you know that a certain tag can never hold children of its own type.
83 * e.g. <FORM> can never have more form tags within it. If it does, it is an error and should
84 * be corrected. The default behavior is to allow nesting.
85 * <pre>
86 * MyScanner extends CompositeTagScanner {
87 * private static final String [] MATCH_IDS = { "FORM" };
88 * private static final String [] ENDERS = {};
89 * private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
90 * MyScanner() {
91 * super(MATCH_IDS, ENDERS,END_TAG_ENDERS, false);
92 * }
93 * ...
94 * }
95 * </pre>
96 * Inside the scanner, use createTag() to specify what tag needs to be created.
97 */
98 public abstract class CompositeTagScanner extends TagScanner
99 {
100 protected String[] nameOfTagToMatch;
101 private boolean allowSelfChildren;
102 private Set tagEnderSet;
103 private Set endTagEnderSet;
104 private boolean balance_quotes;
105
106 public CompositeTagScanner(String[] nameOfTagToMatch)
107 {
108 this(nameOfTagToMatch, new String[] {
109 });
110 }
111
112 public CompositeTagScanner(String[] nameOfTagToMatch, String[] tagEnders)
113 {
114 this("", nameOfTagToMatch, tagEnders);
115 }
116
117 public CompositeTagScanner(
118 String[] nameOfTagToMatch,
119 String[] tagEnders,
120 boolean allowSelfChildren)
121 {
122 this("", nameOfTagToMatch, tagEnders, allowSelfChildren);
123 }
124
125 public CompositeTagScanner(String filter, String[] nameOfTagToMatch)
126 {
127 this(filter, nameOfTagToMatch, new String[] {
128 }, true);
129 }
130
131 public CompositeTagScanner(
132 String filter,
133 String[] nameOfTagToMatch,
134 String[] tagEnders)
135 {
136 this(filter, nameOfTagToMatch, tagEnders, true);
137 }
138
139 public CompositeTagScanner(
140 String filter,
141 String[] nameOfTagToMatch,
142 String[] tagEnders,
143 boolean allowSelfChildren)
144 {
145 this(filter, nameOfTagToMatch, tagEnders, new String[] {
146 }, allowSelfChildren);
147 }
148
149 public CompositeTagScanner(
150 String filter,
151 String[] nameOfTagToMatch,
152 String[] tagEnders,
153 String[] endTagEnders,
154 boolean allowSelfChildren)
155 {
156 this(
157 filter,
158 nameOfTagToMatch,
159 tagEnders,
160 endTagEnders,
161 allowSelfChildren,
162 false);
163 }
164
165 /**
166 * Constructor specifying all member fields.
167 * @param filter A string that is used to match which tags are to be allowed
168 * to pass through. This can be useful when one wishes to dynamically filter
169 * out all tags except one type which may be programmed later than the parser.
170 * @param nameOfTagToMatch The tag names recognized by this scanner.
171 * @param tagEnders The non-endtag tag names which signal that no closing
172 * end tag was found. For example, encountering <FORM> while
173 * scanning a <A> link tag would mean that no </A> was found
174 * and needs to be corrected.
175 * @param endTagEnders The endtag names which signal that no closing end
176 * tag was found. For example, encountering </HTML> while
177 * scanning a <BODY> tag would mean that no </BODY> was found
178 * and needs to be corrected. These items are not prefixed by a '/'.
179 * @param allowSelfChildren If <code>true</code> a tag of the same name is
180 * allowed within this tag. Used to determine when an endtag is missing.
181 * @param balance_quotes <code>true</code> if scanning string nodes needs to
182 * honour quotes. For example, ScriptScanner defines this <code>true</code>
183 * so that text within <SCRIPT></SCRIPT> ignores tag-like text
184 * within quotes.
185 */
186 public CompositeTagScanner(
187 String filter,
188 String[] nameOfTagToMatch,
189 String[] tagEnders,
190 String[] endTagEnders,
191 boolean allowSelfChildren,
192 boolean balance_quotes)
193 {
194 super(filter);
195 this.nameOfTagToMatch = nameOfTagToMatch;
196 this.allowSelfChildren = allowSelfChildren;
197 this.balance_quotes = balance_quotes;
198 this.tagEnderSet = new HashSet();
199 for (int i = 0; i < tagEnders.length; i++)
200 tagEnderSet.add(tagEnders[i]);
201 this.endTagEnderSet = new HashSet();
202 for (int i = 0; i < endTagEnders.length; i++)
203 endTagEnderSet.add(endTagEnders[i]);
204 }
205
206 public Tag scan(Tag tag, String url, NodeReader reader, String currLine)
207 throws ParserException
208 {
209 CompositeTagScannerHelper helper =
210 new CompositeTagScannerHelper(
211 this,
212 tag,
213 url,
214 reader,
215 currLine,
216 balance_quotes);
217 return helper.scan();
218 }
219
220 /**
221 * Override this method if you wish to create any data structures or do anything
222 * before the start of the scan. This is just after a tag has triggered the scanner
223 * but before the scanner begins its processing.
224 */
225 public void beforeScanningStarts()
226 {
227 }
228
229 /**
230 * This method is called everytime a child to the composite is found. It is useful when we
231 * need to store special children seperately. Though, all children are collected anyway into a node list.
232 */
233 public void childNodeEncountered(Node node)
234 {
235 }
236
237 /**
238 * You must override this method to create the tag of your choice upon successful parsing. Data required
239 * for construction of your tag can be found within tagData and compositeTagData
240 */
241 public abstract Tag createTag(
242 TagData tagData,
243 CompositeTagData compositeTagData)
244 throws ParserException;
245
246 public final boolean isTagToBeEndedFor(Tag tag)
247
248 {
249 boolean isEndTag = tag instanceof EndTag;
250 String tagName = tag.getTagName();
251 if ((isEndTag && endTagEnderSet.contains(tagName))
252 || (!isEndTag && tagEnderSet.contains(tagName)))
253 return true;
254 else
255 return false;
256 }
257
258 public final boolean isAllowSelfChildren()
259 {
260 return allowSelfChildren;
261 }
262
263 /**
264 * Override this method to implement scanner logic that determines if the current scanner is
265 * to be allowed. This is useful when there are rules which dont allow recursive tags of the same
266 * type. @see BulletScanner
267 * @return boolean true/false
268 */
269 public boolean shouldCreateEndTagAndExit()
270 {
271 return false;
272 }
273 }