Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/scanners/CompositeTagScanner.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/scanners/CompositeTagScanner.java,v 1.2 2004/02/10 13:41:09 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  package org.htmlparser.scanners;
34  
35  import java.util.HashSet;
36  import java.util.Set;
37  
38  import org.htmlparser.Node;
39  import org.htmlparser.NodeReader;
40  import org.htmlparser.parserHelper.CompositeTagScannerHelper;
41  import org.htmlparser.tags.EndTag;
42  import org.htmlparser.tags.Tag;
43  import org.htmlparser.tags.data.CompositeTagData;
44  import org.htmlparser.tags.data.TagData;
45  import org.htmlparser.util.ParserException;
46  
47  /**
48   * To create your own scanner that can hold children, create a subclass of this class.
49   * The composite tag scanner can be configured with:<br>
50   * <ul>
51   * <li>Tags which will trigger a match</li>
52   * <li>Tags which when encountered before a legal end tag, should force a correction</li>
53   * <li>Preventing more tags of its own type to appear as children 
54   * </ul> 
55   * Here are examples of each:<BR>
56   * <B>Tags which will trigger a match</B>
57   * If we wish to recognize &lt;mytag&gt;,
58   * <pre>
59   * MyScanner extends CompositeTagScanner {
60   *   private static final String [] MATCH_IDS = { "MYTAG" };
61   *   MyScanner() {
62   *    super(MATCH_IDS);
63   *   }
64   *   ...
65   * }
66   * </pre>
67   * <B>Tags which force correction</B>
68   * If we wish to insert end tags if we get a </BODY> or </HTML> without recieving
69   * &lt;/mytag&gt;
70   * <pre>
71   * MyScanner extends CompositeTagScanner {
72   *   private static final String [] MATCH_IDS = { "MYTAG" };
73   *   private static final String [] ENDERS = {};
74   *   private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
75   *   MyScanner() {
76   *    super(MATCH_IDS, ENDERS, END_TAG_ENDERS, true);
77   *   }
78   *   ...
79   * }
80   * </pre>
81   * <B>Preventing children of same type</B>
82   * This is useful when you know that a certain tag can never hold children of its own type.
83   * e.g. &lt;FORM&gt; can never have more form tags within it. If it does, it is an error and should 
84   * be corrected. The default behavior is to allow nesting.
85   * <pre>
86   * MyScanner extends CompositeTagScanner {
87   *   private static final String [] MATCH_IDS = { "FORM" };
88   *   private static final String [] ENDERS = {};
89   *   private static final String [] END_TAG_ENDERS = { "BODY", "HTML" };
90   *   MyScanner() {
91   *    super(MATCH_IDS, ENDERS,END_TAG_ENDERS, false);
92   *   }
93   *   ...
94   * }
95   * </pre>
96   * Inside the scanner, use createTag() to specify what tag needs to be created.
97   */
98  public abstract class CompositeTagScanner extends TagScanner
99  {
100     protected String[] nameOfTagToMatch;
101     private boolean allowSelfChildren;
102     private Set tagEnderSet;
103     private Set endTagEnderSet;
104     private boolean balance_quotes;
105 
106     public CompositeTagScanner(String[] nameOfTagToMatch)
107     {
108         this(nameOfTagToMatch, new String[] {
109         });
110     }
111 
112     public CompositeTagScanner(String[] nameOfTagToMatch, String[] tagEnders)
113     {
114         this("", nameOfTagToMatch, tagEnders);
115     }
116 
117     public CompositeTagScanner(
118         String[] nameOfTagToMatch,
119         String[] tagEnders,
120         boolean allowSelfChildren)
121     {
122         this("", nameOfTagToMatch, tagEnders, allowSelfChildren);
123     }
124 
125     public CompositeTagScanner(String filter, String[] nameOfTagToMatch)
126     {
127         this(filter, nameOfTagToMatch, new String[] {
128         }, true);
129     }
130 
131     public CompositeTagScanner(
132         String filter,
133         String[] nameOfTagToMatch,
134         String[] tagEnders)
135     {
136         this(filter, nameOfTagToMatch, tagEnders, true);
137     }
138 
139     public CompositeTagScanner(
140         String filter,
141         String[] nameOfTagToMatch,
142         String[] tagEnders,
143         boolean allowSelfChildren)
144     {
145         this(filter, nameOfTagToMatch, tagEnders, new String[] {
146         }, allowSelfChildren);
147     }
148 
149     public CompositeTagScanner(
150         String filter,
151         String[] nameOfTagToMatch,
152         String[] tagEnders,
153         String[] endTagEnders,
154         boolean allowSelfChildren)
155     {
156         this(
157             filter,
158             nameOfTagToMatch,
159             tagEnders,
160             endTagEnders,
161             allowSelfChildren,
162             false);
163     }
164 
165     /**
166      * Constructor specifying all member fields.
167      * @param filter A string that is used to match which tags are to be allowed
168      * to pass through. This can be useful when one wishes to dynamically filter
169      * out all tags except one type which may be programmed later than the parser.
170      * @param nameOfTagToMatch The tag names recognized by this scanner.
171      * @param tagEnders The non-endtag tag names which signal that no closing
172      * end tag was found. For example, encountering &lt;FORM&gt; while
173      * scanning a &lt;A&gt; link tag would mean that no &lt;/A&gt; was found
174      * and needs to be corrected.
175      * @param endTagEnders The endtag names which signal that no closing end
176      * tag was found. For example, encountering &lt;/HTML&gt; while
177      * scanning a &lt;BODY&gt; tag would mean that no &lt;/BODY&gt; was found
178      * and needs to be corrected. These items are not prefixed by a '/'.
179      * @param allowSelfChildren If <code>true</code> a tag of the same name is
180      * allowed within this tag. Used to determine when an endtag is missing.
181      * @param balance_quotes <code>true</code> if scanning string nodes needs to
182      * honour quotes. For example, ScriptScanner defines this <code>true</code>
183      * so that text within &lt;SCRIPT&gt;&lt;/SCRIPT&gt; ignores tag-like text
184      * within quotes.
185      */
186     public CompositeTagScanner(
187         String filter,
188         String[] nameOfTagToMatch,
189         String[] tagEnders,
190         String[] endTagEnders,
191         boolean allowSelfChildren,
192         boolean balance_quotes)
193     {
194         super(filter);
195         this.nameOfTagToMatch = nameOfTagToMatch;
196         this.allowSelfChildren = allowSelfChildren;
197         this.balance_quotes = balance_quotes;
198         this.tagEnderSet = new HashSet();
199         for (int i = 0; i < tagEnders.length; i++)
200             tagEnderSet.add(tagEnders[i]);
201         this.endTagEnderSet = new HashSet();
202         for (int i = 0; i < endTagEnders.length; i++)
203             endTagEnderSet.add(endTagEnders[i]);
204     }
205 
206     public Tag scan(Tag tag, String url, NodeReader reader, String currLine)
207         throws ParserException
208     {
209         CompositeTagScannerHelper helper =
210             new CompositeTagScannerHelper(
211                 this,
212                 tag,
213                 url,
214                 reader,
215                 currLine,
216                 balance_quotes);
217         return helper.scan();
218     }
219 
220     /**
221      * Override this method if you wish to create any data structures or do anything
222      * before the start of the scan. This is just after a tag has triggered the scanner
223      * but before the scanner begins its processing. 
224      */
225     public void beforeScanningStarts()
226     {
227     }
228 
229     /**
230      * This method is called everytime a child to the composite is found. It is useful when we 
231      * need to store special children seperately. Though, all children are collected anyway into a node list.
232      */
233     public void childNodeEncountered(Node node)
234     {
235     }
236 
237     /**
238        * You must override this method to create the tag of your choice upon successful parsing. Data required
239        * for construction of your tag can be found within tagData and compositeTagData
240      */
241     public abstract Tag createTag(
242         TagData tagData,
243         CompositeTagData compositeTagData)
244         throws ParserException;
245 
246     public final boolean isTagToBeEndedFor(Tag tag)
247     
248     {
249         boolean isEndTag = tag instanceof EndTag;
250         String tagName = tag.getTagName();
251         if ((isEndTag && endTagEnderSet.contains(tagName))
252             || (!isEndTag && tagEnderSet.contains(tagName)))
253             return true;
254         else
255             return false;
256     }
257 
258     public final boolean isAllowSelfChildren()
259     {
260         return allowSelfChildren;
261     }
262 
263     /**
264      * Override this method to implement scanner logic that determines if the current scanner is 
265      * to be allowed. This is useful when there are rules which dont allow recursive tags of the same
266      * type. @see BulletScanner
267      * @return boolean true/false
268      */
269     public boolean shouldCreateEndTagAndExit()
270     {
271         return false;
272     }
273 }