Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/tags/Tag.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/tags/Tag.java,v 1.2.2.1 2004/10/24 01:22:59 sebb Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  package org.htmlparser.tags;
34  
35  import java.util.Enumeration;
36  import java.util.HashSet;
37  import java.util.Hashtable;
38  import java.util.Map;
39  
40  import org.htmlparser.Node;
41  import org.htmlparser.NodeReader;
42  import org.htmlparser.parserHelper.AttributeParser;
43  import org.htmlparser.parserHelper.TagParser;
44  import org.htmlparser.scanners.TagScanner;
45  import org.htmlparser.tags.data.TagData;
46  import org.htmlparser.util.NodeList;
47  import org.htmlparser.util.ParserException;
48  import org.htmlparser.visitors.NodeVisitor;
49  
50  /**
51   * Tag represents a generic tag. This class allows users to register specific
52   * tag scanners, which can identify links, or image references. This tag asks the
53   * scanners to run over the text, and identify. It can be used to dynamically
54   * configure a parser.
55   * @author Kaarle Kaila 23.10.2001
56   */
57  public class Tag extends Node
58  {
59      public static final String TYPE = "TAG";
60      /**
61       * Constant used as value for the value of the tag name
62       * in parseParameters  (Kaarle Kaila 3.8.2001)
63       */
64      public final static String TAGNAME = "$<TAGNAME>$";
65      public final static String EMPTYTAG = "$<EMPTYTAG>$";
66      private final static int TAG_BEFORE_PARSING_STATE = 1;
67      private final static int TAG_BEGIN_PARSING_STATE = 2;
68      private final static int TAG_FINISHED_PARSING_STATE = 3;
69      private final static int TAG_ILLEGAL_STATE = 4;
70      private final static int TAG_IGNORE_DATA_STATE = 5;
71      private final static int TAG_IGNORE_BEGIN_TAG_STATE = 6;
72      private final static String EMPTY_STRING = "";
73  
74      private static AttributeParser paramParser = new AttributeParser();
75      private static TagParser tagParser;
76      /**
77       * Tag contents will have the contents of the comment tag.
78       */
79      protected StringBuffer tagContents;
80      private boolean emptyXmlTag = false;
81      /**
82       * tag parameters parsed into this hashtable
83       * not implemented yet
84       * added by Kaarle Kaila 23.10.2001
85       */
86      protected Hashtable attributes = null;
87  
88      /**
89       * Scanner associated with this tag (useful for extraction of filtering data from a
90       * HTML node)
91       */
92      protected TagScanner thisScanner = null;
93      private java.lang.String tagLine;
94  
95      /**
96       * The combined text of all the lines spanned by this tag
97       */
98      private String[] tagLines;
99  
100     /**
101      * The line number on which this tag starts
102      */
103     private int startLine;
104 
105     /**
106      * Set of tags that breaks the flow.
107      */
108     protected static HashSet breakTags;
109     static {
110         breakTags = new HashSet(30);
111         breakTags.add("BLOCKQUOTE");
112         breakTags.add("BODY");
113         breakTags.add("BR");
114         breakTags.add("CENTER");
115         breakTags.add("DD");
116         breakTags.add("DIR");
117         breakTags.add("DIV");
118         breakTags.add("DL");
119         breakTags.add("DT");
120         breakTags.add("FORM");
121         breakTags.add("H1");
122         breakTags.add("H2");
123         breakTags.add("H3");
124         breakTags.add("H4");
125         breakTags.add("H5");
126         breakTags.add("H6");
127         breakTags.add("HEAD");
128         breakTags.add("HR");
129         breakTags.add("HTML");
130         breakTags.add("ISINDEX");
131         breakTags.add("LI");
132         breakTags.add("MENU");
133         breakTags.add("NOFRAMES");
134         breakTags.add("OL");
135         breakTags.add("P");
136         breakTags.add("PRE");
137         breakTags.add("TD");
138         breakTags.add("TH");
139         breakTags.add("TITLE");
140         breakTags.add("UL");
141     }
142 
143     /**
144      * Set the Tag with the beginning posn, ending posn and tag contents (in
145      * a tagData object.
146      * @param tagData The data for this tag
147      */
148     public Tag(TagData tagData)
149     {
150         super(tagData.getTagBegin(), tagData.getTagEnd());
151         this.startLine = tagData.getStartLine();
152         this.tagContents = new StringBuffer();
153         this.tagContents.append(tagData.getTagContents());
154         this.tagLine = tagData.getTagLine();
155         this.tagLines = new String[] { tagData.getTagLine()};
156         this.emptyXmlTag = tagData.isEmptyXmlTag();
157     }
158 
159     public void append(char ch)
160     {
161         tagContents.append(ch);
162     }
163 
164     public void append(String ch)
165     {
166         tagContents.append(ch);
167     }
168 
169     /**
170      * Locate the tag withing the input string, by parsing from the given position
171      * @param reader HTML reader to be provided so as to allow reading of next line
172      * @param input Input String
173      * @param position Position to start parsing from
174      */
175     public static Tag find(NodeReader reader, String input, int position)
176     {
177         return tagParser.find(reader, input, position);
178     }
179 
180     /**
181      * This method is not to be called by any scanner or tag. It is
182      * an expensive method, hence it has been made private. However,
183      * there might be some circumstances when a scanner wishes to force
184      * parsing of attributes over and above what has already been parsed.
185      * To make the choice clear - we have a method - redoParseAttributes(),
186      * which can be used.
187      * @return Hashtable
188      */
189     private Hashtable parseAttributes()
190     {
191         return paramParser.parseAttributes(this);
192     }
193 
194     /**
195      * In case the tag is parsed at the scan method this will return value of a
196      * parameter not implemented yet
197      * @param name of parameter
198      */
199     public String getAttribute(String name)
200     {
201         return (String) getAttributes().get(name.toUpperCase());
202     }
203 
204     /**
205      * Set attribute with given key, value pair.
206      * @param key
207      * @param value
208      */
209     public void setAttribute(String key, String value)
210     {
211         attributes.put(key, value);
212     }
213 
214     /**
215      * In case the tag is parsed at the scan method this will return value of a
216      * parameter not implemented yet
217      * @param name of parameter
218      * @deprecated use getAttribute instead
219      */
220     public String getParameter(String name)
221     {
222         return (String) getAttributes().get(name.toUpperCase());
223     }
224 
225     /**
226      * Gets the attributes in the tag.
227      * @return Returns a Hashtable of attributes
228      */
229     public Hashtable getAttributes()
230     {
231         if (attributes == null)
232         {
233             attributes = parseAttributes();
234         }
235         return attributes;
236     }
237 
238     public String getTagName()
239     {
240         return (String) getAttributes().get(TAGNAME);
241     }
242 
243     /**
244      * Returns the line where the tag was found
245      * @return java.lang.String
246      */
247     public String getTagLine()
248     {
249         return tagLine;
250     }
251 
252     /**
253      * Returns the combined text of all the lines spanned by this tag
254      * @return java.lang.String
255      */
256     public String[] getTagLines()
257     {
258         return tagLines;
259     }
260 
261     /**
262      * Return the text contained in this tag
263      */
264     public String getText()
265     {
266         return tagContents.toString();
267     }
268 
269     /**
270      * Return the scanner associated with this tag.
271      */
272     public TagScanner getThisScanner()
273     {
274         return thisScanner;
275     }
276 
277     /**
278      * Extract the first word from the given string.
279      * Words are delimited by whitespace or equals signs.
280      * @param s The string to get the word from.
281      * @return The first word.
282      */
283     public static String extractWord(String s)
284     {
285         int length;
286         boolean parse;
287         char ch;
288         StringBuffer ret;
289 
290         length = s.length();
291         ret = new StringBuffer(length);
292         parse = true;
293         for (int i = 0; i < length && parse; i++)
294         {
295             ch = s.charAt(i);
296             if (Character.isWhitespace(ch) || ch == '=')
297                 parse = false;
298             else
299                 ret.append(Character.toUpperCase(ch));
300         }
301 
302         return (ret.toString());
303     }
304 
305     /**
306      * Scan the tag to see using the registered scanners, and attempt identification.
307      * @param url URL at which HTML page is located
308      * @param reader The NodeReader that is to be used for reading the url
309      */
310     public Node scan(Map scanners, String url, NodeReader reader)
311         throws ParserException
312     {
313         if (tagContents.length() == 0)
314             return this;
315         try
316         {
317             boolean found = false;
318             Node retVal = null;
319             // Find the first word in the scanners
320             String firstWord = extractWord(tagContents.toString());
321             // Now, get the scanner associated with this.
322             TagScanner scanner = (TagScanner) scanners.get(firstWord);
323 
324             // Now do a deep check
325             if (scanner != null
326                 && scanner.evaluate(
327                     tagContents.toString(),
328                     reader.getPreviousOpenScanner()))
329             {
330                 found = true;
331                 TagScanner save;
332                 save = reader.getPreviousOpenScanner();
333                 reader.setPreviousOpenScanner(scanner);
334                 retVal = scanner.createScannedNode(this, url, reader, tagLine);
335                 reader.setPreviousOpenScanner(save);
336             }
337 
338             if (!found)
339                 return this;
340             else
341             {
342                 return retVal;
343             }
344         }
345         catch (Exception e)
346         {
347             String errorMsg;
348             if (tagContents != null)
349                 errorMsg = tagContents.toString();
350             else
351                 errorMsg = "null";
352             throw new ParserException(
353                 "Tag.scan() : Error while scanning tag, tag contents = "
354                     + errorMsg
355                     + ", tagLine = "
356                     + tagLine,
357                 e);
358         }
359     }
360 
361     /**
362      * Sets the parsed.
363      * @param parsed The parsed to set
364      */
365     public void setAttributes(Hashtable attributes)
366     {
367         this.attributes = attributes;
368     }
369 
370     /**
371      * Sets the nodeBegin.
372      * @param nodeBegin The nodeBegin to set
373      */
374     public void setTagBegin(int tagBegin)
375     {
376         this.nodeBegin = tagBegin;
377     }
378 
379     /**
380      * Gets the nodeBegin.
381      * @return The nodeBegin value.
382      */
383     public int getTagBegin()
384     {
385         return (nodeBegin);
386     }
387 
388     /**
389      * Sets the nodeEnd.
390      * @param nodeEnd The nodeEnd to set
391      */
392     public void setTagEnd(int tagEnd)
393     {
394         this.nodeEnd = tagEnd;
395     }
396 
397     /**
398      * Gets the nodeEnd.
399      * @return The nodeEnd value.
400      */
401     public int getTagEnd()
402     {
403         return (nodeEnd);
404     }
405 
406     /**
407      * Gets the line number on which this tag starts.
408      * @return the start line number
409      */
410     public int getTagStartLine()
411     {
412         return startLine;
413     }
414 
415     /**
416      * Gets the line number on which this tag ends.
417      * @return the end line number
418      */
419     public int getTagEndLine()
420     {
421         return startLine + tagLines.length - 1;
422     }
423 
424     public void setTagLine(java.lang.String newTagLine)
425     {
426         tagLine = newTagLine;
427 
428         // Note: Incur the overhead of resizing each time (versus
429         // preallocating a larger array), since the average tag
430         // generally doesn't span multiple lines
431         String[] newTagLines = new String[tagLines.length + 1];
432         for (int i = 0; i < tagLines.length; i++)
433             newTagLines[i] = tagLines[i];
434         newTagLines[tagLines.length] = newTagLine;
435         tagLines = newTagLines;
436     }
437 
438     public void setText(String text)
439     {
440         tagContents = new StringBuffer(text);
441     }
442 
443     public void setThisScanner(TagScanner scanner)
444     {
445         thisScanner = scanner;
446     }
447 
448     public String toPlainTextString()
449     {
450         return EMPTY_STRING;
451     }
452 
453     /**
454      * A call to a tag's toHTML() method will render it in HTML
455      * Most tags that do not have children and inherit from Tag,
456      * do not need to override toHTML().
457      * @see org.htmlparser.Node#toHTML()
458      */
459     public String toHtml()
460     {
461         StringBuffer sb = new StringBuffer();
462         sb.append("<");
463         sb.append(getTagName());
464         if (containsMoreThanOneKey())
465             sb.append(" ");
466         String key, value;
467         String empty = null;
468         int i = 0;
469         for (Enumeration e = attributes.keys(); e.hasMoreElements();)
470         {
471             key = (String) e.nextElement();
472             i++;
473             if (!key.equals(TAGNAME))
474             {
475                 if (key.equals(EMPTYTAG))
476                 {
477                     empty = "/";
478                 }
479                 else
480                 {
481                     value = getAttribute(key);
482                     sb.append(key + "=\"" + value + "\"");
483                     if (i < attributes.size())
484                         sb.append(" ");
485                 }
486             }
487         }
488         if (empty != null)
489             sb.append(empty);
490         if (isEmptyXmlTag())
491             sb.append("/");
492         sb.append(">");
493         return sb.toString();
494     }
495 
496     private boolean containsMoreThanOneKey()
497     {
498         return attributes.keySet().size() > 1;
499     }
500 
501     /**
502      * Print the contents of the tag
503      */
504     public String toString()
505     {
506         return "Begin Tag : "
507             + tagContents
508             + "; begins at : "
509             + elementBegin()
510             + "; ends at : "
511             + elementEnd();
512     }
513 
514     /**
515      * Sets the tagParser.
516      * @param tagParser The tagParser to set
517      */
518     public static void setTagParser(TagParser tagParser)
519     {
520         Tag.tagParser = tagParser;
521     }
522 
523     /**
524      * Determines if the given tag breaks the flow of text.
525      * @return <code>true</code> if following text would start on a new line,
526      * <code>false</code> otherwise.
527      */
528     public boolean breaksFlow()
529     {
530         return (breakTags.contains(getText().toUpperCase()));
531     }
532 
533     /**
534      * This method verifies that the current tag matches the provided
535      * filter. The match is based on the string object and not its contents,
536      * so ensure that you are using static final filter strings provided
537      * in the tag classes.
538      * @see org.htmlparser.Node#collectInto(NodeList, String)
539      */
540     public void collectInto(NodeList collectionList, String filter)
541     {
542         if (thisScanner != null && thisScanner.getFilter().equals(filter))
543             collectionList.add(this);
544     }
545 
546     /**
547      * Returns table of attributes in the tag
548      * @return Hashtable
549      * @deprecated This method is deprecated. Use getAttributes() instead.
550      */
551     public Hashtable getParsed()
552     {
553         return attributes;
554     }
555 
556     /**
557      * Sometimes, a scanner may need to request a re-evaluation of the
558      * attributes in a tag. This may happen when there is some correction
559      * activity. An example of its usage can be found in ImageTag.
560      * <br>
561      * <B>Note:<B> This is an intensive task, hence call only when
562      * really necessary
563      * @return Hashtable
564      */
565     public Hashtable redoParseAttributes()
566     {
567         return parseAttributes();
568     }
569 
570     public void accept(NodeVisitor visitor)
571     {
572         visitor.visitTag(this);
573     }
574 
575     public String getType()
576     {
577         return TYPE;
578     }
579 
580     /**
581      * Is this an empty xml tag of the form<br>
582      * &lt;tag/&gt; 
583      * @return boolean
584      */
585     public boolean isEmptyXmlTag()
586     {
587         return emptyXmlTag;
588     }
589 
590     public void setEmptyXmlTag(boolean emptyXmlTag)
591     {
592         this.emptyXmlTag = emptyXmlTag;
593     }
594 
595 }