Source code: org/htmlparser/tags/Tag.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/tags/Tag.java,v 1.2.2.1 2004/10/24 01:22:59 sebb Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33 package org.htmlparser.tags;
34
35 import java.util.Enumeration;
36 import java.util.HashSet;
37 import java.util.Hashtable;
38 import java.util.Map;
39
40 import org.htmlparser.Node;
41 import org.htmlparser.NodeReader;
42 import org.htmlparser.parserHelper.AttributeParser;
43 import org.htmlparser.parserHelper.TagParser;
44 import org.htmlparser.scanners.TagScanner;
45 import org.htmlparser.tags.data.TagData;
46 import org.htmlparser.util.NodeList;
47 import org.htmlparser.util.ParserException;
48 import org.htmlparser.visitors.NodeVisitor;
49
50 /**
51 * Tag represents a generic tag. This class allows users to register specific
52 * tag scanners, which can identify links, or image references. This tag asks the
53 * scanners to run over the text, and identify. It can be used to dynamically
54 * configure a parser.
55 * @author Kaarle Kaila 23.10.2001
56 */
57 public class Tag extends Node
58 {
59 public static final String TYPE = "TAG";
60 /**
61 * Constant used as value for the value of the tag name
62 * in parseParameters (Kaarle Kaila 3.8.2001)
63 */
64 public final static String TAGNAME = "$<TAGNAME>$";
65 public final static String EMPTYTAG = "$<EMPTYTAG>$";
66 private final static int TAG_BEFORE_PARSING_STATE = 1;
67 private final static int TAG_BEGIN_PARSING_STATE = 2;
68 private final static int TAG_FINISHED_PARSING_STATE = 3;
69 private final static int TAG_ILLEGAL_STATE = 4;
70 private final static int TAG_IGNORE_DATA_STATE = 5;
71 private final static int TAG_IGNORE_BEGIN_TAG_STATE = 6;
72 private final static String EMPTY_STRING = "";
73
74 private static AttributeParser paramParser = new AttributeParser();
75 private static TagParser tagParser;
76 /**
77 * Tag contents will have the contents of the comment tag.
78 */
79 protected StringBuffer tagContents;
80 private boolean emptyXmlTag = false;
81 /**
82 * tag parameters parsed into this hashtable
83 * not implemented yet
84 * added by Kaarle Kaila 23.10.2001
85 */
86 protected Hashtable attributes = null;
87
88 /**
89 * Scanner associated with this tag (useful for extraction of filtering data from a
90 * HTML node)
91 */
92 protected TagScanner thisScanner = null;
93 private java.lang.String tagLine;
94
95 /**
96 * The combined text of all the lines spanned by this tag
97 */
98 private String[] tagLines;
99
100 /**
101 * The line number on which this tag starts
102 */
103 private int startLine;
104
105 /**
106 * Set of tags that breaks the flow.
107 */
108 protected static HashSet breakTags;
109 static {
110 breakTags = new HashSet(30);
111 breakTags.add("BLOCKQUOTE");
112 breakTags.add("BODY");
113 breakTags.add("BR");
114 breakTags.add("CENTER");
115 breakTags.add("DD");
116 breakTags.add("DIR");
117 breakTags.add("DIV");
118 breakTags.add("DL");
119 breakTags.add("DT");
120 breakTags.add("FORM");
121 breakTags.add("H1");
122 breakTags.add("H2");
123 breakTags.add("H3");
124 breakTags.add("H4");
125 breakTags.add("H5");
126 breakTags.add("H6");
127 breakTags.add("HEAD");
128 breakTags.add("HR");
129 breakTags.add("HTML");
130 breakTags.add("ISINDEX");
131 breakTags.add("LI");
132 breakTags.add("MENU");
133 breakTags.add("NOFRAMES");
134 breakTags.add("OL");
135 breakTags.add("P");
136 breakTags.add("PRE");
137 breakTags.add("TD");
138 breakTags.add("TH");
139 breakTags.add("TITLE");
140 breakTags.add("UL");
141 }
142
143 /**
144 * Set the Tag with the beginning posn, ending posn and tag contents (in
145 * a tagData object.
146 * @param tagData The data for this tag
147 */
148 public Tag(TagData tagData)
149 {
150 super(tagData.getTagBegin(), tagData.getTagEnd());
151 this.startLine = tagData.getStartLine();
152 this.tagContents = new StringBuffer();
153 this.tagContents.append(tagData.getTagContents());
154 this.tagLine = tagData.getTagLine();
155 this.tagLines = new String[] { tagData.getTagLine()};
156 this.emptyXmlTag = tagData.isEmptyXmlTag();
157 }
158
159 public void append(char ch)
160 {
161 tagContents.append(ch);
162 }
163
164 public void append(String ch)
165 {
166 tagContents.append(ch);
167 }
168
169 /**
170 * Locate the tag withing the input string, by parsing from the given position
171 * @param reader HTML reader to be provided so as to allow reading of next line
172 * @param input Input String
173 * @param position Position to start parsing from
174 */
175 public static Tag find(NodeReader reader, String input, int position)
176 {
177 return tagParser.find(reader, input, position);
178 }
179
180 /**
181 * This method is not to be called by any scanner or tag. It is
182 * an expensive method, hence it has been made private. However,
183 * there might be some circumstances when a scanner wishes to force
184 * parsing of attributes over and above what has already been parsed.
185 * To make the choice clear - we have a method - redoParseAttributes(),
186 * which can be used.
187 * @return Hashtable
188 */
189 private Hashtable parseAttributes()
190 {
191 return paramParser.parseAttributes(this);
192 }
193
194 /**
195 * In case the tag is parsed at the scan method this will return value of a
196 * parameter not implemented yet
197 * @param name of parameter
198 */
199 public String getAttribute(String name)
200 {
201 return (String) getAttributes().get(name.toUpperCase());
202 }
203
204 /**
205 * Set attribute with given key, value pair.
206 * @param key
207 * @param value
208 */
209 public void setAttribute(String key, String value)
210 {
211 attributes.put(key, value);
212 }
213
214 /**
215 * In case the tag is parsed at the scan method this will return value of a
216 * parameter not implemented yet
217 * @param name of parameter
218 * @deprecated use getAttribute instead
219 */
220 public String getParameter(String name)
221 {
222 return (String) getAttributes().get(name.toUpperCase());
223 }
224
225 /**
226 * Gets the attributes in the tag.
227 * @return Returns a Hashtable of attributes
228 */
229 public Hashtable getAttributes()
230 {
231 if (attributes == null)
232 {
233 attributes = parseAttributes();
234 }
235 return attributes;
236 }
237
238 public String getTagName()
239 {
240 return (String) getAttributes().get(TAGNAME);
241 }
242
243 /**
244 * Returns the line where the tag was found
245 * @return java.lang.String
246 */
247 public String getTagLine()
248 {
249 return tagLine;
250 }
251
252 /**
253 * Returns the combined text of all the lines spanned by this tag
254 * @return java.lang.String
255 */
256 public String[] getTagLines()
257 {
258 return tagLines;
259 }
260
261 /**
262 * Return the text contained in this tag
263 */
264 public String getText()
265 {
266 return tagContents.toString();
267 }
268
269 /**
270 * Return the scanner associated with this tag.
271 */
272 public TagScanner getThisScanner()
273 {
274 return thisScanner;
275 }
276
277 /**
278 * Extract the first word from the given string.
279 * Words are delimited by whitespace or equals signs.
280 * @param s The string to get the word from.
281 * @return The first word.
282 */
283 public static String extractWord(String s)
284 {
285 int length;
286 boolean parse;
287 char ch;
288 StringBuffer ret;
289
290 length = s.length();
291 ret = new StringBuffer(length);
292 parse = true;
293 for (int i = 0; i < length && parse; i++)
294 {
295 ch = s.charAt(i);
296 if (Character.isWhitespace(ch) || ch == '=')
297 parse = false;
298 else
299 ret.append(Character.toUpperCase(ch));
300 }
301
302 return (ret.toString());
303 }
304
305 /**
306 * Scan the tag to see using the registered scanners, and attempt identification.
307 * @param url URL at which HTML page is located
308 * @param reader The NodeReader that is to be used for reading the url
309 */
310 public Node scan(Map scanners, String url, NodeReader reader)
311 throws ParserException
312 {
313 if (tagContents.length() == 0)
314 return this;
315 try
316 {
317 boolean found = false;
318 Node retVal = null;
319 // Find the first word in the scanners
320 String firstWord = extractWord(tagContents.toString());
321 // Now, get the scanner associated with this.
322 TagScanner scanner = (TagScanner) scanners.get(firstWord);
323
324 // Now do a deep check
325 if (scanner != null
326 && scanner.evaluate(
327 tagContents.toString(),
328 reader.getPreviousOpenScanner()))
329 {
330 found = true;
331 TagScanner save;
332 save = reader.getPreviousOpenScanner();
333 reader.setPreviousOpenScanner(scanner);
334 retVal = scanner.createScannedNode(this, url, reader, tagLine);
335 reader.setPreviousOpenScanner(save);
336 }
337
338 if (!found)
339 return this;
340 else
341 {
342 return retVal;
343 }
344 }
345 catch (Exception e)
346 {
347 String errorMsg;
348 if (tagContents != null)
349 errorMsg = tagContents.toString();
350 else
351 errorMsg = "null";
352 throw new ParserException(
353 "Tag.scan() : Error while scanning tag, tag contents = "
354 + errorMsg
355 + ", tagLine = "
356 + tagLine,
357 e);
358 }
359 }
360
361 /**
362 * Sets the parsed.
363 * @param parsed The parsed to set
364 */
365 public void setAttributes(Hashtable attributes)
366 {
367 this.attributes = attributes;
368 }
369
370 /**
371 * Sets the nodeBegin.
372 * @param nodeBegin The nodeBegin to set
373 */
374 public void setTagBegin(int tagBegin)
375 {
376 this.nodeBegin = tagBegin;
377 }
378
379 /**
380 * Gets the nodeBegin.
381 * @return The nodeBegin value.
382 */
383 public int getTagBegin()
384 {
385 return (nodeBegin);
386 }
387
388 /**
389 * Sets the nodeEnd.
390 * @param nodeEnd The nodeEnd to set
391 */
392 public void setTagEnd(int tagEnd)
393 {
394 this.nodeEnd = tagEnd;
395 }
396
397 /**
398 * Gets the nodeEnd.
399 * @return The nodeEnd value.
400 */
401 public int getTagEnd()
402 {
403 return (nodeEnd);
404 }
405
406 /**
407 * Gets the line number on which this tag starts.
408 * @return the start line number
409 */
410 public int getTagStartLine()
411 {
412 return startLine;
413 }
414
415 /**
416 * Gets the line number on which this tag ends.
417 * @return the end line number
418 */
419 public int getTagEndLine()
420 {
421 return startLine + tagLines.length - 1;
422 }
423
424 public void setTagLine(java.lang.String newTagLine)
425 {
426 tagLine = newTagLine;
427
428 // Note: Incur the overhead of resizing each time (versus
429 // preallocating a larger array), since the average tag
430 // generally doesn't span multiple lines
431 String[] newTagLines = new String[tagLines.length + 1];
432 for (int i = 0; i < tagLines.length; i++)
433 newTagLines[i] = tagLines[i];
434 newTagLines[tagLines.length] = newTagLine;
435 tagLines = newTagLines;
436 }
437
438 public void setText(String text)
439 {
440 tagContents = new StringBuffer(text);
441 }
442
443 public void setThisScanner(TagScanner scanner)
444 {
445 thisScanner = scanner;
446 }
447
448 public String toPlainTextString()
449 {
450 return EMPTY_STRING;
451 }
452
453 /**
454 * A call to a tag's toHTML() method will render it in HTML
455 * Most tags that do not have children and inherit from Tag,
456 * do not need to override toHTML().
457 * @see org.htmlparser.Node#toHTML()
458 */
459 public String toHtml()
460 {
461 StringBuffer sb = new StringBuffer();
462 sb.append("<");
463 sb.append(getTagName());
464 if (containsMoreThanOneKey())
465 sb.append(" ");
466 String key, value;
467 String empty = null;
468 int i = 0;
469 for (Enumeration e = attributes.keys(); e.hasMoreElements();)
470 {
471 key = (String) e.nextElement();
472 i++;
473 if (!key.equals(TAGNAME))
474 {
475 if (key.equals(EMPTYTAG))
476 {
477 empty = "/";
478 }
479 else
480 {
481 value = getAttribute(key);
482 sb.append(key + "=\"" + value + "\"");
483 if (i < attributes.size())
484 sb.append(" ");
485 }
486 }
487 }
488 if (empty != null)
489 sb.append(empty);
490 if (isEmptyXmlTag())
491 sb.append("/");
492 sb.append(">");
493 return sb.toString();
494 }
495
496 private boolean containsMoreThanOneKey()
497 {
498 return attributes.keySet().size() > 1;
499 }
500
501 /**
502 * Print the contents of the tag
503 */
504 public String toString()
505 {
506 return "Begin Tag : "
507 + tagContents
508 + "; begins at : "
509 + elementBegin()
510 + "; ends at : "
511 + elementEnd();
512 }
513
514 /**
515 * Sets the tagParser.
516 * @param tagParser The tagParser to set
517 */
518 public static void setTagParser(TagParser tagParser)
519 {
520 Tag.tagParser = tagParser;
521 }
522
523 /**
524 * Determines if the given tag breaks the flow of text.
525 * @return <code>true</code> if following text would start on a new line,
526 * <code>false</code> otherwise.
527 */
528 public boolean breaksFlow()
529 {
530 return (breakTags.contains(getText().toUpperCase()));
531 }
532
533 /**
534 * This method verifies that the current tag matches the provided
535 * filter. The match is based on the string object and not its contents,
536 * so ensure that you are using static final filter strings provided
537 * in the tag classes.
538 * @see org.htmlparser.Node#collectInto(NodeList, String)
539 */
540 public void collectInto(NodeList collectionList, String filter)
541 {
542 if (thisScanner != null && thisScanner.getFilter().equals(filter))
543 collectionList.add(this);
544 }
545
546 /**
547 * Returns table of attributes in the tag
548 * @return Hashtable
549 * @deprecated This method is deprecated. Use getAttributes() instead.
550 */
551 public Hashtable getParsed()
552 {
553 return attributes;
554 }
555
556 /**
557 * Sometimes, a scanner may need to request a re-evaluation of the
558 * attributes in a tag. This may happen when there is some correction
559 * activity. An example of its usage can be found in ImageTag.
560 * <br>
561 * <B>Note:<B> This is an intensive task, hence call only when
562 * really necessary
563 * @return Hashtable
564 */
565 public Hashtable redoParseAttributes()
566 {
567 return parseAttributes();
568 }
569
570 public void accept(NodeVisitor visitor)
571 {
572 visitor.visitTag(this);
573 }
574
575 public String getType()
576 {
577 return TYPE;
578 }
579
580 /**
581 * Is this an empty xml tag of the form<br>
582 * <tag/>
583 * @return boolean
584 */
585 public boolean isEmptyXmlTag()
586 {
587 return emptyXmlTag;
588 }
589
590 public void setEmptyXmlTag(boolean emptyXmlTag)
591 {
592 this.emptyXmlTag = emptyXmlTag;
593 }
594
595 }