Source code: org/htmlparser/Parser.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/Parser.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33 package org.htmlparser;
34 //////////////////
35 // Java Imports //
36 //////////////////
37 import java.io.BufferedInputStream;
38 import java.io.File;
39 import java.io.IOException;
40 import java.io.InputStream;
41 import java.io.InputStreamReader;
42 import java.io.ObjectInputStream;
43 import java.io.ObjectOutputStream;
44 import java.io.Serializable;
45 import java.io.StringReader;
46 import java.io.UnsupportedEncodingException;
47 import java.net.MalformedURLException;
48 import java.net.URLConnection;
49 import java.util.HashMap;
50 import java.util.Hashtable;
51 import java.util.Map;
52
53 import org.htmlparser.parserHelper.ParserHelper;
54 import org.htmlparser.parserHelper.TagParser;
55 import org.htmlparser.scanners.AppletScanner;
56 import org.htmlparser.scanners.BodyScanner;
57 import org.htmlparser.scanners.BulletListScanner;
58 import org.htmlparser.scanners.DivScanner;
59 import org.htmlparser.scanners.DoctypeScanner;
60 import org.htmlparser.scanners.FormScanner;
61 import org.htmlparser.scanners.FrameSetScanner;
62 import org.htmlparser.scanners.HeadScanner;
63 import org.htmlparser.scanners.HtmlScanner;
64 import org.htmlparser.scanners.JspScanner;
65 import org.htmlparser.scanners.LinkScanner;
66 import org.htmlparser.scanners.MetaTagScanner;
67 import org.htmlparser.scanners.ScriptScanner;
68 import org.htmlparser.scanners.StyleScanner;
69 import org.htmlparser.scanners.TableScanner;
70 import org.htmlparser.scanners.TagScanner;
71 import org.htmlparser.scanners.TitleScanner;
72 import org.htmlparser.tags.EndTag;
73 import org.htmlparser.tags.ImageTag;
74 import org.htmlparser.tags.LinkTag;
75 import org.htmlparser.tags.MetaTag;
76 import org.htmlparser.tags.Tag;
77 import org.htmlparser.util.DefaultParserFeedback;
78 import org.htmlparser.util.IteratorImpl;
79 import org.htmlparser.util.NodeIterator;
80 import org.htmlparser.util.NodeList;
81 import org.htmlparser.util.ParserException;
82 import org.htmlparser.util.ParserFeedback;
83 import org.htmlparser.visitors.NodeVisitor;
84
85 /**
86 * This is the class that the user will use, either to get an iterator into
87 * the html page or to directly parse the page and print the results
88 * <BR>
89 * Typical usage of the parser is as follows : <BR>
90 * [1] Create a parser object - passing the URL and a feedback object to the parser<BR>
91 * [2] Register the common scanners. See {@link #registerScanners()} <BR>
92 * You wouldnt do this if you want to configure a custom lightweight parser. In that case,
93 * you would add the scanners of your choice using {@link #addScanner(TagScanner)}<BR>
94 * [3] Enumerate through the elements from the parser object <BR>
95 * It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way,
96 * and you only get the control back after a particular element is parsed and returned.
97 *
98 * <BR>
99 * Below is some sample code to parse Yahoo.com and print all the tags.
100 * <pre>
101 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
102 * // In this example, we are registering all the common scanners
103 * parser.registerScanners();
104 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
105 * Node node = i.nextNode();
106 * node.print();
107 * }
108 * </pre> Below is some sample code to parse Yahoo.com and print only the text
109 * information. This scanning will run faster, as there are no scanners
110 * registered here.
111 * <pre>
112 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
113 * // In this example, none of the scanners need to be registered
114 * // as a string node is not a tag to be scanned for.
115 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
116 * Node node = i.nextNode();
117 * if (node instanceof StringNode) {
118 * StringNode stringNode =
119 * (StringNode)node;
120 * System.out.println(stringNode.getText());
121 * }
122 * }
123 * </pre>
124 * The above snippet will print out only the text contents in the html document.<br>
125 * Here's another snippet that will only print out the link urls in a document.
126 * This is an example of adding a link scanner.
127 * <pre>
128 * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
129 * parser.addScanner(new LinkScanner("-l"));
130 * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
131 * Node node = i.nextNode();
132 * if (node instanceof LinkTag) {
133 * LinkTag linkTag = (LinkTag)node;
134 * System.out.println(linkTag.getLink());
135 * }
136 * }
137 * </pre>
138 * @see Parser#elements()
139 */
140 public class Parser implements Serializable
141 {
142 // Please don't change the formatting of the version variables below.
143 // This is done so as to facilitate ant script processing.
144
145 /**
146 * The floating point version number.
147 */
148 public final static double VERSION_NUMBER = 1.3;
149
150 /**
151 * The type of version.
152 */
153 public final static String VERSION_TYPE = "Release Build";
154
155 /**
156 * The date of the version.
157 */
158 public final static String VERSION_DATE = "May 25, 2003";
159
160 /**
161 * The display version.
162 */
163 public final static String VERSION_STRING =
164 "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";
165
166 // End of formatting
167
168 /**
169 * The default charset.
170 * This should be <code>ISO-8859-1</code>,
171 * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1
172 * Another alias is "8859_1".
173 */
174 protected static final String DEFAULT_CHARSET = "ISO-8859-1";
175
176 /**
177 * Trigger for charset detection.
178 */
179 protected static final String CHARSET_STRING = "charset";
180
181 /**
182 * Feedback object.
183 */
184 protected ParserFeedback feedback;
185
186 /**
187 * The URL or filename to be parsed.
188 */
189 protected String resourceLocn;
190
191 /**
192 * The html reader associated with this parser.
193 */
194 protected transient NodeReader reader;
195
196 /**
197 * The list of scanners to apply at the top level.
198 */
199 private Map scanners;
200
201 /**
202 * The encoding being used to decode the connection input stream.
203 */
204 protected String character_set;
205
206 /**
207 * The source for HTML.
208 */
209 protected transient URLConnection url_conn;
210
211 /**
212 * The bytes extracted from the source.
213 */
214 protected transient BufferedInputStream input;
215
216 /**
217 * A quiet message sink.
218 * Use this for no feedback.
219 */
220 public static ParserFeedback noFeedback =
221 new DefaultParserFeedback(DefaultParserFeedback.QUIET);
222
223 /**
224 * A verbose message sink.
225 * Use this for output on <code>System.out</code>.
226 */
227 public static ParserFeedback stdout = new DefaultParserFeedback();
228
229 private ParserHelper parserHelper = new ParserHelper();
230
231 //
232 // Static methods
233 //
234
235 /**
236 * @param lineSeparator New Line separator to be used
237 */
238 public static void setLineSeparator(String lineSeparator)
239 {
240 Node.setLineSeparator(lineSeparator);
241 }
242
243 /**
244 * Return the version string of this parser.
245 * @return A string of the form:
246 * <pre>
247 * "[floating point number] ([build-type] [build-date])"
248 * </pre>
249 */
250 public static String getVersion()
251 {
252 return (VERSION_STRING);
253 }
254
255 /**
256 * Return the version number of this parser.
257 * @return A floating point number, the whole number part is the major
258 * version, and the fractional part is the minor version.
259 */
260 public static double getVersionNumber()
261 {
262 return (VERSION_NUMBER);
263 }
264
265 //
266 // Constructors
267 //
268
269 /**
270 * Zero argument constructor.
271 * The parser is in a safe but useless state.
272 * Set the reader or connection using setReader() or setConnection().
273 * @see #setReader(NodeReader)
274 * @see #setConnection(URLConnection)
275 */
276 public Parser()
277 {
278 setFeedback(null);
279 setScanners(null);
280 resourceLocn = null;
281 reader = null;
282 character_set = DEFAULT_CHARSET;
283 url_conn = null;
284 input = null;
285 Tag.setTagParser(new TagParser(getFeedback()));
286 }
287
288 /**
289 * This constructor enables the construction of test cases, with readers
290 * associated with test string buffers. It can also be used with readers of the user's choice
291 * streaming data into the parser.<p/>
292 * <B>Important:</B> If you are using this constructor, and you would like to use the parser
293 * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
294 * <ul>
295 * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
296 * <li>After the first parse, calls to elements() must be preceded by calls to :
297 * <pre>
298 * parser.getReader().reset();
299 * </pre>
300 * </li>
301 * </ul>
302 * @param rd The reader to draw characters from.
303 * @param fb The object to use when information,
304 * warning and error messages are produced. If <em>null</em> no feedback
305 * is provided.
306 */
307 public Parser(NodeReader rd, ParserFeedback fb)
308 {
309 setFeedback(fb);
310 setScanners(null);
311 resourceLocn = null;
312 reader = null;
313 character_set = DEFAULT_CHARSET;
314 url_conn = null;
315 input = null;
316 setReader(rd);
317 Tag.setTagParser(new TagParser(feedback));
318 }
319
320 /**
321 * Constructor for custom HTTP access.
322 * @param connection A fully conditioned connection. The connect()
323 * method will be called so it need not be connected yet.
324 * @param fb The object to use for message communication.
325 */
326 public Parser(URLConnection connection, ParserFeedback fb)
327 throws ParserException
328 {
329 setFeedback(fb);
330 setScanners(null);
331 resourceLocn = null;
332 reader = null;
333 character_set = DEFAULT_CHARSET;
334 url_conn = null;
335 input = null;
336 Tag.setTagParser(new TagParser(feedback));
337 setConnection(connection);
338 }
339
340 /**
341 * Creates a Parser object with the location of the resource (URL or file)
342 * You would typically create a DefaultHTMLParserFeedback object and pass it in.
343 * @param resourceLocn Either the URL or the filename (autodetects).
344 * A standard HTTP GET is performed to read the content of the URL.
345 * @param feedback The HTMLParserFeedback object to use when information,
346 * warning and error messages are produced. If <em>null</em> no feedback
347 * is provided.
348 * @see #Parser(URLConnection,ParserFeedback)
349 */
350 public Parser(String resourceLocn, ParserFeedback feedback)
351 throws ParserException
352 {
353 this(ParserHelper.openConnection(resourceLocn, feedback), feedback);
354 }
355
356 /**
357 * Creates a Parser object with the location of the resource (URL or file).
358 * A DefaultHTMLParserFeedback object is used for feedback.
359 * @param resourceLocn Either the URL or the filename (autodetects).
360 */
361 public Parser(String resourceLocn) throws ParserException
362 {
363 this(resourceLocn, stdout);
364 }
365
366 /**
367 * This constructor is present to enable users to plugin their own readers.
368 * A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice
369 * streaming data into the parser.<p/>
370 * <B>Important:</B> If you are using this constructor, and you would like to use the parser
371 * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
372 * <ul>
373 * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
374 * <li>After the first parse, calls to elements() must be preceded by calls to :
375 * <pre>
376 * parser.getReader().reset();
377 * </pre>
378 * </li>
379 * @param reader The source for HTML to be parsed.
380 */
381 public Parser(NodeReader reader)
382 {
383 this(reader, stdout);
384 }
385
386 /**
387 * Constructor for non-standard access.
388 * A DefaultHTMLParserFeedback object is used for feedback.
389 * @param connection A fully conditioned connection. The connect()
390 * method will be called so it need not be connected yet.
391 * @see #Parser(URLConnection,ParserFeedback)
392 */
393 public Parser(URLConnection connection) throws ParserException
394 {
395 this(connection, stdout);
396 }
397
398 //
399 // Serialization support
400 //
401
402 private void writeObject(ObjectOutputStream out) throws IOException
403 {
404 if ((null == getConnection()) || /*redundant*/
405 (null == getURL()))
406 if (null != getReader());
407 // commented out by Somik - why are we not allowed to serialize parsers without url
408 // throw new IOException ("can only serialize parsers with a URL");
409 out.defaultWriteObject();
410 }
411
412 private void readObject(ObjectInputStream in)
413 throws IOException, ClassNotFoundException
414 {
415 in.defaultReadObject();
416 try
417 {
418 // reopen the connection and create a reader which are transient fields
419 setURL(getURL());
420 }
421 catch (ParserException hpe)
422 {
423 throw new IOException(hpe.toString());
424 }
425 }
426
427 //
428 // Bean patterns
429 //
430
431 /**
432 * Set the connection for this parser.
433 * This method sets four of the fields in the parser object;
434 * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
435 * and <code>reader</code>. It does not adjust the <code>scanners</code> list
436 * or <code>feedback</code> object. The four fields are set atomicly by
437 * this method, either they are all set or none of them is set. Trying to
438 * set the connection to null is a noop.
439 * @param connection A fully conditioned connection. The connect()
440 * method will be called so it need not be connected yet.
441 * @exception ParserException if the character set specified in the
442 * HTTP header is not supported, or an i/o exception occurs creating the
443 * reader.
444 */
445 public void setConnection(URLConnection connection) throws ParserException
446 {
447 String res;
448 NodeReader rd;
449 String chs;
450 URLConnection con;
451
452 if (null != connection)
453 {
454 res = getURL();
455 rd = getReader();
456 chs = getEncoding();
457 con = getConnection();
458 try
459 {
460 resourceLocn = connection.getURL().toExternalForm();
461 url_conn = connection;
462 url_conn.connect();
463 character_set = getCharacterSet(url_conn);
464 createReader();
465 }
466 catch (IOException ioe)
467 {
468 String msg =
469 "setConnection() : Error in opening a connection to "
470 + connection.getURL().toExternalForm();
471 ParserException ex = new ParserException(msg, ioe);
472 feedback.error(msg, ex);
473 resourceLocn = res;
474 url_conn = con;
475 character_set = chs;
476 reader = rd;
477 throw ex;
478 }
479 }
480 }
481
482 /**
483 * Return the current connection.
484 * @return The connection either created by the parser or passed into this
485 * parser via <code>setConnection</code>.
486 * @see #setConnection(URLConnection)
487 */
488 public URLConnection getConnection()
489 {
490 return (url_conn);
491 }
492
493 /**
494 * Set the URL for this parser.
495 * This method sets four of the fields in the parser object;
496 * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
497 * and <code>reader</code>. It does not adjust the <code>scanners</code> list
498 * or <code>feedback</code> object.Trying to set the url to null or an
499 * empty string is a noop.
500 * @see #setConnection(URLConnection)
501 */
502 public void setURL(String url) throws ParserException
503 {
504 if ((null != url) && !"".equals(url))
505 setConnection(ParserHelper.openConnection(url, getFeedback()));
506 }
507
508 /**
509 * Return the current URL being parsed.
510 * @return The url passed into the constructor or the file name
511 * passed to the constructor modified to be a URL.
512 */
513 public String getURL()
514 {
515 return (resourceLocn);
516 }
517
518 /**
519 * Set the encoding for this parser.
520 * If there is no connection (getConnection() returns null) it simply sets
521 * the character set name stored in the parser (Note: the reader object
522 * which must have been set in the constructor or by <code>setReader()</code>,
523 * may or may not be using this character set).
524 * Otherwise (getConnection() doesn't return null) it does this by reopening the
525 * input stream of the connection and creating a reader that uses this
526 * character set. In this case, this method sets two of the fields in the
527 * parser object; <code>character_set</code> and <code>reader</code>.
528 * It does not adjust <code>resourceLocn</code>, <code>url_conn</code>,
529 * <code>scanners</code> or <code>feedback</code>. The two fields are set
530 * atomicly by this method, either they are both set or none of them is set.
531 * Trying to set the encoding to null or an empty string is a noop.
532 * @exception ParserException If the opening of the reader
533 */
534 public void setEncoding(String encoding) throws ParserException
535 {
536 String chs;
537 NodeReader rd;
538 BufferedInputStream in;
539
540 if ((null != encoding) && !"".equals(encoding))
541 if (null == getConnection())
542 character_set = encoding;
543 else
544 {
545 rd = getReader();
546 chs = getEncoding();
547 in = input;
548 try
549 {
550 character_set = encoding;
551 recreateReader();
552 }
553 catch (IOException ioe)
554 {
555 String msg =
556 "setEncoding() : Error in opening a connection to "
557 + getConnection().getURL().toExternalForm();
558 ParserException ex = new ParserException(msg, ioe);
559 feedback.error(msg, ex);
560 character_set = chs;
561 reader = rd;
562 input = in;
563 throw ex;
564 }
565 }
566 }
567
568 /**
569 * The current encoding.
570 * This item is et from the HTTP header but may be overridden by meta
571 * tags in the head, so this may change after the head has been parsed.
572 */
573 public String getEncoding()
574 {
575 return (character_set);
576 }
577
578 /**
579 * Set the reader for this parser.
580 * This method sets four of the fields in the parser object;
581 * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
582 * and <code>reader</code>. It does not adjust the <code>scanners</code> list
583 * or <code>feedback</code> object. The <code>url_conn</code> is set to
584 * null since this cannot be determined from the reader. The
585 * <code>character_set</code> is set to the default character set since
586 * this cannot be determined from the reader.
587 * Trying to set the reader to <code>null</code> is a noop.
588 * @param rd The reader object to use. This reader will be bound to this
589 * parser after this call.
590 */
591 public void setReader(NodeReader rd)
592 {
593 if (null != rd)
594 {
595 resourceLocn = rd.getURL();
596 reader = rd;
597 character_set = DEFAULT_CHARSET;
598 url_conn = null;
599 reader.setParser(this);
600 }
601 }
602
603 /**
604 * Returns the reader associated with the parser
605 * @return NodeReader
606 */
607 public NodeReader getReader()
608 {
609 return reader;
610 }
611
612 /**
613 * Get the number of scanners registered currently in the scanner.
614 * @return int number of scanners registered
615 */
616 public int getNumScanners()
617 {
618 return scanners.size();
619 }
620
621 /**
622 * This method is to be used to change the set of scanners in the current parser.
623 * @param newScanners Vector holding scanner objects to be used during the parsing process.
624 */
625 public void setScanners(Map newScanners)
626 {
627 scanners = (null == newScanners) ? new HashMap() : newScanners;
628 }
629
630 /**
631 * Get an enumeration of scanners registered currently in the parser
632 * @return Enumeration of scanners currently registered in the parser
633 */
634 public Map getScanners()
635 {
636 return scanners;
637 }
638
639 /**
640 * Sets the feedback object used in scanning.
641 * @param fb The new feedback object to use.
642 */
643 public void setFeedback(ParserFeedback fb)
644 {
645 feedback = (null == fb) ? noFeedback : fb;
646 }
647
648 /**
649 * Returns the feedback.
650 * @return HTMLParserFeedback
651 */
652 public ParserFeedback getFeedback()
653 {
654 return feedback;
655 }
656
657 //
658 // Internal methods
659 //
660
661 /**
662 * Open a stream reader on the <code>InputStream</code>.
663 * Revise the character set to it's default value if an
664 * <code>UnsupportedEncodingException</code> is thrown.
665 * @exception UnsupportedEncodingException in the unlikely event that
666 * the default character set is not supported on this platform.
667 */
668 protected InputStreamReader createInputStreamReader()
669 throws UnsupportedEncodingException
670 {
671 InputStreamReader ret;
672
673 try
674 {
675 ret = new InputStreamReader(input, character_set);
676 }
677 catch (UnsupportedEncodingException uee)
678 {
679 StringBuffer msg;
680 String message;
681
682 msg = new StringBuffer(1024);
683 msg.append(url_conn.getURL().toExternalForm());
684 msg.append(" has an encoding (");
685 msg.append(character_set);
686 msg.append(") which is not supported, using ");
687 msg.append(DEFAULT_CHARSET);
688 message = msg.toString();
689 feedback.warning(message);
690 character_set = DEFAULT_CHARSET;
691 ret = new InputStreamReader(input, character_set);
692 }
693
694 return (ret);
695 }
696
697 /**
698 * Create a new reader for the URLConnection object.
699 * The current character set is used to transform the input stream
700 * into a character reader.
701 * @exception IOException if there is a problem constructing the reader.
702 * @see #createInputStreamReader()
703 * @see #getEncoding()
704 */
705 protected void createReader() throws IOException
706 {
707 InputStream stream;
708 InputStreamReader in;
709
710 stream = url_conn.getInputStream();
711 input = new BufferedInputStream(stream);
712 input.mark(Integer.MAX_VALUE);
713 in = createInputStreamReader();
714 reader = new NodeReader(in, resourceLocn);
715 reader.setParser(this);
716 }
717
718 /**
719 * Create a new reader for the URLConnection object but reuse the input stream.
720 * The current character set is used to transform the input stream
721 * into a character reader. Defaults to <code>createReader()</code> if
722 * there is no existing input stream.
723 * @exception IOException if there is a problem constructing the reader.
724 * @see #createReader()
725 * @see #createInputStreamReader()
726 * @see #getEncoding()
727 */
728 protected void recreateReader() throws IOException
729 {
730 InputStreamReader in;
731
732 if (null == input)
733 createReader();
734 else
735 {
736 input.reset();
737 input.mark(Integer.MAX_VALUE);
738 in = createInputStreamReader();
739 reader = new NodeReader(in, resourceLocn);
740 reader.setParser(this);
741 }
742 }
743
744 /**
745 * Try and extract the character set from the HTTP header.
746 * @param connection The connection with the charset info.
747 * @return The character set name to use for this HTML page.
748 */
749 protected String getCharacterSet(URLConnection connection)
750 {
751 final String field = "Content-Type";
752
753 String string;
754 String ret;
755
756 ret = DEFAULT_CHARSET;
757 string = connection.getHeaderField(field);
758 if (null != string)
759 ret = getCharset(string);
760
761 return (ret);
762 }
763
764 /**
765 * Get a CharacterSet name corresponding to a charset parameter.
766 * @param content A text line of the form:
767 * <pre>
768 * text/html; charset=Shift_JIS
769 * </pre>
770 * which is applicable both to the HTTP header field Content-Type and
771 * the meta tag http-equiv="Content-Type".
772 * Note this method also handles non-compliant quoted charset directives such as:
773 * <pre>
774 * text/html; charset="UTF-8"
775 * </pre>
776 * and
777 * <pre>
778 * text/html; charset='UTF-8'
779 * </pre>
780 * @return The character set name to use when reading the input stream.
781 * For JDKs that have the Charset class this is qualified by passing
782 * the name to findCharset() to render it into canonical form.
783 * If the charset parameter is not found in the given string, the default
784 * character set is returned.
785 * @see ParserHelper#findCharset
786 * @see #DEFAULT_CHARSET
787 */
788 protected String getCharset(String content)
789 {
790 int index;
791 String ret;
792
793 ret = DEFAULT_CHARSET;
794 if (null != content)
795 {
796 index = content.indexOf(CHARSET_STRING);
797
798 if (index != -1)
799 {
800 content =
801 content.substring(index + CHARSET_STRING.length()).trim();
802 if (content.startsWith("="))
803 {
804 content = content.substring(1).trim();
805 index = content.indexOf(";");
806 if (index != -1)
807 content = content.substring(0, index);
808
809 //remove any double quotes from around charset string
810 if (content.startsWith("\"")
811 && content.endsWith("\"")
812 && (1 < content.length()))
813 content = content.substring(1, content.length() - 1);
814
815 //remove any single quote from around charset string
816 if (content.startsWith("'")
817 && content.endsWith("'")
818 && (1 < content.length()))
819 content = content.substring(1, content.length() - 1);
820
821 ret = ParserHelper.findCharset(content, ret);
822 // Charset names are not case-sensitive;
823 // that is, case is always ignored when comparing charset names.
824 if (!ret.equalsIgnoreCase(content))
825 {
826 feedback.info(
827 "detected charset \""
828 + content
829 + "\", using \""
830 + ret
831 + "\"");
832 }
833 }
834 }
835 }
836
837 return (ret);
838 }
839
840 //
841 // Public methods
842 //
843
844 /**
845 * Add a new Tag Scanner.
846 * In typical situations where you require a no-frills parser, use the registerScanners() method to add the most
847 * common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method.
848 * It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method
849 * would also be of use when you have developed custom scanners, and need to register them into the parser.
850 * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners
851 */
852 public void addScanner(TagScanner scanner)
853 {
854 String ids[] = scanner.getID();
855 for (int i = 0; i < ids.length; i++)
856 {
857 scanners.put(ids[i], scanner);
858 }
859 scanner.setFeedback(feedback);
860 }
861
862 /**
863 * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
864 * string/link/image<br>
865 * This is perhaps the most important method of this class. In typical situations, you will need to use
866 * the parser like this :
867 * <pre>
868 * Parser parser = new Parser("http://www.yahoo.com");
869 * parser.registerScanners();
870 * for (NodeIterator i = parser.elements();i.hasMoreElements();) {
871 * Node node = i.nextHTMLNode();
872 * if (node instanceof StringNode) {
873 * // Downcasting to StringNode
874 * StringNode stringNode = (StringNode)node;
875 * // Do whatever processing you want with the string node
876 * System.out.println(stringNode.getText());
877 * }
878 * // Check for the node or tag that you want
879 * if (node instanceof ...) {
880 * // Downcast, and process
881 * }
882 * }
883 * </pre>
884 */
885 public NodeIterator elements() throws ParserException
886 {
887 boolean remove_scanner;
888 Node node;
889 MetaTag meta;
890 String httpEquiv;
891 String charset;
892 boolean restart;
893 EndTag end;
894 IteratorImpl ret;
895
896 remove_scanner = false;
897 restart = false;
898 ret = new IteratorImpl(reader, resourceLocn, feedback);
899 ret = createIteratorImpl(remove_scanner, ret);
900
901 return (ret);
902 }
903
904 public IteratorImpl createIteratorImpl(
905 boolean remove_scanner,
906 IteratorImpl ret)
907 throws ParserException
908 {
909 Node node;
910 MetaTag meta;
911 String httpEquiv;
912 String charset;
913 EndTag end;
914 if (null != url_conn)
915 try
916 {
917 if (null == scanners.get("-m"))
918 {
919 addScanner(new MetaTagScanner("-m"));
920 remove_scanner = true;
921 }
922
923 /* pre-read up to </HEAD> looking for charset directive */
924 while (null != (node = ret.peek()))
925 {
926 if (node instanceof MetaTag)
927 { // check for charset on Content-Type
928 meta = (MetaTag) node;
929 httpEquiv = meta.getAttribute("HTTP-EQUIV");
930 if ("Content-Type".equalsIgnoreCase(httpEquiv))
931 {
932 charset = getCharset(meta.getAttribute("CONTENT"));
933 if (!charset.equalsIgnoreCase(character_set))
934 { // oops, different character set, restart
935 character_set = charset;
936 recreateReader();
937 ret =
938 new IteratorImpl(
939 reader,
940 resourceLocn,
941 feedback);
942 }
943 // once we see the Content-Type meta tag we're finished the pre-read
944 break;
945 }
946 }
947 else if (node instanceof EndTag)
948 {
949 end = (EndTag) node;
950 if (end.getTagName().equalsIgnoreCase("HEAD"))
951 // or, once we see the </HEAD> tag we're finished the pre-read
952 break;
953 }
954 }
955 }
956 catch (UnsupportedEncodingException uee)
957 {
958 String msg =
959 "elements() : The content of "
960 + url_conn.getURL().toExternalForm()
961 + " has an encoding which is not supported";
962 ParserException ex = new ParserException(msg, uee);
963 feedback.error(msg, ex);
964 throw ex;
965 }
966 catch (IOException ioe)
967 {
968 String msg =
969 "elements() : Error in opening a connection to "
970 + url_conn.getURL().toExternalForm();
971 ParserException ex = new ParserException(msg, ioe);
972 feedback.error(msg, ex);
973 throw ex;
974 }
975 finally
976 {
977 if (remove_scanner)
978 scanners.remove("-m");
979 }
980 return ret;
981 }
982
983 /**
984 * Flush the current scanners registered. The registered scanners list becomes empty with this call.
985 */
986 public void flushScanners()
987 {
988 scanners = new Hashtable();
989 }
990
991 /**
992 * Return the scanner registered in the parser having the
993 * given id
994 * @param id The id of the requested scanner
995 * @return TagScanner The Tag Scanner
996 */
997 public TagScanner getScanner(String id)
998 {
999 return (TagScanner) scanners.get(id);
1000 }
1001
1002 /**
1003 * Parse the given resource, using the filter provided
1004 */
1005 public void parse(String filter) throws Exception
1006 {
1007 Node node;
1008 for (NodeIterator e = elements(); e.hasMoreNodes();)
1009 {
1010 node = e.nextNode();
1011 if (node != null)
1012 {
1013 if (filter == null)
1014 System.out.println(node.toString());
1015 else
1016 {
1017 // There is a filter. Find if the associated filter of this node
1018 // matches the specified filter
1019 if (!(node instanceof Tag))
1020 continue;
1021 Tag tag = (Tag) node;
1022 TagScanner scanner = tag.getThisScanner();
1023 if (scanner == null)
1024 continue;
1025
1026 String tagFilter = scanner.getFilter();
1027 if (tagFilter == null)
1028 continue;
1029 if (tagFilter.equals(filter))
1030 System.out.println(node.toString());
1031 }
1032 }
1033 else
1034 System.out.println("Node is null");
1035 }
1036
1037 }
1038
1039 /**
1040 * This method should be invoked in order to register some common scanners. The scanners that get added are : <br>
1041 * LinkScanner (filter key "-l")<br>
1042 * HTMLImageScanner (filter key "-i")<br>
1043 * HTMLScriptScanner (filter key "-s") <br>
1044 * HTMLStyleScanner (filter key "-t") <br>
1045 * HTMLJspScanner (filter key "-j") <br>
1046 * HTMLAppletScanner (filter key "-a") <br>
1047 * HTMLMetaTagScanner (filter key "-m") <br>
1048 * HTMLTitleScanner (filter key "-t") <br>
1049 * HTMLDoctypeScanner (filter key "-d") <br>
1050 * HTMLFormScanner (filter key "-f") <br>
1051 * HTMLFrameSetScanner(filter key "-r") <br>
1052 * HTMLBaseHREFScanner(filter key "-b") <br>
1053 * <br>
1054 * Call this method after creating the Parser object. e.g. <BR>
1055 * <pre>
1056 * Parser parser = new Parser("http://www.yahoo.com");
1057 * parser.registerScanners();
1058 * </pre>
1059 */
1060 public void registerScanners()
1061 {
1062 if (scanners.size() > 0)
1063 {
1064 System.err.println(
1065 "registerScanners() should be called first, when no other scanner has been registered.");
1066 System.err.println(
1067 "Other scanners already exist, hence this method call wont have any effect");
1068 return;
1069 }
1070 LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER);
1071 // Note - The BaseHREF and Image scanners share the same
1072 // link processor - internally linked up with the factory
1073 // method in the link scanner class
1074 addScanner(linkScanner);
1075 addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER));
1076 addScanner(new ScriptScanner("-s"));
1077 addScanner(new StyleScanner("-t"));
1078 addScanner(new JspScanner("-j"));
1079 addScanner(new AppletScanner("-a"));
1080 addScanner(new MetaTagScanner("-m"));
1081 addScanner(new TitleScanner("-T"));
1082 addScanner(new DoctypeScanner("-d"));
1083 addScanner(new FormScanner("-f", this));
1084 addScanner(new FrameSetScanner("-r"));
1085 addScanner(linkScanner.createBaseHREFScanner("-b"));
1086 addScanner(new BulletListScanner("-bulletList", this));
1087 // addScanner(new SpanScanner("-p"));
1088 addScanner(new DivScanner("-div"));
1089 addScanner(new TableScanner(this));
1090 }
1091
1092 /**
1093 * Make a call to registerDomScanners(), instead of registerScanners(),
1094 * when you are interested in retrieving a Dom representation of the html
1095 * page. Upon parsing, you will receive an Html object - which will contain
1096 * children, one of which would be the body. This is still evolving, and in
1097 * future releases, you might see consolidation of Html - to provide you
1098 * with methods to access the body and the head.
1099 */
1100 public void registerDomScanners()
1101 {
1102 registerScanners();
1103 addScanner(new HtmlScanner());
1104 addScanner(new BodyScanner());
1105 addScanner(new HeadScanner());
1106 }
1107
1108 /**
1109 * Removes a specified scanner object. You can create
1110 * an anonymous object as a parameter. This method
1111 * will use the scanner's key and remove it from the
1112 * registry of scanners.
1113 * e.g.
1114 * <pre>
1115 * removeScanner(new FormScanner(""));
1116 * </pre>
1117 * @param scanner TagScanner object to be removed from the list of registered scanners
1118 */
1119 public void removeScanner(TagScanner scanner)
1120 {
1121 scanners.remove(scanner.getID()[0]);
1122 }
1123
1124 /**
1125 * The main program, which can be executed from the command line
1126 */
1127 public static void main(String[] args)
1128 {
1129 System.out.println("HTMLParser v" + VERSION_STRING);
1130 if (args.length < 1 || args[0].equals("-help"))
1131 {
1132 System.out.println();
1133 System.out.println(
1134 "Syntax : java -jar htmlparser.jar <resourceLocn/website> -l");
1135 System.out.println(
1136 " <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)");
1137 System.out.println(
1138 " -l Show only the link tags extracted from the document");
1139 System.out.println(
1140 " -i Show only the image tags extracted from the document");
1141 System.out.println(
1142 " -s Show only the Javascript code extracted from the document");
1143 System.out.println(
1144 " -t Show only the Style code extracted from the document");
1145 System.out.println(
1146 " -a Show only the Applet tag extracted from the document");
1147 System.out.println(" -j Parse JSP tags");
1148 System.out.println(" -m Parse Meta tags");
1149 System.out.println(" -T Extract the Title");
1150 System.out.println(" -f Extract forms");
1151 System.out.println(" -r Extract frameset");
1152 System.out.println(" -help This screen");
1153 System.out.println();
1154 System.out.println(
1155 "HTML Parser home page : http://htmlparser.sourceforge.net");
1156 System.out.println();
1157 System.out.println(
1158 "Example : java -jar htmlparser.jar http://www.yahoo.com");
1159 System.out.println();
1160 System.out.println(
1161 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
1162 System.exit(-1);
1163 }
1164 try
1165 {
1166 if (args[0].indexOf("http") < 0)
1167 {
1168 File input = new File(args[0]);
1169 try
1170 {
1171 args[0] = input.toURL().toString();
1172 System.out.println("file converted to URL: " + args[0]);
1173 }
1174 catch (MalformedURLException e)
1175 {
1176 e.printStackTrace();
1177 }
1178 }
1179 Parser parser = new Parser(args[0]);
1180 System.out.println("Parsing " + parser.getURL());
1181 parser.registerScanners();
1182 try
1183 {
1184 long start = System.currentTimeMillis();
1185 if (args.length == 2)
1186 {
1187 parser.parse(args[1]);
1188 }
1189 else
1190 parser.parse(null);
1191 System.out.println(
1192 "Elapsed Time ms: " + (System.currentTimeMillis() - start));
1193 }
1194 catch (Exception e)
1195 {
1196 e.printStackTrace();
1197 }
1198 }
1199 catch (ParserException e)
1200 {
1201 e.printStackTrace();
1202 }
1203 }
1204
1205 public void visitAllNodesWith(NodeVisitor visitor) throws ParserException
1206 {
1207 Node node;
1208 for (NodeIterator e = elements(); e.hasMoreNodes();)
1209 {
1210 node = e.nextNode();
1211 node.accept(visitor);
1212 }
1213 visitor.finishedParsing();
1214 }
1215
1216 /** Initializes the parser with the given input HTML String.
1217 * @param inputHTML the input HTML that is to be parsed.
1218 */
1219 public void setInputHTML(String inputHTML)
1220 {
1221 if ("".equals(inputHTML))
1222 {
1223 reader = new NodeReader(new StringReader(inputHTML), "");
1224 }
1225 }
1226
1227 public Node[] extractAllNodesThatAre(Class nodeType) throws ParserException
1228 {
1229 NodeList nodeList = new NodeList();
1230 for (NodeIterator e = elements(); e.hasMoreNodes();)
1231 {
1232 e.nextNode().collectInto(nodeList, nodeType);
1233 }
1234 return nodeList.toNodeArray();
1235 }
1236
1237 /**
1238 * Creates the parser on an input string.
1239 * @param inputHTML
1240 * @return Parser
1241 */
1242 public static Parser createParser(String inputHTML)
1243 {
1244 NodeReader reader = new NodeReader(new StringReader(inputHTML), "");
1245 return new Parser(reader);
1246 }
1247
1248 public static Parser createLinkRecognizingParser(String inputHTML)
1249 {
1250 Parser parser = createParser(inputHTML);
1251 parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER));
1252 return parser;
1253 }
1254}