Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/Parser.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/Parser.java,v 1.2 2004/02/10 13:41:10 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  package org.htmlparser;
34  //////////////////
35  // Java Imports //
36  //////////////////
37  import java.io.BufferedInputStream;
38  import java.io.File;
39  import java.io.IOException;
40  import java.io.InputStream;
41  import java.io.InputStreamReader;
42  import java.io.ObjectInputStream;
43  import java.io.ObjectOutputStream;
44  import java.io.Serializable;
45  import java.io.StringReader;
46  import java.io.UnsupportedEncodingException;
47  import java.net.MalformedURLException;
48  import java.net.URLConnection;
49  import java.util.HashMap;
50  import java.util.Hashtable;
51  import java.util.Map;
52  
53  import org.htmlparser.parserHelper.ParserHelper;
54  import org.htmlparser.parserHelper.TagParser;
55  import org.htmlparser.scanners.AppletScanner;
56  import org.htmlparser.scanners.BodyScanner;
57  import org.htmlparser.scanners.BulletListScanner;
58  import org.htmlparser.scanners.DivScanner;
59  import org.htmlparser.scanners.DoctypeScanner;
60  import org.htmlparser.scanners.FormScanner;
61  import org.htmlparser.scanners.FrameSetScanner;
62  import org.htmlparser.scanners.HeadScanner;
63  import org.htmlparser.scanners.HtmlScanner;
64  import org.htmlparser.scanners.JspScanner;
65  import org.htmlparser.scanners.LinkScanner;
66  import org.htmlparser.scanners.MetaTagScanner;
67  import org.htmlparser.scanners.ScriptScanner;
68  import org.htmlparser.scanners.StyleScanner;
69  import org.htmlparser.scanners.TableScanner;
70  import org.htmlparser.scanners.TagScanner;
71  import org.htmlparser.scanners.TitleScanner;
72  import org.htmlparser.tags.EndTag;
73  import org.htmlparser.tags.ImageTag;
74  import org.htmlparser.tags.LinkTag;
75  import org.htmlparser.tags.MetaTag;
76  import org.htmlparser.tags.Tag;
77  import org.htmlparser.util.DefaultParserFeedback;
78  import org.htmlparser.util.IteratorImpl;
79  import org.htmlparser.util.NodeIterator;
80  import org.htmlparser.util.NodeList;
81  import org.htmlparser.util.ParserException;
82  import org.htmlparser.util.ParserFeedback;
83  import org.htmlparser.visitors.NodeVisitor;
84  
85  /**
86   * This is the class that the user will use, either to get an iterator into 
87   * the html page or to directly parse the page and print the results
88   * <BR>
89   * Typical usage of the parser is as follows : <BR>
90   * [1] Create a parser object - passing the URL and a feedback object to the parser<BR>
91   * [2] Register the common scanners. See {@link #registerScanners()} <BR>
92   * You wouldnt do this if you want to configure a custom lightweight parser. In that case, 
93   * you would add the scanners of your choice using {@link #addScanner(TagScanner)}<BR>
94   * [3] Enumerate through the elements from the parser object <BR>
95   * It is important to note that the parsing occurs when you enumerate, ON DEMAND. This is a thread-safe way, 
96   * and you only get the control back after a particular element is parsed and returned.
97   * 
98   * <BR>
99   * Below is some sample code to parse Yahoo.com and print all the tags.
100  * <pre>
101  * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
102  * // In this example, we are registering all the common scanners
103  * parser.registerScanners(); 
104  * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
105  *   Node node = i.nextNode();
106  *  node.print();
107  * }
108  * </pre> Below is some sample code to parse Yahoo.com and print only the text
109  * information. This scanning will run faster, as there are no scanners
110  * registered here.
111  * <pre>
112  * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
113  * // In this example, none of the scanners need to be registered
114  * // as a string node is not a tag to be scanned for.
115  * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
116  *  Node node = i.nextNode();
117  *  if (node instanceof StringNode) {          
118  *     StringNode stringNode =
119  *    (StringNode)node;        
120  *     System.out.println(stringNode.getText());    
121  *   } 
122  * }
123  * </pre>
124  * The above snippet will print out only the text contents in the html document.<br>
125  * Here's another snippet that will only print out the link urls in a document. 
126  * This is an example of adding a link scanner.
127  * <pre>
128  * Parser parser = new Parser("http://www.yahoo.com",new DefaultHTMLParserFeedback());
129  * parser.addScanner(new LinkScanner("-l"));
130  * for (NodeIterator i = parser.elements();e.hasMoreNodes();) {
131  *   Node node = i.nextNode();    
132  *   if (node instanceof LinkTag) {
133  *     LinkTag linkTag = (LinkTag)node;        
134  *     System.out.println(linkTag.getLink());    
135  *   } 
136  * }
137  * </pre>
138  *  @see Parser#elements() 
139  */
140 public class Parser implements Serializable
141 {
142     // Please don't change the formatting of the version variables below.
143     // This is done so as to facilitate ant script processing.
144 
145     /**
146      * The floating point version number.
147      */
148     public final static double VERSION_NUMBER = 1.3;
149 
150     /**
151      * The type of version.
152      */
153     public final static String VERSION_TYPE = "Release Build";
154 
155     /**
156      * The date of the version.
157      */
158     public final static String VERSION_DATE = "May 25, 2003";
159 
160     /**
161      * The display version.
162      */
163     public final static String VERSION_STRING =
164         "" + VERSION_NUMBER + " (" + VERSION_TYPE + " " + VERSION_DATE + ")";
165 
166     // End of formatting
167 
168     /**
169      * The default charset.
170      * This should be <code>ISO-8859-1</code>,
171      * see RFC 2616 (http://www.ietf.org/rfc/rfc2616.txt?number=2616) section 3.7.1
172      * Another alias is "8859_1".
173      */
174     protected static final String DEFAULT_CHARSET = "ISO-8859-1";
175 
176     /**
177      *  Trigger for charset detection.
178      */
179     protected static final String CHARSET_STRING = "charset";
180 
181     /**
182      * Feedback object.
183      */
184     protected ParserFeedback feedback;
185 
186     /**
187      * The URL or filename to be parsed.
188      */
189     protected String resourceLocn;
190 
191     /** 
192      * The html reader associated with this parser.
193      */
194     protected transient NodeReader reader;
195 
196     /**
197      * The list of scanners to apply at the top level.
198      */
199     private Map scanners;
200 
201     /**
202      * The encoding being used to decode the connection input stream.
203      */
204     protected String character_set;
205 
206     /**
207      * The source for HTML.
208      */
209     protected transient URLConnection url_conn;
210 
211     /**
212      * The bytes extracted from the source.
213      */
214     protected transient BufferedInputStream input;
215 
216     /**
217      * A quiet message sink.
218      * Use this for no feedback.
219      */
220     public static ParserFeedback noFeedback =
221         new DefaultParserFeedback(DefaultParserFeedback.QUIET);
222 
223     /**
224      * A verbose message sink.
225      * Use this for output on <code>System.out</code>.
226      */
227     public static ParserFeedback stdout = new DefaultParserFeedback();
228 
229     private ParserHelper parserHelper = new ParserHelper();
230 
231     //
232     // Static methods
233     //
234 
235     /**
236      * @param lineSeparator New Line separator to be used
237      */
238     public static void setLineSeparator(String lineSeparator)
239     {
240         Node.setLineSeparator(lineSeparator);
241     }
242 
243     /**
244      * Return the version string of this parser.
245      * @return A string of the form:
246      * <pre>
247      * "[floating point number] ([build-type] [build-date])"
248      * </pre>
249      */
250     public static String getVersion()
251     {
252         return (VERSION_STRING);
253     }
254 
255     /**
256      * Return the version number of this parser.
257      * @return A floating point number, the whole number part is the major
258      * version, and the fractional part is the minor version.
259      */
260     public static double getVersionNumber()
261     {
262         return (VERSION_NUMBER);
263     }
264 
265     //
266     // Constructors
267     //
268 
269     /**
270      * Zero argument constructor.
271      * The parser is in a safe but useless state.
272      * Set the reader or connection using setReader() or setConnection().
273      * @see #setReader(NodeReader)
274      * @see #setConnection(URLConnection)
275      */
276     public Parser()
277     {
278         setFeedback(null);
279         setScanners(null);
280         resourceLocn = null;
281         reader = null;
282         character_set = DEFAULT_CHARSET;
283         url_conn = null;
284         input = null;
285         Tag.setTagParser(new TagParser(getFeedback()));
286     }
287 
288     /**
289      * This constructor enables the construction of test cases, with readers
290      * associated with test string buffers. It can also be used with readers of the user's choice
291      * streaming data into the parser.<p/>
292      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
293      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
294      * <ul>
295      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
296      * <li>After the first parse, calls to elements() must be preceded by calls to :
297      * <pre>
298      * parser.getReader().reset();
299      * </pre>
300      * </li>
301      * </ul>
302      * @param rd The reader to draw characters from.
303      * @param fb The object to use when information,
304      * warning and error messages are produced. If <em>null</em> no feedback
305      * is provided.
306      */
307     public Parser(NodeReader rd, ParserFeedback fb)
308     {
309         setFeedback(fb);
310         setScanners(null);
311         resourceLocn = null;
312         reader = null;
313         character_set = DEFAULT_CHARSET;
314         url_conn = null;
315         input = null;
316         setReader(rd);
317         Tag.setTagParser(new TagParser(feedback));
318     }
319 
320     /**
321      * Constructor for custom HTTP access.
322      * @param connection A fully conditioned connection. The connect()
323      * method will be called so it need not be connected yet.
324      * @param fb The object to use for message communication.
325      */
326     public Parser(URLConnection connection, ParserFeedback fb)
327         throws ParserException
328     {
329         setFeedback(fb);
330         setScanners(null);
331         resourceLocn = null;
332         reader = null;
333         character_set = DEFAULT_CHARSET;
334         url_conn = null;
335         input = null;
336         Tag.setTagParser(new TagParser(feedback));
337         setConnection(connection);
338     }
339 
340     /**
341      * Creates a Parser object with the location of the resource (URL or file)
342      * You would typically create a DefaultHTMLParserFeedback object and pass it in.
343      * @param resourceLocn Either the URL or the filename (autodetects).
344      * A standard HTTP GET is performed to read the content of the URL.
345      * @param feedback The HTMLParserFeedback object to use when information,
346      * warning and error messages are produced. If <em>null</em> no feedback
347      * is provided.
348      * @see #Parser(URLConnection,ParserFeedback)
349      */
350     public Parser(String resourceLocn, ParserFeedback feedback)
351         throws ParserException
352     {
353         this(ParserHelper.openConnection(resourceLocn, feedback), feedback);
354     }
355 
356     /**
357      * Creates a Parser object with the location of the resource (URL or file).
358      * A DefaultHTMLParserFeedback object is used for feedback.
359      * @param resourceLocn Either the URL or the filename (autodetects).
360      */
361     public Parser(String resourceLocn) throws ParserException
362     {
363         this(resourceLocn, stdout);
364     }
365 
366     /**
367      * This constructor is present to enable users to plugin their own readers. 
368      * A DefaultHTMLParserFeedback object is used for feedback. It can also be used with readers of the user's choice
369      * streaming data into the parser.<p/>
370      * <B>Important:</B> If you are using this constructor, and you would like to use the parser
371      * to parse multiple times (multiple calls to parser.elements()), you must ensure the following:<br>
372      * <ul>
373      * <li>Before the first parse, you must mark the reader for a length that you anticipate (the size of the stream).</li>
374      * <li>After the first parse, calls to elements() must be preceded by calls to :
375      * <pre>
376      * parser.getReader().reset();
377      * </pre>
378      * </li>
379      * @param reader The source for HTML to be parsed.
380      */
381     public Parser(NodeReader reader)
382     {
383         this(reader, stdout);
384     }
385 
386     /**
387      * Constructor for non-standard access.
388      * A DefaultHTMLParserFeedback object is used for feedback.
389      * @param connection A fully conditioned connection. The connect()
390      * method will be called so it need not be connected yet.
391      * @see #Parser(URLConnection,ParserFeedback)
392      */
393     public Parser(URLConnection connection) throws ParserException
394     {
395         this(connection, stdout);
396     }
397 
398     //
399     // Serialization support
400     //
401 
402     private void writeObject(ObjectOutputStream out) throws IOException
403     {
404         if ((null == getConnection()) || /*redundant*/
405              (null == getURL()))
406             if (null != getReader());
407         //  commented out by Somik - why are we not allowed to serialize parsers without url
408         //                throw new IOException ("can only serialize parsers with a URL");
409         out.defaultWriteObject();
410     }
411 
412     private void readObject(ObjectInputStream in)
413         throws IOException, ClassNotFoundException
414     {
415         in.defaultReadObject();
416         try
417         {
418             // reopen the connection and create a reader which are transient fields
419             setURL(getURL());
420         }
421         catch (ParserException hpe)
422         {
423             throw new IOException(hpe.toString());
424         }
425     }
426 
427     //
428     // Bean patterns
429     //
430 
431     /**
432      * Set the connection for this parser.
433      * This method sets four of the fields in the parser object;
434      * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
435      * and <code>reader</code>. It does not adjust the <code>scanners</code> list
436      * or <code>feedback</code> object. The four fields are set atomicly by
437      * this method, either they are all set or none of them is set. Trying to
438      * set the connection to null is a noop.
439      * @param connection A fully conditioned connection. The connect()
440      * method will be called so it need not be connected yet.
441      * @exception ParserException if the character set specified in the
442      * HTTP header is not supported, or an i/o exception occurs creating the
443      * reader.
444      */
445     public void setConnection(URLConnection connection) throws ParserException
446     {
447         String res;
448         NodeReader rd;
449         String chs;
450         URLConnection con;
451 
452         if (null != connection)
453         {
454             res = getURL();
455             rd = getReader();
456             chs = getEncoding();
457             con = getConnection();
458             try
459             {
460                 resourceLocn = connection.getURL().toExternalForm();
461                 url_conn = connection;
462                 url_conn.connect();
463                 character_set = getCharacterSet(url_conn);
464                 createReader();
465             }
466             catch (IOException ioe)
467             {
468                 String msg =
469                     "setConnection() : Error in opening a connection to "
470                         + connection.getURL().toExternalForm();
471                 ParserException ex = new ParserException(msg, ioe);
472                 feedback.error(msg, ex);
473                 resourceLocn = res;
474                 url_conn = con;
475                 character_set = chs;
476                 reader = rd;
477                 throw ex;
478             }
479         }
480     }
481 
482     /**
483      * Return the current connection.
484      * @return The connection either created by the parser or passed into this
485      * parser via <code>setConnection</code>.
486      * @see #setConnection(URLConnection)
487      */
488     public URLConnection getConnection()
489     {
490         return (url_conn);
491     }
492 
493     /**
494      * Set the URL for this parser.
495      * This method sets four of the fields in the parser object;
496      * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
497      * and <code>reader</code>. It does not adjust the <code>scanners</code> list
498      * or <code>feedback</code> object.Trying to set the url to null or an
499      * empty string is a noop.
500      * @see #setConnection(URLConnection)
501      */
502     public void setURL(String url) throws ParserException
503     {
504         if ((null != url) && !"".equals(url))
505             setConnection(ParserHelper.openConnection(url, getFeedback()));
506     }
507 
508     /**
509      * Return the current URL being parsed.
510      * @return The url passed into the constructor or the file name
511      * passed to the constructor modified to be a URL.
512      */
513     public String getURL()
514     {
515         return (resourceLocn);
516     }
517 
518     /**
519      * Set the encoding for this parser.
520      * If there is no connection (getConnection() returns null) it simply sets
521      * the character set name stored in the parser (Note: the reader object
522      * which must have been set in the constructor or by <code>setReader()</code>,
523      * may or may not be using this character set).
524      * Otherwise (getConnection() doesn't return null) it does this by reopening the
525      * input stream of the connection and creating a reader that uses this
526      * character set. In this case, this method sets two of the fields in the
527      * parser object; <code>character_set</code> and <code>reader</code>.
528      * It does not adjust <code>resourceLocn</code>, <code>url_conn</code>,
529      * <code>scanners</code> or <code>feedback</code>. The two fields are set
530      * atomicly by this method, either they are both set or none of them is set.
531      * Trying to set the encoding to null or an empty string is a noop.
532      * @exception ParserException If the opening of the reader
533      */
534     public void setEncoding(String encoding) throws ParserException
535     {
536         String chs;
537         NodeReader rd;
538         BufferedInputStream in;
539 
540         if ((null != encoding) && !"".equals(encoding))
541             if (null == getConnection())
542                 character_set = encoding;
543             else
544             {
545                 rd = getReader();
546                 chs = getEncoding();
547                 in = input;
548                 try
549                 {
550                     character_set = encoding;
551                     recreateReader();
552                 }
553                 catch (IOException ioe)
554                 {
555                     String msg =
556                         "setEncoding() : Error in opening a connection to "
557                             + getConnection().getURL().toExternalForm();
558                     ParserException ex = new ParserException(msg, ioe);
559                     feedback.error(msg, ex);
560                     character_set = chs;
561                     reader = rd;
562                     input = in;
563                     throw ex;
564                 }
565             }
566     }
567 
568     /**
569      * The current encoding.
570      * This item is et from the HTTP header but may be overridden by meta
571      * tags in the head, so this may change after the head has been parsed.
572      */
573     public String getEncoding()
574     {
575         return (character_set);
576     }
577 
578     /**
579      * Set the reader for this parser.
580      * This method sets four of the fields in the parser object;
581      * <code>resourceLocn</code>, <code>url_conn</code>, <code>character_set</code>
582      * and <code>reader</code>. It does not adjust the <code>scanners</code> list
583      * or <code>feedback</code> object. The <code>url_conn</code> is set to
584      * null since this cannot be determined from the reader. The 
585      * <code>character_set</code> is set to the default character set since
586      * this cannot be determined from the reader.
587      * Trying to set the reader to <code>null</code> is a noop.
588      * @param rd The reader object to use. This reader will be bound to this
589      * parser after this call.
590      */
591     public void setReader(NodeReader rd)
592     {
593         if (null != rd)
594         {
595             resourceLocn = rd.getURL();
596             reader = rd;
597             character_set = DEFAULT_CHARSET;
598             url_conn = null;
599             reader.setParser(this);
600         }
601     }
602 
603     /**
604      * Returns the reader associated with the parser
605      * @return NodeReader
606      */
607     public NodeReader getReader()
608     {
609         return reader;
610     }
611 
612     /**
613      * Get the number of scanners registered currently in the scanner.
614      * @return int number of scanners registered
615      */
616     public int getNumScanners()
617     {
618         return scanners.size();
619     }
620 
621     /**
622      * This method is to be used to change the set of scanners in the current parser.
623      * @param newScanners Vector holding scanner objects to be used during the parsing process.
624      */
625     public void setScanners(Map newScanners)
626     {
627         scanners = (null == newScanners) ? new HashMap() : newScanners;
628     }
629 
630     /**
631      * Get an enumeration of scanners registered currently in the parser
632      * @return Enumeration of scanners currently registered in the parser
633      */
634     public Map getScanners()
635     {
636         return scanners;
637     }
638 
639     /**
640      * Sets the feedback object used in scanning.
641      * @param fb The new feedback object to use.
642      */
643     public void setFeedback(ParserFeedback fb)
644     {
645         feedback = (null == fb) ? noFeedback : fb;
646     }
647 
648     /**
649      * Returns the feedback.
650      * @return HTMLParserFeedback
651      */
652     public ParserFeedback getFeedback()
653     {
654         return feedback;
655     }
656 
657     //
658     // Internal methods
659     //
660 
661     /**
662      * Open a stream reader on the <code>InputStream</code>.
663      * Revise the character set to it's default value if an
664      * <code>UnsupportedEncodingException</code> is thrown.
665      * @exception UnsupportedEncodingException in the unlikely event that
666      * the default character set is not supported on this platform.
667      */
668     protected InputStreamReader createInputStreamReader()
669         throws UnsupportedEncodingException
670     {
671         InputStreamReader ret;
672 
673         try
674         {
675             ret = new InputStreamReader(input, character_set);
676         }
677         catch (UnsupportedEncodingException uee)
678         {
679             StringBuffer msg;
680             String message;
681 
682             msg = new StringBuffer(1024);
683             msg.append(url_conn.getURL().toExternalForm());
684             msg.append(" has an encoding (");
685             msg.append(character_set);
686             msg.append(") which is not supported, using ");
687             msg.append(DEFAULT_CHARSET);
688             message = msg.toString();
689             feedback.warning(message);
690             character_set = DEFAULT_CHARSET;
691             ret = new InputStreamReader(input, character_set);
692         }
693 
694         return (ret);
695     }
696 
697     /**
698      * Create a new reader for the URLConnection object.
699      * The current character set is used to transform the input stream
700      * into a character reader.
701      * @exception IOException if there is a problem constructing the reader.
702      * @see #createInputStreamReader()
703      * @see #getEncoding()
704      */
705     protected void createReader() throws IOException
706     {
707         InputStream stream;
708         InputStreamReader in;
709 
710         stream = url_conn.getInputStream();
711         input = new BufferedInputStream(stream);
712         input.mark(Integer.MAX_VALUE);
713         in = createInputStreamReader();
714         reader = new NodeReader(in, resourceLocn);
715         reader.setParser(this);
716     }
717 
718     /**
719      * Create a new reader for the URLConnection object but reuse the input stream.
720      * The current character set is used to transform the input stream
721      * into a character reader. Defaults to <code>createReader()</code> if
722      * there is no existing input stream.
723      * @exception IOException if there is a problem constructing the reader.
724      * @see #createReader()
725      * @see #createInputStreamReader()
726      * @see #getEncoding()
727      */
728     protected void recreateReader() throws IOException
729     {
730         InputStreamReader in;
731 
732         if (null == input)
733             createReader();
734         else
735         {
736             input.reset();
737             input.mark(Integer.MAX_VALUE);
738             in = createInputStreamReader();
739             reader = new NodeReader(in, resourceLocn);
740             reader.setParser(this);
741         }
742     }
743 
744     /**
745      * Try and extract the character set from the HTTP header.
746      * @param connection The connection with the charset info.
747      * @return The character set name to use for this HTML page.
748      */
749     protected String getCharacterSet(URLConnection connection)
750     {
751         final String field = "Content-Type";
752 
753         String string;
754         String ret;
755 
756         ret = DEFAULT_CHARSET;
757         string = connection.getHeaderField(field);
758         if (null != string)
759             ret = getCharset(string);
760 
761         return (ret);
762     }
763 
764     /**
765      * Get a CharacterSet name corresponding to a charset parameter.
766      * @param content A text line of the form:
767      * <pre>
768      * text/html; charset=Shift_JIS
769      * </pre>
770      * which is applicable both to the HTTP header field Content-Type and
771      * the meta tag http-equiv="Content-Type".
772      * Note this method also handles non-compliant quoted charset directives such as:
773      * <pre>
774      * text/html; charset="UTF-8"
775      * </pre>
776      * and
777      * <pre>
778      * text/html; charset='UTF-8'
779      * </pre>
780      * @return The character set name to use when reading the input stream.
781      * For JDKs that have the Charset class this is qualified by passing
782      * the name to findCharset() to render it into canonical form.
783      * If the charset parameter is not found in the given string, the default
784      * character set is returned.
785      * @see ParserHelper#findCharset
786      * @see #DEFAULT_CHARSET
787      */
788     protected String getCharset(String content)
789     {
790         int index;
791         String ret;
792 
793         ret = DEFAULT_CHARSET;
794         if (null != content)
795         {
796             index = content.indexOf(CHARSET_STRING);
797 
798             if (index != -1)
799             {
800                 content =
801                     content.substring(index + CHARSET_STRING.length()).trim();
802                 if (content.startsWith("="))
803                 {
804                     content = content.substring(1).trim();
805                     index = content.indexOf(";");
806                     if (index != -1)
807                         content = content.substring(0, index);
808 
809                     //remove any double quotes from around charset string
810                     if (content.startsWith("\"")
811                         && content.endsWith("\"")
812                         && (1 < content.length()))
813                         content = content.substring(1, content.length() - 1);
814 
815                     //remove any single quote from around charset string
816                     if (content.startsWith("'")
817                         && content.endsWith("'")
818                         && (1 < content.length()))
819                         content = content.substring(1, content.length() - 1);
820 
821                     ret = ParserHelper.findCharset(content, ret);
822                     // Charset names are not case-sensitive;
823                     // that is, case is always ignored when comparing charset names.
824                     if (!ret.equalsIgnoreCase(content))
825                     {
826                         feedback.info(
827                             "detected charset \""
828                                 + content
829                                 + "\", using \""
830                                 + ret
831                                 + "\"");
832                     }
833                 }
834             }
835         }
836 
837         return (ret);
838     }
839 
840     //
841     // Public methods
842     //
843 
844     /**
845      * Add a new Tag Scanner.
846      * In typical situations where you require a no-frills parser, use the registerScanners() method to add the most
847      * common parsers. But when you wish to either compose a parser with only certain scanners registered, use this method.
848      * It is advantageous to register only the scanners you want, in order to achieve faster parsing speed. This method 
849      * would also be of use when you have developed custom scanners, and need to register them into the parser.
850      * @param scanner TagScanner object (or derivative) to be added to the list of registered scanners
851      */
852     public void addScanner(TagScanner scanner)
853     {
854         String ids[] = scanner.getID();
855         for (int i = 0; i < ids.length; i++)
856         {
857             scanners.put(ids[i], scanner);
858         }
859         scanner.setFeedback(feedback);
860     }
861 
862     /**
863      * Returns an iterator (enumeration) to the html nodes. Each node can be a tag/endtag/
864      * string/link/image<br>
865      * This is perhaps the most important method of this class. In typical situations, you will need to use
866      * the parser like this :
867      * <pre>
868      * Parser parser = new Parser("http://www.yahoo.com");
869      * parser.registerScanners();
870      * for (NodeIterator i = parser.elements();i.hasMoreElements();) {
871      *    Node node = i.nextHTMLNode();
872      *    if (node instanceof StringNode) {
873      *      // Downcasting to StringNode
874      *      StringNode stringNode = (StringNode)node;
875      *      // Do whatever processing you want with the string node
876      *      System.out.println(stringNode.getText());
877      *    }
878      *    // Check for the node or tag that you want
879      *    if (node instanceof ...) {
880      *      // Downcast, and process
881      *    }
882      * }
883      * </pre>
884      */
885     public NodeIterator elements() throws ParserException
886     {
887         boolean remove_scanner;
888         Node node;
889         MetaTag meta;
890         String httpEquiv;
891         String charset;
892         boolean restart;
893         EndTag end;
894         IteratorImpl ret;
895 
896         remove_scanner = false;
897         restart = false;
898         ret = new IteratorImpl(reader, resourceLocn, feedback);
899         ret = createIteratorImpl(remove_scanner, ret);
900 
901         return (ret);
902     }
903 
904     public IteratorImpl createIteratorImpl(
905         boolean remove_scanner,
906         IteratorImpl ret)
907         throws ParserException
908     {
909         Node node;
910         MetaTag meta;
911         String httpEquiv;
912         String charset;
913         EndTag end;
914         if (null != url_conn)
915             try
916             {
917                 if (null == scanners.get("-m"))
918                 {
919                     addScanner(new MetaTagScanner("-m"));
920                     remove_scanner = true;
921                 }
922 
923                 /* pre-read up to </HEAD> looking for charset directive */
924                 while (null != (node = ret.peek()))
925                 {
926                     if (node instanceof MetaTag)
927                     { // check for charset on Content-Type
928                         meta = (MetaTag) node;
929                         httpEquiv = meta.getAttribute("HTTP-EQUIV");
930                         if ("Content-Type".equalsIgnoreCase(httpEquiv))
931                         {
932                             charset = getCharset(meta.getAttribute("CONTENT"));
933                             if (!charset.equalsIgnoreCase(character_set))
934                             { // oops, different character set, restart
935                                 character_set = charset;
936                                 recreateReader();
937                                 ret =
938                                     new IteratorImpl(
939                                         reader,
940                                         resourceLocn,
941                                         feedback);
942                             }
943                             // once we see the Content-Type meta tag we're finished the pre-read
944                             break;
945                         }
946                     }
947                     else if (node instanceof EndTag)
948                     {
949                         end = (EndTag) node;
950                         if (end.getTagName().equalsIgnoreCase("HEAD"))
951                             // or, once we see the </HEAD> tag we're finished the pre-read
952                             break;
953                     }
954                 }
955             }
956             catch (UnsupportedEncodingException uee)
957             {
958                 String msg =
959                     "elements() : The content of "
960                         + url_conn.getURL().toExternalForm()
961                         + " has an encoding which is not supported";
962                 ParserException ex = new ParserException(msg, uee);
963                 feedback.error(msg, ex);
964                 throw ex;
965             }
966             catch (IOException ioe)
967             {
968                 String msg =
969                     "elements() : Error in opening a connection to "
970                         + url_conn.getURL().toExternalForm();
971                 ParserException ex = new ParserException(msg, ioe);
972                 feedback.error(msg, ex);
973                 throw ex;
974             }
975             finally
976             {
977                 if (remove_scanner)
978                     scanners.remove("-m");
979             }
980         return ret;
981     }
982 
983     /**
984      * Flush the current scanners registered. The registered scanners list becomes empty with this call.
985      */
986     public void flushScanners()
987     {
988         scanners = new Hashtable();
989     }
990 
991     /**
992      * Return the scanner registered in the parser having the
993      * given id
994      * @param id The id of the requested scanner
995      * @return TagScanner The Tag Scanner
996      */
997     public TagScanner getScanner(String id)
998     {
999         return (TagScanner) scanners.get(id);
1000    }
1001
1002    /**
1003     * Parse the given resource, using the filter provided
1004     */
1005    public void parse(String filter) throws Exception
1006    {
1007        Node node;
1008        for (NodeIterator e = elements(); e.hasMoreNodes();)
1009        {
1010            node = e.nextNode();
1011            if (node != null)
1012            {
1013                if (filter == null)
1014                    System.out.println(node.toString());
1015                else
1016                {
1017                    // There is a filter. Find if the associated filter of this node
1018                    // matches the specified filter
1019                    if (!(node instanceof Tag))
1020                        continue;
1021                    Tag tag = (Tag) node;
1022                    TagScanner scanner = tag.getThisScanner();
1023                    if (scanner == null)
1024                        continue;
1025
1026                    String tagFilter = scanner.getFilter();
1027                    if (tagFilter == null)
1028                        continue;
1029                    if (tagFilter.equals(filter))
1030                        System.out.println(node.toString());
1031                }
1032            }
1033            else
1034                System.out.println("Node is null");
1035        }
1036
1037    }
1038
1039    /**
1040     * This method should be invoked in order to register some common scanners. The scanners that get added are : <br>
1041     * LinkScanner    (filter key "-l")<br>
1042     * HTMLImageScanner   (filter key "-i")<br>
1043     * HTMLScriptScanner  (filter key "-s") <br>
1044     * HTMLStyleScanner   (filter key "-t") <br>
1045     * HTMLJspScanner     (filter key "-j") <br>
1046     * HTMLAppletScanner  (filter key "-a") <br>
1047     * HTMLMetaTagScanner (filter key "-m") <br>
1048     * HTMLTitleScanner   (filter key "-t") <br>
1049     * HTMLDoctypeScanner (filter key "-d") <br>
1050     * HTMLFormScanner    (filter key "-f") <br>
1051     * HTMLFrameSetScanner(filter key "-r") <br>
1052     * HTMLBaseHREFScanner(filter key "-b") <br>
1053     * <br>
1054     * Call this method after creating the Parser object. e.g. <BR>
1055     * <pre>
1056     * Parser parser = new Parser("http://www.yahoo.com");
1057     * parser.registerScanners();
1058     * </pre>
1059     */
1060    public void registerScanners()
1061    {
1062        if (scanners.size() > 0)
1063        {
1064            System.err.println(
1065                "registerScanners() should be called first, when no other scanner has been registered.");
1066            System.err.println(
1067                "Other scanners already exist, hence this method call wont have any effect");
1068            return;
1069        }
1070        LinkScanner linkScanner = new LinkScanner(LinkTag.LINK_TAG_FILTER);
1071        // Note - The BaseHREF and Image scanners share the same
1072        // link processor - internally linked up with the factory
1073        // method in the link scanner class
1074        addScanner(linkScanner);
1075        addScanner(linkScanner.createImageScanner(ImageTag.IMAGE_TAG_FILTER));
1076        addScanner(new ScriptScanner("-s"));
1077        addScanner(new StyleScanner("-t"));
1078        addScanner(new JspScanner("-j"));
1079        addScanner(new AppletScanner("-a"));
1080        addScanner(new MetaTagScanner("-m"));
1081        addScanner(new TitleScanner("-T"));
1082        addScanner(new DoctypeScanner("-d"));
1083        addScanner(new FormScanner("-f", this));
1084        addScanner(new FrameSetScanner("-r"));
1085        addScanner(linkScanner.createBaseHREFScanner("-b"));
1086        addScanner(new BulletListScanner("-bulletList", this));
1087        //  addScanner(new SpanScanner("-p"));
1088        addScanner(new DivScanner("-div"));
1089        addScanner(new TableScanner(this));
1090    }
1091
1092    /**
1093     * Make a call to registerDomScanners(), instead of registerScanners(),
1094     * when you are interested in retrieving a Dom representation of the html
1095     * page. Upon parsing, you will receive an Html object - which will contain
1096     * children, one of which would be the body. This is still evolving, and in 
1097     * future releases, you might see consolidation of Html - to provide you 
1098     * with methods to access the body and the head.
1099     */
1100    public void registerDomScanners()
1101    {
1102        registerScanners();
1103        addScanner(new HtmlScanner());
1104        addScanner(new BodyScanner());
1105        addScanner(new HeadScanner());
1106    }
1107
1108    /**
1109     * Removes a specified scanner object. You can create
1110     * an anonymous object as a parameter. This method
1111     * will use the scanner's key and remove it from the 
1112     * registry of scanners.
1113     * e.g.
1114     * <pre>
1115     * removeScanner(new FormScanner(""));
1116     * </pre>
1117     * @param scanner TagScanner object to be removed from the list of registered scanners
1118     */
1119    public void removeScanner(TagScanner scanner)
1120    {
1121        scanners.remove(scanner.getID()[0]);
1122    }
1123
1124    /**
1125     * The main program, which can be executed from the command line
1126     */
1127    public static void main(String[] args)
1128    {
1129        System.out.println("HTMLParser v" + VERSION_STRING);
1130        if (args.length < 1 || args[0].equals("-help"))
1131        {
1132            System.out.println();
1133            System.out.println(
1134                "Syntax : java -jar htmlparser.jar <resourceLocn/website> -l");
1135            System.out.println(
1136                "   <resourceLocn> the name of the file to be parsed (with complete path if not in current directory)");
1137            System.out.println(
1138                "   -l Show only the link tags extracted from the document");
1139            System.out.println(
1140                "   -i Show only the image tags extracted from the document");
1141            System.out.println(
1142                "   -s Show only the Javascript code extracted from the document");
1143            System.out.println(
1144                "   -t Show only the Style code extracted from the document");
1145            System.out.println(
1146                "   -a Show only the Applet tag extracted from the document");
1147            System.out.println("   -j Parse JSP tags");
1148            System.out.println("   -m Parse Meta tags");
1149            System.out.println("   -T Extract the Title");
1150            System.out.println("   -f Extract forms");
1151            System.out.println("   -r Extract frameset");
1152            System.out.println("   -help This screen");
1153            System.out.println();
1154            System.out.println(
1155                "HTML Parser home page : http://htmlparser.sourceforge.net");
1156            System.out.println();
1157            System.out.println(
1158                "Example : java -jar htmlparser.jar http://www.yahoo.com");
1159            System.out.println();
1160            System.out.println(
1161                "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
1162            System.exit(-1);
1163        }
1164        try
1165        {
1166            if (args[0].indexOf("http") < 0)
1167            {
1168                File input = new File(args[0]);
1169                try
1170                {
1171                    args[0] = input.toURL().toString();
1172                    System.out.println("file converted to URL: " + args[0]);
1173                }
1174                catch (MalformedURLException e)
1175                {
1176                    e.printStackTrace();
1177                }
1178            }
1179            Parser parser = new Parser(args[0]);
1180            System.out.println("Parsing " + parser.getURL());
1181            parser.registerScanners();
1182            try
1183            {
1184                long start = System.currentTimeMillis();
1185                if (args.length == 2)
1186                {
1187                    parser.parse(args[1]);
1188                }
1189                else
1190                    parser.parse(null);
1191                System.out.println(
1192                    "Elapsed Time ms: " + (System.currentTimeMillis() - start));
1193            }
1194            catch (Exception e)
1195            {
1196                e.printStackTrace();
1197            }
1198        }
1199        catch (ParserException e)
1200        {
1201            e.printStackTrace();
1202        }
1203    }
1204
1205    public void visitAllNodesWith(NodeVisitor visitor) throws ParserException
1206    {
1207        Node node;
1208        for (NodeIterator e = elements(); e.hasMoreNodes();)
1209        {
1210            node = e.nextNode();
1211            node.accept(visitor);
1212        }
1213        visitor.finishedParsing();
1214    }
1215
1216    /** Initializes the parser with the given input HTML String.
1217     * @param inputHTML the input HTML that is to be parsed.
1218     */
1219    public void setInputHTML(String inputHTML)
1220    {
1221        if ("".equals(inputHTML))
1222        {
1223            reader = new NodeReader(new StringReader(inputHTML), "");
1224        }
1225    }
1226
1227    public Node[] extractAllNodesThatAre(Class nodeType) throws ParserException
1228    {
1229        NodeList nodeList = new NodeList();
1230        for (NodeIterator e = elements(); e.hasMoreNodes();)
1231        {
1232            e.nextNode().collectInto(nodeList, nodeType);
1233        }
1234        return nodeList.toNodeArray();
1235    }
1236
1237    /**
1238     * Creates the parser on an input string.
1239     * @param inputHTML
1240     * @return Parser
1241     */
1242    public static Parser createParser(String inputHTML)
1243    {
1244        NodeReader reader = new NodeReader(new StringReader(inputHTML), "");
1245        return new Parser(reader);
1246    }
1247
1248    public static Parser createLinkRecognizingParser(String inputHTML)
1249    {
1250        Parser parser = createParser(inputHTML);
1251        parser.addScanner(new LinkScanner(LinkTag.LINK_TAG_FILTER));
1252        return parser;
1253    }
1254}