Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/parserapplications/LinkExtractor.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/LinkExtractor.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  
34  package org.htmlparser.parserapplications;
35  
36  
37  import org.htmlparser.Node;
38  import org.htmlparser.Parser;
39  import org.htmlparser.tags.LinkTag;
40  import org.htmlparser.util.ParserException;
41  
42  /**
43   * LinkExtractor extracts all the links from the given webpage
44   * and prints them on standard output.
45   */
46  public class LinkExtractor
47  {
48      private String location;
49      private Parser parser;
50      public LinkExtractor(String location)
51      {
52          this.location = location;
53          try
54          {
55              this.parser = new Parser(location); // Create the parser object
56              parser.registerScanners();
57              // Register standard scanners (Very Important)
58          }
59          catch (ParserException e)
60          {
61              e.printStackTrace();
62          }
63  
64      }
65      public void extractLinks() throws ParserException
66      {
67          System.out.println("Parsing " + location + " for links...");
68          Node[] links = parser.extractAllNodesThatAre(LinkTag.class);
69          for (int i = 0; i < links.length; i++)
70          {
71              LinkTag linkTag = (LinkTag) links[i];
72              // Print it
73              //      System.out.println(linkTag.toString()); 
74              System.out.println(linkTag.getLink());
75              // To extract only mail addresses, uncomment the following line
76              //      if (linkTag.isMailLink()) System.out.println(linkTag.getLink());
77          }
78      }
79  
80      public static void main(String[] args)
81      {
82          if (args.length < 0)
83          {
84              System.err.println(
85                  "Syntax Error : Please provide the location(URL or file) to parse");
86              System.exit(-1);
87          }
88          LinkExtractor linkExtractor = new LinkExtractor(args[0]);
89          try
90          {
91              linkExtractor.extractLinks();
92          }
93          catch (ParserException e)
94          {
95              e.printStackTrace();
96          }
97      }
98  }