Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/parserapplications/MailRipper.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/MailRipper.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  
33  
34  package org.htmlparser.parserapplications;
35  import java.util.Enumeration;
36  import java.util.Vector;
37  
38  import org.htmlparser.Node;
39  import org.htmlparser.Parser;
40  import org.htmlparser.tags.LinkTag;
41  import org.htmlparser.util.DefaultParserFeedback;
42  import org.htmlparser.util.NodeIterator;
43  import org.htmlparser.util.ParserException;
44  
45  
46  /**
47   * MailRipper will rip out all the mail addresses from a given web page
48   * Pass a web site (or html file on your local disk) as an argument.
49   */
50  public class MailRipper
51  {
52      private org.htmlparser.Parser parser;
53      /**
54       * MailRipper c'tor takes the url to be ripped
55       * @param resourceLocation url to be ripped
56       */
57      public MailRipper(String resourceLocation)
58      {
59          try
60          {
61              parser = new Parser(resourceLocation, new DefaultParserFeedback());
62              parser.registerScanners();
63          }
64          catch (ParserException e)
65          {
66              System.err.println("Could not create parser object");
67              e.printStackTrace();
68          }
69      }
70      public static void main(String[] args)
71      {
72          System.out.println("Mail Ripper v" + Parser.getVersion());
73          if (args.length < 1 || args[0].equals("-help"))
74          {
75              System.out.println();
76              System.out.println(
77                  "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>");
78              System.out.println();
79              System.out.println(
80                  "   <resourceLocn> the name of the file to be parsed (with complete path ");
81              System.out.println(
82                  "                  if not in current directory)");
83              System.out.println("   -help This screen");
84              System.out.println();
85              System.out.println(
86                  "HTML Parser home page : http://htmlparser.sourceforge.net");
87              System.out.println();
88              System.out.println(
89                  "Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net");
90              System.out.println();
91              System.out.println(
92                  "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
93              System.exit(-1);
94          }
95          String resourceLocation = "http://htmlparser.sourceforge.net";
96          if (args.length != 0)
97              resourceLocation = args[0];
98  
99          MailRipper ripper = new MailRipper(resourceLocation);
100         System.out.println("Ripping Site " + resourceLocation);
101         try
102         {
103             for (Enumeration e = ripper.rip(); e.hasMoreElements();)
104             {
105                 LinkTag tag = (LinkTag) e.nextElement();
106                 System.out.println("Ripped mail address : " + tag.getLink());
107             }
108         }
109         catch (ParserException e)
110         {
111             e.printStackTrace();
112         }
113     }
114     /**
115      * Rip all mail addresses from the given url, and return an enumeration of such mail addresses.
116      * @return Enumeration of mail addresses (a vector of LinkTag)
117      */
118     public Enumeration rip() throws ParserException
119     {
120         Node node;
121         Vector mailAddresses = new Vector();
122         for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
123         {
124             node = e.nextNode();
125             if (node instanceof LinkTag)
126             {
127                 LinkTag linkTag = (LinkTag) node;
128                 if (linkTag.isMailLink())
129                     mailAddresses.addElement(linkTag);
130             }
131         }
132         return mailAddresses.elements();
133     }
134 }