Source code: org/htmlparser/parserapplications/MailRipper.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/MailRipper.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33
34 package org.htmlparser.parserapplications;
35 import java.util.Enumeration;
36 import java.util.Vector;
37
38 import org.htmlparser.Node;
39 import org.htmlparser.Parser;
40 import org.htmlparser.tags.LinkTag;
41 import org.htmlparser.util.DefaultParserFeedback;
42 import org.htmlparser.util.NodeIterator;
43 import org.htmlparser.util.ParserException;
44
45
46 /**
47 * MailRipper will rip out all the mail addresses from a given web page
48 * Pass a web site (or html file on your local disk) as an argument.
49 */
50 public class MailRipper
51 {
52 private org.htmlparser.Parser parser;
53 /**
54 * MailRipper c'tor takes the url to be ripped
55 * @param resourceLocation url to be ripped
56 */
57 public MailRipper(String resourceLocation)
58 {
59 try
60 {
61 parser = new Parser(resourceLocation, new DefaultParserFeedback());
62 parser.registerScanners();
63 }
64 catch (ParserException e)
65 {
66 System.err.println("Could not create parser object");
67 e.printStackTrace();
68 }
69 }
70 public static void main(String[] args)
71 {
72 System.out.println("Mail Ripper v" + Parser.getVersion());
73 if (args.length < 1 || args[0].equals("-help"))
74 {
75 System.out.println();
76 System.out.println(
77 "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.MailRipper <resourceLocn/website>");
78 System.out.println();
79 System.out.println(
80 " <resourceLocn> the name of the file to be parsed (with complete path ");
81 System.out.println(
82 " if not in current directory)");
83 System.out.println(" -help This screen");
84 System.out.println();
85 System.out.println(
86 "HTML Parser home page : http://htmlparser.sourceforge.net");
87 System.out.println();
88 System.out.println(
89 "Example : java -classpath htmlparser.jar com.kizna.parserapplications.MailRipper http://htmlparser.sourceforge.net");
90 System.out.println();
91 System.out.println(
92 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
93 System.exit(-1);
94 }
95 String resourceLocation = "http://htmlparser.sourceforge.net";
96 if (args.length != 0)
97 resourceLocation = args[0];
98
99 MailRipper ripper = new MailRipper(resourceLocation);
100 System.out.println("Ripping Site " + resourceLocation);
101 try
102 {
103 for (Enumeration e = ripper.rip(); e.hasMoreElements();)
104 {
105 LinkTag tag = (LinkTag) e.nextElement();
106 System.out.println("Ripped mail address : " + tag.getLink());
107 }
108 }
109 catch (ParserException e)
110 {
111 e.printStackTrace();
112 }
113 }
114 /**
115 * Rip all mail addresses from the given url, and return an enumeration of such mail addresses.
116 * @return Enumeration of mail addresses (a vector of LinkTag)
117 */
118 public Enumeration rip() throws ParserException
119 {
120 Node node;
121 Vector mailAddresses = new Vector();
122 for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
123 {
124 node = e.nextNode();
125 if (node instanceof LinkTag)
126 {
127 LinkTag linkTag = (LinkTag) node;
128 if (linkTag.isMailLink())
129 mailAddresses.addElement(linkTag);
130 }
131 }
132 return mailAddresses.elements();
133 }
134 }