Source code: org/htmlparser/parserapplications/Robot.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/parserapplications/Robot.java,v 1.2 2004/02/10 13:41:07 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32
33 package org.htmlparser.parserapplications;
34 import org.htmlparser.Node;
35 import org.htmlparser.Parser;
36 import org.htmlparser.tags.LinkTag;
37 import org.htmlparser.util.DefaultParserFeedback;
38 import org.htmlparser.util.NodeIterator;
39 import org.htmlparser.util.ParserException;
40 /**
41 * The Robot Crawler application will crawl through urls recursively, based on a depth value.
42 */
43 public class Robot
44 {
45 private org.htmlparser.Parser parser;
46 /**
47 * Robot crawler - Provide the starting url
48 */
49 public Robot(String resourceLocation)
50 {
51 try
52 {
53 parser = new Parser(resourceLocation, new DefaultParserFeedback());
54 parser.registerScanners();
55 }
56 catch (ParserException e)
57 {
58 System.err.println("Error, could not create parser object");
59 e.printStackTrace();
60 }
61 }
62 /**
63 * Crawl using a given crawl depth.
64 * @param crawlDepth Depth of crawling
65 */
66 public void crawl(int crawlDepth) throws ParserException
67 {
68 try
69 {
70 crawl(parser, crawlDepth);
71 }
72 catch (ParserException e)
73 {
74 throw new ParserException(
75 "HTMLParserException at crawl(" + crawlDepth + ")",
76 e);
77 }
78 }
79 /**
80 * Crawl using a given parser object, and a given crawl depth.
81 * @param parser Parser object
82 * @param crawlDepth Depth of crawling
83 */
84 public void crawl(Parser parser, int crawlDepth) throws ParserException
85 {
86 System.out.println(" crawlDepth = " + crawlDepth);
87 for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
88 {
89 Node node = e.nextNode();
90 if (node instanceof LinkTag)
91 {
92 LinkTag linkTag = (LinkTag) node;
93 {
94 if (!linkTag.isMailLink())
95 {
96 if (linkTag.getLink().toUpperCase().indexOf("HTM")
97 != -1
98 || linkTag.getLink().toUpperCase().indexOf("COM")
99 != -1
100 || linkTag.getLink().toUpperCase().indexOf("ORG")
101 != -1)
102 {
103 if (crawlDepth > 0)
104 {
105 Parser newParser =
106 new Parser(
107 linkTag.getLink(),
108 new DefaultParserFeedback());
109 newParser.registerScanners();
110 System.out.print(
111 "Crawling to " + linkTag.getLink());
112 crawl(newParser, crawlDepth - 1);
113 }
114 else
115 System.out.println(linkTag.getLink());
116 }
117 }
118 }
119 }
120 }
121 }
122
123 public static void main(String[] args)
124 {
125 System.out.println("Robot Crawler v" + Parser.getVersion());
126 if (args.length < 2 || args[0].equals("-help"))
127 {
128 System.out.println();
129 System.out.println(
130 "Syntax : java -classpath htmlparser.jar org.htmlparser.parserapplications.Robot <resourceLocn/website> <depth>");
131 System.out.println();
132 System.out.println(
133 " <resourceLocn> the name of the file to be parsed (with complete path ");
134 System.out.println(
135 " if not in current directory)");
136 System.out.println(
137 " <depth> No of links to be followed from each link");
138 System.out.println(" -help This screen");
139 System.out.println();
140 System.out.println(
141 "HTML Parser home page : http://htmlparser.sourceforge.net");
142 System.out.println();
143 System.out.println(
144 "Example : java -classpath htmlparser.jar com.kizna.parserapplications.Robot http://www.google.com 3");
145 System.out.println();
146 System.out.println(
147 "If you have any doubts, please join the HTMLParser mailing list (user/developer) from the HTML Parser home page instead of mailing any of the contributors directly. You will be surprised with the quality of open source support. ");
148 System.exit(-1);
149 }
150 String resourceLocation = "";
151 int crawlDepth = 1;
152 if (args.length != 0)
153 resourceLocation = args[0];
154 if (args.length == 2)
155 crawlDepth = Integer.valueOf(args[1]).intValue();
156
157 Robot robot = new Robot(resourceLocation);
158 System.out.println("Crawling Site " + resourceLocation);
159 try
160 {
161 robot.crawl(crawlDepth);
162 }
163 catch (ParserException e)
164 {
165 e.printStackTrace();
166 }
167 }
168 }