1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.crawl;
19
20 import java.util;
21 import java.text;
22
23 // Commons Logging imports
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26
27 import org.apache.hadoop.fs;
28 import org.apache.hadoop.conf;
29 import org.apache.hadoop.mapred;
30 import org.apache.nutch.parse.ParseSegment;
31 import org.apache.nutch.indexer.DeleteDuplicates;
32 import org.apache.nutch.indexer.IndexMerger;
33 import org.apache.nutch.indexer.Indexer;
34 import org.apache.nutch.util.HadoopFSUtil;
35 import org.apache.nutch.util.NutchConfiguration;
36 import org.apache.nutch.util.NutchJob;
37
38 import org.apache.nutch.fetcher.Fetcher;
39
40 public class Crawl {
41 public static final Log LOG = LogFactory.getLog(Crawl.class);
42
43 private static String getDate() {
44 return new SimpleDateFormat("yyyyMMddHHmmss").format
45 (new Date(System.currentTimeMillis()));
46 }
47
48
49 /* Perform complete crawling and indexing given a set of root urls. */
50 public static void main(String args[]) throws Exception {
51 if (args.length < 1) {
52 System.out.println
53 ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]");
54 return;
55 }
56
57 Configuration conf = NutchConfiguration.create();
58 conf.addResource("crawl-tool.xml");
59 JobConf job = new NutchJob(conf);
60
61 Path rootUrlDir = null;
62 Path dir = new Path("crawl-" + getDate());
63 int threads = job.getInt("fetcher.threads.fetch", 10);
64 int depth = 5;
65 long topN = Long.MAX_VALUE;
66 for (int i = 0; i < args.length; i++) {
67 if ("-dir".equals(args[i])) {
68 dir = new Path(args[i+1]);
69 i++;
70 } else if ("-threads".equals(args[i])) {
71 threads = Integer.parseInt(args[i+1]);
72 i++;
73 } else if ("-depth".equals(args[i])) {
74 depth = Integer.parseInt(args[i+1]);
75 i++;
76 } else if ("-topN".equals(args[i])) {
77 topN = Integer.parseInt(args[i+1]);
78 i++;
79 } else if (args[i] != null) {
80 rootUrlDir = new Path(args[i]);
81 }
82 }
83
84 FileSystem fs = FileSystem.get(job);
85
86 if (LOG.isInfoEnabled()) {
87 LOG.info("crawl started in: " + dir);
88 LOG.info("rootUrlDir = " + rootUrlDir);
89 LOG.info("threads = " + threads);
90 LOG.info("depth = " + depth);
91 if (topN != Long.MAX_VALUE)
92 LOG.info("topN = " + topN);
93 }
94
95 Path crawlDb = new Path(dir + "/crawldb");
96 Path linkDb = new Path(dir + "/linkdb");
97 Path segments = new Path(dir + "/segments");
98 Path indexes = new Path(dir + "/indexes");
99 Path index = new Path(dir + "/index");
100
101 Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
102 Injector injector = new Injector(conf);
103 Generator generator = new Generator(conf);
104 Fetcher fetcher = new Fetcher(conf);
105 ParseSegment parseSegment = new ParseSegment(conf);
106 CrawlDb crawlDbTool = new CrawlDb(conf);
107 LinkDb linkDbTool = new LinkDb(conf);
108 Indexer indexer = new Indexer(conf);
109 DeleteDuplicates dedup = new DeleteDuplicates(conf);
110 IndexMerger merger = new IndexMerger(conf);
111
112 // initialize crawlDb
113 injector.inject(crawlDb, rootUrlDir);
114 int i;
115 for (i = 0; i < depth; i++) { // generate new segment
116 Path segment = generator.generate(crawlDb, segments, -1, topN, System
117 .currentTimeMillis());
118 if (segment == null) {
119 LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
120 break;
121 }
122 fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf)); // fetch it
123 if (!Fetcher.isParsing(job)) {
124 parseSegment.parse(segment); // parse it, if needed
125 }
126 crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
127 }
128 if (i > 0) {
129 linkDbTool.invert(linkDb, segments, true, true, false); // invert links
130
131 if(indexes != null) {
132 // Delete old indexes
133 if (fs.exists(indexes)) {
134 LOG.info("Deleting old indexes: " + indexes);
135 fs.delete(indexes, true);
136 }
137
138 // Delete old index
139 if (fs.exists(index)) {
140 LOG.info("Deleting old merged index: " + index);
141 fs.delete(index, true);
142 }
143 }
144
145 // index, dedup & merge
146 FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
147 indexer.index(indexes, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
148 if(indexes != null) {
149 dedup.dedup(new Path[] { indexes });
150 fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
151 merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
152 }
153 } else {
154 LOG.warn("No URLs to fetch - check your seed list and URL filters.");
155 }
156 if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
157 }
158 }