org.apache.nutch.crawl
public class: CrawlDbMerger [javadoc |
source]
java.lang.Object
org.apache.hadoop.conf.Configured
org.apache.nutch.crawl.CrawlDbMerger
All Implemented Interfaces:
org.apache.hadoop.util.Tool
This tool merges several CrawlDb-s into one, optionally filtering
URLs through the current URLFilters, to skip prohibited
pages.
It's possible to use this tool just for filtering - in that case
only one CrawlDb should be specified in arguments.
If more than one CrawlDb contains information about the same URL,
only the most recent version is retained, as determined by the
value of org.apache.nutch.crawl.CrawlDatum#getFetchTime() .
However, all metadata information from all versions is accumulated,
with newer values taking precedence over older values.
- author:
Andrzej - Bialecki
| Method from org.apache.nutch.crawl.CrawlDbMerger Detail: |
public static JobConf createMergeJob(Configuration conf,
Path output,
boolean normalize,
boolean filter) {
Path newCrawlDb = new Path("crawldb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(conf);
job.setJobName("crawldb merge " + output);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(CrawlDbFilter.class);
job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);
job.setReducerClass(Merger.class);
FileOutputFormat.setOutputPath(job, newCrawlDb);
job.setOutputFormat(MapFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
return job;
}
|
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new CrawlDbMerger(), args);
System.exit(res);
}
|
public void merge(Path output,
Path[] dbs,
boolean normalize,
boolean filter) throws Exception {
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
FileInputFormat.addInputPath(job, new Path(dbs[i], CrawlDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, CrawlDb.CURRENT_NAME));
}
|
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: CrawlDbMerger < output_crawldb > < crawldb1 > [< crawldb2 > < crawldb3 > ...] [-normalize] [-filter]");
System.err.println("\toutput_crawldb\toutput CrawlDb");
System.err.println("\tcrawldb1 ...\tinput CrawlDb-s (single input CrawlDb is ok)");
System.err.println("\t-normalize\tuse URLNormalizer on urls in the crawldb(s) (usually not needed)");
System.err.println("\t-filter\tuse URLFilters on urls in the crawldb(s)");
return -1;
}
Path output = new Path(args[0]);
ArrayList< Path > dbs = new ArrayList< Path >();
boolean filter = false;
boolean normalize = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-filter")) {
filter = true;
continue;
} else if (args[i].equals("-normalize")) {
normalize = true;
continue;
}
dbs.add(new Path(args[i]));
}
try {
merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);
return 0;
} catch (Exception e) {
LOG.fatal("CrawlDb merge: " + StringUtils.stringifyException(e));
return -1;
}
}
|