org.apache.nutch.crawl
public class: LinkDbMerger [javadoc |
source]
java.lang.Object
org.apache.hadoop.conf.Configured
org.apache.nutch.crawl.LinkDbMerger
All Implemented Interfaces:
org.apache.hadoop.util.Tool, org.apache.hadoop.mapred.Reducer
This tool merges several LinkDb-s into one, optionally filtering
URLs through the current URLFilters, to skip prohibited URLs and
links.
It's possible to use this tool just for filtering - in that case
only one LinkDb should be specified in arguments.
If more than one LinkDb contains information about the same URL,
all inlinks are accumulated, but only at most db.max.inlinks
inlinks will ever be added.
If activated, URLFilters will be applied to both the target URLs and
to any incoming link URL. If a target URL is prohibited, all
inlinks to that target will be removed, including the target URL. If
some of incoming links are prohibited, only they will be removed, and they
won't count when checking the above-mentioned maximum limit.
- author:
Andrzej - Bialecki
| Method from org.apache.nutch.crawl.LinkDbMerger Detail: |
public void close() throws IOException {
}
|
public void configure(JobConf job) {
maxInlinks = job.getInt("db.max.inlinks", 10000);
}
|
public static JobConf createMergeJob(Configuration config,
Path linkDb,
boolean normalize,
boolean filter) {
Path newLinkDb =
new Path("linkdb-merge-" +
Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
JobConf job = new NutchJob(config);
job.setJobName("linkdb merge " + linkDb);
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(LinkDbFilter.class);
job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
job.setReducerClass(LinkDbMerger.class);
FileOutputFormat.setOutputPath(job, newLinkDb);
job.setOutputFormat(MapFileOutputFormat.class);
job.setBoolean("mapred.output.compress", true);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Inlinks.class);
return job;
}
|
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new LinkDbMerger(), args);
System.exit(res);
}
|
public void merge(Path output,
Path[] dbs,
boolean normalize,
boolean filter) throws Exception {
JobConf job = createMergeJob(getConf(), output, normalize, filter);
for (int i = 0; i < dbs.length; i++) {
FileInputFormat.addInputPath(job, new Path(dbs[i], LinkDb.CURRENT_NAME));
}
JobClient.runJob(job);
FileSystem fs = FileSystem.get(getConf());
fs.mkdirs(output);
fs.rename(FileOutputFormat.getOutputPath(job), new Path(output, LinkDb.CURRENT_NAME));
}
|
public void reduce(Text key,
Iterator values,
OutputCollector output,
Reporter reporter) throws IOException {
Inlinks result = new Inlinks();
while (values.hasNext()) {
Inlinks inlinks = values.next();
int end = Math.min(maxInlinks - result.size(), inlinks.size());
Iterator< Inlink > it = inlinks.iterator();
int i = 0;
while(it.hasNext() && i++ < end) {
result.add(it.next());
}
}
if (result.size() == 0) return;
output.collect(key, result);
}
|
public int run(String[] args) throws Exception {
if (args.length < 2) {
System.err.println("Usage: LinkDbMerger < output_linkdb > < linkdb1 > [< linkdb2 > < linkdb3 > ...] [-normalize] [-filter]");
System.err.println("\toutput_linkdb\toutput LinkDb");
System.err.println("\tlinkdb1 ...\tinput LinkDb-s (single input LinkDb is ok)");
System.err.println("\t-normalize\tuse URLNormalizer on both fromUrls and toUrls in linkdb(s) (usually not needed)");
System.err.println("\t-filter\tuse URLFilters on both fromUrls and toUrls in linkdb(s)");
return -1;
}
Path output = new Path(args[0]);
ArrayList< Path > dbs = new ArrayList< Path >();
boolean normalize = false;
boolean filter = false;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-filter")) {
filter = true;
} else if (args[i].equals("-normalize")) {
normalize = true;
} else dbs.add(new Path(args[i]));
}
try {
merge(output, dbs.toArray(new Path[dbs.size()]), normalize, filter);
return 0;
} catch (Exception e) {
LOG.fatal("LinkDbMerger: " + StringUtils.stringifyException(e));
return -1;
}
}
|