Save This Page
Home » nutch-1.0 » org.apache.nutch » crawl » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.crawl;
   19   
   20   import java.util;
   21   import java.text;
   22   
   23   // Commons Logging imports
   24   import org.apache.commons.logging.Log;
   25   import org.apache.commons.logging.LogFactory;
   26   
   27   import org.apache.hadoop.fs;
   28   import org.apache.hadoop.conf;
   29   import org.apache.hadoop.mapred;
   30   import org.apache.nutch.parse.ParseSegment;
   31   import org.apache.nutch.indexer.DeleteDuplicates;
   32   import org.apache.nutch.indexer.IndexMerger;
   33   import org.apache.nutch.indexer.Indexer;
   34   import org.apache.nutch.util.HadoopFSUtil;
   35   import org.apache.nutch.util.NutchConfiguration;
   36   import org.apache.nutch.util.NutchJob;
   37   
   38   import org.apache.nutch.fetcher.Fetcher;
   39   
   40   public class Crawl {
   41     public static final Log LOG = LogFactory.getLog(Crawl.class);
   42   
   43     private static String getDate() {
   44       return new SimpleDateFormat("yyyyMMddHHmmss").format
   45         (new Date(System.currentTimeMillis()));
   46     }
   47   
   48   
   49     /* Perform complete crawling and indexing given a set of root urls. */
   50     public static void main(String args[]) throws Exception {
   51       if (args.length < 1) {
   52         System.out.println
   53           ("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]");
   54         return;
   55       }
   56   
   57       Configuration conf = NutchConfiguration.create();
   58       conf.addResource("crawl-tool.xml");
   59       JobConf job = new NutchJob(conf);
   60   
   61       Path rootUrlDir = null;
   62       Path dir = new Path("crawl-" + getDate());
   63       int threads = job.getInt("fetcher.threads.fetch", 10);
   64       int depth = 5;
   65       long topN = Long.MAX_VALUE;
   66       for (int i = 0; i < args.length; i++) {
   67         if ("-dir".equals(args[i])) {
   68           dir = new Path(args[i+1]);
   69           i++;
   70         } else if ("-threads".equals(args[i])) {
   71           threads = Integer.parseInt(args[i+1]);
   72           i++;
   73         } else if ("-depth".equals(args[i])) {
   74           depth = Integer.parseInt(args[i+1]);
   75           i++;
   76         } else if ("-topN".equals(args[i])) {
   77             topN = Integer.parseInt(args[i+1]);
   78             i++;
   79         } else if (args[i] != null) {
   80           rootUrlDir = new Path(args[i]);
   81         }
   82       }
   83   
   84       FileSystem fs = FileSystem.get(job);
   85   
   86       if (LOG.isInfoEnabled()) {
   87         LOG.info("crawl started in: " + dir);
   88         LOG.info("rootUrlDir = " + rootUrlDir);
   89         LOG.info("threads = " + threads);
   90         LOG.info("depth = " + depth);
   91         if (topN != Long.MAX_VALUE)
   92           LOG.info("topN = " + topN);
   93       }
   94       
   95       Path crawlDb = new Path(dir + "/crawldb");
   96       Path linkDb = new Path(dir + "/linkdb");
   97       Path segments = new Path(dir + "/segments");
   98       Path indexes = new Path(dir + "/indexes");
   99       Path index = new Path(dir + "/index");
  100   
  101       Path tmpDir = job.getLocalPath("crawl"+Path.SEPARATOR+getDate());
  102       Injector injector = new Injector(conf);
  103       Generator generator = new Generator(conf);
  104       Fetcher fetcher = new Fetcher(conf);
  105       ParseSegment parseSegment = new ParseSegment(conf);
  106       CrawlDb crawlDbTool = new CrawlDb(conf);
  107       LinkDb linkDbTool = new LinkDb(conf);
  108       Indexer indexer = new Indexer(conf);
  109       DeleteDuplicates dedup = new DeleteDuplicates(conf);
  110       IndexMerger merger = new IndexMerger(conf);
  111         
  112       // initialize crawlDb
  113       injector.inject(crawlDb, rootUrlDir);
  114       int i;
  115       for (i = 0; i < depth; i++) {             // generate new segment
  116         Path segment = generator.generate(crawlDb, segments, -1, topN, System
  117             .currentTimeMillis());
  118         if (segment == null) {
  119           LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
  120           break;
  121         }
  122         fetcher.fetch(segment, threads, org.apache.nutch.fetcher.Fetcher.isParsing(conf));  // fetch it
  123         if (!Fetcher.isParsing(job)) {
  124           parseSegment.parse(segment);    // parse it, if needed
  125         }
  126         crawlDbTool.update(crawlDb, new Path[]{segment}, true, true); // update crawldb
  127       }
  128       if (i > 0) {
  129         linkDbTool.invert(linkDb, segments, true, true, false); // invert links
  130   
  131         if(indexes != null) {
  132           // Delete old indexes
  133           if (fs.exists(indexes)) {
  134             LOG.info("Deleting old indexes: " + indexes);
  135             fs.delete(indexes, true);
  136           }
  137   
  138           // Delete old index
  139           if (fs.exists(index)) {
  140             LOG.info("Deleting old merged index: " + index);
  141             fs.delete(index, true);
  142           }
  143         }
  144   
  145         // index, dedup & merge
  146         FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
  147         indexer.index(indexes, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
  148         if(indexes != null) {
  149           dedup.dedup(new Path[] { indexes });
  150           fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
  151           merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
  152         }
  153       } else {
  154         LOG.warn("No URLs to fetch - check your seed list and URL filters.");
  155       }
  156       if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); }
  157     }
  158   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » crawl » [javadoc | source]