Save This Page
Home » nutch-1.0 » org.apache.nutch » indexer » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.indexer;
   19   
   20   import java.util.ArrayList;
   21   import java.util.HashMap;
   22   
   23   // Commons Logging imports
   24   import org.apache.commons.logging.Log;
   25   import org.apache.commons.logging.LogFactory;
   26   
   27   import org.apache.nutch.plugin;
   28   import org.apache.nutch.parse.Parse;
   29   import org.apache.nutch.util.ObjectCache;
   30   import org.apache.hadoop.conf.Configuration;
   31   import org.apache.nutch.crawl.CrawlDatum;
   32   import org.apache.nutch.crawl.Inlinks;
   33   import org.apache.hadoop.io.Text;
   34   
   35   /** Creates and caches {@link IndexingFilter} implementing plugins.*/
   36   public class IndexingFilters {
   37   
   38     public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
   39   
   40     public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
   41   
   42     private IndexingFilter[] indexingFilters;
   43   
   44     public IndexingFilters(Configuration conf) {
   45       /* Get indexingfilter.order property */
   46       String order = conf.get(INDEXINGFILTER_ORDER);
   47       ObjectCache objectCache = ObjectCache.get(conf);
   48       this.indexingFilters = (IndexingFilter[]) objectCache
   49           .getObject(IndexingFilter.class.getName());
   50       if (this.indexingFilters == null) {
   51         /*
   52          * If ordered filters are required, prepare array of filters based on
   53          * property
   54          */
   55         String[] orderedFilters = null;
   56         if (order != null && !order.trim().equals("")) {
   57           orderedFilters = order.split("\\s+");
   58         }
   59         try {
   60           ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
   61               IndexingFilter.X_POINT_ID);
   62           if (point == null)
   63             throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found.");
   64           Extension[] extensions = point.getExtensions();
   65           HashMap<String, IndexingFilter> filterMap =
   66             new HashMap<String, IndexingFilter>();
   67           for (int i = 0; i < extensions.length; i++) {
   68             Extension extension = extensions[i];
   69             IndexingFilter filter = (IndexingFilter) extension
   70                 .getExtensionInstance();
   71             LOG.info("Adding " + filter.getClass().getName());
   72             if (!filterMap.containsKey(filter.getClass().getName())) {
   73               filter.addIndexBackendOptions(conf);
   74               filterMap.put(filter.getClass().getName(), filter);
   75             }
   76           }
   77           /*
   78            * If no ordered filters required, just get the filters in an
   79            * indeterminate order
   80            */
   81           if (orderedFilters == null) {
   82             objectCache.setObject(IndexingFilter.class.getName(),
   83                 filterMap.values().toArray(
   84                     new IndexingFilter[0]));
   85             /* Otherwise run the filters in the required order */
   86           } else {
   87             ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
   88             for (int i = 0; i < orderedFilters.length; i++) {
   89               IndexingFilter filter = filterMap
   90                   .get(orderedFilters[i]);
   91               if (filter != null) {
   92                 filter.addIndexBackendOptions(conf);
   93                 filters.add(filter);
   94               }
   95             }
   96             objectCache.setObject(IndexingFilter.class.getName(), filters
   97                 .toArray(new IndexingFilter[filters.size()]));
   98           }
   99         } catch (PluginRuntimeException e) {
  100           throw new RuntimeException(e);
  101         }
  102         this.indexingFilters = (IndexingFilter[]) objectCache
  103             .getObject(IndexingFilter.class.getName());
  104       }
  105     }                  
  106   
  107     /** Run all defined filters. */
  108     public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
  109         Inlinks inlinks) throws IndexingException {
  110       for (int i = 0; i < this.indexingFilters.length; i++) {
  111         doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
  112         // break the loop if an indexing filter discards the doc
  113         if (doc == null) return null;
  114       }
  115   
  116       return doc;
  117     }
  118   
  119   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » indexer » [javadoc | source]