Save This Page
Home » nutch-1.0 » org.apache.nutch » net » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.net;
   19   
   20   import java.net.MalformedURLException;
   21   import java.util.ArrayList;
   22   import java.util.Arrays;
   23   import java.util.Collections;
   24   import java.util.HashMap;
   25   import java.util.HashSet;
   26   import java.util.Iterator;
   27   import java.util.List;
   28   import java.util.Set;
   29   import java.util.Vector;
   30   
   31   import org.apache.commons.logging.Log;
   32   import org.apache.commons.logging.LogFactory;
   33   import org.apache.hadoop.conf.Configuration;
   34   import org.apache.nutch.plugin.Extension;
   35   import org.apache.nutch.plugin.ExtensionPoint;
   36   import org.apache.nutch.plugin.PluginRepository;
   37   import org.apache.nutch.plugin.PluginRuntimeException;
   38   import org.apache.nutch.util.ObjectCache;
   39   
   40   /**
   41    * This class uses a "chained filter" pattern to run defined normalizers.
   42    * Different lists of normalizers may be defined for different "scopes", or
   43    * contexts where they are used (note however that they need to be activated
   44    * first through <tt>plugin.include</tt> property).
   45    * 
   46    * <p>There is one global scope defined by default, which consists of all
   47    * active normalizers. The order in which these normalizers
   48    * are executed may be defined in "urlnormalizer.order" property, which lists
   49    * space-separated implementation classes (if this property is missing normalizers
   50    * will be run in random order). If there are more
   51    * normalizers activated than explicitly named on this list, the remaining ones
   52    * will be run in random order after the ones specified on the list are executed.</p>
   53    * <p>You can define a set of contexts (or scopes) in which normalizers may be
   54    * called. Each scope can have its own list of normalizers (defined in
   55    * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
   56    * "urlnormalizer.order.<scope_name>" property). If any of these properties are
   57    * missing, default settings are used for the global scope.</p>
   58    * <p>In case no normalizers are required for any given scope, a
   59    * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should be used.</p>
   60    * <p>Each normalizer may further select among many configurations, depending on
   61    * the scope in which it is called, because the scope name is passed as a parameter
   62    * to each normalizer. You can also use the same normalizer for many scopes.</p>
   63    * <p>Several scopes have been defined, and various Nutch tools will attempt using
   64    * scope-specific normalizers first (and fall back to default config if scope-specific
   65    * configuration is missing).</p>
   66    * <p>Normalizers may be run several times, to ensure that modifications introduced
   67    * by normalizers at the end of the list can be further reduced by normalizers
   68    * executed at the beginning. By default this loop is executed just once - if you want
   69    * to ensure that all possible combinations have been applied you may want to run
   70    * this loop up to the number of activated normalizers. This loop count can be configured
   71    * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is
   72    * unchanged the loop will stop and return the result.</p>
   73    * 
   74    * @author Andrzej Bialecki
   75    */
   76   public final class URLNormalizers {
   77     
   78     /** Default scope. If no scope properties are defined then the configuration for
   79      * this scope will be used.
   80      */
   81     public static final String SCOPE_DEFAULT = "default";
   82     /** Scope used by {@link org.apache.nutch.crawl.PartitionUrlByHost}. */
   83     public static final String SCOPE_PARTITION = "partition";
   84     /** Scope used by {@link org.apache.nutch.crawl.Generator}. */
   85     public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
   86     /** Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
   87      * redirect URLs.
   88      */
   89     public static final String SCOPE_FETCHER = "fetcher";
   90     /** Scope used when updating the CrawlDb with new URLs. */
   91     public static final String SCOPE_CRAWLDB = "crawldb";
   92     /** Scope used when updating the LinkDb with new URLs. */
   93     public static final String SCOPE_LINKDB = "linkdb";
   94     /** Scope used by {@link org.apache.nutch.crawl.Injector}. */
   95     public static final String SCOPE_INJECT = "inject";
   96     /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
   97     public static final String SCOPE_OUTLINK = "outlink";
   98     
   99   
  100     public static final Log LOG = LogFactory.getLog(URLNormalizers.class);
  101   
  102     /* Empty extension list for caching purposes. */
  103     private final List<Extension> EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;
  104     
  105     private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
  106   
  107     private Configuration conf;
  108   
  109     private ExtensionPoint extensionPoint;
  110   
  111     private URLNormalizer[] normalizers;
  112     
  113     private int loopCount;
  114   
  115     public URLNormalizers(Configuration conf, String scope) {
  116       this.conf = conf;
  117       this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
  118               URLNormalizer.X_POINT_ID);
  119       ObjectCache objectCache = ObjectCache.get(conf);
  120       
  121       if (this.extensionPoint == null) {
  122         throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
  123                 + " not found.");
  124       }
  125   
  126       normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope);
  127       if (normalizers == null) {
  128         normalizers = getURLNormalizers(scope);
  129       }
  130       if (normalizers == EMPTY_NORMALIZERS) {
  131         normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
  132         if (normalizers == null) {
  133           normalizers = getURLNormalizers(SCOPE_DEFAULT);
  134         }
  135       }
  136       
  137       loopCount = conf.getInt("urlnormalizer.loop.count", 1);
  138     }
  139   
  140     /**
  141      * Function returns an array of {@link URLNormalizer}s for a given scope,
  142      * with a specified order.
  143      * 
  144      * @param scope
  145      *          The scope to return the <code>Array</code> of
  146      *          {@link URLNormalizer}s for.
  147      * @return An <code>Array</code> of {@link URLNormalizer}s for the given
  148      *         scope.
  149      * @throws PluginRuntimeException
  150      */
  151     URLNormalizer[] getURLNormalizers(String scope) {
  152       List<Extension> extensions = getExtensions(scope);
  153       ObjectCache objectCache = ObjectCache.get(conf);
  154       
  155       if (extensions == EMPTY_EXTENSION_LIST) {
  156         return EMPTY_NORMALIZERS;
  157       }
  158       
  159       List<URLNormalizer> normalizers = new Vector<URLNormalizer>(extensions.size());
  160   
  161       Iterator<Extension> it = extensions.iterator();
  162       while (it.hasNext()) {
  163         Extension ext = it.next();
  164         URLNormalizer normalizer = null;
  165         try {
  166           // check to see if we've cached this URLNormalizer instance yet
  167           normalizer = (URLNormalizer) objectCache.getObject(ext.getId());
  168           if (normalizer == null) {
  169             // go ahead and instantiate it and then cache it
  170             normalizer = (URLNormalizer) ext.getExtensionInstance();
  171             objectCache.setObject(ext.getId(), normalizer);
  172           }
  173           normalizers.add(normalizer);
  174         } catch (PluginRuntimeException e) {
  175           e.printStackTrace();
  176           LOG.warn("URLNormalizers:PluginRuntimeException when "
  177                   + "initializing url normalizer plugin "
  178                   + ext.getDescriptor().getPluginId()
  179                   + " instance in getURLNormalizers "
  180                   + "function: attempting to continue instantiating plugins");
  181         }
  182       }
  183       return normalizers.toArray(new URLNormalizer[normalizers
  184               .size()]);
  185     }
  186   
  187     /**
  188      * Finds the best-suited normalizer plugin for a given scope.
  189      * 
  190      * @param scope
  191      *          Scope for which we seek a normalizer plugin.
  192      * @return a list of extensions to be used for this scope. If none, returns
  193      *         empty list.
  194      * @throws PluginRuntimeException
  195      */
  196     private List<Extension> getExtensions(String scope) {
  197       ObjectCache objectCache = ObjectCache.get(conf);
  198       List<Extension> extensions = 
  199         (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_"
  200                                                   + scope);
  201   
  202       // Just compare the reference:
  203       // if this is the empty list, we know we will find no extension.
  204       if (extensions == EMPTY_EXTENSION_LIST) {
  205         return EMPTY_EXTENSION_LIST;
  206       }
  207   
  208       if (extensions == null) {
  209         extensions = findExtensions(scope);
  210         if (extensions != null) {
  211           objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions);
  212         } else {
  213           // Put the empty extension list into cache
  214           // to remember we don't know any related extension.
  215           objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST);
  216           extensions = EMPTY_EXTENSION_LIST;
  217         }
  218       }
  219       return extensions;
  220     }
  221   
  222     /**
  223      * searches a list of suitable url normalizer plugins for the given scope.
  224      * 
  225      * @param scope
  226      *          Scope for which we seek a url normalizer plugin.
  227      * @return List - List of extensions to be used for this scope. If none,
  228      *         returns null.
  229      * @throws PluginRuntimeException
  230      */
  231     private List<Extension> findExtensions(String scope) {
  232   
  233       String[] orders = null;
  234       String orderlist = conf.get("urlnormalizer.order." + scope);
  235       if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
  236       if (orderlist != null && !orderlist.trim().equals("")) {
  237         orders = orderlist.split("\\s+");
  238       }
  239       String scopelist = conf.get("urlnormalizer.scope." + scope);
  240       Set<String> impls = null;
  241       if (scopelist != null && !scopelist.trim().equals("")) {
  242         String[] names = scopelist.split("\\s+");
  243         impls = new HashSet<String>(Arrays.asList(names));
  244       }
  245       Extension[] extensions = this.extensionPoint.getExtensions();
  246       HashMap<String, Extension> normalizerExtensions = new HashMap<String, Extension>();
  247       for (int i = 0; i < extensions.length; i++) {
  248         Extension extension = extensions[i];
  249         if (impls != null && !impls.contains(extension.getClazz()))
  250           continue;
  251         normalizerExtensions.put(extension.getClazz(), extension);
  252       }
  253       List<Extension> res = new ArrayList<Extension>();
  254       if (orders == null) {
  255         res.addAll(normalizerExtensions.values());
  256       } else {
  257         // first add those explicitly named in correct order
  258         for (int i = 0; i < orders.length; i++) {
  259           Extension e = normalizerExtensions.get(orders[i]);
  260           if (e != null) {
  261             res.add(e);
  262             normalizerExtensions.remove(orders[i]);
  263           }
  264         }
  265         // then add all others in random order
  266         res.addAll(normalizerExtensions.values());
  267       }
  268       return res;
  269     }
  270   
  271     /**
  272      * Normalize
  273      * @param urlString The URL string to normalize.
  274      * @param scope The given scope.
  275      * @return A normalized String, using the given <code>scope</code>
  276      * @throws MalformedURLException If the given URL string is malformed.
  277      */
  278     public String normalize(String urlString, String scope)
  279             throws MalformedURLException {
  280       // optionally loop several times, and break if no further changes
  281       String initialString = urlString;
  282       for (int k = 0; k < loopCount; k++) {
  283         for (int i = 0; i < this.normalizers.length; i++) {
  284           if (urlString == null)
  285             return null;
  286           urlString = this.normalizers[i].normalize(urlString, scope);
  287         }
  288         if (initialString.equals(urlString)) break;
  289         initialString = urlString;
  290       }
  291       return urlString;
  292     }
  293   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » net » [javadoc | source]