Save This Page
Home » nutch-1.0 » org.apache.nutch » collection » [javadoc | source]
    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.nutch.collection;
   18   
   19   import java.io.File;
   20   import java.io.FileNotFoundException;
   21   import java.io.FileOutputStream;
   22   import java.io.IOException;
   23   import java.io.InputStream;
   24   import java.net.URL;
   25   import java.util.Collection;
   26   import java.util.HashMap;
   27   import java.util.Iterator;
   28   import java.util.Map;
   29   
   30   import org.apache.commons.logging.Log;
   31   import org.apache.commons.logging.LogFactory;
   32   
   33   import org.apache.hadoop.conf.Configuration;
   34   import org.apache.hadoop.conf.Configured;
   35   import org.apache.nutch.util.DomUtil;
   36   import org.apache.nutch.util.LogUtil;
   37   import org.apache.nutch.util.NutchConfiguration;
   38   import org.apache.nutch.util.ObjectCache;
   39   import org.apache.xerces.dom.DocumentImpl;
   40   import org.w3c.dom.Document;
   41   import org.w3c.dom.Element;
   42   import org.w3c.dom.NodeList;
   43   
   44   public class CollectionManager extends Configured {
   45   
   46     public static final String DEFAULT_FILE_NAME = "subcollections.xml";
   47   
   48     static final Log LOG = LogFactory.getLog(CollectionManager.class);
   49   
   50     transient Map collectionMap = new HashMap();
   51   
   52     transient URL configfile;
   53     
   54     public CollectionManager(Configuration conf) {
   55       super(conf);
   56       init();
   57     }
   58     
   59     /** 
   60      * Used for testing
   61      */
   62     protected CollectionManager(){
   63       super(NutchConfiguration.create());
   64     }
   65   
   66     protected void init(){
   67       try {
   68         if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
   69         // initialize known subcollections
   70         configfile = getConf().getResource(
   71             getConf().get("subcollections.config", DEFAULT_FILE_NAME));
   72   
   73         InputStream input = getConf().getConfResourceAsInputStream(
   74             getConf().get("subcollections.config", DEFAULT_FILE_NAME));
   75         parse(input);
   76       } catch (Exception e) {
   77         if (LOG.isWarnEnabled()) {
   78           LOG.warn("Error occured:" + e);
   79           e.printStackTrace(LogUtil.getWarnStream(LOG));
   80         }
   81       }
   82     }
   83   
   84     protected void parse(InputStream input) {
   85       Element collections = DomUtil.getDom(input);
   86   
   87       if (collections != null) {
   88         NodeList nodeList = collections
   89             .getElementsByTagName(Subcollection.TAG_COLLECTION);
   90   
   91         if (LOG.isInfoEnabled()) {
   92           LOG.info("file has" + nodeList.getLength() + " elements");
   93         }
   94         
   95         for (int i = 0; i < nodeList.getLength(); i++) {
   96           Element scElem = (Element) nodeList.item(i);
   97           Subcollection subCol = new Subcollection(getConf());
   98           subCol.initialize(scElem);
   99           collectionMap.put(subCol.name, subCol);
  100         }
  101       } else if (LOG.isInfoEnabled()) {
  102         LOG.info("Cannot find collections");
  103       }
  104     }
  105     
  106     public static CollectionManager getCollectionManager(Configuration conf) {
  107       String key = "collectionmanager";
  108       ObjectCache objectCache = ObjectCache.get(conf);
  109       CollectionManager impl = (CollectionManager)objectCache.getObject(key);
  110       if (impl == null) {
  111         try {
  112           if (LOG.isInfoEnabled()) {
  113             LOG.info("Instantiating CollectionManager");
  114           }
  115           impl=new CollectionManager(conf);
  116           objectCache.setObject(key,impl);
  117         } catch (Exception e) {
  118           throw new RuntimeException("Couldn't create CollectionManager",e);
  119         }
  120       }
  121       return impl;
  122     }
  123   
  124     /**
  125      * Returns named subcollection
  126      * 
  127      * @param id
  128      * @return Named SubCollection (or null if not existing)
  129      */
  130     public Subcollection getSubColection(final String id) {
  131       return (Subcollection) collectionMap.get(id);
  132     }
  133   
  134     /**
  135      * Delete named subcollection
  136      * 
  137      * @param id
  138      *          Id of SubCollection to delete
  139      */
  140     public void deleteSubCollection(final String id) throws IOException {
  141       final Subcollection subCol = getSubColection(id);
  142       if (subCol != null) {
  143         collectionMap.remove(id);
  144       }
  145     }
  146   
  147     /**
  148      * Create a new subcollection.
  149      * 
  150      * @param name
  151      *          Name of SubCollection to create
  152      * @return Created SubCollection or null if allready existed
  153      */
  154     public Subcollection createSubCollection(final String id, final String name) {
  155       Subcollection subCol = null;
  156   
  157       if (!collectionMap.containsKey(id)) {
  158         subCol = new Subcollection(id, name, getConf());
  159         collectionMap.put(id, subCol);
  160       }
  161   
  162       return subCol;
  163     }
  164   
  165     /**
  166      * Return names of collections url is part of
  167      * 
  168      * @param url
  169      *          The url to test against Collections
  170      * @return Space delimited string of collection names url is part of
  171      */
  172     public String getSubCollections(final String url) {
  173       String collections = "";
  174       final Iterator iterator = collectionMap.values().iterator();
  175   
  176       while (iterator.hasNext()) {
  177         final Subcollection subCol = (Subcollection) iterator.next();
  178         if (subCol.filter(url) != null) {
  179           collections += " " + subCol.name;
  180         }
  181       }
  182       if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
  183       
  184       return collections;
  185     }
  186   
  187     /**
  188      * Returns all collections
  189      * 
  190      * @return All collections CollectionManager knows about
  191      */
  192     public Collection getAll() {
  193       return collectionMap.values();
  194     }
  195   
  196     /**
  197      * Save collections into file
  198      * 
  199      * @throws Exception
  200      */
  201     public void save() throws IOException {
  202       try {
  203         final FileOutputStream fos = new FileOutputStream(new File(configfile
  204             .getFile()));
  205         final Document doc = new DocumentImpl();
  206         final Element collections = doc
  207             .createElement(Subcollection.TAG_COLLECTIONS);
  208         final Iterator iterator = collectionMap.values().iterator();
  209   
  210         while (iterator.hasNext()) {
  211           final Subcollection subCol = (Subcollection) iterator.next();
  212           final Element collection = doc
  213               .createElement(Subcollection.TAG_COLLECTION);
  214           collections.appendChild(collection);
  215           final Element name = doc.createElement(Subcollection.TAG_NAME);
  216           name.setNodeValue(subCol.getName());
  217           collection.appendChild(name);
  218           final Element whiteList = doc
  219               .createElement(Subcollection.TAG_WHITELIST);
  220           whiteList.setNodeValue(subCol.getWhiteListString());
  221           collection.appendChild(whiteList);
  222           final Element blackList = doc
  223               .createElement(Subcollection.TAG_BLACKLIST);
  224           blackList.setNodeValue(subCol.getBlackListString());
  225           collection.appendChild(blackList);
  226         }
  227   
  228         DomUtil.saveDom(fos, collections);
  229         fos.flush();
  230         fos.close();
  231       } catch (FileNotFoundException e) {
  232         throw new IOException(e.toString());
  233       }
  234     }
  235   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » collection » [javadoc | source]