Save This Page
Home » nutch-1.0 » org.apache.nutch » parse » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.nutch.parse;
   18   
   19   // Commons Logging imports
   20   import org.apache.commons.logging.Log;
   21   import org.apache.commons.logging.LogFactory;
   22   
   23   // Nutch Imports
   24   import org.apache.nutch.protocol.Content;
   25   
   26   // Hadoop imports
   27   import org.apache.hadoop.conf.Configuration;
   28   
   29   
   30   /**
   31    * A Utility class containing methods to simply perform parsing utilities such
   32    * as iterating through a preferred list of {@link Parser}s to obtain
   33    * {@link Parse} objects.
   34    *
   35    * @author mattmann
   36    * @author Jérôme Charron
   37    * @author Sébastien Le Callonnec
   38    */
   39   public class ParseUtil {
   40     
   41     /* our log stream */
   42     public static final Log LOG = LogFactory.getLog(ParseUtil.class);
   43     private ParserFactory parserFactory;
   44     
   45     /**
   46      * 
   47      * @param conf
   48      */
   49     public ParseUtil(Configuration conf) {
   50       this.parserFactory = new ParserFactory(conf);
   51     }
   52     
   53     /**
   54      * Performs a parse by iterating through a List of preferred {@link Parser}s
   55      * until a successful parse is performed and a {@link Parse} object is
   56      * returned. If the parse is unsuccessful, a message is logged to the
   57      * <code>WARNING</code> level, and an empty parse is returned.
   58      *
   59      * @param content The content to try and parse.
   60      * @return &lt;key, {@link Parse}&gt; pairs.
   61      * @throws ParseException If no suitable parser is found to perform the parse.
   62      */
   63     public ParseResult parse(Content content) throws ParseException {
   64       Parser[] parsers = null;
   65       
   66       try {
   67         parsers = this.parserFactory.getParsers(content.getContentType(), 
   68   	         content.getUrl() != null ? content.getUrl():"");
   69       } catch (ParserNotFound e) {
   70         if (LOG.isWarnEnabled()) {
   71           LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
   72                  " of type " + content.getContentType());
   73         }
   74         throw new ParseException(e.getMessage());
   75       }
   76       
   77       ParseResult parseResult = null;
   78       for (int i=0; i<parsers.length; i++) {
   79         if (LOG.isDebugEnabled()) {
   80           LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
   81         }
   82         parseResult = parsers[i].getParse(content);
   83         if (parseResult != null && !parseResult.isEmpty())
   84           return parseResult;
   85       }
   86      
   87       if (LOG.isWarnEnabled()) { 
   88         LOG.warn("Unable to successfully parse content " + content.getUrl() +
   89                  " of type " + content.getContentType());
   90       }
   91       return null;
   92     }
   93       
   94     /**
   95      * Method parses a {@link Content} object using the {@link Parser} specified
   96      * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
   97      * If a suitable {@link Parser} is not found, then a <code>WARNING</code>
   98      * level message is logged, and a ParseException is thrown. If the parse is
   99      * uncessful for any other reason, then a <code>WARNING</code> level
  100      * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
  101      * returned.
  102      *
  103      * @param extId The extension implementation ID of the {@link Parser} to use
  104      *              to parse the specified content.
  105      * @param content The content to parse.
  106      *
  107      * @return &lt;key, {@link Parse}&gt; pairs if the parse is successful, otherwise,
  108      *         a single &lt;key, <code>ParseStatus.getEmptyParse()</code>&gt; pair.
  109      *
  110      * @throws ParseException If there is no suitable {@link Parser} found
  111      *                        to perform the parse.
  112      */
  113     public ParseResult parseByExtensionId(String extId, Content content)
  114     throws ParseException {
  115       Parser p = null;
  116       
  117       try {
  118         p = this.parserFactory.getParserById(extId);
  119       } catch (ParserNotFound e) {
  120         if (LOG.isWarnEnabled()) {
  121           LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
  122               " of type " + content.getContentType());
  123         }
  124         throw new ParseException(e.getMessage());
  125       }
  126       
  127       ParseResult parseResult = p.getParse(content);
  128       if (parseResult != null && !parseResult.isEmpty()) {
  129         return parseResult;
  130       } else {
  131         if (LOG.isWarnEnabled()) {
  132           LOG.warn("Unable to successfully parse content " + content.getUrl() +
  133               " of type " + content.getContentType());
  134         }  
  135         return null;
  136       }
  137     }  
  138     
  139   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » parse » [javadoc | source]