Save This Page
Home » nutch-1.0 » org.apache.nutch » parse » html » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.parse.html;
   19   
   20   import java.net.URL;
   21   
   22   import org.apache.nutch.parse.HTMLMetaTags;
   23   import org.w3c.dom;
   24   
   25   /**
   26    * Class for parsing META Directives from DOM trees.  This class
   27    * handles specifically Robots META directives (all, none, nofollow,
   28    * noindex), finding BASE HREF tags, and HTTP-EQUIV no-cache
   29    * instructions. All meta directives are stored in a HTMLMetaTags instance.
   30    */
   31   public class HTMLMetaProcessor {
   32   
   33     /**
   34      * Utility class with indicators for the robots directives "noindex"
   35      * and "nofollow", and HTTP-EQUIV/no-cache
   36      */
   37     
   38     /**
   39      * Sets the indicators in <code>robotsMeta</code> to appropriate
   40      * values, based on any META tags found under the given
   41      * <code>node</code>.
   42      */
   43     public static final void getMetaTags (
   44       HTMLMetaTags metaTags, Node node, URL currURL) {
   45   
   46       metaTags.reset();
   47       getMetaTagsHelper(metaTags, node, currURL);
   48     }
   49   
   50     private static final void getMetaTagsHelper(
   51       HTMLMetaTags metaTags, Node node, URL currURL) {
   52   
   53       if (node.getNodeType() == Node.ELEMENT_NODE) {
   54   
   55         if ("body".equalsIgnoreCase(node.getNodeName())) {
   56           // META tags should not be under body
   57           return;
   58         }
   59   
   60         if ("meta".equalsIgnoreCase(node.getNodeName())) {
   61           NamedNodeMap attrs = node.getAttributes();
   62           Node nameNode = null;
   63           Node equivNode = null;
   64           Node contentNode = null;
   65           // Retrieves name, http-equiv and content attribues
   66           for (int i=0; i<attrs.getLength(); i++) {
   67             Node attr = attrs.item(i);
   68             String attrName = attr.getNodeName().toLowerCase();
   69             if (attrName.equals("name")) {
   70               nameNode = attr;
   71             } else if (attrName.equals("http-equiv")) {
   72               equivNode = attr;
   73             } else if (attrName.equals("content")) {
   74               contentNode = attr;
   75             }
   76           }
   77           
   78           if (nameNode != null) {
   79             if (contentNode != null) {
   80               String name = nameNode.getNodeValue().toLowerCase();
   81               metaTags.getGeneralTags().setProperty(name, contentNode.getNodeValue());
   82               if ("robots".equals(name)) {
   83     
   84                 if (contentNode != null) {
   85                   String directives = 
   86                     contentNode.getNodeValue().toLowerCase();
   87                   int index = directives.indexOf("none");
   88     
   89                   if (index >= 0) {
   90                     metaTags.setNoIndex();
   91                     metaTags.setNoFollow();
   92                   }
   93     
   94                   index = directives.indexOf("all");
   95                   if (index >= 0) {
   96                     // do nothing...
   97                   }
   98     
   99                   index = directives.indexOf("noindex");
  100                   if (index >= 0) {
  101                     metaTags.setNoIndex();
  102                   }
  103     
  104                   index = directives.indexOf("nofollow");
  105                   if (index >= 0) {
  106                     metaTags.setNoFollow();
  107                   }
  108                   
  109                   index = directives.indexOf("noarchive");
  110                   if (index >= 0) {
  111                     metaTags.setNoCache();
  112                   }
  113                 } 
  114     
  115               } // end if (name == robots)
  116             }
  117           }
  118   
  119           if (equivNode != null) {
  120             if (contentNode != null) {
  121               String name = equivNode.getNodeValue().toLowerCase();
  122               String content = contentNode.getNodeValue();
  123               metaTags.getHttpEquivTags().setProperty(name, content);
  124               if ("pragma".equals(name)) {
  125                 content = content.toLowerCase();
  126                 int index = content.indexOf("no-cache");
  127                 if (index >= 0) 
  128                   metaTags.setNoCache();
  129               } else if ("refresh".equals(name)) {
  130                 int idx = content.indexOf(';');
  131                 String time = null;
  132                 if (idx == -1) { // just the refresh time
  133                   time = content;
  134                 } else time = content.substring(0, idx);
  135                 try {
  136                   metaTags.setRefreshTime(Integer.parseInt(time));
  137                   // skip this if we couldn't parse the time
  138                   metaTags.setRefresh(true);
  139                 } catch (Exception e) {
  140                   ;
  141                 }
  142                 URL refreshUrl = null;
  143                 if (metaTags.getRefresh() && idx != -1) { // set the URL
  144                   idx = content.toLowerCase().indexOf("url=");
  145                   if (idx == -1) { // assume a mis-formatted entry with just the url
  146                     idx = content.indexOf(';') + 1;
  147                   } else idx += 4;
  148                   if (idx != -1) {
  149                     String url = content.substring(idx);
  150                     try {
  151                       refreshUrl = new URL(url);
  152                     } catch (Exception e) {
  153                       // XXX according to the spec, this has to be an absolute
  154                       // XXX url. However, many websites use relative URLs and
  155                       // XXX expect browsers to handle that.
  156                       // XXX Unfortunately, in some cases this may create a
  157                       // XXX infinitely recursive paths (a crawler trap)...
  158                       // if (!url.startsWith("/")) url = "/" + url;
  159                       try {
  160                         refreshUrl = new URL(currURL, url);
  161                       } catch (Exception e1) {
  162                         refreshUrl = null;
  163                       }
  164                     }
  165                   }
  166                 }
  167                 if (metaTags.getRefresh()) {
  168                   if (refreshUrl == null) {
  169                     // apparently only refresh time was present. set the URL
  170                     // to the same URL.
  171                     refreshUrl = currURL;
  172                   }
  173                   metaTags.setRefreshHref(refreshUrl);
  174                 }
  175               }
  176             }
  177           }
  178   
  179         } else if ("base".equalsIgnoreCase(node.getNodeName())) {
  180           NamedNodeMap attrs = node.getAttributes();
  181           Node hrefNode = attrs.getNamedItem("href");
  182   
  183           if (hrefNode != null) {
  184             String urlString = hrefNode.getNodeValue();
  185   
  186             URL url = null;
  187             try {
  188               if (currURL == null)
  189                 url = new URL(urlString);
  190               else 
  191                 url = new URL(currURL, urlString);
  192             } catch (Exception e) {
  193               ;
  194             }
  195   
  196             if (url != null) 
  197               metaTags.setBaseHref(url);
  198           }
  199   
  200         }
  201   
  202       }
  203   
  204       NodeList children = node.getChildNodes();
  205       if (children != null) {
  206         int len = children.getLength();
  207         for (int i = 0; i < len; i++) {
  208           getMetaTagsHelper(metaTags, children.item(i), currURL);
  209         }
  210       }
  211     }
  212   
  213   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » parse » html » [javadoc | source]