Save This Page
Home » nutch-1.0 » org.apache.nutch » protocol » httpclient » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.nutch.protocol.httpclient;
   18   
   19   // JDK imports
   20   import java.io.InputStream;
   21   import java.io.IOException;
   22   import java.net.URL;
   23   import java.util.ArrayList;
   24   import javax.xml.parsers.DocumentBuilderFactory;
   25   import javax.xml.parsers.ParserConfigurationException;
   26   import org.xml.sax.SAXException;
   27   import org.w3c.dom.Document;
   28   import org.w3c.dom.Element;
   29   import org.w3c.dom.NodeList;
   30   import org.w3c.dom.Node;
   31   
   32   // Commons Logging imports
   33   import org.apache.commons.logging.Log;
   34   import org.apache.commons.logging.LogFactory;
   35   
   36   // HTTP Client imports
   37   import org.apache.commons.httpclient.Header;
   38   import org.apache.commons.httpclient.HostConfiguration;
   39   import org.apache.commons.httpclient.HttpClient;
   40   import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
   41   import org.apache.commons.httpclient.NTCredentials;
   42   import org.apache.commons.httpclient.auth.AuthScope;
   43   import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
   44   import org.apache.commons.httpclient.protocol.Protocol;
   45   
   46   // Nutch imports
   47   import org.apache.nutch.util.LogUtil;
   48   import org.apache.nutch.crawl.CrawlDatum;
   49   import org.apache.nutch.net.protocols.Response;
   50   import org.apache.nutch.protocol.ProtocolException;
   51   import org.apache.nutch.protocol.http.api.HttpBase;
   52   import org.apache.hadoop.conf.Configuration;
   53   import org.apache.nutch.util.NutchConfiguration;
   54   
   55   /**
   56    * This class is a protocol plugin that configures an HTTP client for
   57    * Basic, Digest and NTLM authentication schemes for web server as well
   58    * as proxy server. It takes care of HTTPS protocol as well as cookies
   59    * in a single fetch session.
   60    *
   61    * @author Susam Pal
   62    */
   63   public class Http extends HttpBase {
   64   
   65     public static final Log LOG = LogFactory.getLog(Http.class);
   66   
   67     private static MultiThreadedHttpConnectionManager connectionManager =
   68             new MultiThreadedHttpConnectionManager();
   69   
   70     // Since the Configuration has not yet been set,
   71     // then an unconfigured client is returned.
   72     private static HttpClient client = new HttpClient(connectionManager);
   73     private static String defaultUsername;
   74     private static String defaultPassword;
   75     private static String defaultRealm;
   76     private static String defaultScheme;
   77     private static String authFile;
   78     private static String agentHost;
   79     private static boolean authRulesRead = false;
   80     private static Configuration conf;
   81   
   82     int maxThreadsTotal = 10;
   83   
   84     private String proxyUsername;
   85     private String proxyPassword;
   86     private String proxyRealm;
   87   
   88   
   89     /**
   90      * Returns the configured HTTP client.
   91      *
   92      * @return HTTP client
   93      */
   94     static synchronized HttpClient getClient() {
   95       return client;
   96     }
   97   
   98     /**
   99      * Constructs this plugin.
  100      */
  101     public Http() {
  102       super(LOG);
  103     }
  104   
  105     /**
  106      * Reads the configuration from the Nutch configuration files and sets
  107      * the configuration.
  108      *
  109      * @param conf Configuration
  110      */
  111     public void setConf(Configuration conf) {
  112       super.setConf(conf);
  113       this.conf = conf;
  114       this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
  115       this.proxyUsername = conf.get("http.proxy.username", "");
  116       this.proxyPassword = conf.get("http.proxy.password", "");
  117       this.proxyRealm = conf.get("http.proxy.realm", "");
  118       agentHost = conf.get("http.agent.host", "");
  119       authFile = conf.get("http.auth.file", "");
  120       configureClient();
  121       try {
  122         setCredentials();
  123       } catch (Exception ex) {
  124         if (LOG.isFatalEnabled()) {
  125           LOG.fatal("Could not read " + authFile + " : " + ex.getMessage());
  126           ex.printStackTrace(LogUtil.getErrorStream(LOG));
  127         }
  128       }
  129     }
  130   
  131     /**
  132      * Main method.
  133      *
  134      * @param args Command line arguments
  135      */
  136     public static void main(String[] args) throws Exception {
  137       Http http = new Http();
  138       http.setConf(NutchConfiguration.create());
  139       main(http, args);
  140     }
  141   
  142     /**
  143      * Fetches the <code>url</code> with a configured HTTP client and
  144      * gets the response.
  145      *
  146      * @param url       URL to be fetched
  147      * @param datum     Crawl data
  148      * @param redirect  Follow redirects if and only if true
  149      * @return          HTTP response
  150      */
  151     protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
  152       throws ProtocolException, IOException {
  153       resolveCredentials(url);
  154       return new HttpResponse(this, url, datum, redirect);
  155     }
  156   
  157     /**
  158      * Configures the HTTP client
  159      */
  160     private void configureClient() {
  161   
  162       // Set up an HTTPS socket factory that accepts self-signed certs.
  163       Protocol https = new Protocol("https",
  164           new DummySSLProtocolSocketFactory(), 443);
  165       Protocol.registerProtocol("https", https);
  166   
  167       HttpConnectionManagerParams params = connectionManager.getParams();
  168       params.setConnectionTimeout(timeout);
  169       params.setSoTimeout(timeout);
  170       params.setSendBufferSize(BUFFER_SIZE);
  171       params.setReceiveBufferSize(BUFFER_SIZE);
  172       params.setMaxTotalConnections(maxThreadsTotal);
  173       if (maxThreadsTotal > maxThreadsPerHost) {
  174         params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost);
  175       } else {
  176         params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
  177       }
  178   
  179       // executeMethod(HttpMethod) seems to ignore the connection timeout on the connection manager.
  180       // set it explicitly on the HttpClient.
  181       client.getParams().setConnectionManagerTimeout(timeout);
  182   
  183       HostConfiguration hostConf = client.getHostConfiguration();
  184       ArrayList headers = new ArrayList();
  185       // Set the User Agent in the header
  186       headers.add(new Header("User-Agent", userAgent));
  187       // prefer English
  188       headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
  189       // prefer UTF-8
  190       headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
  191       // prefer understandable formats
  192       headers.add(new Header("Accept",
  193               "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
  194       // accept gzipped content
  195       headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
  196       hostConf.getParams().setParameter("http.default-headers", headers);
  197   
  198       // HTTP proxy server details
  199       if (useProxy) {
  200         hostConf.setProxy(proxyHost, proxyPort);
  201   
  202         if (proxyUsername.length() > 0) {
  203   
  204           AuthScope proxyAuthScope = getAuthScope(
  205               this.proxyHost, this.proxyPort, this.proxyRealm);
  206   
  207           NTCredentials proxyCredentials = new NTCredentials(
  208               this.proxyUsername, this.proxyPassword,
  209               this.agentHost, this.proxyRealm);
  210   
  211           client.getState().setProxyCredentials(
  212               proxyAuthScope, proxyCredentials);
  213         }
  214       }
  215   
  216     }
  217   
  218     /**
  219      * Reads authentication configuration file (defined as
  220      * 'http.auth.file' in Nutch configuration file) and sets the
  221      * credentials for the configured authentication scopes in the HTTP
  222      * client object.
  223      *
  224      * @throws ParserConfigurationException  If a document builder can not
  225      *                                       be created.
  226      * @throws SAXException                  If any parsing error occurs.
  227      * @throws IOException                   If any I/O error occurs.
  228      */
  229     private static synchronized void setCredentials() throws 
  230         ParserConfigurationException, SAXException, IOException {
  231   
  232       if (authRulesRead)
  233         return;
  234   
  235       authRulesRead = true; // Avoid re-attempting to read
  236   
  237       InputStream is = conf.getConfResourceAsInputStream(authFile);    
  238       if (is != null) {
  239         Document doc = DocumentBuilderFactory.newInstance()
  240                        .newDocumentBuilder().parse(is);
  241   
  242         Element rootElement = doc.getDocumentElement();
  243         if (!"auth-configuration".equals(rootElement.getTagName())) {
  244           if (LOG.isWarnEnabled())
  245             LOG.warn("Bad auth conf file: root element <"
  246                 + rootElement.getTagName() + "> found in " + authFile
  247                 + " - must be <auth-configuration>");
  248         }
  249   
  250         // For each set of credentials
  251         NodeList credList = rootElement.getChildNodes();
  252         for (int i = 0; i < credList.getLength(); i++) {
  253           Node credNode = credList.item(i);
  254           if (!(credNode instanceof Element))
  255             continue;    
  256   
  257           Element credElement = (Element) credNode;
  258           if (!"credentials".equals(credElement.getTagName())) {
  259             if (LOG.isWarnEnabled())
  260               LOG.warn("Bad auth conf file: Element <"
  261               + credElement.getTagName() + "> not recognized in "
  262               + authFile + " - expected <credentials>");
  263             continue;
  264           }
  265   
  266           String username = credElement.getAttribute("username");
  267           String password = credElement.getAttribute("password");
  268   
  269           // For each authentication scope
  270           NodeList scopeList = credElement.getChildNodes();
  271           for (int j = 0; j < scopeList.getLength(); j++) {
  272             Node scopeNode = scopeList.item(j);
  273             if (!(scopeNode instanceof Element))
  274               continue;
  275             
  276             Element scopeElement = (Element) scopeNode;
  277   
  278             if ("default".equals(scopeElement.getTagName())) {
  279   
  280               // Determine realm and scheme, if any
  281               String realm = scopeElement.getAttribute("realm");
  282               String scheme = scopeElement.getAttribute("scheme");
  283   
  284               // Set default credentials
  285               defaultUsername = username;
  286               defaultPassword = password;
  287               defaultRealm = realm;
  288               defaultScheme = scheme;
  289   
  290               if (LOG.isTraceEnabled()) {
  291                 LOG.trace("Credentials - username: " + username 
  292                     + "; set as default"
  293                     + " for realm: " + realm + "; scheme: " + scheme);
  294               }
  295   
  296             } else if ("authscope".equals(scopeElement.getTagName())) {
  297   
  298               // Determine authentication scope details
  299               String host = scopeElement.getAttribute("host");
  300               int port = -1; // For setting port to AuthScope.ANY_PORT
  301               try {
  302                 port = Integer.parseInt(
  303                     scopeElement.getAttribute("port"));
  304               } catch (Exception ex) {
  305                 // do nothing, port is already set to any port
  306               }
  307               String realm = scopeElement.getAttribute("realm");
  308               String scheme = scopeElement.getAttribute("scheme");
  309   
  310               // Set credentials for the determined scope
  311               AuthScope authScope = getAuthScope(host, port, realm, scheme);
  312               NTCredentials credentials = new NTCredentials(
  313                   username, password, agentHost, realm);
  314   
  315               client.getState().setCredentials(authScope, credentials);
  316   
  317               if (LOG.isTraceEnabled()) {
  318                 LOG.trace("Credentials - username: " + username
  319                     + "; set for AuthScope - " + "host: " + host
  320                     + "; port: " + port + "; realm: " + realm
  321                     + "; scheme: " + scheme);
  322               }
  323   
  324             } else {
  325               if (LOG.isWarnEnabled())
  326                 LOG.warn("Bad auth conf file: Element <"
  327                     + scopeElement.getTagName() + "> not recognized in "
  328                     + authFile + " - expected <authscope>");
  329             }
  330           }
  331           is.close();
  332         }
  333       }
  334     }
  335   
  336     /**
  337      * If credentials for the authentication scope determined from the
  338      * specified <code>url</code> is not already set in the HTTP client,
  339      * then this method sets the default credentials to fetch the
  340      * specified <code>url</code>. If credentials are found for the
  341      * authentication scope, the method returns without altering the
  342      * client.
  343      *
  344      * @param url URL to be fetched
  345      */
  346     private void resolveCredentials(URL url) {
  347   
  348       if (defaultUsername != null && defaultUsername.length() > 0) {
  349   
  350         int port = url.getPort();
  351         if (port == -1) {
  352           if ("https".equals(url.getProtocol()))
  353             port = 443;
  354           else
  355             port = 80;
  356         }
  357   
  358         AuthScope scope = new AuthScope(url.getHost(), port);
  359   
  360         if (client.getState().getCredentials(scope) != null) {
  361           if (LOG.isTraceEnabled())
  362             LOG.trace("Pre-configured credentials with scope - host: "
  363                 + url.getHost() + "; port: " + port
  364                 + "; found for url: " + url);
  365   
  366           // Credentials are already configured, so do nothing and return
  367           return;
  368         }
  369   
  370         if (LOG.isTraceEnabled())
  371             LOG.trace("Pre-configured credentials with scope -  host: "
  372                 + url.getHost() + "; port: " + port
  373                 + "; not found for url: " + url);
  374   
  375         AuthScope serverAuthScope = getAuthScope(
  376             url.getHost(), port, defaultRealm, defaultScheme);
  377   
  378         NTCredentials serverCredentials = new NTCredentials(
  379             defaultUsername, defaultPassword,
  380             agentHost, defaultRealm);
  381   
  382         client.getState().setCredentials(
  383             serverAuthScope, serverCredentials);
  384       }
  385     }
  386   
  387     /**
  388      * Returns an authentication scope for the specified
  389      * <code>host</code>, <code>port</code>, <code>realm</code> and
  390      * <code>scheme</code>.
  391      *
  392      * @param host    Host name or address.
  393      * @param port    Port number.
  394      * @param realm   Authentication realm.
  395      * @param scheme  Authentication scheme.
  396      */
  397     private static AuthScope getAuthScope(String host, int port,
  398         String realm, String scheme) {
  399       
  400       if (host.length() == 0)
  401         host = null;
  402   
  403       if (port < 0)
  404         port = -1;
  405   
  406       if (realm.length() == 0)
  407         realm = null;
  408   
  409       if (scheme.length() == 0)
  410         scheme = null;
  411   
  412       return new AuthScope(host, port, realm, scheme);
  413     }
  414   
  415     /**
  416      * Returns an authentication scope for the specified
  417      * <code>host</code>, <code>port</code> and <code>realm</code>.
  418      *
  419      * @param host    Host name or address.
  420      * @param port    Port number.
  421      * @param realm   Authentication realm.
  422      */
  423     private static AuthScope getAuthScope(String host, int port,
  424         String realm) {
  425   
  426         return getAuthScope(host, port, realm, "");
  427     }
  428   }
  429   

Save This Page
Home » nutch-1.0 » org.apache.nutch » protocol » httpclient » [javadoc | source]