1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.nutch.protocol.httpclient;
18
19 // JDK imports
20 import java.io.InputStream;
21 import java.io.IOException;
22 import java.net.URL;
23 import java.util.ArrayList;
24 import javax.xml.parsers.DocumentBuilderFactory;
25 import javax.xml.parsers.ParserConfigurationException;
26 import org.xml.sax.SAXException;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.Element;
29 import org.w3c.dom.NodeList;
30 import org.w3c.dom.Node;
31
32 // Commons Logging imports
33 import org.apache.commons.logging.Log;
34 import org.apache.commons.logging.LogFactory;
35
36 // HTTP Client imports
37 import org.apache.commons.httpclient.Header;
38 import org.apache.commons.httpclient.HostConfiguration;
39 import org.apache.commons.httpclient.HttpClient;
40 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
41 import org.apache.commons.httpclient.NTCredentials;
42 import org.apache.commons.httpclient.auth.AuthScope;
43 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
44 import org.apache.commons.httpclient.protocol.Protocol;
45
46 // Nutch imports
47 import org.apache.nutch.util.LogUtil;
48 import org.apache.nutch.crawl.CrawlDatum;
49 import org.apache.nutch.net.protocols.Response;
50 import org.apache.nutch.protocol.ProtocolException;
51 import org.apache.nutch.protocol.http.api.HttpBase;
52 import org.apache.hadoop.conf.Configuration;
53 import org.apache.nutch.util.NutchConfiguration;
54
55 /**
56 * This class is a protocol plugin that configures an HTTP client for
57 * Basic, Digest and NTLM authentication schemes for web server as well
58 * as proxy server. It takes care of HTTPS protocol as well as cookies
59 * in a single fetch session.
60 *
61 * @author Susam Pal
62 */
63 public class Http extends HttpBase {
64
65 public static final Log LOG = LogFactory.getLog(Http.class);
66
67 private static MultiThreadedHttpConnectionManager connectionManager =
68 new MultiThreadedHttpConnectionManager();
69
70 // Since the Configuration has not yet been set,
71 // then an unconfigured client is returned.
72 private static HttpClient client = new HttpClient(connectionManager);
73 private static String defaultUsername;
74 private static String defaultPassword;
75 private static String defaultRealm;
76 private static String defaultScheme;
77 private static String authFile;
78 private static String agentHost;
79 private static boolean authRulesRead = false;
80 private static Configuration conf;
81
82 int maxThreadsTotal = 10;
83
84 private String proxyUsername;
85 private String proxyPassword;
86 private String proxyRealm;
87
88
89 /**
90 * Returns the configured HTTP client.
91 *
92 * @return HTTP client
93 */
94 static synchronized HttpClient getClient() {
95 return client;
96 }
97
98 /**
99 * Constructs this plugin.
100 */
101 public Http() {
102 super(LOG);
103 }
104
105 /**
106 * Reads the configuration from the Nutch configuration files and sets
107 * the configuration.
108 *
109 * @param conf Configuration
110 */
111 public void setConf(Configuration conf) {
112 super.setConf(conf);
113 this.conf = conf;
114 this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);
115 this.proxyUsername = conf.get("http.proxy.username", "");
116 this.proxyPassword = conf.get("http.proxy.password", "");
117 this.proxyRealm = conf.get("http.proxy.realm", "");
118 agentHost = conf.get("http.agent.host", "");
119 authFile = conf.get("http.auth.file", "");
120 configureClient();
121 try {
122 setCredentials();
123 } catch (Exception ex) {
124 if (LOG.isFatalEnabled()) {
125 LOG.fatal("Could not read " + authFile + " : " + ex.getMessage());
126 ex.printStackTrace(LogUtil.getErrorStream(LOG));
127 }
128 }
129 }
130
131 /**
132 * Main method.
133 *
134 * @param args Command line arguments
135 */
136 public static void main(String[] args) throws Exception {
137 Http http = new Http();
138 http.setConf(NutchConfiguration.create());
139 main(http, args);
140 }
141
142 /**
143 * Fetches the <code>url</code> with a configured HTTP client and
144 * gets the response.
145 *
146 * @param url URL to be fetched
147 * @param datum Crawl data
148 * @param redirect Follow redirects if and only if true
149 * @return HTTP response
150 */
151 protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
152 throws ProtocolException, IOException {
153 resolveCredentials(url);
154 return new HttpResponse(this, url, datum, redirect);
155 }
156
157 /**
158 * Configures the HTTP client
159 */
160 private void configureClient() {
161
162 // Set up an HTTPS socket factory that accepts self-signed certs.
163 Protocol https = new Protocol("https",
164 new DummySSLProtocolSocketFactory(), 443);
165 Protocol.registerProtocol("https", https);
166
167 HttpConnectionManagerParams params = connectionManager.getParams();
168 params.setConnectionTimeout(timeout);
169 params.setSoTimeout(timeout);
170 params.setSendBufferSize(BUFFER_SIZE);
171 params.setReceiveBufferSize(BUFFER_SIZE);
172 params.setMaxTotalConnections(maxThreadsTotal);
173 if (maxThreadsTotal > maxThreadsPerHost) {
174 params.setDefaultMaxConnectionsPerHost(maxThreadsPerHost);
175 } else {
176 params.setDefaultMaxConnectionsPerHost(maxThreadsTotal);
177 }
178
179 // executeMethod(HttpMethod) seems to ignore the connection timeout on the connection manager.
180 // set it explicitly on the HttpClient.
181 client.getParams().setConnectionManagerTimeout(timeout);
182
183 HostConfiguration hostConf = client.getHostConfiguration();
184 ArrayList headers = new ArrayList();
185 // Set the User Agent in the header
186 headers.add(new Header("User-Agent", userAgent));
187 // prefer English
188 headers.add(new Header("Accept-Language", "en-us,en-gb,en;q=0.7,*;q=0.3"));
189 // prefer UTF-8
190 headers.add(new Header("Accept-Charset", "utf-8,ISO-8859-1;q=0.7,*;q=0.7"));
191 // prefer understandable formats
192 headers.add(new Header("Accept",
193 "text/html,application/xml;q=0.9,application/xhtml+xml,text/xml;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5"));
194 // accept gzipped content
195 headers.add(new Header("Accept-Encoding", "x-gzip, gzip, deflate"));
196 hostConf.getParams().setParameter("http.default-headers", headers);
197
198 // HTTP proxy server details
199 if (useProxy) {
200 hostConf.setProxy(proxyHost, proxyPort);
201
202 if (proxyUsername.length() > 0) {
203
204 AuthScope proxyAuthScope = getAuthScope(
205 this.proxyHost, this.proxyPort, this.proxyRealm);
206
207 NTCredentials proxyCredentials = new NTCredentials(
208 this.proxyUsername, this.proxyPassword,
209 this.agentHost, this.proxyRealm);
210
211 client.getState().setProxyCredentials(
212 proxyAuthScope, proxyCredentials);
213 }
214 }
215
216 }
217
218 /**
219 * Reads authentication configuration file (defined as
220 * 'http.auth.file' in Nutch configuration file) and sets the
221 * credentials for the configured authentication scopes in the HTTP
222 * client object.
223 *
224 * @throws ParserConfigurationException If a document builder can not
225 * be created.
226 * @throws SAXException If any parsing error occurs.
227 * @throws IOException If any I/O error occurs.
228 */
229 private static synchronized void setCredentials() throws
230 ParserConfigurationException, SAXException, IOException {
231
232 if (authRulesRead)
233 return;
234
235 authRulesRead = true; // Avoid re-attempting to read
236
237 InputStream is = conf.getConfResourceAsInputStream(authFile);
238 if (is != null) {
239 Document doc = DocumentBuilderFactory.newInstance()
240 .newDocumentBuilder().parse(is);
241
242 Element rootElement = doc.getDocumentElement();
243 if (!"auth-configuration".equals(rootElement.getTagName())) {
244 if (LOG.isWarnEnabled())
245 LOG.warn("Bad auth conf file: root element <"
246 + rootElement.getTagName() + "> found in " + authFile
247 + " - must be <auth-configuration>");
248 }
249
250 // For each set of credentials
251 NodeList credList = rootElement.getChildNodes();
252 for (int i = 0; i < credList.getLength(); i++) {
253 Node credNode = credList.item(i);
254 if (!(credNode instanceof Element))
255 continue;
256
257 Element credElement = (Element) credNode;
258 if (!"credentials".equals(credElement.getTagName())) {
259 if (LOG.isWarnEnabled())
260 LOG.warn("Bad auth conf file: Element <"
261 + credElement.getTagName() + "> not recognized in "
262 + authFile + " - expected <credentials>");
263 continue;
264 }
265
266 String username = credElement.getAttribute("username");
267 String password = credElement.getAttribute("password");
268
269 // For each authentication scope
270 NodeList scopeList = credElement.getChildNodes();
271 for (int j = 0; j < scopeList.getLength(); j++) {
272 Node scopeNode = scopeList.item(j);
273 if (!(scopeNode instanceof Element))
274 continue;
275
276 Element scopeElement = (Element) scopeNode;
277
278 if ("default".equals(scopeElement.getTagName())) {
279
280 // Determine realm and scheme, if any
281 String realm = scopeElement.getAttribute("realm");
282 String scheme = scopeElement.getAttribute("scheme");
283
284 // Set default credentials
285 defaultUsername = username;
286 defaultPassword = password;
287 defaultRealm = realm;
288 defaultScheme = scheme;
289
290 if (LOG.isTraceEnabled()) {
291 LOG.trace("Credentials - username: " + username
292 + "; set as default"
293 + " for realm: " + realm + "; scheme: " + scheme);
294 }
295
296 } else if ("authscope".equals(scopeElement.getTagName())) {
297
298 // Determine authentication scope details
299 String host = scopeElement.getAttribute("host");
300 int port = -1; // For setting port to AuthScope.ANY_PORT
301 try {
302 port = Integer.parseInt(
303 scopeElement.getAttribute("port"));
304 } catch (Exception ex) {
305 // do nothing, port is already set to any port
306 }
307 String realm = scopeElement.getAttribute("realm");
308 String scheme = scopeElement.getAttribute("scheme");
309
310 // Set credentials for the determined scope
311 AuthScope authScope = getAuthScope(host, port, realm, scheme);
312 NTCredentials credentials = new NTCredentials(
313 username, password, agentHost, realm);
314
315 client.getState().setCredentials(authScope, credentials);
316
317 if (LOG.isTraceEnabled()) {
318 LOG.trace("Credentials - username: " + username
319 + "; set for AuthScope - " + "host: " + host
320 + "; port: " + port + "; realm: " + realm
321 + "; scheme: " + scheme);
322 }
323
324 } else {
325 if (LOG.isWarnEnabled())
326 LOG.warn("Bad auth conf file: Element <"
327 + scopeElement.getTagName() + "> not recognized in "
328 + authFile + " - expected <authscope>");
329 }
330 }
331 is.close();
332 }
333 }
334 }
335
336 /**
337 * If credentials for the authentication scope determined from the
338 * specified <code>url</code> is not already set in the HTTP client,
339 * then this method sets the default credentials to fetch the
340 * specified <code>url</code>. If credentials are found for the
341 * authentication scope, the method returns without altering the
342 * client.
343 *
344 * @param url URL to be fetched
345 */
346 private void resolveCredentials(URL url) {
347
348 if (defaultUsername != null && defaultUsername.length() > 0) {
349
350 int port = url.getPort();
351 if (port == -1) {
352 if ("https".equals(url.getProtocol()))
353 port = 443;
354 else
355 port = 80;
356 }
357
358 AuthScope scope = new AuthScope(url.getHost(), port);
359
360 if (client.getState().getCredentials(scope) != null) {
361 if (LOG.isTraceEnabled())
362 LOG.trace("Pre-configured credentials with scope - host: "
363 + url.getHost() + "; port: " + port
364 + "; found for url: " + url);
365
366 // Credentials are already configured, so do nothing and return
367 return;
368 }
369
370 if (LOG.isTraceEnabled())
371 LOG.trace("Pre-configured credentials with scope - host: "
372 + url.getHost() + "; port: " + port
373 + "; not found for url: " + url);
374
375 AuthScope serverAuthScope = getAuthScope(
376 url.getHost(), port, defaultRealm, defaultScheme);
377
378 NTCredentials serverCredentials = new NTCredentials(
379 defaultUsername, defaultPassword,
380 agentHost, defaultRealm);
381
382 client.getState().setCredentials(
383 serverAuthScope, serverCredentials);
384 }
385 }
386
387 /**
388 * Returns an authentication scope for the specified
389 * <code>host</code>, <code>port</code>, <code>realm</code> and
390 * <code>scheme</code>.
391 *
392 * @param host Host name or address.
393 * @param port Port number.
394 * @param realm Authentication realm.
395 * @param scheme Authentication scheme.
396 */
397 private static AuthScope getAuthScope(String host, int port,
398 String realm, String scheme) {
399
400 if (host.length() == 0)
401 host = null;
402
403 if (port < 0)
404 port = -1;
405
406 if (realm.length() == 0)
407 realm = null;
408
409 if (scheme.length() == 0)
410 scheme = null;
411
412 return new AuthScope(host, port, realm, scheme);
413 }
414
415 /**
416 * Returns an authentication scope for the specified
417 * <code>host</code>, <code>port</code> and <code>realm</code>.
418 *
419 * @param host Host name or address.
420 * @param port Port number.
421 * @param realm Authentication realm.
422 */
423 private static AuthScope getAuthScope(String host, int port,
424 String realm) {
425
426 return getAuthScope(host, port, realm, "");
427 }
428 }
429