1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.net;
19
20 import java.net.MalformedURLException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.HashSet;
26 import java.util.Iterator;
27 import java.util.List;
28 import java.util.Set;
29 import java.util.Vector;
30
31 import org.apache.commons.logging.Log;
32 import org.apache.commons.logging.LogFactory;
33 import org.apache.hadoop.conf.Configuration;
34 import org.apache.nutch.plugin.Extension;
35 import org.apache.nutch.plugin.ExtensionPoint;
36 import org.apache.nutch.plugin.PluginRepository;
37 import org.apache.nutch.plugin.PluginRuntimeException;
38 import org.apache.nutch.util.ObjectCache;
39
40 /**
41 * This class uses a "chained filter" pattern to run defined normalizers.
42 * Different lists of normalizers may be defined for different "scopes", or
43 * contexts where they are used (note however that they need to be activated
44 * first through <tt>plugin.include</tt> property).
45 *
46 * <p>There is one global scope defined by default, which consists of all
47 * active normalizers. The order in which these normalizers
48 * are executed may be defined in "urlnormalizer.order" property, which lists
49 * space-separated implementation classes (if this property is missing normalizers
50 * will be run in random order). If there are more
51 * normalizers activated than explicitly named on this list, the remaining ones
52 * will be run in random order after the ones specified on the list are executed.</p>
53 * <p>You can define a set of contexts (or scopes) in which normalizers may be
54 * called. Each scope can have its own list of normalizers (defined in
55 * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
56 * "urlnormalizer.order.<scope_name>" property). If any of these properties are
57 * missing, default settings are used for the global scope.</p>
58 * <p>In case no normalizers are required for any given scope, a
59 * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should be used.</p>
60 * <p>Each normalizer may further select among many configurations, depending on
61 * the scope in which it is called, because the scope name is passed as a parameter
62 * to each normalizer. You can also use the same normalizer for many scopes.</p>
63 * <p>Several scopes have been defined, and various Nutch tools will attempt using
64 * scope-specific normalizers first (and fall back to default config if scope-specific
65 * configuration is missing).</p>
66 * <p>Normalizers may be run several times, to ensure that modifications introduced
67 * by normalizers at the end of the list can be further reduced by normalizers
68 * executed at the beginning. By default this loop is executed just once - if you want
69 * to ensure that all possible combinations have been applied you may want to run
70 * this loop up to the number of activated normalizers. This loop count can be configured
71 * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is
72 * unchanged the loop will stop and return the result.</p>
73 *
74 * @author Andrzej Bialecki
75 */
76 public final class URLNormalizers {
77
78 /** Default scope. If no scope properties are defined then the configuration for
79 * this scope will be used.
80 */
81 public static final String SCOPE_DEFAULT = "default";
82 /** Scope used by {@link org.apache.nutch.crawl.PartitionUrlByHost}. */
83 public static final String SCOPE_PARTITION = "partition";
84 /** Scope used by {@link org.apache.nutch.crawl.Generator}. */
85 public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
86 /** Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
87 * redirect URLs.
88 */
89 public static final String SCOPE_FETCHER = "fetcher";
90 /** Scope used when updating the CrawlDb with new URLs. */
91 public static final String SCOPE_CRAWLDB = "crawldb";
92 /** Scope used when updating the LinkDb with new URLs. */
93 public static final String SCOPE_LINKDB = "linkdb";
94 /** Scope used by {@link org.apache.nutch.crawl.Injector}. */
95 public static final String SCOPE_INJECT = "inject";
96 /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
97 public static final String SCOPE_OUTLINK = "outlink";
98
99
100 public static final Log LOG = LogFactory.getLog(URLNormalizers.class);
101
102 /* Empty extension list for caching purposes. */
103 private final List<Extension> EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;
104
105 private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
106
107 private Configuration conf;
108
109 private ExtensionPoint extensionPoint;
110
111 private URLNormalizer[] normalizers;
112
113 private int loopCount;
114
115 public URLNormalizers(Configuration conf, String scope) {
116 this.conf = conf;
117 this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
118 URLNormalizer.X_POINT_ID);
119 ObjectCache objectCache = ObjectCache.get(conf);
120
121 if (this.extensionPoint == null) {
122 throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
123 + " not found.");
124 }
125
126 normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope);
127 if (normalizers == null) {
128 normalizers = getURLNormalizers(scope);
129 }
130 if (normalizers == EMPTY_NORMALIZERS) {
131 normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
132 if (normalizers == null) {
133 normalizers = getURLNormalizers(SCOPE_DEFAULT);
134 }
135 }
136
137 loopCount = conf.getInt("urlnormalizer.loop.count", 1);
138 }
139
140 /**
141 * Function returns an array of {@link URLNormalizer}s for a given scope,
142 * with a specified order.
143 *
144 * @param scope
145 * The scope to return the <code>Array</code> of
146 * {@link URLNormalizer}s for.
147 * @return An <code>Array</code> of {@link URLNormalizer}s for the given
148 * scope.
149 * @throws PluginRuntimeException
150 */
151 URLNormalizer[] getURLNormalizers(String scope) {
152 List<Extension> extensions = getExtensions(scope);
153 ObjectCache objectCache = ObjectCache.get(conf);
154
155 if (extensions == EMPTY_EXTENSION_LIST) {
156 return EMPTY_NORMALIZERS;
157 }
158
159 List<URLNormalizer> normalizers = new Vector<URLNormalizer>(extensions.size());
160
161 Iterator<Extension> it = extensions.iterator();
162 while (it.hasNext()) {
163 Extension ext = it.next();
164 URLNormalizer normalizer = null;
165 try {
166 // check to see if we've cached this URLNormalizer instance yet
167 normalizer = (URLNormalizer) objectCache.getObject(ext.getId());
168 if (normalizer == null) {
169 // go ahead and instantiate it and then cache it
170 normalizer = (URLNormalizer) ext.getExtensionInstance();
171 objectCache.setObject(ext.getId(), normalizer);
172 }
173 normalizers.add(normalizer);
174 } catch (PluginRuntimeException e) {
175 e.printStackTrace();
176 LOG.warn("URLNormalizers:PluginRuntimeException when "
177 + "initializing url normalizer plugin "
178 + ext.getDescriptor().getPluginId()
179 + " instance in getURLNormalizers "
180 + "function: attempting to continue instantiating plugins");
181 }
182 }
183 return normalizers.toArray(new URLNormalizer[normalizers
184 .size()]);
185 }
186
187 /**
188 * Finds the best-suited normalizer plugin for a given scope.
189 *
190 * @param scope
191 * Scope for which we seek a normalizer plugin.
192 * @return a list of extensions to be used for this scope. If none, returns
193 * empty list.
194 * @throws PluginRuntimeException
195 */
196 private List<Extension> getExtensions(String scope) {
197 ObjectCache objectCache = ObjectCache.get(conf);
198 List<Extension> extensions =
199 (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_"
200 + scope);
201
202 // Just compare the reference:
203 // if this is the empty list, we know we will find no extension.
204 if (extensions == EMPTY_EXTENSION_LIST) {
205 return EMPTY_EXTENSION_LIST;
206 }
207
208 if (extensions == null) {
209 extensions = findExtensions(scope);
210 if (extensions != null) {
211 objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions);
212 } else {
213 // Put the empty extension list into cache
214 // to remember we don't know any related extension.
215 objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST);
216 extensions = EMPTY_EXTENSION_LIST;
217 }
218 }
219 return extensions;
220 }
221
222 /**
223 * searches a list of suitable url normalizer plugins for the given scope.
224 *
225 * @param scope
226 * Scope for which we seek a url normalizer plugin.
227 * @return List - List of extensions to be used for this scope. If none,
228 * returns null.
229 * @throws PluginRuntimeException
230 */
231 private List<Extension> findExtensions(String scope) {
232
233 String[] orders = null;
234 String orderlist = conf.get("urlnormalizer.order." + scope);
235 if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
236 if (orderlist != null && !orderlist.trim().equals("")) {
237 orders = orderlist.split("\\s+");
238 }
239 String scopelist = conf.get("urlnormalizer.scope." + scope);
240 Set<String> impls = null;
241 if (scopelist != null && !scopelist.trim().equals("")) {
242 String[] names = scopelist.split("\\s+");
243 impls = new HashSet<String>(Arrays.asList(names));
244 }
245 Extension[] extensions = this.extensionPoint.getExtensions();
246 HashMap<String, Extension> normalizerExtensions = new HashMap<String, Extension>();
247 for (int i = 0; i < extensions.length; i++) {
248 Extension extension = extensions[i];
249 if (impls != null && !impls.contains(extension.getClazz()))
250 continue;
251 normalizerExtensions.put(extension.getClazz(), extension);
252 }
253 List<Extension> res = new ArrayList<Extension>();
254 if (orders == null) {
255 res.addAll(normalizerExtensions.values());
256 } else {
257 // first add those explicitly named in correct order
258 for (int i = 0; i < orders.length; i++) {
259 Extension e = normalizerExtensions.get(orders[i]);
260 if (e != null) {
261 res.add(e);
262 normalizerExtensions.remove(orders[i]);
263 }
264 }
265 // then add all others in random order
266 res.addAll(normalizerExtensions.values());
267 }
268 return res;
269 }
270
271 /**
272 * Normalize
273 * @param urlString The URL string to normalize.
274 * @param scope The given scope.
275 * @return A normalized String, using the given <code>scope</code>
276 * @throws MalformedURLException If the given URL string is malformed.
277 */
278 public String normalize(String urlString, String scope)
279 throws MalformedURLException {
280 // optionally loop several times, and break if no further changes
281 String initialString = urlString;
282 for (int k = 0; k < loopCount; k++) {
283 for (int i = 0; i < this.normalizers.length; i++) {
284 if (urlString == null)
285 return null;
286 urlString = this.normalizers[i].normalize(urlString, scope);
287 }
288 if (initialString.equals(urlString)) break;
289 initialString = urlString;
290 }
291 return urlString;
292 }
293 }