1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.nutch.indexer;
19
20 import java.util.ArrayList;
21 import java.util.HashMap;
22
23 // Commons Logging imports
24 import org.apache.commons.logging.Log;
25 import org.apache.commons.logging.LogFactory;
26
27 import org.apache.nutch.plugin;
28 import org.apache.nutch.parse.Parse;
29 import org.apache.nutch.util.ObjectCache;
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.nutch.crawl.CrawlDatum;
32 import org.apache.nutch.crawl.Inlinks;
33 import org.apache.hadoop.io.Text;
34
35 /** Creates and caches {@link IndexingFilter} implementing plugins.*/
36 public class IndexingFilters {
37
38 public static final String INDEXINGFILTER_ORDER = "indexingfilter.order";
39
40 public final static Log LOG = LogFactory.getLog(IndexingFilters.class);
41
42 private IndexingFilter[] indexingFilters;
43
44 public IndexingFilters(Configuration conf) {
45 /* Get indexingfilter.order property */
46 String order = conf.get(INDEXINGFILTER_ORDER);
47 ObjectCache objectCache = ObjectCache.get(conf);
48 this.indexingFilters = (IndexingFilter[]) objectCache
49 .getObject(IndexingFilter.class.getName());
50 if (this.indexingFilters == null) {
51 /*
52 * If ordered filters are required, prepare array of filters based on
53 * property
54 */
55 String[] orderedFilters = null;
56 if (order != null && !order.trim().equals("")) {
57 orderedFilters = order.split("\\s+");
58 }
59 try {
60 ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
61 IndexingFilter.X_POINT_ID);
62 if (point == null)
63 throw new RuntimeException(IndexingFilter.X_POINT_ID + " not found.");
64 Extension[] extensions = point.getExtensions();
65 HashMap<String, IndexingFilter> filterMap =
66 new HashMap<String, IndexingFilter>();
67 for (int i = 0; i < extensions.length; i++) {
68 Extension extension = extensions[i];
69 IndexingFilter filter = (IndexingFilter) extension
70 .getExtensionInstance();
71 LOG.info("Adding " + filter.getClass().getName());
72 if (!filterMap.containsKey(filter.getClass().getName())) {
73 filter.addIndexBackendOptions(conf);
74 filterMap.put(filter.getClass().getName(), filter);
75 }
76 }
77 /*
78 * If no ordered filters required, just get the filters in an
79 * indeterminate order
80 */
81 if (orderedFilters == null) {
82 objectCache.setObject(IndexingFilter.class.getName(),
83 filterMap.values().toArray(
84 new IndexingFilter[0]));
85 /* Otherwise run the filters in the required order */
86 } else {
87 ArrayList<IndexingFilter> filters = new ArrayList<IndexingFilter>();
88 for (int i = 0; i < orderedFilters.length; i++) {
89 IndexingFilter filter = filterMap
90 .get(orderedFilters[i]);
91 if (filter != null) {
92 filter.addIndexBackendOptions(conf);
93 filters.add(filter);
94 }
95 }
96 objectCache.setObject(IndexingFilter.class.getName(), filters
97 .toArray(new IndexingFilter[filters.size()]));
98 }
99 } catch (PluginRuntimeException e) {
100 throw new RuntimeException(e);
101 }
102 this.indexingFilters = (IndexingFilter[]) objectCache
103 .getObject(IndexingFilter.class.getName());
104 }
105 }
106
107 /** Run all defined filters. */
108 public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
109 Inlinks inlinks) throws IndexingException {
110 for (int i = 0; i < this.indexingFilters.length; i++) {
111 doc = this.indexingFilters[i].filter(doc, parse, url, datum, inlinks);
112 // break the loop if an indexing filter discards the doc
113 if (doc == null) return null;
114 }
115
116 return doc;
117 }
118
119 }