1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.nutch.collection;
18
19 import java.io.File;
20 import java.io.FileNotFoundException;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.net.URL;
25 import java.util.Collection;
26 import java.util.HashMap;
27 import java.util.Iterator;
28 import java.util.Map;
29
30 import org.apache.commons.logging.Log;
31 import org.apache.commons.logging.LogFactory;
32
33 import org.apache.hadoop.conf.Configuration;
34 import org.apache.hadoop.conf.Configured;
35 import org.apache.nutch.util.DomUtil;
36 import org.apache.nutch.util.LogUtil;
37 import org.apache.nutch.util.NutchConfiguration;
38 import org.apache.nutch.util.ObjectCache;
39 import org.apache.xerces.dom.DocumentImpl;
40 import org.w3c.dom.Document;
41 import org.w3c.dom.Element;
42 import org.w3c.dom.NodeList;
43
44 public class CollectionManager extends Configured {
45
46 public static final String DEFAULT_FILE_NAME = "subcollections.xml";
47
48 static final Log LOG = LogFactory.getLog(CollectionManager.class);
49
50 transient Map collectionMap = new HashMap();
51
52 transient URL configfile;
53
54 public CollectionManager(Configuration conf) {
55 super(conf);
56 init();
57 }
58
59 /**
60 * Used for testing
61 */
62 protected CollectionManager(){
63 super(NutchConfiguration.create());
64 }
65
66 protected void init(){
67 try {
68 if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
69 // initialize known subcollections
70 configfile = getConf().getResource(
71 getConf().get("subcollections.config", DEFAULT_FILE_NAME));
72
73 InputStream input = getConf().getConfResourceAsInputStream(
74 getConf().get("subcollections.config", DEFAULT_FILE_NAME));
75 parse(input);
76 } catch (Exception e) {
77 if (LOG.isWarnEnabled()) {
78 LOG.warn("Error occured:" + e);
79 e.printStackTrace(LogUtil.getWarnStream(LOG));
80 }
81 }
82 }
83
84 protected void parse(InputStream input) {
85 Element collections = DomUtil.getDom(input);
86
87 if (collections != null) {
88 NodeList nodeList = collections
89 .getElementsByTagName(Subcollection.TAG_COLLECTION);
90
91 if (LOG.isInfoEnabled()) {
92 LOG.info("file has" + nodeList.getLength() + " elements");
93 }
94
95 for (int i = 0; i < nodeList.getLength(); i++) {
96 Element scElem = (Element) nodeList.item(i);
97 Subcollection subCol = new Subcollection(getConf());
98 subCol.initialize(scElem);
99 collectionMap.put(subCol.name, subCol);
100 }
101 } else if (LOG.isInfoEnabled()) {
102 LOG.info("Cannot find collections");
103 }
104 }
105
106 public static CollectionManager getCollectionManager(Configuration conf) {
107 String key = "collectionmanager";
108 ObjectCache objectCache = ObjectCache.get(conf);
109 CollectionManager impl = (CollectionManager)objectCache.getObject(key);
110 if (impl == null) {
111 try {
112 if (LOG.isInfoEnabled()) {
113 LOG.info("Instantiating CollectionManager");
114 }
115 impl=new CollectionManager(conf);
116 objectCache.setObject(key,impl);
117 } catch (Exception e) {
118 throw new RuntimeException("Couldn't create CollectionManager",e);
119 }
120 }
121 return impl;
122 }
123
124 /**
125 * Returns named subcollection
126 *
127 * @param id
128 * @return Named SubCollection (or null if not existing)
129 */
130 public Subcollection getSubColection(final String id) {
131 return (Subcollection) collectionMap.get(id);
132 }
133
134 /**
135 * Delete named subcollection
136 *
137 * @param id
138 * Id of SubCollection to delete
139 */
140 public void deleteSubCollection(final String id) throws IOException {
141 final Subcollection subCol = getSubColection(id);
142 if (subCol != null) {
143 collectionMap.remove(id);
144 }
145 }
146
147 /**
148 * Create a new subcollection.
149 *
150 * @param name
151 * Name of SubCollection to create
152 * @return Created SubCollection or null if allready existed
153 */
154 public Subcollection createSubCollection(final String id, final String name) {
155 Subcollection subCol = null;
156
157 if (!collectionMap.containsKey(id)) {
158 subCol = new Subcollection(id, name, getConf());
159 collectionMap.put(id, subCol);
160 }
161
162 return subCol;
163 }
164
165 /**
166 * Return names of collections url is part of
167 *
168 * @param url
169 * The url to test against Collections
170 * @return Space delimited string of collection names url is part of
171 */
172 public String getSubCollections(final String url) {
173 String collections = "";
174 final Iterator iterator = collectionMap.values().iterator();
175
176 while (iterator.hasNext()) {
177 final Subcollection subCol = (Subcollection) iterator.next();
178 if (subCol.filter(url) != null) {
179 collections += " " + subCol.name;
180 }
181 }
182 if (LOG.isTraceEnabled()) { LOG.trace("subcollections:" + collections); }
183
184 return collections;
185 }
186
187 /**
188 * Returns all collections
189 *
190 * @return All collections CollectionManager knows about
191 */
192 public Collection getAll() {
193 return collectionMap.values();
194 }
195
196 /**
197 * Save collections into file
198 *
199 * @throws Exception
200 */
201 public void save() throws IOException {
202 try {
203 final FileOutputStream fos = new FileOutputStream(new File(configfile
204 .getFile()));
205 final Document doc = new DocumentImpl();
206 final Element collections = doc
207 .createElement(Subcollection.TAG_COLLECTIONS);
208 final Iterator iterator = collectionMap.values().iterator();
209
210 while (iterator.hasNext()) {
211 final Subcollection subCol = (Subcollection) iterator.next();
212 final Element collection = doc
213 .createElement(Subcollection.TAG_COLLECTION);
214 collections.appendChild(collection);
215 final Element name = doc.createElement(Subcollection.TAG_NAME);
216 name.setNodeValue(subCol.getName());
217 collection.appendChild(name);
218 final Element whiteList = doc
219 .createElement(Subcollection.TAG_WHITELIST);
220 whiteList.setNodeValue(subCol.getWhiteListString());
221 collection.appendChild(whiteList);
222 final Element blackList = doc
223 .createElement(Subcollection.TAG_BLACKLIST);
224 blackList.setNodeValue(subCol.getBlackListString());
225 collection.appendChild(blackList);
226 }
227
228 DomUtil.saveDom(fos, collections);
229 fos.flush();
230 fos.close();
231 } catch (FileNotFoundException e) {
232 throw new IOException(e.toString());
233 }
234 }
235 }