Source code: recoinx/clef/irs/lucene/CLEFLuceneAdapter.java
1 /*
2 * Created on 15.08.2003
3 *
4 */
5 package recoinx.clef.irs.lucene;
6
7 import java.io.IOException;
8 import java.util.Hashtable;
9 import java.util.Vector;
10
11 import org.apache.log4j.Logger;
12 import org.apache.lucene.analysis.SimpleAnalyzer;
13 import org.apache.lucene.document.Document;
14 import org.apache.lucene.queryParser.ParseException;
15 import org.apache.lucene.queryParser.QueryParser;
16 import org.apache.lucene.search.Hits;
17 import org.apache.lucene.search.IndexSearcher;
18
19 import recoin.container.RetrievalContainer;
20 import recoin.exception.ComponentRunnableException;
21 import recoin.exception.InitiationException;
22 import recoin.group.Component;
23 import recoin.group.ComponentRunnable;
24 import recoin.group.ComponentSupport;
25 import recoin.group.ComponentWorker;
26
27 import recoinx.clef.*;
28 import recoinx.clef.CLEFConstants;
29 import recoinx.clef.CLEFQuery;
30 import recoinx.clef.CLEFResult;
31 import recoinx.clef.CLEFResultList;
32 import recoinx.clef.ScoreNormalizer;
33
34 /**
35 * Abstract super class for ComponentWorkers that use the Lucene search engine
36 * for retrieval.
37 * <br><br>
38 * This class provides attributes and methods to search the indexes of the
39 * Lucene search engine.
40 * @author Jan H. Scheufen
41 * @version 0.2.9
42 */
43 public abstract class CLEFLuceneAdapter extends ComponentWorker
44 {
45 /**
46 * The logger for this class.
47 */
48 static Logger logger;
49 /**
50 * The IndexSearcher.
51 */
52 public IndexSearcher searcher;
53 /**
54 * Specifies the weight that should be applied to the results found in
55 * the 'title' field of the collections.
56 * <br><br>
57 * This parameter is overwritten if a an attribute with the same name
58 * exists in the repository for this Component. Default is '1.0'.
59 */
60 protected float titleWeight = 1.0f;
61 /**
62 * Specifies the weight that should be applied to the results found in
63 * the 'text' field of the collections.
64 * <br><br>
65 * This parameter is overwritten if a an attribute with the same name
66 * exists in the repository for this Component. Default is '1.0'.
67 */
68 protected float textWeight = 1.0f;
69 /**
70 * The path of the indexes.
71 */
72 protected String indexPath;
73 /**
74 * Flag to signal whether any results should be normalized.
75 * <br><br>
76 * This parameter is overwritten if a an attribute with the same name
77 * exists in the repository for this Component. Default is 'false'.
78 */
79 protected boolean normalize = false;
80
81 /**
82 * Creates a new CLEFLuceneAdapter.
83 */
84 public CLEFLuceneAdapter()
85 {
86 // Initialize the logger for this class.
87 logger = Logger.getLogger( CLEFLuceneAdapter.class.getName() );
88 }
89
90 /**
91 * Initiates this CLEFLuceneAdapter by passing the specified Component to the
92 * superclass's initiate(Component) method and by initializing this class's
93 * attributes.
94 * @param c the Component
95 * @see recoin.group.ComponentWorker#initiate(recoin.group.Component)
96 */
97 public void initiate( Component c ) throws InitiationException
98 {
99 super.initiate( c );
100 Hashtable attributes = component.getAttributes();
101
102 if( attributes.get("normalize") != null )
103 this.normalize = Boolean.getBoolean( (String)attributes.get("normalize") );
104 if( attributes.get("titleWeight") != null )
105 this.titleWeight = Float.parseFloat( (String)attributes.get("titleWeight") );
106 if( attributes.get("textWeight") != null )
107 this.textWeight = Float.parseFloat( (String)attributes.get("textWeight") );
108
109 // indexPath is a mandatory attribute to be set in the repository
110 if( attributes.get("indexPath") != null )
111 indexPath = (String)attributes.get("indexPath");
112 else
113 {
114 logger.error("Attribute 'indexPath' not specified.");
115 throw new InitiationException("CLEFLuceneAdapter ChainID:"+component.generateChainID()+" not initiated correctly! Attribute 'indexPath' is missing.");
116 }
117
118 }
119
120 /**
121 * Queries the Lucene index found at the specified path corresponding to the
122 * specified index using the specified query. The specified field determines
123 * the field of the indexed documents that should be searched.
124 * <br><br>
125 * The method creates a new CLEFResultList that has its attributes set to represent
126 * the collection the results were found in.
127 * @param indexPath the path where the index can be found
128 * @param field the document field to query
129 * @param queryString the query
130 * @param index the the index
131 * @return a CLEFResultList
132 */
133 protected CLEFResultList executeQuery( String indexPath, String field, String queryString, int index )
134 {
135 String name = "";
136 String indexName = "";
137 int collection = -1;
138 switch( index )
139 {
140 case CLEFConstants.SP9495:
141 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SP9495];
142 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SP9495];
143 collection = CLEFConstants.SP9495;
144 break;
145
146 case CLEFConstants.SDA94:
147 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SDA94];
148 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SDA94];
149 collection = CLEFConstants.SDA94;
150 break;
151
152 case CLEFConstants.SDA95:
153 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SDA95];
154 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SDA95];
155 collection = CLEFConstants.SDA95;
156 break;
157
158 case CLEFConstants.FR94:
159 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.FR94];
160 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.FR94];
161 collection = CLEFConstants.FR94;
162 break;
163
164 case CLEFConstants.LAT94:
165 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.LAT94];
166 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.LAT94];
167 collection = CLEFConstants.LAT94;
168 break;
169
170 case CLEFConstants.GH95:
171 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.GH95];
172 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.GH95];
173 collection = CLEFConstants.GH95;
174 break;
175
176 case CLEFConstants.LM94:
177 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.LM94];
178 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.LM94];
179 collection = CLEFConstants.LM94;
180 break;
181
182 case CLEFConstants.ATS94:
183 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.ATS94];
184 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.ATS94];
185 collection = CLEFConstants.ATS94;
186 break;
187
188 case CLEFConstants.ATS95:
189 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.ATS95];
190 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.ATS95];
191 collection = CLEFConstants.ATS95;
192 break;
193
194 case CLEFConstants.EFE94:
195 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.EFE94];
196 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.EFE94];
197 collection = CLEFConstants.EFE94;
198 break;
199
200 case CLEFConstants.EFE95:
201 name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.EFE95];
202 indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.EFE95];
203 collection = CLEFConstants.EFE95;
204 break;
205
206 default:
207 logger.warn("Unable to find statement for table="+CLEFConstants.COLLECTIONTABLES[index]);
208 return null;
209 }
210
211 // this method takes care of setting the list's data correctly, except for the topic and language.
212 CLEFResultList list = new CLEFResultList();
213 list.setName( name );
214 list.setCollection( collection );
215 // chainID setzen
216 component.markComponentSupport( list );
217
218 try
219 {
220 searcher = new IndexSearcher( indexPath+System.getProperty("file.separator")+indexName );
221 Hits hits = searcher.search( QueryParser.parse( queryString, field, new SimpleAnalyzer()) );
222 logger.debug(hits.length()+" Hits found in collection '"+CLEFConstants.COLLECTIONTABLES[collection]+"'.");
223 if( hits.length() > 0 )
224 {
225 Document doc;
226 float score;
227 for( int x=0; x<hits.length(); x++ )
228 {
229 doc = hits.doc(x);
230 score = hits.score(x);
231 String doc_no = doc.get("doc_id");
232 list.addResult( new CLEFResult( list, doc_no, score ) );
233 }
234 }
235 }
236 catch (IOException e)
237 {
238 logger.error("IOException while creating CLEFResultList!", e);
239 }
240 catch (ParseException e)
241 {
242 logger.error("ParseException while creating CLEFResultList!", e);
243 }
244 return list;
245 }
246
247 /**
248 * The LuceneRunnable class implements the special behavior to query the indexes.
249 * It behaves differently, i.e. it searches different collections, according to
250 * the language with which it is created.
251 */
252 protected class LuceneRunnable extends ComponentRunnable
253 {
254 /**
255 * Language to use for retrieval.
256 * @see CLEFConstants
257 */
258 private int language;
259
260 /**
261 * Creates a new LuceneRunnable with the specified RetrievalContainer and
262 * ComponentWorker that performs retrieval for the specified language.
263 * @param container the RetrievalContainer
264 * @param worker the ComponentWorker
265 * @param lang the language
266 */
267 public LuceneRunnable(RetrievalContainer container, ComponentWorker worker, int lang)
268 {
269 super(container, worker);
270 this.language = lang;
271 }
272
273 /**
274 * Creates a new LuceneRunnable with the specified RetrievalContainer,
275 * ComponentSupport and ComponentWorker that performs retrieval for the
276 * specified language.
277 * @param container the RetrievalContainer
278 * @param support the ComponentSupport
279 * @param worker the ComponentWorker
280 * @param lang the language
281 */
282 public LuceneRunnable(RetrievalContainer container, ComponentSupport support, ComponentWorker worker, int lang)
283 {
284 super(container, support, worker);
285 this.language = lang;
286 }
287
288 /**
289 * Creates a new LuceneRunnable with the specified RetrievalContainer,
290 * ComponentSupport, ComponentRunnable and ComponentWorker that performs
291 * retrieval for the specified language.
292 * @param container the RetrievalContainer
293 * @param support the ComponentSupport
294 * @param runnable ComponentRunnable
295 * @param worker the ComponentWorker
296 * @param lang the language
297 */
298 public LuceneRunnable( RetrievalContainer container, ComponentSupport support, ComponentRunnable runnable, ComponentWorker worker, int lang )
299 {
300 super(container, support, runnable, worker);
301 this.language = lang;
302 }
303
304 /**
305 * Performs a search over indexes in a Lucene database according to the language
306 * with which it was created.
307 * <br><br>
308 * First the indexes, i.e. the collections which are
309 * to be searched are determined. Then, for each collection, a search is performed
310 * querying the 'title' and 'text' document fields. These two result lists are
311 * merged according to the specified weights (<code>titleWeight</code>,
312 * <code>textWeight</code>) for the document fields. At the end, all result lists
313 * from different collections are merged into one CLEFResultList.
314 */
315 public void run()
316 {
317 logger.debug(this.getClass().getName()+" started.");
318 if( support instanceof CLEFQuery ) // Verarbeitung eines bestimmten Query
319 {
320 logger.info("Processing CLEFQuery ChainID: "+support.getChainID());
321 CLEFQuery query = (CLEFQuery) support;
322 if( query.getTopic(this.language) != null )
323 {
324 // find the right topic
325 CLEFQuery.CLEFTopic topic = query.getTopic(this.language);
326
327 CLEFResultList titleList = null;
328 CLEFResultList textList = null;
329 CLEFResultList mergedList = null;
330 Vector resultLists = new Vector();
331
332 CLEFResultListMerger merger;
333 // create CLEFResultListMerger that uses cut off if necessary
334 if( query.getCutOff() > 0 )
335 merger = new CLEFResultListMerger( query.getCutOff() );
336 else
337 merger = new CLEFResultListMerger();
338
339 // get tables for language
340 int[] tables = CLEFConstants.getCollectionsByLanguage( language );
341
342 for( int x = 0; x < tables.length; x++ )
343 {
344 // Execute queries with searchstring
345 titleList = executeQuery( indexPath, "title", topic.getTopicContent(), tables[x] );
346 textList = executeQuery( indexPath, "text", topic.getTopicContent(), tables[x] );
347 // Merge lists and add them to container.
348 if( titleList != null && textList != null )
349 {
350
351 if( titleList.getResultCount()+textList.getResultCount() > 0 )
352 {
353 CLEFResultList tmpList = merger.mergeResultLists( titleList, textList, titleWeight, textWeight );
354 // ResultList is only added if Results were found.
355 if( tmpList.getResultCount() > 0 )
356 {
357 // normalize list if specified
358 if( normalize )
359 tmpList = ScoreNormalizer.normalize( tmpList );
360
361 // set the list's topic! Important, because it contains data about language, etc.
362 tmpList.setTopic( topic );
363
364 // also shorten lists now to speed up merging.
365 if( query.getCutOff() > 0 && query.getCutOff() < tmpList.getResultCount() )
366 {
367 logger.debug("Reducing ResultList from "+tmpList.getResultCount()+" to "+query.getCutOff()+" Results.");
368 tmpList.setResults( new Vector (tmpList.getResults().subList(0, query.getCutOff()) ) );
369 }
370 resultLists.add( tmpList );
371 }
372 }
373 else
374 logger.debug("ResultList not added to container.");
375 }
376 }
377
378 // merge the lists into one and store in container
379 mergedList = merger.mergeResultLists( resultLists );
380 // ResultList is only added if Results were found. Also, the merger
381 // might return NULL if there weren't enough ResultLists.
382 if( mergedList != null && mergedList.getResultCount() > 0 )
383 {
384 // set necessary data in mergedList
385 mergedList.setTopic( topic );
386 component.markComponentSupport( mergedList );
387 logger.debug("Adding CLEFResultList to container.");
388 container.addComponentSupport( mergedList );
389 }
390
391 }
392 else
393 {
394 logger.warn("Cannot process ComponentSupport "+support.getClass().getName()+". The language type does not match!");
395 container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
396 }
397 }
398 else
399 {
400 logger.warn("Cannot process ComponentSupport "+support.getClass().getName());
401 container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
402 }
403
404 setFinished(true);
405 }
406 }
407 }