Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: recoinx/clef/irs/lucene/CLEFLuceneAdapter.java


1   /*
2    * Created on 15.08.2003
3    *
4    */
5   package recoinx.clef.irs.lucene;
6   
7   import java.io.IOException;
8   import java.util.Hashtable;
9   import java.util.Vector;
10  
11  import org.apache.log4j.Logger;
12  import org.apache.lucene.analysis.SimpleAnalyzer;
13  import org.apache.lucene.document.Document;
14  import org.apache.lucene.queryParser.ParseException;
15  import org.apache.lucene.queryParser.QueryParser;
16  import org.apache.lucene.search.Hits;
17  import org.apache.lucene.search.IndexSearcher;
18  
19  import recoin.container.RetrievalContainer;
20  import recoin.exception.ComponentRunnableException;
21  import recoin.exception.InitiationException;
22  import recoin.group.Component;
23  import recoin.group.ComponentRunnable;
24  import recoin.group.ComponentSupport;
25  import recoin.group.ComponentWorker;
26  
27  import recoinx.clef.*;
28  import recoinx.clef.CLEFConstants;
29  import recoinx.clef.CLEFQuery;
30  import recoinx.clef.CLEFResult;
31  import recoinx.clef.CLEFResultList;
32  import recoinx.clef.ScoreNormalizer;
33  
34  /**
35   * Abstract super class for ComponentWorkers that use the Lucene search engine
36   * for retrieval.
37   * <br><br>
38   * This class provides attributes and methods to search the indexes of the
39   * Lucene search engine.
40   * @author Jan H. Scheufen
41   * @version 0.2.9
42   */
43  public abstract class CLEFLuceneAdapter extends ComponentWorker
44  {
45    /**
46     * The logger for this class.
47     */
48    static Logger logger;
49    /**
50     * The IndexSearcher.
51     */
52    public IndexSearcher searcher;
53    /**
54     * Specifies the weight that should be applied to the results found in
55     * the 'title' field of the collections.
56     * <br><br>
57     * This parameter is overwritten if a an attribute with the same name
58     * exists in the repository for this Component. Default is '1.0'. 
59     */  
60    protected float titleWeight = 1.0f;
61    /**
62     * Specifies the weight that should be applied to the results found in
63     * the 'text' field of the collections.
64     * <br><br>
65     * This parameter is overwritten if a an attribute with the same name
66     * exists in the repository for this Component. Default is '1.0'. 
67     */  
68    protected float textWeight = 1.0f;
69    /**
70     * The path of the indexes.
71     */
72    protected String indexPath;
73    /**
74     * Flag to signal whether any results should be normalized.
75     * <br><br>
76     * This parameter is overwritten if a an attribute with the same name
77     * exists in the repository for this Component. Default is 'false'. 
78     */
79    protected boolean normalize = false;
80    
81    /**
82     * Creates a new CLEFLuceneAdapter.
83     */
84    public CLEFLuceneAdapter()
85    {
86      // Initialize the logger for this class.
87      logger = Logger.getLogger( CLEFLuceneAdapter.class.getName() );
88    }
89    
90    /**
91     * Initiates this CLEFLuceneAdapter by passing the specified Component to the
92     * superclass's initiate(Component) method and by initializing this class's
93     * attributes.
94     * @param c the Component
95     * @see recoin.group.ComponentWorker#initiate(recoin.group.Component)
96     */
97    public void initiate( Component c ) throws InitiationException
98    {
99      super.initiate( c );
100     Hashtable attributes = component.getAttributes();
101 
102     if( attributes.get("normalize") != null )
103       this.normalize = Boolean.getBoolean( (String)attributes.get("normalize") );
104     if( attributes.get("titleWeight") != null )  
105       this.titleWeight = Float.parseFloat( (String)attributes.get("titleWeight") );
106     if( attributes.get("textWeight") != null )
107       this.textWeight = Float.parseFloat( (String)attributes.get("textWeight") );
108     
109     // indexPath is a mandatory attribute to be set in the repository
110     if( attributes.get("indexPath") != null )
111       indexPath = (String)attributes.get("indexPath");
112     else
113     {
114       logger.error("Attribute 'indexPath' not specified.");
115       throw new InitiationException("CLEFLuceneAdapter ChainID:"+component.generateChainID()+" not initiated correctly! Attribute 'indexPath' is missing.");
116     }
117       
118   }
119   
120   /**
121    * Queries the Lucene index found at the specified path corresponding to the
122    * specified index using the specified query. The specified field determines
123    * the field of the indexed documents that should be searched.
124    * <br><br>
125    * The method creates a new CLEFResultList that has its attributes set to represent
126    * the collection the results were found in.
127    * @param indexPath the path where the index can be found
128    * @param field the document field to query
129    * @param queryString the query
130    * @param index the the index
131    * @return a CLEFResultList
132    */
133   protected CLEFResultList executeQuery( String indexPath, String field, String queryString, int index )
134   {
135     String name = "";
136     String indexName = "";
137     int collection = -1;
138     switch( index )
139     {
140       case CLEFConstants.SP9495:
141         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SP9495];
142         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SP9495];
143         collection = CLEFConstants.SP9495;
144         break;
145 
146       case CLEFConstants.SDA94:
147         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SDA94];
148         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SDA94];
149         collection = CLEFConstants.SDA94;
150         break;
151 
152       case CLEFConstants.SDA95:
153         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.SDA95];
154         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.SDA95];
155         collection = CLEFConstants.SDA95;
156         break;
157 
158       case CLEFConstants.FR94:
159         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.FR94];
160         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.FR94];
161         collection = CLEFConstants.FR94;
162         break;
163 
164       case CLEFConstants.LAT94:
165         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.LAT94];
166         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.LAT94];
167         collection = CLEFConstants.LAT94;
168         break;
169 
170       case CLEFConstants.GH95:
171         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.GH95];
172         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.GH95];
173         collection = CLEFConstants.GH95;
174         break;
175 
176       case CLEFConstants.LM94:
177         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.LM94];
178         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.LM94];
179         collection = CLEFConstants.LM94;
180         break;
181 
182       case CLEFConstants.ATS94:
183         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.ATS94];
184         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.ATS94];
185         collection = CLEFConstants.ATS94;
186         break;
187 
188       case CLEFConstants.ATS95:
189         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.ATS95];
190         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.ATS95];
191         collection = CLEFConstants.ATS95;
192         break;
193 
194       case CLEFConstants.EFE94:
195         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.EFE94];
196         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.EFE94];
197         collection = CLEFConstants.EFE94;
198         break;
199 
200       case CLEFConstants.EFE95:
201         name = CLEFConstants.COLLECTIONNAMES[CLEFConstants.EFE95];
202         indexName = CLEFConstants.COLLECTIONTABLES[CLEFConstants.EFE95];
203         collection = CLEFConstants.EFE95;
204         break;
205 
206       default:
207         logger.warn("Unable to find statement for table="+CLEFConstants.COLLECTIONTABLES[index]);
208         return null;
209     }
210     
211     // this method takes care of setting the list's data correctly, except for the topic and language.
212     CLEFResultList list = new CLEFResultList();
213     list.setName( name );
214     list.setCollection( collection );
215     // chainID setzen
216     component.markComponentSupport( list );
217 
218     try
219     {
220       searcher = new IndexSearcher( indexPath+System.getProperty("file.separator")+indexName );
221       Hits hits = searcher.search( QueryParser.parse( queryString, field, new SimpleAnalyzer()) );
222       logger.debug(hits.length()+" Hits found in collection '"+CLEFConstants.COLLECTIONTABLES[collection]+"'.");
223       if( hits.length() > 0 )
224       {
225         Document doc;
226         float score;
227         for( int x=0; x<hits.length(); x++ )
228         {
229           doc = hits.doc(x);
230           score = hits.score(x);   
231           String doc_no = doc.get("doc_id");
232           list.addResult( new CLEFResult( list, doc_no, score ) );
233         }
234       }
235     }
236     catch (IOException e)
237     {
238       logger.error("IOException while creating CLEFResultList!", e);
239     }
240     catch (ParseException e)
241     {
242       logger.error("ParseException while creating CLEFResultList!", e);
243     }
244     return list;
245   }
246 
247   /**
248    * The LuceneRunnable class implements the special behavior to query the indexes.
249    * It behaves differently, i.e. it searches different collections, according to
250    * the language with which it is created.
251    */
252   protected class LuceneRunnable extends ComponentRunnable 
253   {
254     /**
255      * Language to use for retrieval.
256      * @see CLEFConstants
257      */
258     private int language;
259     
260     /**
261      * Creates a new LuceneRunnable with the specified RetrievalContainer and
262      * ComponentWorker that performs retrieval for the specified language.
263      * @param container the RetrievalContainer
264      * @param worker the ComponentWorker
265      * @param lang the language
266      */
267     public LuceneRunnable(RetrievalContainer container, ComponentWorker worker, int lang)
268     {
269       super(container, worker);
270       this.language = lang;
271     }
272     
273     /**
274      * Creates a new LuceneRunnable with the specified RetrievalContainer,
275      * ComponentSupport and ComponentWorker that performs retrieval for the
276      * specified language.
277      * @param container the RetrievalContainer
278      * @param support the ComponentSupport
279      * @param worker the ComponentWorker
280      * @param lang the language
281      */
282     public LuceneRunnable(RetrievalContainer container, ComponentSupport support, ComponentWorker worker, int lang) 
283     {
284       super(container, support, worker);
285       this.language = lang;
286     }
287     
288     /**
289      * Creates a new LuceneRunnable with the specified RetrievalContainer,
290      * ComponentSupport, ComponentRunnable and ComponentWorker that performs
291      * retrieval for the specified language.
292      * @param container the RetrievalContainer
293      * @param support the ComponentSupport
294      * @param runnable ComponentRunnable
295      * @param worker the ComponentWorker
296      * @param lang the language
297      */
298     public LuceneRunnable( RetrievalContainer container, ComponentSupport support, ComponentRunnable runnable, ComponentWorker worker, int lang )
299     {
300       super(container, support, runnable, worker);
301       this.language = lang;
302     }
303 
304     /**
305      * Performs a search over indexes in a Lucene database according to the language
306      * with which it was created.
307      * <br><br>
308      * First the indexes, i.e. the collections which are
309      * to be searched are determined. Then, for each collection, a search is performed
310      * querying the 'title' and 'text' document fields. These two result lists are
311      * merged according to the specified weights (<code>titleWeight</code>,
312      * <code>textWeight</code>) for the document fields. At the end, all result lists
313      * from different collections are merged into one CLEFResultList.
314      */
315     public void run() 
316     {
317       logger.debug(this.getClass().getName()+" started.");
318       if( support instanceof CLEFQuery )  // Verarbeitung eines bestimmten Query
319       {
320         logger.info("Processing CLEFQuery ChainID: "+support.getChainID());
321         CLEFQuery query = (CLEFQuery) support;
322         if( query.getTopic(this.language) != null )
323         {
324           // find the right topic
325           CLEFQuery.CLEFTopic topic = query.getTopic(this.language);
326 
327           CLEFResultList titleList = null;
328           CLEFResultList textList = null;
329           CLEFResultList mergedList = null;
330           Vector resultLists = new Vector();
331           
332           CLEFResultListMerger merger;
333           // create CLEFResultListMerger that uses cut off if necessary
334           if( query.getCutOff() > 0 )
335             merger = new CLEFResultListMerger( query.getCutOff() );
336           else
337             merger = new CLEFResultListMerger();
338 
339           // get tables for language
340           int[] tables = CLEFConstants.getCollectionsByLanguage( language );
341           
342           for( int x = 0; x < tables.length; x++ )
343           {
344             // Execute queries with searchstring
345             titleList = executeQuery( indexPath, "title", topic.getTopicContent(), tables[x] );
346             textList = executeQuery( indexPath, "text", topic.getTopicContent(), tables[x] );
347             // Merge lists and add them to container.
348             if( titleList != null && textList != null )
349             {
350               
351               if( titleList.getResultCount()+textList.getResultCount() > 0 )
352               {
353                 CLEFResultList tmpList = merger.mergeResultLists( titleList, textList, titleWeight, textWeight );
354                 // ResultList is only added if Results were found.
355                 if( tmpList.getResultCount() > 0 )
356                 {                    
357                   // normalize list if specified
358                   if( normalize )
359                     tmpList = ScoreNormalizer.normalize( tmpList );
360                   
361                   // set the list's topic! Important, because it contains data about language, etc.
362                   tmpList.setTopic( topic );
363 
364                   // also shorten lists now to speed up merging.
365                   if( query.getCutOff() > 0 && query.getCutOff() < tmpList.getResultCount() )
366                   {
367                     logger.debug("Reducing ResultList from "+tmpList.getResultCount()+" to "+query.getCutOff()+" Results.");
368                     tmpList.setResults( new Vector (tmpList.getResults().subList(0, query.getCutOff()) ) );
369                   }
370                   resultLists.add( tmpList );
371                 }
372               }
373               else
374                 logger.debug("ResultList not added to container.");
375             }
376           }
377 
378           // merge the lists into one and store in container
379           mergedList = merger.mergeResultLists( resultLists );
380           // ResultList is only added if Results were found. Also, the merger
381           // might return NULL if there weren't enough ResultLists.
382           if( mergedList != null && mergedList.getResultCount() > 0 )
383           {
384             // set necessary data in mergedList
385             mergedList.setTopic( topic );
386             component.markComponentSupport( mergedList );
387             logger.debug("Adding CLEFResultList to container.");
388             container.addComponentSupport( mergedList );
389           }
390 
391         }
392         else
393         {  
394           logger.warn("Cannot process ComponentSupport "+support.getClass().getName()+". The language type does not match!");
395           container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
396         }
397       }
398       else
399       {
400         logger.warn("Cannot process ComponentSupport "+support.getClass().getName());
401         container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
402       }
403       
404       setFinished(true);
405     }
406   }
407 }