Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: recoinx/clef/CLEFTopicTranslator.java


1   /*
2    * Created on 18.08.2003
3    *
4    */
5   package recoinx.clef;
6   
7   import java.io.File;
8   import java.io.FilenameFilter;
9   import java.io.IOException;
10  import java.io.StringReader;
11  import java.util.Enumeration;
12  import java.util.Hashtable;
13  import java.util.Iterator;
14  import java.util.Vector;
15  
16  import org.apache.log4j.Logger;
17  import org.apache.lucene.analysis.Token;
18  import org.apache.lucene.analysis.WhitespaceTokenizer;
19  import org.dom4j.Document;
20  import org.dom4j.DocumentException;
21  import org.dom4j.Element;
22  import org.dom4j.io.SAXReader;
23  
24  import recoin.container.RetrievalContainer;
25  import recoin.exception.ComponentRunnableException;
26  import recoin.exception.InitiationException;
27  import recoin.group.Component;
28  import recoin.group.ComponentRunnable;
29  import recoin.group.ComponentSupport;
30  import recoin.group.ComponentWorker;
31  
32  /**
33   * The CLEFTopicTranslator is a PreQuery ComponentWorker class that takes the English
34   * topic of a CLEFQuery and uses pre-translated files to fill the CLEFQuery's
35   * translated topics.
36   * @author Jan H. Scheufen
37   * @version 0.2.9
38   */
39  public class CLEFTopicTranslator extends ComponentWorker
40  {
41    /**
42     * The logger of this class.
43     */
44    static Logger logger;
45    /**
46     * The path to the directory where the translations can be found.
47     */
48    private String translationPath = null;
49    /**
50     * The path to the directory where the stopword files can be found.
51     */
52    private String stopwordPath = null;
53    /**
54     * A SAXReader to read XML files.
55     */
56    private SAXReader xmlReader;
57    
58    /**
59     * Creates a new CLEFTopicTranslator.
60     */
61    public CLEFTopicTranslator()
62    {
63      // Initialize the logger for this class.
64      logger = Logger.getLogger( CLEFTopicTranslator.class.getName() );
65    }
66    
67    /**
68     * Initiates this CLEFTopicTranslator using the specified Component.
69     * @see recoin.group.ComponentWorker#initiate(recoin.group.Component)
70     */
71    public void initiate(Component c) throws InitiationException
72    {
73      super.initiate(c);
74      // get attributes
75      Hashtable attributes = component.getAttributes();
76      translationPath = (String)attributes.get("translationPath");
77      stopwordPath = (String)attributes.get("stopwordPath");
78      xmlReader = new SAXReader();
79      logger.debug("CLEFTopicTranslator initiated with attributes translationPath="+translationPath+", stopwordPath="+stopwordPath);
80      component.setInitiated( true );
81    }
82    
83    /**
84     * Creates and returns a new TopicTranslatorRunnable with the specified RetrievalContainer.
85     * @param container the RetrievalContainer
86     * @return a TopicTranslatorRunnable
87     * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer)
88     */
89    public ComponentRunnable createComponentRunnable(RetrievalContainer container)
90    {
91      if( !component.isInitiated() )
92      {
93        try
94        {
95          initiate(component);
96        }
97        catch (InitiationException e)
98        {
99          logger.error(e);
100         return null;
101       }
102     }
103     logger.debug("Returning new TopicTranslatorRunnable.");
104     return new TopicTranslatorRunnable(container, this);
105   }
106 
107   /**
108    * Creates and returns a new TopicTranslatorRunnable with the specified
109    * RetrievalContainer and Vector.
110    * @param container the RetrievalContainer
111    * @param supports a Vector of ComponentSupport objects
112    * @return a TopicTranslatorRunnable
113    * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer, java.util.Vector)
114    */
115   public ComponentRunnable createComponentRunnable( RetrievalContainer container, Vector supports)
116   {
117     if( !component.isInitiated() )
118     {
119       try
120       {
121         initiate(component);
122       }
123       catch (InitiationException e)
124       {
125         logger.error(e);
126         return null;
127       }
128     }
129     logger.debug("Returning new TopicTranslatorRunnable.");
130     return new TopicTranslatorRunnable( container, (ComponentSupport)supports.firstElement(), this );
131   }
132 
133   /**
134    * Creates and returns a new TopicTranslatorRunnable with the specified
135    * RetrievalContainer, Vector and ComponentRunnable.
136    * @param container the RetrievalContainer
137    * @param supports a Vector of ComponentSupport objects
138    * @param runnable a ComponentRunnable
139    * @return a TopicTranslatorRunnable
140    * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer, java.util.Vector, recoin.group.ComponentRunnable)
141    */
142   public ComponentRunnable createComponentRunnable( RetrievalContainer container, Vector supports, ComponentRunnable runnable)
143   {
144     if( !component.isInitiated() )
145     {
146       try
147       {
148         initiate(component);
149       }
150       catch (InitiationException e)
151       {
152         logger.error(e);
153         return null;
154       }
155     }
156     logger.debug("Returning new TopicTranslatorRunnable.");
157     return new TopicTranslatorRunnable( container, (ComponentSupport)supports.firstElement(), runnable, this );
158   }
159 
160   /**
161    * The TopicTranslatorRunnable class implements the special behavior to translate
162    * the english topic of a CLEFQuery into German, Spanish and English and complete
163    * the CLEFQuery.
164    */
165   protected class TopicTranslatorRunnable extends ComponentRunnable 
166   {
167     /**
168      * Creates a new TopicTranslatorRunnable with the specified RetrievalContainer.
169      * @param container a RetrievalContainer
170      * @param worker the ComponentWorker
171      */
172     public TopicTranslatorRunnable(RetrievalContainer container, ComponentWorker worker)
173     {
174       super(container, worker);
175     }
176 
177     /**
178      * Creates a new TopicTranslatorRunnable with the specified ComponentSupport and
179      * RetrievalContainer.
180      * @param rc the ResultContainer.
181      * @param cs the ComponentSupport.
182      * @param worker the ComponentWorker
183      */
184     public TopicTranslatorRunnable(RetrievalContainer rc, ComponentSupport cs, ComponentWorker worker) 
185     {
186       super(rc, cs, worker);
187     }
188     
189     /**
190      * Creates a new TopicTranslatorRunnable with the specified RetrievalContainer,
191      * ComponentSupport and ComponentRunnable.
192      * @param rc a RetrievalContainer
193      * @param cs a ComponentSupport
194      * @param r a ComponentRunnable
195      * @param worker the ComponentWorker
196      */
197     public TopicTranslatorRunnable( RetrievalContainer rc, ComponentSupport cs, ComponentRunnable r, ComponentWorker worker )
198     {
199       super(rc, cs, r, worker);
200     }
201 
202     /**
203      * The run-method of this TopicTranslatorRunnable.<br>
204      * It takes the DocNo of the CLEFQuery and uses it to look up the different
205      * translations of the topic in the translation files. The translations are
206      * then stored as topics in the CLEFQuery. 
207      */
208     public void run() 
209     {
210       logger.debug(this.getClass().getName()+" started.");
211       if( support instanceof CLEFQuery )  // Verarbeitung eines bestimmten Query
212       {
213         if( translationPath != null && stopwordPath != null )
214         {
215           logger.info("Processing CLEFQuery.");
216           CLEFAnalyzer analyzer = new CLEFAnalyzer( stopwordPath );
217           
218           CLEFQuery query = (CLEFQuery) support;
219           // get english topic
220           CLEFQuery.CLEFTopic topic = query.getTopic(CLEFConstants.ENGLISH);
221           if( !topic.getTopicContent().equals("") )
222           {
223             // get topic content
224             String queryString = topic.getTopicContent();
225             logger.debug("Translating english topic '"+queryString+"'");
226             // replace original english content with stemmed version
227             query.addTopic( analyzer.analyze( queryString , CLEFConstants.ENGLISH ), CLEFConstants.ENGLISH );
228 
229             String translatedTopic;
230             // translate and add topics in other languages
231             translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.GERMAN );
232             query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.GERMAN ), CLEFConstants.GERMAN );
233 
234             translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.SPANISH );
235             query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.SPANISH ), CLEFConstants.SPANISH );
236 
237             translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.FRENCH );
238             query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.FRENCH ), CLEFConstants.FRENCH );
239             
240             component.markComponentSupport( query );
241           } 
242           else
243           {  
244             logger.warn("Cannot process CLEFQuery! English topic is empty!");
245             container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
246           }
247         }
248         else
249         {
250           logger.error("Couldn't find path to translation files and stopword files!");
251         }
252       }
253       else
254       {
255         logger.warn("Cannot process ComponentSupport "+support.getClass().getName());
256         container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
257       }
258       
259       setFinished(true);
260     }
261 
262     /**
263      * Returns a translation of the topic with the specified number in the specified
264      * language.
265      * <br><br>
266      * A Filter will look for files starting with 'DE_', 'ES_' or 'FR_' depending
267      * on the language and gather all translations for the topic. This implies that
268      * there may be several translated files for one language. The translations are
269      * merged and doublette tokens are removed before the translation is returned. 
270      * @param queryString the topic to translate
271      * @param language the target language
272      * @return the translated topic
273      */
274     private String getTopicTranslated(String topicNumber, int language)
275     {
276       logger.debug("Looking for translations for topic no. '"+topicNumber+"'");
277       // Find directory with translations
278       File dir = new File( translationPath );
279       if( dir.isDirectory() )
280       {
281         FilenameFilter filter;
282         // switch statement creates FilenameFilter
283         switch( language )
284         {
285           case CLEFConstants.GERMAN:
286             filter = new FilenameFilter(){
287               public boolean accept(File dir, String name)
288               {  return name.startsWith("DE_"); } };
289             break;
290           case CLEFConstants.SPANISH:
291             filter = new FilenameFilter(){
292               public boolean accept(File dir, String name)
293               {  return name.startsWith("ES_"); } };
294             break;
295           case CLEFConstants.FRENCH:
296             filter = new FilenameFilter(){
297               public boolean accept(File dir, String name)
298               {  return name.startsWith("FR_"); } };
299             break;
300           default:
301             filter = null;
302         }
303         
304         // get filtered file list
305         File[] files = dir.listFiles( filter );
306         String[] translations = new String[files.length];
307          
308          // gather translations from files
309         for( int i=0; i<files.length; i++ )
310         {
311           try
312           {
313             Document doc = createDocument( files[i] );
314             Iterator topicIter = getTopicIterator( doc );
315             while( topicIter.hasNext() )
316             {
317               Element topic = (Element)topicIter.next();
318               // find topic with specified topicNumber
319               if( topic.element("num").getTextTrim().equals( topicNumber ) )
320               {
321                 String languagePrefix = "";
322                 switch( language )
323                 {
324                   case CLEFConstants.GERMAN:
325                     languagePrefix = "DE-";
326                     break;
327                   case CLEFConstants.ENGLISH:
328                     languagePrefix = "EN-";
329                     break;
330                   case CLEFConstants.SPANISH:
331                     languagePrefix = "ES-";
332                     break;
333                   case CLEFConstants.FRENCH:
334                     languagePrefix = "FR-";
335                     break;
336                 }
337   
338                 translations[i] = topic.element(languagePrefix+"desc").getText();
339               }
340             }    
341             logger.debug("Found translation for topic no. '"+topicNumber+"' in file '"+files[i].getName()+"'");
342           }
343           catch (DocumentException e)
344           {
345             logger.error("Couldn't create Document!",e);
346           }
347         }
348         
349         // return merged translations
350         return mergeTranslations( translations );
351       }
352       else
353       {
354         logger.error("Specified path to translation files is not a directory! Path: "+translationPath);
355       }
356       return null;
357     }
358     
359     /**
360      * Merges the translations in the specified String[] into one String. Doublettes
361      * are eliminated and the merged translation is returned.
362      * @param translations the individual translations in a String[]
363      * @return the merged translation
364      */
365     private String mergeTranslations( String[] translations )
366     {
367       String merged = "";
368       // create the vector that holds the unique tokens off all translations.
369       Vector tokens = new Vector();
370       for( int x=0; x<translations.length; x++ )
371       {
372         WhitespaceTokenizer tokenizer = new WhitespaceTokenizer( new StringReader(translations[x]) );
373         Token token = null;
374         do
375         {
376           try
377           {
378             token = tokenizer.next();
379           }
380           catch (IOException e)
381           {
382             logger.error("IOException in getAnalyzedString.", e);
383           }
384           if( token != null )
385           {
386             // add content of token to vector.
387             if( !tokens.contains( token.termText() ) )
388               tokens.add( token.termText() );
389             else
390               logger.debug("Doublette token found while merging translations. Token: "+token.termText());
391           }
392         }
393         while( token != null );
394       }
395       
396       // get Strings from vector
397       for( Enumeration words = tokens.elements(); words.hasMoreElements(); )
398       {
399         String word = (String)words.nextElement();
400         merged += word;
401         if( words.hasMoreElements() )
402           merged += " ";
403       }
404 
405       logger.debug(translations.length+" translations merged into '"+merged+"'");
406       return merged;
407     }
408 
409     /**
410      * Returns a org.dom4j.Document that represents the specified file.
411      * @param file the XML file to parse
412      * @return the document representation
413      * @throws org.dom4j.DocumentException
414      */
415     private Document createDocument(File file) throws DocumentException 
416     {
417       Document document = null;
418       try
419       {
420           document = xmlReader.read(file);
421       }
422       catch( java.net.MalformedURLException e )
423       {}
424       return document;    
425     }
426 
427     private Iterator getTopicIterator( Document doc )
428     {
429       Element root = doc.getRootElement();
430       return root.elementIterator("top");
431     }
432     
433   }
434   
435 }