Source code: recoinx/clef/CLEFTopicTranslator.java
1 /*
2 * Created on 18.08.2003
3 *
4 */
5 package recoinx.clef;
6
7 import java.io.File;
8 import java.io.FilenameFilter;
9 import java.io.IOException;
10 import java.io.StringReader;
11 import java.util.Enumeration;
12 import java.util.Hashtable;
13 import java.util.Iterator;
14 import java.util.Vector;
15
16 import org.apache.log4j.Logger;
17 import org.apache.lucene.analysis.Token;
18 import org.apache.lucene.analysis.WhitespaceTokenizer;
19 import org.dom4j.Document;
20 import org.dom4j.DocumentException;
21 import org.dom4j.Element;
22 import org.dom4j.io.SAXReader;
23
24 import recoin.container.RetrievalContainer;
25 import recoin.exception.ComponentRunnableException;
26 import recoin.exception.InitiationException;
27 import recoin.group.Component;
28 import recoin.group.ComponentRunnable;
29 import recoin.group.ComponentSupport;
30 import recoin.group.ComponentWorker;
31
32 /**
33 * The CLEFTopicTranslator is a PreQuery ComponentWorker class that takes the English
34 * topic of a CLEFQuery and uses pre-translated files to fill the CLEFQuery's
35 * translated topics.
36 * @author Jan H. Scheufen
37 * @version 0.2.9
38 */
39 public class CLEFTopicTranslator extends ComponentWorker
40 {
41 /**
42 * The logger of this class.
43 */
44 static Logger logger;
45 /**
46 * The path to the directory where the translations can be found.
47 */
48 private String translationPath = null;
49 /**
50 * The path to the directory where the stopword files can be found.
51 */
52 private String stopwordPath = null;
53 /**
54 * A SAXReader to read XML files.
55 */
56 private SAXReader xmlReader;
57
58 /**
59 * Creates a new CLEFTopicTranslator.
60 */
61 public CLEFTopicTranslator()
62 {
63 // Initialize the logger for this class.
64 logger = Logger.getLogger( CLEFTopicTranslator.class.getName() );
65 }
66
67 /**
68 * Initiates this CLEFTopicTranslator using the specified Component.
69 * @see recoin.group.ComponentWorker#initiate(recoin.group.Component)
70 */
71 public void initiate(Component c) throws InitiationException
72 {
73 super.initiate(c);
74 // get attributes
75 Hashtable attributes = component.getAttributes();
76 translationPath = (String)attributes.get("translationPath");
77 stopwordPath = (String)attributes.get("stopwordPath");
78 xmlReader = new SAXReader();
79 logger.debug("CLEFTopicTranslator initiated with attributes translationPath="+translationPath+", stopwordPath="+stopwordPath);
80 component.setInitiated( true );
81 }
82
83 /**
84 * Creates and returns a new TopicTranslatorRunnable with the specified RetrievalContainer.
85 * @param container the RetrievalContainer
86 * @return a TopicTranslatorRunnable
87 * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer)
88 */
89 public ComponentRunnable createComponentRunnable(RetrievalContainer container)
90 {
91 if( !component.isInitiated() )
92 {
93 try
94 {
95 initiate(component);
96 }
97 catch (InitiationException e)
98 {
99 logger.error(e);
100 return null;
101 }
102 }
103 logger.debug("Returning new TopicTranslatorRunnable.");
104 return new TopicTranslatorRunnable(container, this);
105 }
106
107 /**
108 * Creates and returns a new TopicTranslatorRunnable with the specified
109 * RetrievalContainer and Vector.
110 * @param container the RetrievalContainer
111 * @param supports a Vector of ComponentSupport objects
112 * @return a TopicTranslatorRunnable
113 * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer, java.util.Vector)
114 */
115 public ComponentRunnable createComponentRunnable( RetrievalContainer container, Vector supports)
116 {
117 if( !component.isInitiated() )
118 {
119 try
120 {
121 initiate(component);
122 }
123 catch (InitiationException e)
124 {
125 logger.error(e);
126 return null;
127 }
128 }
129 logger.debug("Returning new TopicTranslatorRunnable.");
130 return new TopicTranslatorRunnable( container, (ComponentSupport)supports.firstElement(), this );
131 }
132
133 /**
134 * Creates and returns a new TopicTranslatorRunnable with the specified
135 * RetrievalContainer, Vector and ComponentRunnable.
136 * @param container the RetrievalContainer
137 * @param supports a Vector of ComponentSupport objects
138 * @param runnable a ComponentRunnable
139 * @return a TopicTranslatorRunnable
140 * @see recoin.group.ComponentWorker#createComponentRunnable(recoin.container.RetrievalContainer, java.util.Vector, recoin.group.ComponentRunnable)
141 */
142 public ComponentRunnable createComponentRunnable( RetrievalContainer container, Vector supports, ComponentRunnable runnable)
143 {
144 if( !component.isInitiated() )
145 {
146 try
147 {
148 initiate(component);
149 }
150 catch (InitiationException e)
151 {
152 logger.error(e);
153 return null;
154 }
155 }
156 logger.debug("Returning new TopicTranslatorRunnable.");
157 return new TopicTranslatorRunnable( container, (ComponentSupport)supports.firstElement(), runnable, this );
158 }
159
160 /**
161 * The TopicTranslatorRunnable class implements the special behavior to translate
162 * the english topic of a CLEFQuery into German, Spanish and English and complete
163 * the CLEFQuery.
164 */
165 protected class TopicTranslatorRunnable extends ComponentRunnable
166 {
167 /**
168 * Creates a new TopicTranslatorRunnable with the specified RetrievalContainer.
169 * @param container a RetrievalContainer
170 * @param worker the ComponentWorker
171 */
172 public TopicTranslatorRunnable(RetrievalContainer container, ComponentWorker worker)
173 {
174 super(container, worker);
175 }
176
177 /**
178 * Creates a new TopicTranslatorRunnable with the specified ComponentSupport and
179 * RetrievalContainer.
180 * @param rc the ResultContainer.
181 * @param cs the ComponentSupport.
182 * @param worker the ComponentWorker
183 */
184 public TopicTranslatorRunnable(RetrievalContainer rc, ComponentSupport cs, ComponentWorker worker)
185 {
186 super(rc, cs, worker);
187 }
188
189 /**
190 * Creates a new TopicTranslatorRunnable with the specified RetrievalContainer,
191 * ComponentSupport and ComponentRunnable.
192 * @param rc a RetrievalContainer
193 * @param cs a ComponentSupport
194 * @param r a ComponentRunnable
195 * @param worker the ComponentWorker
196 */
197 public TopicTranslatorRunnable( RetrievalContainer rc, ComponentSupport cs, ComponentRunnable r, ComponentWorker worker )
198 {
199 super(rc, cs, r, worker);
200 }
201
202 /**
203 * The run-method of this TopicTranslatorRunnable.<br>
204 * It takes the DocNo of the CLEFQuery and uses it to look up the different
205 * translations of the topic in the translation files. The translations are
206 * then stored as topics in the CLEFQuery.
207 */
208 public void run()
209 {
210 logger.debug(this.getClass().getName()+" started.");
211 if( support instanceof CLEFQuery ) // Verarbeitung eines bestimmten Query
212 {
213 if( translationPath != null && stopwordPath != null )
214 {
215 logger.info("Processing CLEFQuery.");
216 CLEFAnalyzer analyzer = new CLEFAnalyzer( stopwordPath );
217
218 CLEFQuery query = (CLEFQuery) support;
219 // get english topic
220 CLEFQuery.CLEFTopic topic = query.getTopic(CLEFConstants.ENGLISH);
221 if( !topic.getTopicContent().equals("") )
222 {
223 // get topic content
224 String queryString = topic.getTopicContent();
225 logger.debug("Translating english topic '"+queryString+"'");
226 // replace original english content with stemmed version
227 query.addTopic( analyzer.analyze( queryString , CLEFConstants.ENGLISH ), CLEFConstants.ENGLISH );
228
229 String translatedTopic;
230 // translate and add topics in other languages
231 translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.GERMAN );
232 query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.GERMAN ), CLEFConstants.GERMAN );
233
234 translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.SPANISH );
235 query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.SPANISH ), CLEFConstants.SPANISH );
236
237 translatedTopic = getTopicTranslated( query.getTopicNumber(), CLEFConstants.FRENCH );
238 query.addTopic( analyzer.analyze( translatedTopic , CLEFConstants.FRENCH ), CLEFConstants.FRENCH );
239
240 component.markComponentSupport( query );
241 }
242 else
243 {
244 logger.warn("Cannot process CLEFQuery! English topic is empty!");
245 container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
246 }
247 }
248 else
249 {
250 logger.error("Couldn't find path to translation files and stopword files!");
251 }
252 }
253 else
254 {
255 logger.warn("Cannot process ComponentSupport "+support.getClass().getName());
256 container.addException( new ComponentRunnableException("The provided ComponentSupport cannot be processed.") );
257 }
258
259 setFinished(true);
260 }
261
262 /**
263 * Returns a translation of the topic with the specified number in the specified
264 * language.
265 * <br><br>
266 * A Filter will look for files starting with 'DE_', 'ES_' or 'FR_' depending
267 * on the language and gather all translations for the topic. This implies that
268 * there may be several translated files for one language. The translations are
269 * merged and doublette tokens are removed before the translation is returned.
270 * @param queryString the topic to translate
271 * @param language the target language
272 * @return the translated topic
273 */
274 private String getTopicTranslated(String topicNumber, int language)
275 {
276 logger.debug("Looking for translations for topic no. '"+topicNumber+"'");
277 // Find directory with translations
278 File dir = new File( translationPath );
279 if( dir.isDirectory() )
280 {
281 FilenameFilter filter;
282 // switch statement creates FilenameFilter
283 switch( language )
284 {
285 case CLEFConstants.GERMAN:
286 filter = new FilenameFilter(){
287 public boolean accept(File dir, String name)
288 { return name.startsWith("DE_"); } };
289 break;
290 case CLEFConstants.SPANISH:
291 filter = new FilenameFilter(){
292 public boolean accept(File dir, String name)
293 { return name.startsWith("ES_"); } };
294 break;
295 case CLEFConstants.FRENCH:
296 filter = new FilenameFilter(){
297 public boolean accept(File dir, String name)
298 { return name.startsWith("FR_"); } };
299 break;
300 default:
301 filter = null;
302 }
303
304 // get filtered file list
305 File[] files = dir.listFiles( filter );
306 String[] translations = new String[files.length];
307
308 // gather translations from files
309 for( int i=0; i<files.length; i++ )
310 {
311 try
312 {
313 Document doc = createDocument( files[i] );
314 Iterator topicIter = getTopicIterator( doc );
315 while( topicIter.hasNext() )
316 {
317 Element topic = (Element)topicIter.next();
318 // find topic with specified topicNumber
319 if( topic.element("num").getTextTrim().equals( topicNumber ) )
320 {
321 String languagePrefix = "";
322 switch( language )
323 {
324 case CLEFConstants.GERMAN:
325 languagePrefix = "DE-";
326 break;
327 case CLEFConstants.ENGLISH:
328 languagePrefix = "EN-";
329 break;
330 case CLEFConstants.SPANISH:
331 languagePrefix = "ES-";
332 break;
333 case CLEFConstants.FRENCH:
334 languagePrefix = "FR-";
335 break;
336 }
337
338 translations[i] = topic.element(languagePrefix+"desc").getText();
339 }
340 }
341 logger.debug("Found translation for topic no. '"+topicNumber+"' in file '"+files[i].getName()+"'");
342 }
343 catch (DocumentException e)
344 {
345 logger.error("Couldn't create Document!",e);
346 }
347 }
348
349 // return merged translations
350 return mergeTranslations( translations );
351 }
352 else
353 {
354 logger.error("Specified path to translation files is not a directory! Path: "+translationPath);
355 }
356 return null;
357 }
358
359 /**
360 * Merges the translations in the specified String[] into one String. Doublettes
361 * are eliminated and the merged translation is returned.
362 * @param translations the individual translations in a String[]
363 * @return the merged translation
364 */
365 private String mergeTranslations( String[] translations )
366 {
367 String merged = "";
368 // create the vector that holds the unique tokens off all translations.
369 Vector tokens = new Vector();
370 for( int x=0; x<translations.length; x++ )
371 {
372 WhitespaceTokenizer tokenizer = new WhitespaceTokenizer( new StringReader(translations[x]) );
373 Token token = null;
374 do
375 {
376 try
377 {
378 token = tokenizer.next();
379 }
380 catch (IOException e)
381 {
382 logger.error("IOException in getAnalyzedString.", e);
383 }
384 if( token != null )
385 {
386 // add content of token to vector.
387 if( !tokens.contains( token.termText() ) )
388 tokens.add( token.termText() );
389 else
390 logger.debug("Doublette token found while merging translations. Token: "+token.termText());
391 }
392 }
393 while( token != null );
394 }
395
396 // get Strings from vector
397 for( Enumeration words = tokens.elements(); words.hasMoreElements(); )
398 {
399 String word = (String)words.nextElement();
400 merged += word;
401 if( words.hasMoreElements() )
402 merged += " ";
403 }
404
405 logger.debug(translations.length+" translations merged into '"+merged+"'");
406 return merged;
407 }
408
409 /**
410 * Returns a org.dom4j.Document that represents the specified file.
411 * @param file the XML file to parse
412 * @return the document representation
413 * @throws org.dom4j.DocumentException
414 */
415 private Document createDocument(File file) throws DocumentException
416 {
417 Document document = null;
418 try
419 {
420 document = xmlReader.read(file);
421 }
422 catch( java.net.MalformedURLException e )
423 {}
424 return document;
425 }
426
427 private Iterator getTopicIterator( Document doc )
428 {
429 Element root = doc.getRootElement();
430 return root.elementIterator("top");
431 }
432
433 }
434
435 }