Source code: recoinx/clef/CLEFAnalyzer.java
1 package recoinx.clef;
2
3 import java.io.BufferedReader;
4 import java.io.File;
5 import java.io.FileNotFoundException;
6 import java.io.FileReader;
7 import java.io.IOException;
8 import java.io.StringReader;
9 import java.util.Vector;
10
11 import org.apache.log4j.Logger;
12 import org.apache.lucene.analysis.Token;
13 import org.apache.lucene.analysis.TokenStream;
14 import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
15
16 /**
17 * The CLEFAnalyzer class is the base class used for formatting, stemming, stop
18 * word removal, etc. It uses a SnowballAnalyzer for stemming and stopword removal
19 * and is capable of processing the languages English, German, French and Spanish.
20 * <br><br>
21 * The names of the stopword files must follow the naming convention
22 * <br><br>
23 * <center><LanguagePrefix>_stopwords.txt</center>
24 * <br><br>
25 * to be considered for stopword removal.<br>
26 * Example: <b>EN_stopwords.txt</b> for the English stopword file. The language
27 * prefixes are: EN (English), DE (German), FR (French), ES (Spanish). The single
28 * stopwords in the files must appear separately on different lines.
29 */
30 public class CLEFAnalyzer
31 {
32 /**
33 * The logger of this class.
34 */
35 static Logger logger;
36 /**
37 * The path where the stopword files can be found.
38 */
39 String stopwordPath = "";
40 /**
41 * A SnowballAnalyzer for German.
42 */
43 private SnowballAnalyzer germanAnalyzer;
44 /**
45 * A SnowballAnalyzer for Spanish.
46 */
47 private SnowballAnalyzer spanishAnalyzer;
48 /**
49 * A SnowballAnalyzer for French.
50 */
51 private SnowballAnalyzer frenchAnalyzer;
52 /**
53 * A SnowballAnalyzer for English.
54 */
55 private SnowballAnalyzer englishAnalyzer;
56
57 /**
58 * Creates a new CLEFAnalyzer with the specified path to the stopword files. The
59 * CLEFAnalyzer will have four String[] of stopwords and four SnowballAnalyzers
60 * for the different languages respectively.
61 * @param stopPath path to stopword files
62 */
63 public CLEFAnalyzer( String stopPath )
64 {
65 // Initialize the logger for this class.
66 logger = Logger.getLogger( CLEFAnalyzer.class.getName() );
67 logger.debug("Creating CLEFAnalyzer");
68
69 stopwordPath = stopPath;
70
71 String[] stopwordsDE = createStopwords( new File( stopPath+File.separator+"DE_stopwords.txt" ) );
72 String[] stopwordsES = createStopwords( new File( stopPath+File.separator+"ES_stopwords.txt" ) );
73 String[] stopwordsFR = createStopwords( new File( stopPath+File.separator+"FR_stopwords.txt" ) );
74 String[] stopwordsEN = createStopwords( new File( stopPath+File.separator+"EN_stopwords.txt" ) );
75 logger.debug("Creating SnowballAnalyzers.");
76 // first argument of SnowballAnalyzer constructor has to match classes in net.sf.snowball.ext package!
77 germanAnalyzer = new SnowballAnalyzer("German", stopwordsDE);
78 spanishAnalyzer = new SnowballAnalyzer("Spanish", stopwordsES);
79 frenchAnalyzer = new SnowballAnalyzer("French", stopwordsFR);
80 englishAnalyzer = new SnowballAnalyzer("English", stopwordsEN);
81 }
82
83 /**
84 * Performs stopword removal and stemming on the specified topic according to the
85 * specified language.<br>
86 * For the languages see {@link recoinx.clef.CLEFConstants CLEFConstants}.
87 * @param topic the topic
88 * @param language the language of the topic
89 * @return the analyzed topic
90 */
91 public String analyze( String topic, int language )
92 {
93 logger.debug("Analyzing topic: "+topic);
94 switch( language )
95 {
96 case CLEFConstants.GERMAN:
97 return getAnalyzedString( germanAnalyzer, topic );
98
99 case CLEFConstants.FRENCH:
100 return getAnalyzedString( frenchAnalyzer, topic );
101
102 case CLEFConstants.SPANISH:
103 return getAnalyzedString( spanishAnalyzer, topic );
104
105 case CLEFConstants.ENGLISH:
106 return getAnalyzedString( englishAnalyzer, topic );
107
108 default:
109 logger.warn("Topic could not be analyzed! Unknown language specified.");
110 return topic;
111 }
112 }
113
114 /**
115 * Parses the specified string and applies stemming using the specified
116 * SnowballAnalyzer.
117 * @param analyzer the SnowballAnalyzer
118 * @param string the String
119 * @return the analyzed String
120 */
121 public static String getAnalyzedString( SnowballAnalyzer analyzer, String string )
122 {
123 Token token = null;
124 String tokenString = "";
125 // read the specified string as a TokenStream from the analyzer
126 TokenStream stream = analyzer.tokenStream("", new StringReader(string) );
127
128 do
129 {
130 try
131 {
132 token = stream.next();
133 }
134 catch (IOException e)
135 {
136 logger.error("IOException in getAnalyzedString.",e);
137 }
138 if( token != null )
139 {
140 tokenString += (tokenString.equals("")) ? token.termText() : " "+token.termText();
141 }
142 }
143 while( token != null );
144
145 return tokenString;
146 }
147
148 /**
149 * Creates a String[] of stopwords from the specified file. Each stopword must
150 * appear on a separate line. If there are any errors reading the file, the
151 * returned list will be empty.
152 * @param file the stopword file
153 * @return a String[] containing the stopwords
154 */
155 public static String[] createStopwords( File file )
156 {
157 logger.debug("Creating stopwords from file "+file.getName());
158 BufferedReader reader = null;
159 String[] stopwords;
160 Vector words = new Vector();
161 try
162 {
163 reader = new BufferedReader( new FileReader( file ) );
164 String line = reader.readLine();
165 while( line != null )
166 {
167 words.add( line.trim() );
168 line = reader.readLine();
169 }
170 }
171 catch (FileNotFoundException e)
172 {
173 logger.error("FileNotFoundException in createStopwords. "+e);
174 }
175 catch (IOException e)
176 {
177 logger.error("IOException in createStopwords. "+e);
178 }
179
180 stopwords = new String[ words.size() ];
181 words.copyInto( stopwords );
182 logger.debug("Array with stopwords created. Length: "+stopwords.length);
183 return stopwords;
184 }
185 }