Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: recoinx/clef/CLEFAnalyzer.java


1   package recoinx.clef;
2   
3   import java.io.BufferedReader;
4   import java.io.File;
5   import java.io.FileNotFoundException;
6   import java.io.FileReader;
7   import java.io.IOException;
8   import java.io.StringReader;
9   import java.util.Vector;
10  
11  import org.apache.log4j.Logger;
12  import org.apache.lucene.analysis.Token;
13  import org.apache.lucene.analysis.TokenStream;
14  import org.apache.lucene.analysis.snowball.SnowballAnalyzer;
15  
16  /**
17   * The CLEFAnalyzer class is the base class used for formatting, stemming, stop
18   * word removal, etc. It uses a SnowballAnalyzer for stemming and stopword removal
19   * and is capable of processing the languages English, German, French and Spanish.
20   * <br><br>
21   * The names of the stopword files must follow the naming convention
22   * <br><br>
23   * <center>&lt;LanguagePrefix&gt;_stopwords.txt</center>
24   * <br><br>
25   * to be considered for stopword removal.<br>
26   * Example: <b>EN_stopwords.txt</b> for the English stopword file. The language
27   * prefixes are: EN (English), DE (German), FR (French), ES (Spanish). The single
28   * stopwords in the files must appear separately on different lines.
29   */
30  public class CLEFAnalyzer
31  {
32    /**
33     * The logger of this class.
34     */
35    static Logger logger;
36    /**
37     * The path where the stopword files can be found.
38     */
39    String stopwordPath = "";
40    /**
41     * A SnowballAnalyzer for German.
42     */
43    private SnowballAnalyzer germanAnalyzer;
44    /**
45     * A SnowballAnalyzer for Spanish.
46     */
47    private SnowballAnalyzer spanishAnalyzer;
48    /**
49     * A SnowballAnalyzer for French.
50     */
51    private SnowballAnalyzer frenchAnalyzer;
52    /**
53     * A SnowballAnalyzer for English.
54     */
55    private SnowballAnalyzer englishAnalyzer;
56  
57    /**
58     * Creates a new CLEFAnalyzer with the specified path to the stopword files. The
59     * CLEFAnalyzer will have four String[] of stopwords and four SnowballAnalyzers
60     * for the different languages respectively.
61     * @param stopPath path to stopword files
62     */
63    public CLEFAnalyzer( String stopPath )
64    {
65      // Initialize the logger for this class.
66      logger = Logger.getLogger( CLEFAnalyzer.class.getName() );
67      logger.debug("Creating CLEFAnalyzer");
68  
69      stopwordPath = stopPath;
70      
71      String[] stopwordsDE = createStopwords( new File( stopPath+File.separator+"DE_stopwords.txt" ) );
72      String[] stopwordsES = createStopwords( new File( stopPath+File.separator+"ES_stopwords.txt" ) );
73      String[] stopwordsFR = createStopwords( new File( stopPath+File.separator+"FR_stopwords.txt" ) );
74      String[] stopwordsEN = createStopwords( new File( stopPath+File.separator+"EN_stopwords.txt" ) );
75      logger.debug("Creating SnowballAnalyzers.");
76      // first argument of SnowballAnalyzer constructor has to match classes in net.sf.snowball.ext package!
77      germanAnalyzer = new SnowballAnalyzer("German", stopwordsDE);
78      spanishAnalyzer = new SnowballAnalyzer("Spanish", stopwordsES);
79      frenchAnalyzer = new SnowballAnalyzer("French", stopwordsFR);
80      englishAnalyzer = new SnowballAnalyzer("English", stopwordsEN);
81    }
82    
83    /**
84     * Performs stopword removal and stemming on the specified topic according to the
85     * specified language.<br>
86     * For the languages see {@link recoinx.clef.CLEFConstants CLEFConstants}.
87     * @param topic the topic
88     * @param language the language of the topic
89     * @return the analyzed topic
90     */
91    public String analyze( String topic, int language )
92    {
93      logger.debug("Analyzing topic: "+topic);
94      switch( language )
95      {
96        case CLEFConstants.GERMAN:
97          return getAnalyzedString( germanAnalyzer, topic );
98          
99        case CLEFConstants.FRENCH:
100         return getAnalyzedString( frenchAnalyzer, topic );
101         
102       case CLEFConstants.SPANISH:
103         return getAnalyzedString( spanishAnalyzer, topic );
104         
105       case CLEFConstants.ENGLISH:
106         return getAnalyzedString( englishAnalyzer, topic );
107 
108       default:
109         logger.warn("Topic could not be analyzed! Unknown language specified.");
110         return topic;
111     }
112   }
113     
114   /**
115    * Parses the specified string and applies stemming using the specified
116    * SnowballAnalyzer.
117    * @param analyzer the SnowballAnalyzer
118    * @param string the String
119    * @return the analyzed String
120    */
121   public static String getAnalyzedString( SnowballAnalyzer analyzer, String string )
122   {
123     Token token = null;
124     String tokenString = "";
125     // read the specified string as a TokenStream from the analyzer
126     TokenStream stream = analyzer.tokenStream("", new StringReader(string) );
127       
128     do
129     {
130       try
131       {
132         token = stream.next();
133       }
134       catch (IOException e)
135       {
136         logger.error("IOException in getAnalyzedString.",e);
137       }
138       if( token != null )
139       {
140         tokenString += (tokenString.equals("")) ? token.termText() : " "+token.termText();
141       }
142     }
143     while( token != null );
144     
145     return tokenString;
146   }
147   
148   /**
149    * Creates a String[] of stopwords from the specified file. Each stopword must
150    * appear on a separate line. If there are any errors reading the file, the
151    * returned list will be empty.
152    * @param file the stopword file
153    * @return a String[] containing the stopwords
154    */
155   public static String[] createStopwords( File file )
156   {
157     logger.debug("Creating stopwords from file "+file.getName());
158     BufferedReader reader = null;
159     String[] stopwords;
160     Vector words = new Vector();
161     try
162     {
163       reader = new BufferedReader( new FileReader( file ) );
164       String line = reader.readLine();
165       while( line != null )
166       {
167         words.add( line.trim() );
168         line = reader.readLine();
169       }
170     }
171     catch (FileNotFoundException e)
172     {
173       logger.error("FileNotFoundException in createStopwords. "+e);
174     }
175     catch (IOException e)
176     {
177       logger.error("IOException in createStopwords. "+e);
178     }
179     
180     stopwords = new String[ words.size() ];
181     words.copyInto( stopwords );
182     logger.debug("Array with stopwords created. Length: "+stopwords.length);
183     return stopwords;
184   }
185 }