Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: marf/Stats/StatisticalEstimators/StatisticalEstimator.java


1   package marf.Stats.StatisticalEstimators;
2   
3   import java.util.Vector;
4   
5   import marf.MARF;
6   import marf.MARF.NLP;
7   import marf.Stats.ProbabilityTable;
8   import marf.Storage.StorageException;
9   import marf.Storage.StorageManager;
10  import marf.nlp.util.NLPStreamTokenizer;
11  
12  
13  /**
14   * <p>Implements generic Statistical Estimator routines.
15   * Must be subclasses by concrete implemenations of statistical estimators.</p>
16   *
17   * $Id: StatisticalEstimator.java,v 1.24 2005/06/16 19:58:47 mokhov Exp $
18   *
19   * @author Serguei Mokhov
20   * @version $Revision: 1.24 $
21   * @since 0.3.0
22   */
23  public abstract class StatisticalEstimator
24  extends StorageManager
25  implements IStatisticalEstimator
26  {
27    protected ProbabilityTable oProbabilityTable = null;
28    protected NLPStreamTokenizer oStreamTokenizer = null;
29  
30    public StatisticalEstimator()
31    {
32      //super(oProbabilityTable, getFilename());
33      this.oProbabilityTable = new ProbabilityTable(NLP.getLanguage());
34      this.strFilename = getFilename();
35      this.oObjectToSerialize = this.oProbabilityTable;
36      //System.out.println("StatisticalEstimator()");
37    }
38  
39    /**
40     * N-gram-based classification.
41     * @return calculated probability value
42     */
43    //public abstract double P(String pstrSentence);
44    //public abstract double P(Ngram poNgram);
45    public final double p()
46    {
47      double dProbability = 0.0;
48  
49      try
50      {
51        // P(EOS,first-char)
52        boolean bBeginning = true;
53  
54        restore();
55        this.oProbabilityTable = (ProbabilityTable)this.oObjectToSerialize;
56  
57        String strToken;
58  
59  //      while(oStreamTokenizer.nextToken() != StreamTokenizer.TT_EOF)
60        while((strToken = oStreamTokenizer.getNextToken()) != null)
61        {
62  //        String strToken = getNextToken();
63  
64          // Something which what we didn't ask for (not a string)
65  //        if(strToken == null)
66  //        {
67  //          System.out.println("WARNING: Null token!");
68  //          continue;
69  //        }
70  
71          switch(MARF.NLP.getNgramModel())
72          {
73            case MARF.INgramModels.UNIGRAM:
74            {
75              Vector oNgram = new Vector();
76              oNgram.add(strToken);
77              dProbability += Math.log(oProbabilityTable.P(oNgram));
78              break;
79            }
80  
81            case MARF.INgramModels.BIGRAM:
82            {
83              Vector oNgram = new Vector();
84              oNgram.add(strToken);
85  
86  //            if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
87  //              break;
88  
89  //            String strToken2 = getNextToken();
90  
91              String strToken2 = oStreamTokenizer.getNextToken();
92  
93              if(strToken2 == null)
94                break;
95  
96              oNgram.add(strToken2);
97  
98              oStreamTokenizer.pushBack();
99  
100             dProbability += Math.log(oProbabilityTable.P(oNgram));
101 
102             break;
103           }
104 
105           case MARF.INgramModels.TRIGRAM:
106           {
107             Vector oNgram = new Vector();
108             oNgram.add(strToken);
109 
110 //            if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
111 //              break;
112 
113             String strToken2 = oStreamTokenizer.getNextToken();
114 
115             if(strToken2 == null)
116               break;
117 
118             oNgram.add(strToken2);
119 
120             int ttype2 = oStreamTokenizer.ttype;
121             double nval2 = oStreamTokenizer.nval;
122 
123 //            if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
124 //              break;
125 
126 //            String strToken3 = getNextToken();
127 
128             String strToken3 = oStreamTokenizer.getNextToken();
129 
130             if(strToken3 == null)
131               break;
132 
133             oNgram.add(strToken3);
134 
135             int ttype3 = oStreamTokenizer.ttype;
136             double nval3 = oStreamTokenizer.nval;
137 
138             oStreamTokenizer.ttype = ttype2;
139             oStreamTokenizer.nval = nval2;
140             oStreamTokenizer.sval = strToken2;
141 
142             oStreamTokenizer.pushBack();
143 
144             oStreamTokenizer.ttype = ttype3;
145             oStreamTokenizer.nval = nval3;
146             oStreamTokenizer.sval = strToken3;
147 
148             oStreamTokenizer.pushBack();
149 
150             dProbability += Math.log(oProbabilityTable.P(oNgram));
151             break;
152           }
153 
154           default:
155             break;
156         }
157       }
158     }
159     catch(Exception e)
160     {
161       System.err.println(e.getMessage());
162       e.printStackTrace(System.err);
163     }
164 
165     return dProbability;
166   }
167 
168   /**
169    * Every estimator needs to implement its specific training method.
170    * @return <code>true</code> if training was successful
171    */
172   public boolean train()
173   {
174     try
175     {
176       restore();
177 
178       this.oProbabilityTable = (ProbabilityTable)this.oObjectToSerialize;
179 
180       String strToken;
181 
182       // Collect Stats
183       //while(oStreamTokenizer.nextToken() != StreamTokenizer.TT_EOF)
184       while((strToken = oStreamTokenizer.getNextToken()) != null)
185       {
186         //String strToken = getNextToken();
187 
188         // Something which what we didn't ask for (not a string)
189         if(strToken == null)
190         {
191           System.err.println
192           (
193             "WARNING: Null token! st:[" + oStreamTokenizer.toString() + "]" +
194             oStreamTokenizer.sval + oStreamTokenizer.nval + oStreamTokenizer.ttype
195           );
196 
197           continue;
198         }
199 
200         switch(MARF.NLP.getNgramModel())
201         {
202           case MARF.INgramModels.UNIGRAM:
203           {
204             Vector oUnigram = new Vector();
205 
206             oUnigram.add(strToken);
207 
208 //            System.out.println("oNgram["+oUnigram+"]");
209             this.oProbabilityTable.incFrequency(oUnigram);
210 
211             break;
212           }
213 
214           case MARF.INgramModels.BIGRAM:
215           {
216             Vector oNgram = new Vector();
217 
218             oNgram.add(strToken);
219 
220 //            if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
221 //              break;
222 
223             String strToken2 = oStreamTokenizer.getNextToken();
224 
225             if(strToken2 == null)
226               break;
227 
228             oNgram.add(strToken2);
229             oStreamTokenizer.pushBack();
230 
231 //            System.out.println("oNgram["+oNgram+"]");
232             this.oProbabilityTable.incFrequency(oNgram);
233 
234             break;
235           }
236 
237           case MARF.INgramModels.TRIGRAM:
238           {
239             Vector oNgram = new Vector();
240             oNgram.add(strToken);
241 
242 //            if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
243 //              break;
244 
245             String strToken2 = oStreamTokenizer.getNextToken();
246 
247             if(strToken2 == null)
248               break;
249 
250             oNgram.add(strToken2);
251 
252             int ttype2 = oStreamTokenizer.ttype;
253             double nval2 = oStreamTokenizer.nval;
254 
255             //if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
256             //  break;
257 
258             String strToken3 = oStreamTokenizer.getNextToken();
259 
260             if(strToken3 == null)
261               break;
262 
263             oNgram.add(strToken3);
264 
265             int ttype3 = oStreamTokenizer.ttype;
266             double nval3 = oStreamTokenizer.nval;
267 
268             oStreamTokenizer.ttype = ttype2;
269             oStreamTokenizer.nval = nval2;
270             oStreamTokenizer.sval = strToken2;
271 
272             oStreamTokenizer.pushBack();
273 
274             oStreamTokenizer.ttype = ttype3;
275             oStreamTokenizer.nval = nval3;
276             oStreamTokenizer.sval = strToken3;
277 
278             oStreamTokenizer.pushBack();
279 
280 //            System.out.println("oNgram["+oNgram+"]");
281             this.oProbabilityTable.incFrequency(oNgram);
282 
283             break;
284           }
285 
286           default:
287             break;
288         }
289       }
290 
291       dump();
292 /*
293       smooth();
294 
295       dump();
296 
297       this.oProbabilityTable.dumpCSV();
298 */
299     }
300     catch(Exception e)
301     {
302       System.err.println(e.getMessage());
303       e.printStackTrace(System.err);
304     }
305 
306     return true;
307   }
308 /*
309   protected String getNextToken()
310   {
311     return new String((char)oStreamTokenizer.ttype + "");
312   }
313 */
314   /**
315    * Not implemented.
316    * @throws StorageException never thrown
317    */
318   public void dumpCSV()
319   throws StorageException
320   {
321   }
322 
323   /**
324    * Not implemented.
325    * @throws StorageException never thrown
326    */
327   public void dumpXML()
328   throws StorageException
329   {
330   }
331 
332   /**
333    * Not implemented.
334    * @throws StorageException never thrown
335    */
336   public void restoreCSV()
337   throws StorageException
338   {
339   }
340 
341   /**
342    * Not implemented.
343    * @throws StorageException never thrown
344    */
345   public void restoreXML()
346   throws StorageException
347   {
348   }
349 
350   public final void setStreamTokenizer(NLPStreamTokenizer poStreamTokenizer)
351   {
352     this.oStreamTokenizer = (NLPStreamTokenizer)poStreamTokenizer;
353   }
354 
355   public NLPStreamTokenizer getStreamTokenizer()
356   {
357     return this.oStreamTokenizer;
358   }
359 
360   public ProbabilityTable getProbabilityTable()
361   {
362     return this.oProbabilityTable;
363   }
364 
365   public final void setLanguage(final String pstrLang)
366   {
367     this.oProbabilityTable.setLang(pstrLang);
368   }
369 
370   public final String getLanguage()
371   {
372     return this.oProbabilityTable.getLang();
373   }
374 
375   public final String resetFilename()
376   {
377     return (strFilename = getFilename());
378   }
379 
380   public final String getFilename()
381   {
382     // e.g. nlp.StatisticalEstimators.Smoothing.WittenBell.1.en.gzbin
383     // <estimator/smoothing>.<ngram-model>.<lang>.gzbin
384     return
385       this.getClass().getName() +
386       "." + MARF.NLP.getNgramModel() +
387       "." + MARF.NLP.getLanguage() +
388       ".gzbin";
389   }
390 
391   /**
392    * Retrieves class' revision.
393    * @return revision string
394    */
395   public static String getMARFSourceCodeRevision()
396   {
397     return "$Revision: 1.24 $";
398   }
399 }
400 
401 // EOF