Source code: marf/Stats/StatisticalEstimators/StatisticalEstimator.java
1 package marf.Stats.StatisticalEstimators;
2
3 import java.util.Vector;
4
5 import marf.MARF;
6 import marf.MARF.NLP;
7 import marf.Stats.ProbabilityTable;
8 import marf.Storage.StorageException;
9 import marf.Storage.StorageManager;
10 import marf.nlp.util.NLPStreamTokenizer;
11
12
13 /**
14 * <p>Implements generic Statistical Estimator routines.
15 * Must be subclasses by concrete implemenations of statistical estimators.</p>
16 *
17 * $Id: StatisticalEstimator.java,v 1.24 2005/06/16 19:58:47 mokhov Exp $
18 *
19 * @author Serguei Mokhov
20 * @version $Revision: 1.24 $
21 * @since 0.3.0
22 */
23 public abstract class StatisticalEstimator
24 extends StorageManager
25 implements IStatisticalEstimator
26 {
27 protected ProbabilityTable oProbabilityTable = null;
28 protected NLPStreamTokenizer oStreamTokenizer = null;
29
30 public StatisticalEstimator()
31 {
32 //super(oProbabilityTable, getFilename());
33 this.oProbabilityTable = new ProbabilityTable(NLP.getLanguage());
34 this.strFilename = getFilename();
35 this.oObjectToSerialize = this.oProbabilityTable;
36 //System.out.println("StatisticalEstimator()");
37 }
38
39 /**
40 * N-gram-based classification.
41 * @return calculated probability value
42 */
43 //public abstract double P(String pstrSentence);
44 //public abstract double P(Ngram poNgram);
45 public final double p()
46 {
47 double dProbability = 0.0;
48
49 try
50 {
51 // P(EOS,first-char)
52 boolean bBeginning = true;
53
54 restore();
55 this.oProbabilityTable = (ProbabilityTable)this.oObjectToSerialize;
56
57 String strToken;
58
59 // while(oStreamTokenizer.nextToken() != StreamTokenizer.TT_EOF)
60 while((strToken = oStreamTokenizer.getNextToken()) != null)
61 {
62 // String strToken = getNextToken();
63
64 // Something which what we didn't ask for (not a string)
65 // if(strToken == null)
66 // {
67 // System.out.println("WARNING: Null token!");
68 // continue;
69 // }
70
71 switch(MARF.NLP.getNgramModel())
72 {
73 case MARF.INgramModels.UNIGRAM:
74 {
75 Vector oNgram = new Vector();
76 oNgram.add(strToken);
77 dProbability += Math.log(oProbabilityTable.P(oNgram));
78 break;
79 }
80
81 case MARF.INgramModels.BIGRAM:
82 {
83 Vector oNgram = new Vector();
84 oNgram.add(strToken);
85
86 // if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
87 // break;
88
89 // String strToken2 = getNextToken();
90
91 String strToken2 = oStreamTokenizer.getNextToken();
92
93 if(strToken2 == null)
94 break;
95
96 oNgram.add(strToken2);
97
98 oStreamTokenizer.pushBack();
99
100 dProbability += Math.log(oProbabilityTable.P(oNgram));
101
102 break;
103 }
104
105 case MARF.INgramModels.TRIGRAM:
106 {
107 Vector oNgram = new Vector();
108 oNgram.add(strToken);
109
110 // if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
111 // break;
112
113 String strToken2 = oStreamTokenizer.getNextToken();
114
115 if(strToken2 == null)
116 break;
117
118 oNgram.add(strToken2);
119
120 int ttype2 = oStreamTokenizer.ttype;
121 double nval2 = oStreamTokenizer.nval;
122
123 // if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
124 // break;
125
126 // String strToken3 = getNextToken();
127
128 String strToken3 = oStreamTokenizer.getNextToken();
129
130 if(strToken3 == null)
131 break;
132
133 oNgram.add(strToken3);
134
135 int ttype3 = oStreamTokenizer.ttype;
136 double nval3 = oStreamTokenizer.nval;
137
138 oStreamTokenizer.ttype = ttype2;
139 oStreamTokenizer.nval = nval2;
140 oStreamTokenizer.sval = strToken2;
141
142 oStreamTokenizer.pushBack();
143
144 oStreamTokenizer.ttype = ttype3;
145 oStreamTokenizer.nval = nval3;
146 oStreamTokenizer.sval = strToken3;
147
148 oStreamTokenizer.pushBack();
149
150 dProbability += Math.log(oProbabilityTable.P(oNgram));
151 break;
152 }
153
154 default:
155 break;
156 }
157 }
158 }
159 catch(Exception e)
160 {
161 System.err.println(e.getMessage());
162 e.printStackTrace(System.err);
163 }
164
165 return dProbability;
166 }
167
168 /**
169 * Every estimator needs to implement its specific training method.
170 * @return <code>true</code> if training was successful
171 */
172 public boolean train()
173 {
174 try
175 {
176 restore();
177
178 this.oProbabilityTable = (ProbabilityTable)this.oObjectToSerialize;
179
180 String strToken;
181
182 // Collect Stats
183 //while(oStreamTokenizer.nextToken() != StreamTokenizer.TT_EOF)
184 while((strToken = oStreamTokenizer.getNextToken()) != null)
185 {
186 //String strToken = getNextToken();
187
188 // Something which what we didn't ask for (not a string)
189 if(strToken == null)
190 {
191 System.err.println
192 (
193 "WARNING: Null token! st:[" + oStreamTokenizer.toString() + "]" +
194 oStreamTokenizer.sval + oStreamTokenizer.nval + oStreamTokenizer.ttype
195 );
196
197 continue;
198 }
199
200 switch(MARF.NLP.getNgramModel())
201 {
202 case MARF.INgramModels.UNIGRAM:
203 {
204 Vector oUnigram = new Vector();
205
206 oUnigram.add(strToken);
207
208 // System.out.println("oNgram["+oUnigram+"]");
209 this.oProbabilityTable.incFrequency(oUnigram);
210
211 break;
212 }
213
214 case MARF.INgramModels.BIGRAM:
215 {
216 Vector oNgram = new Vector();
217
218 oNgram.add(strToken);
219
220 // if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
221 // break;
222
223 String strToken2 = oStreamTokenizer.getNextToken();
224
225 if(strToken2 == null)
226 break;
227
228 oNgram.add(strToken2);
229 oStreamTokenizer.pushBack();
230
231 // System.out.println("oNgram["+oNgram+"]");
232 this.oProbabilityTable.incFrequency(oNgram);
233
234 break;
235 }
236
237 case MARF.INgramModels.TRIGRAM:
238 {
239 Vector oNgram = new Vector();
240 oNgram.add(strToken);
241
242 // if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
243 // break;
244
245 String strToken2 = oStreamTokenizer.getNextToken();
246
247 if(strToken2 == null)
248 break;
249
250 oNgram.add(strToken2);
251
252 int ttype2 = oStreamTokenizer.ttype;
253 double nval2 = oStreamTokenizer.nval;
254
255 //if(oStreamTokenizer.nextToken() == StreamTokenizer.TT_EOF)
256 // break;
257
258 String strToken3 = oStreamTokenizer.getNextToken();
259
260 if(strToken3 == null)
261 break;
262
263 oNgram.add(strToken3);
264
265 int ttype3 = oStreamTokenizer.ttype;
266 double nval3 = oStreamTokenizer.nval;
267
268 oStreamTokenizer.ttype = ttype2;
269 oStreamTokenizer.nval = nval2;
270 oStreamTokenizer.sval = strToken2;
271
272 oStreamTokenizer.pushBack();
273
274 oStreamTokenizer.ttype = ttype3;
275 oStreamTokenizer.nval = nval3;
276 oStreamTokenizer.sval = strToken3;
277
278 oStreamTokenizer.pushBack();
279
280 // System.out.println("oNgram["+oNgram+"]");
281 this.oProbabilityTable.incFrequency(oNgram);
282
283 break;
284 }
285
286 default:
287 break;
288 }
289 }
290
291 dump();
292 /*
293 smooth();
294
295 dump();
296
297 this.oProbabilityTable.dumpCSV();
298 */
299 }
300 catch(Exception e)
301 {
302 System.err.println(e.getMessage());
303 e.printStackTrace(System.err);
304 }
305
306 return true;
307 }
308 /*
309 protected String getNextToken()
310 {
311 return new String((char)oStreamTokenizer.ttype + "");
312 }
313 */
314 /**
315 * Not implemented.
316 * @throws StorageException never thrown
317 */
318 public void dumpCSV()
319 throws StorageException
320 {
321 }
322
323 /**
324 * Not implemented.
325 * @throws StorageException never thrown
326 */
327 public void dumpXML()
328 throws StorageException
329 {
330 }
331
332 /**
333 * Not implemented.
334 * @throws StorageException never thrown
335 */
336 public void restoreCSV()
337 throws StorageException
338 {
339 }
340
341 /**
342 * Not implemented.
343 * @throws StorageException never thrown
344 */
345 public void restoreXML()
346 throws StorageException
347 {
348 }
349
350 public final void setStreamTokenizer(NLPStreamTokenizer poStreamTokenizer)
351 {
352 this.oStreamTokenizer = (NLPStreamTokenizer)poStreamTokenizer;
353 }
354
355 public NLPStreamTokenizer getStreamTokenizer()
356 {
357 return this.oStreamTokenizer;
358 }
359
360 public ProbabilityTable getProbabilityTable()
361 {
362 return this.oProbabilityTable;
363 }
364
365 public final void setLanguage(final String pstrLang)
366 {
367 this.oProbabilityTable.setLang(pstrLang);
368 }
369
370 public final String getLanguage()
371 {
372 return this.oProbabilityTable.getLang();
373 }
374
375 public final String resetFilename()
376 {
377 return (strFilename = getFilename());
378 }
379
380 public final String getFilename()
381 {
382 // e.g. nlp.StatisticalEstimators.Smoothing.WittenBell.1.en.gzbin
383 // <estimator/smoothing>.<ngram-model>.<lang>.gzbin
384 return
385 this.getClass().getName() +
386 "." + MARF.NLP.getNgramModel() +
387 "." + MARF.NLP.getLanguage() +
388 ".gzbin";
389 }
390
391 /**
392 * Retrieves class' revision.
393 * @return revision string
394 */
395 public static String getMARFSourceCodeRevision()
396 {
397 return "$Revision: 1.24 $";
398 }
399 }
400
401 // EOF