Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: j3/utils.java


1   package j3;
2   
3   /** Lofty goals:
4    *  I would like to provide a method by which a user can present a dictionary file
5    *  like EDICT and a J3_Parser_Description file (or similarly named) so that after 
6    *  this reads in the parser file, it can read in and use the dictionary file.  This
7    *  is handy b/c I then won't have to care two-licks about what format the users'
8    *  dictionary is in; this sort of mirrors the idea of not caring about what language
9    *  the GUI's in either.
10   *
11   *
12   * 
13   *  I would like to support: 
14   *  Supported / Testing
15   *  ??
16   *  -* ISO-2022 (Designators: -CN, -CN-EXT, -KR;
17   *               single shift -CN, -CN-EXT;
18   *               shifitng     -CN, -CN-EXT, -KR;
19   *               escape       -JP, -JP-1, -JP-2)
20   *  -* EUC
21   *  -- GBK
22   *  -- Big5
23   *  -- Big5+
24   *  -* Shift-JIS
25   *  -- Johab
26   */
27  
28  /** @author Jason Vertrees */
29  /** @see Ken Lunde's, "CJKV" book. */
30  
31  // So we may read from file
32  import java.io.File;
33  import java.io.FileInputStream;
34  import java.io.FileOutputStream;
35  import java.io.FileReader;
36  import java.io.InputStreamReader;
37  import java.io.UnsupportedEncodingException;
38  import java.io.Reader;
39  import java.io.BufferedReader;
40  
41  
42  // Exception handling
43  import java.io.FileNotFoundException;
44  import java.io.IOException;
45  
46  // Charsets and other goodies
47  import java.nio.charset.Charset;
48  
49  // utils
50  import java.util.Date;
51  import java.util.Enumeration;
52  import java.util.HashMap;
53  import java.util.regex.PatternSyntaxException;
54  import java.util.Properties;
55  import java.util.TreeMap;
56  
57  /** My J3 utils class.  Here one will find (hopefull many and good) utils
58      that will assist in i18n programming, right now designed for Japanese
59      and English
60  */
61  public class utils {
62  
63      /** String for finding dictionary stuff */
64      public static String dict = "Dictionary";
65  
66      /** system dependent lineSeparator */
67      public static String lineSep = System.getProperty("line.separator");
68      /** system dependent pathSeparator */
69      public static String pathSep = System.getProperty("path.separator");
70      /** system dependent fileSeparator */
71      public static String fileSep = System.getProperty("file.separator");
72  
73      /** For uhh loading charDics */
74      public static String[] charDics = { J3Dict.sKANJIDIC, J3Dict.sKANJI212 };
75      /** For loading String dics */
76      public static String[] stringDics = { J3Dict.sEDICT, J3Dict.sCEDICT, J3Dict.sXML };
77      /** EDICTS */
78      public static String[] EDICTDics = { J3Dict.sEDICT, J3Dict.sCEDICT };
79  
80      /** Used to convert the integer returnVal into a string that 
81    is more human friendly */
82      public static String[] codeArray = {
83    "ASCII",
84    "GB_2312",
85    "GB_7589",
86    "GB_7590",
87    "CNS", // 11643-1992 Plane 1 (ISO-2022-CN)",
88    "CNS", // 11643-1992 Plane 2 (ISO-2022-CN)",
89    "CNS", // 11643-1992 Plane 3 (ISO-2022-CN-EXT)",
90    "CNS", // 11643-1992 Plane 4 (ISO-2022-CN-EXT)",
91    "CNS", // 11643-1992 Plane 5 (ISO-2022-CN-EXT)",
92    "CNS", // 11643-1992 Plane 6 (ISO-2022-CN-EXT)",
93    "CNS", // 11643-1992 Plane 7 (ISO-2022-CN-EXT)",
94    "ISO-IR-165", // (ISO-2022-CN-EXT)",
95    "JIS", //-Roman (ISO-2022-JP)",
96    "JIS", // C 6226-1978 (ISO-2022-JP)",
97    "JIS", //  X 0208-1983 (ISO-2022-JP)",
98    "JIS", //  X 0208-1990",
99    "JIS", //  X 0208-1997",
100   "JIS", //  X 0212-1990 (ISO-2022-JP-1)",
101   "GB_2312", // (ISO-2022-JP-2",
102   "KS", // X 1001:1992 (ISO-2022-JP-2)",
103   "ISO_8859", //-1:1998 (ISO-2022-JP-2)",
104   "ISO_8859", //-7:1998 (ISO-2022-JP-2)",
105   "ISO-2022-KR",
106   "EUC-CN",
107   "EUC-TW",
108   "EUC-KR",
109   "EUC-JP",
110   "Big 5",
111   "Big 5 Plus",
112   "SJIS", //Shift-JIS X 0208:1997",
113   "JOHAB", //  (KS X 1001:1992)",
114   "UCS2",
115   "UCS4",
116   "UTF7",
117   "UTF8",
118   "UTF16",
119   "HALF_WIDTH_KATAKANA",
120   "EUCORSJIS",
121   "EUC",
122   "SJIS",
123   "NEC",
124   "ISO-2022",
125   "JIS"
126     };
127 
128     public static String[] verboseCodeArray = {
129   "ASCII",
130   "GB 2312-80 (ISO-2022-CN)",
131   "GB 7589-87",
132   "GB 7590-87",
133   "CNS 11643-1992 Plane 1 (ISO-2022-CN)",
134   "CNS 11643-1992 Plane 2 (ISO-2022-CN)",
135   "CNS 11643-1992 Plane 3 (ISO-2022-CN-EXT)",
136   "CNS 11643-1992 Plane 4 (ISO-2022-CN-EXT)",
137   "CNS 11643-1992 Plane 5 (ISO-2022-CN-EXT)",
138   "CNS 11643-1992 Plane 6 (ISO-2022-CN-EXT)",
139   "CNS 11643-1992 Plane 7 (ISO-2022-CN-EXT)",
140   "ISO-IR-165 (ISO-2022-CN-EXT)",
141   "JIS-Roman (ISO-2022-JP)",
142   "JIS C 6226-1978 (ISO-2022-JP)",
143   "JIS X 0208-1983 (ISO-2022-JP)",
144   "JIS X 0208-1990",
145   "JIS X 0208-1997",
146   "JIS X 0212-1990 (ISO-2022-JP-1)",
147   "GB 2312-80 (ISO-2022-JP-2",
148   "KS X 1001:1992 (ISO-2022-JP-2)",
149   "ISO 8859-1:1998 (ISO-2022-JP-2)",
150   "ISO 8859-7:1998 (ISO-2022-JP-2)",
151   "ISO-2022-KR",
152   "EUC-CN",
153   "EUC-TW",
154   "EUC-KR",
155   "EUC-JP",
156   "Big 5",
157   "Big 5 Plus",
158   "Shift-JIS X 0208:1997",
159   "Johab (KS X 1001:1992)",
160   "UCS-2",
161   "UCS-4",
162   "UTF-7",
163   "UTF-8",
164   "UTF-16",
165   "HALF_WIDTH_KATAKANA",
166   "EUCORSJIS",
167   "EUC",
168   "SJIS",
169   "NEC",
170   "ISO2022",
171   "JIS"
172     };
173 
174     public static int EOF                    = -1;
175     public static int ESC                    = 27;
176 
177     /** This was a pain in the ass. */
178     public static int ASCII                  = 0;
179     public static int GB_2312_80             = 1;
180     public static int GB_7589                = 2;
181     public static int GB_7590                = 3;
182     public static int CNS                    = 4;
183     public static int CNS_1                  = 5;
184     public static int CNS_2                  = 6;
185     public static int CNS_3                  = 7;
186     public static int CNS_4                  = 8;
187     public static int CNS_5                  = 9;
188     public static int CNS_6                  = 10;
189     public static int CNS_7                  = 11;
190     public static int ISO_IR_165             = 12;
191     public static int JIS_ROMAN              = 13;
192     public static int JIS_C_6226_1978        = 14;
193     public static int JIS_X_0208_1983        = 15;
194     public static int JIS_X_0208_1990        = 16;
195     public static int JIS_X_0208_1997        = 17;
196     public static int JIS_X_0212_1990        = 18;
197     public static int KS_X_1001              = 19;
198     public static int ISO_8859_1             = 20;
199     public static int ISO_8859_7             = 21;
200     public static int ISO_2022_KR            = 22;
201     public static int EUC_CN                 = 23;
202     public static int EUC_TW                 = 24;
203     public static int EUC_KR                 = 25;
204     public static int EUC_JP                 = 26;
205     public static int BIG_5                  = 27;
206     public static int BIG_5_PLUS             = 28;
207     public static int SHIFT_JIS_X_0208       = 29;
208     public static int JOHAB                  = 30;
209     public static int UCS_2                  = 31;
210     public static int UCS_4                  = 32;
211     public static int UTF_7                  = 33;
212     public static int UTF_8                  = 34;
213     public static int UTF_16                 = 35;
214     public static int HALF_WIDTH_KATAKANA    = 36;
215     public static int EUCORSJIS              = 37;
216     public static int EUC                    = 38;
217     public static int SJIS                   = 39;
218     public static int NEC                    = 40;
219     public static int ISO2022                = 41;
220     public static int JIS                    = 42;
221 
222     // File guessing coder thingy here.
223     // From the man -- Ken Lunde
224 
225 
226     /**
227      * <code>listCharsets</code> gets and lists
228      * all the charsets your VM knows about
229      */
230     public static void listCharsets(){
231 
232   // values gives me a sorted map
233   // sortedMap.values gives me a collection
234   // 
235   Charset[] ca = (Charset[]) Charset.availableCharsets().values().toArray( new Charset[0] );
236   
237   for ( int i = 0; i < ca.length; i++ ){
238       System.out.println( ca[i].name() );
239   }
240     }
241 
242 
243 
244     /** explicate, here, is an adaptation (with minimal changes) to Ken Lunde's
245   jcode.c.  Without sounding obseqious, Ken Lunde is the 'man' when it comes
246   to CJKV Information Processing.  It peeks at a text file and checks out
247   the encoding returning a value corresponding to the codeArray above
248     */
249     public static int explicate( String fileName ) {
250 
251   // need a similar thing to getc
252   FileInputStream fisInFile = null;
253 
254   try { 
255       fisInFile = new FileInputStream( fileName );
256   } catch (FileNotFoundException fnfe ) {
257       System.err.println("Could not find the file called, "
258              + fileName
259              + lineSep
260              + "Please make sure the file exists and try again");
261       System.exit(1);
262   }
263 
264   int c = 0;
265   int whatcode = ASCII;
266 
267   // break this down to detecting via (hemisphere/continent?)
268   // detectJapanese(...);
269   // detectKorean(...);
270   // detectChinese(...);
271   // ...
272 
273   try {
274 
275       while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != EOF) {
276     if ((c = fisInFile.read()) != EOF) {
277         // ESC
278         if (c == ESC) {
279       c = fisInFile.read();
280       // $
281       if (c == '$') {
282           c = fisInFile.read();
283           // ESC+$+B
284           if (c == 'B')       // JIS X 0208-1983
285         whatcode = JIS_X_0208_1983;
286           else if (c == '@')  // JIS C 6226-1978
287         whatcode = JIS_C_6226_1978;
288           else if (c == 'A')  // GB 2312-80
289         whatcode = GB_2312_80;
290           else if (c == '(') {  // JISx0212_1990 or KS?
291         c = fisInFile.read();
292         if (c == 'D')    //
293             whatcode = JIS_X_0212_1990; // JIS x 0212-1990
294         else if (c == 'C')
295             whatcode = KS_X_1001;
296           }
297           // ESC+$+)
298           else if (c == ')') { // CN stuff
299         c = fisInFile.read();
300         if (c == 'A')
301             whatcode = GB_2312_80;
302         else if (c == 'G')
303             whatcode = CNS_1;
304         else if (c == 'E')
305             whatcode = ISO_IR_165;
306           } // ESC+$++
307           else if (c == '+') { // more CN stuff
308         c = fisInFile.read();
309         if (c == 'I')
310             whatcode = CNS_3;
311         else if (c == 'J')
312             whatcode = CNS_4;
313         else if (c == 'K')
314             whatcode = CNS_5;
315         else if (c == 'L')
316             whatcode = CNS_6;
317         else if (c == 'M')
318             whatcode = CNS_7;
319           } // ESC+$+*
320           else if ( c == '*') { // more CN stuff
321         c = fisInFile.read();
322         if (c == 'H')
323             whatcode = CNS_2;
324           } // ESC+.+A and ESC+.+F
325       } // ESC-K
326       else if (c == 'K')
327           whatcode = NEC;
328       // ESC+(
329       else if (c == '(') {
330           c = fisInFile.read();
331           if ( c == 'B' )
332         whatcode = ASCII;
333           else if ( c == 'J' || c == 'H' )
334         whatcode = JIS_ROMAN;
335           else if ( c == 'I' )
336         whatcode = HALF_WIDTH_KATAKANA;
337       }
338       else if (c == '.') { // JP stuff
339           c = fisInFile.read();
340           if (c == 'A')
341         whatcode = ISO_8859_1;
342           else if (c == 'F')
343         whatcode = ISO_8859_7;
344       }
345         }
346         else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
347       whatcode = SJIS;
348         else if (c == 142) {
349       c = fisInFile.read();
350       if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
351           whatcode = SJIS;
352       else if (c >= 161 && c <= 223)
353           whatcode = EUCORSJIS;
354         }
355         else if (c >= 161 && c <= 223) {
356       c = fisInFile.read();
357       if (c >= 240 && c <= 254)
358           whatcode = EUC;
359       else if (c >= 161 && c <= 223)
360           whatcode = EUCORSJIS;
361       else if (c >= 224 && c <= 239) {
362           whatcode = EUCORSJIS;
363           while (c >= 64 && c != EOF && whatcode == EUCORSJIS) {
364         if (c >= 129) {
365             if (c <= 141 || (c >= 143 && c <= 159))
366           whatcode = SJIS;
367             else if (c >= 253 && c <= 254)
368           whatcode = EUC;
369         }
370         c = fisInFile.read();
371           }
372       }
373       else if (c <= 159)
374           whatcode = SJIS;
375         }
376         else if (c >= 240 && c <= 254)
377       whatcode = EUC;
378         else if (c >= 224 && c <= 239) {
379       c = fisInFile.read();
380       if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
381           whatcode = SJIS;
382       else if (c >= 253 && c <= 254)
383           whatcode = EUC;
384       else if (c >= 161 && c <= 252)
385           whatcode = EUCORSJIS;
386         }
387     }
388       }
389   } catch (IOException ioe){
390       System.err.println("IOException: Couldn't read in character from the file.");
391       return -1;
392   }
393 
394   return whatcode;
395     }
396 
397 
398     /** 'Table look up' -- go Novak! */
399     public static String convertToAcronym( int code ) {
400   
401   if ( code > codeArray.length ||
402        code < 0 )
403       return "Error - integer code out of bounds.";
404   else 
405       return codeArray[code];
406     }
407 
408 
409 
410     /** Converts to Java-knowable stream names */
411     public static int convertToKnownCode( int code ) {
412 
413   if ( code > codeArray.length ||
414        code < 0 )
415       return -1;
416   else if ( code == EUC                 || 
417       code == EUCORSJIS           ||    /// Don't think this one belongs here.
418       code == JIS_ROMAN           ||
419       code == JIS_X_0208_1997     ||
420       code == JIS_X_0212_1990     ||
421       code == HALF_WIDTH_KATAKANA )
422       return EUC_JP;
423 
424   else if ( code == GB_2312_80 )
425       return EUC_CN;  // Chinese
426 
427   else if ( code == CNS_1 ||
428       code == CNS_2 ||
429       code == CNS_3 ||
430       code == CNS_4 ||
431       code == CNS_5 ||
432       code == CNS_6 ||
433       code == CNS_7 )
434       return EUC_TW;  // Chinese (Taiwan)
435 
436   else if ( code == KS_X_1001 )
437       return EUC_KR;
438 
439   else if ( code == ISO_8859_1      ||
440       code == ISO_8859_7      ||
441       code == JIS_C_6226_1978 ||
442       code == JIS_X_0208_1983 )
443       // was ISO-2022
444       return JIS;  // Japanese
445 
446   else return code;
447   // ... and so on.
448     }
449 
450 
451 
452     /** Saves the current state of the init file
453      * @param - globalProps is a non-null properties file whose contents 
454      * will be written out to the disk, to the initFile as declared in J3. */
455     protected static void saveInitFile( Properties globalProps ){
456 
457   // stupid user
458   if ( globalProps == null )
459       return;
460 
461   // create a new File based on initFile
462   FileOutputStream outFile = null;
463   try { 
464       outFile = new FileOutputStream( j3.J3.initFile );
465   } catch (FileNotFoundException fnfe ) {
466       System.out.println("Could not open basic init file to write: "
467              + fnfe.toString()  );
468       return;
469   }
470   
471   // comment for the initFile
472   String comment = new String("J3's initialization file and settings. " 
473         + lineSep
474         + "#Please don't mess with this unless you know what you're doing.");
475 
476   // try writing the file.
477   try {
478       globalProps.store( outFile , comment );
479   } catch ( Exception e ) {
480       System.err.println("There was a problem writing the non-existent init file,"
481              + j3.J3.initFile );
482       System.err.println("You might check that the path exists, and that you can "
483              + "write to that path");
484       System.err.println("I'll try to continue anyway.");
485       return;
486   }
487     }  
488 
489 
490 
491   /** Helper method thtat gets the integer representing the highest 
492    * numbered dictionary entry and adds one to it: eg: If your ...properties
493    * file has the following in it:
494    *
495    * Dictionary.1
496    * Dictionary.2
497    * Dictionary.3
498    *
499    * This'll return "4"
500    */
501   protected static String getNextDictionaryNumber( Properties globalProps ) {
502       Integer highest = new Integer(0);
503 
504       for (Enumeration e = globalProps.propertyNames(); e.hasMoreElements() ;) {
505     String cur = (String) e.nextElement();
506 
507     if ( cur.startsWith( dict + "." )) {
508         Integer temp = new Integer( cur.substring(cur.length()-1) );
509         if ( temp.intValue() > highest.intValue() )
510       highest = temp;
511     }
512       }
513       return new Integer(highest.intValue() + 1).toString();
514   }
515 
516 
517 
518     /**
519      * <code>pause</code> just a sec.
520      *
521      * @param numberMillis a <code>long</code> value
522      */
523     public static void pause( long numberMillis ) {
524 
525   Date now = new Date();
526   numberMillis *= 100;
527         long exitTime = now.getTime() + numberMillis;
528 
529         while (true) {
530             now = new Date();
531             if (now.getTime() > exitTime)
532                 return;
533         }
534     }
535     
536 
537 
538 
539     /** example of how to use the code.
540   java -classpath C:\playground\ J3.utils C:\temp\test.sjis.txt
541   or similar will work
542     */
543     public static void main( String[] args ){
544   System.out.println( convertToAcronym(explicate(args[0]) ));
545   TreeMap parsedMap = j3.J3Dict.parseEDICTStyleDictionary( args[0] );
546   //  playWithDictionary( parsedMap );
547     }
548 }