Source code: j3/utils.java
1 package j3;
2
3 /** Lofty goals:
4 * I would like to provide a method by which a user can present a dictionary file
5 * like EDICT and a J3_Parser_Description file (or similarly named) so that after
6 * this reads in the parser file, it can read in and use the dictionary file. This
7 * is handy b/c I then won't have to care two-licks about what format the users'
8 * dictionary is in; this sort of mirrors the idea of not caring about what language
9 * the GUI's in either.
10 *
11 *
12 *
13 * I would like to support:
14 * Supported / Testing
15 * ??
16 * -* ISO-2022 (Designators: -CN, -CN-EXT, -KR;
17 * single shift -CN, -CN-EXT;
18 * shifitng -CN, -CN-EXT, -KR;
19 * escape -JP, -JP-1, -JP-2)
20 * -* EUC
21 * -- GBK
22 * -- Big5
23 * -- Big5+
24 * -* Shift-JIS
25 * -- Johab
26 */
27
28 /** @author Jason Vertrees */
29 /** @see Ken Lunde's, "CJKV" book. */
30
31 // So we may read from file
32 import java.io.File;
33 import java.io.FileInputStream;
34 import java.io.FileOutputStream;
35 import java.io.FileReader;
36 import java.io.InputStreamReader;
37 import java.io.UnsupportedEncodingException;
38 import java.io.Reader;
39 import java.io.BufferedReader;
40
41
42 // Exception handling
43 import java.io.FileNotFoundException;
44 import java.io.IOException;
45
46 // Charsets and other goodies
47 import java.nio.charset.Charset;
48
49 // utils
50 import java.util.Date;
51 import java.util.Enumeration;
52 import java.util.HashMap;
53 import java.util.regex.PatternSyntaxException;
54 import java.util.Properties;
55 import java.util.TreeMap;
56
57 /** My J3 utils class. Here one will find (hopefull many and good) utils
58 that will assist in i18n programming, right now designed for Japanese
59 and English
60 */
61 public class utils {
62
63 /** String for finding dictionary stuff */
64 public static String dict = "Dictionary";
65
66 /** system dependent lineSeparator */
67 public static String lineSep = System.getProperty("line.separator");
68 /** system dependent pathSeparator */
69 public static String pathSep = System.getProperty("path.separator");
70 /** system dependent fileSeparator */
71 public static String fileSep = System.getProperty("file.separator");
72
73 /** For uhh loading charDics */
74 public static String[] charDics = { J3Dict.sKANJIDIC, J3Dict.sKANJI212 };
75 /** For loading String dics */
76 public static String[] stringDics = { J3Dict.sEDICT, J3Dict.sCEDICT, J3Dict.sXML };
77 /** EDICTS */
78 public static String[] EDICTDics = { J3Dict.sEDICT, J3Dict.sCEDICT };
79
80 /** Used to convert the integer returnVal into a string that
81 is more human friendly */
82 public static String[] codeArray = {
83 "ASCII",
84 "GB_2312",
85 "GB_7589",
86 "GB_7590",
87 "CNS", // 11643-1992 Plane 1 (ISO-2022-CN)",
88 "CNS", // 11643-1992 Plane 2 (ISO-2022-CN)",
89 "CNS", // 11643-1992 Plane 3 (ISO-2022-CN-EXT)",
90 "CNS", // 11643-1992 Plane 4 (ISO-2022-CN-EXT)",
91 "CNS", // 11643-1992 Plane 5 (ISO-2022-CN-EXT)",
92 "CNS", // 11643-1992 Plane 6 (ISO-2022-CN-EXT)",
93 "CNS", // 11643-1992 Plane 7 (ISO-2022-CN-EXT)",
94 "ISO-IR-165", // (ISO-2022-CN-EXT)",
95 "JIS", //-Roman (ISO-2022-JP)",
96 "JIS", // C 6226-1978 (ISO-2022-JP)",
97 "JIS", // X 0208-1983 (ISO-2022-JP)",
98 "JIS", // X 0208-1990",
99 "JIS", // X 0208-1997",
100 "JIS", // X 0212-1990 (ISO-2022-JP-1)",
101 "GB_2312", // (ISO-2022-JP-2",
102 "KS", // X 1001:1992 (ISO-2022-JP-2)",
103 "ISO_8859", //-1:1998 (ISO-2022-JP-2)",
104 "ISO_8859", //-7:1998 (ISO-2022-JP-2)",
105 "ISO-2022-KR",
106 "EUC-CN",
107 "EUC-TW",
108 "EUC-KR",
109 "EUC-JP",
110 "Big 5",
111 "Big 5 Plus",
112 "SJIS", //Shift-JIS X 0208:1997",
113 "JOHAB", // (KS X 1001:1992)",
114 "UCS2",
115 "UCS4",
116 "UTF7",
117 "UTF8",
118 "UTF16",
119 "HALF_WIDTH_KATAKANA",
120 "EUCORSJIS",
121 "EUC",
122 "SJIS",
123 "NEC",
124 "ISO-2022",
125 "JIS"
126 };
127
128 public static String[] verboseCodeArray = {
129 "ASCII",
130 "GB 2312-80 (ISO-2022-CN)",
131 "GB 7589-87",
132 "GB 7590-87",
133 "CNS 11643-1992 Plane 1 (ISO-2022-CN)",
134 "CNS 11643-1992 Plane 2 (ISO-2022-CN)",
135 "CNS 11643-1992 Plane 3 (ISO-2022-CN-EXT)",
136 "CNS 11643-1992 Plane 4 (ISO-2022-CN-EXT)",
137 "CNS 11643-1992 Plane 5 (ISO-2022-CN-EXT)",
138 "CNS 11643-1992 Plane 6 (ISO-2022-CN-EXT)",
139 "CNS 11643-1992 Plane 7 (ISO-2022-CN-EXT)",
140 "ISO-IR-165 (ISO-2022-CN-EXT)",
141 "JIS-Roman (ISO-2022-JP)",
142 "JIS C 6226-1978 (ISO-2022-JP)",
143 "JIS X 0208-1983 (ISO-2022-JP)",
144 "JIS X 0208-1990",
145 "JIS X 0208-1997",
146 "JIS X 0212-1990 (ISO-2022-JP-1)",
147 "GB 2312-80 (ISO-2022-JP-2",
148 "KS X 1001:1992 (ISO-2022-JP-2)",
149 "ISO 8859-1:1998 (ISO-2022-JP-2)",
150 "ISO 8859-7:1998 (ISO-2022-JP-2)",
151 "ISO-2022-KR",
152 "EUC-CN",
153 "EUC-TW",
154 "EUC-KR",
155 "EUC-JP",
156 "Big 5",
157 "Big 5 Plus",
158 "Shift-JIS X 0208:1997",
159 "Johab (KS X 1001:1992)",
160 "UCS-2",
161 "UCS-4",
162 "UTF-7",
163 "UTF-8",
164 "UTF-16",
165 "HALF_WIDTH_KATAKANA",
166 "EUCORSJIS",
167 "EUC",
168 "SJIS",
169 "NEC",
170 "ISO2022",
171 "JIS"
172 };
173
174 public static int EOF = -1;
175 public static int ESC = 27;
176
177 /** This was a pain in the ass. */
178 public static int ASCII = 0;
179 public static int GB_2312_80 = 1;
180 public static int GB_7589 = 2;
181 public static int GB_7590 = 3;
182 public static int CNS = 4;
183 public static int CNS_1 = 5;
184 public static int CNS_2 = 6;
185 public static int CNS_3 = 7;
186 public static int CNS_4 = 8;
187 public static int CNS_5 = 9;
188 public static int CNS_6 = 10;
189 public static int CNS_7 = 11;
190 public static int ISO_IR_165 = 12;
191 public static int JIS_ROMAN = 13;
192 public static int JIS_C_6226_1978 = 14;
193 public static int JIS_X_0208_1983 = 15;
194 public static int JIS_X_0208_1990 = 16;
195 public static int JIS_X_0208_1997 = 17;
196 public static int JIS_X_0212_1990 = 18;
197 public static int KS_X_1001 = 19;
198 public static int ISO_8859_1 = 20;
199 public static int ISO_8859_7 = 21;
200 public static int ISO_2022_KR = 22;
201 public static int EUC_CN = 23;
202 public static int EUC_TW = 24;
203 public static int EUC_KR = 25;
204 public static int EUC_JP = 26;
205 public static int BIG_5 = 27;
206 public static int BIG_5_PLUS = 28;
207 public static int SHIFT_JIS_X_0208 = 29;
208 public static int JOHAB = 30;
209 public static int UCS_2 = 31;
210 public static int UCS_4 = 32;
211 public static int UTF_7 = 33;
212 public static int UTF_8 = 34;
213 public static int UTF_16 = 35;
214 public static int HALF_WIDTH_KATAKANA = 36;
215 public static int EUCORSJIS = 37;
216 public static int EUC = 38;
217 public static int SJIS = 39;
218 public static int NEC = 40;
219 public static int ISO2022 = 41;
220 public static int JIS = 42;
221
222 // File guessing coder thingy here.
223 // From the man -- Ken Lunde
224
225
226 /**
227 * <code>listCharsets</code> gets and lists
228 * all the charsets your VM knows about
229 */
230 public static void listCharsets(){
231
232 // values gives me a sorted map
233 // sortedMap.values gives me a collection
234 //
235 Charset[] ca = (Charset[]) Charset.availableCharsets().values().toArray( new Charset[0] );
236
237 for ( int i = 0; i < ca.length; i++ ){
238 System.out.println( ca[i].name() );
239 }
240 }
241
242
243
244 /** explicate, here, is an adaptation (with minimal changes) to Ken Lunde's
245 jcode.c. Without sounding obseqious, Ken Lunde is the 'man' when it comes
246 to CJKV Information Processing. It peeks at a text file and checks out
247 the encoding returning a value corresponding to the codeArray above
248 */
249 public static int explicate( String fileName ) {
250
251 // need a similar thing to getc
252 FileInputStream fisInFile = null;
253
254 try {
255 fisInFile = new FileInputStream( fileName );
256 } catch (FileNotFoundException fnfe ) {
257 System.err.println("Could not find the file called, "
258 + fileName
259 + lineSep
260 + "Please make sure the file exists and try again");
261 System.exit(1);
262 }
263
264 int c = 0;
265 int whatcode = ASCII;
266
267 // break this down to detecting via (hemisphere/continent?)
268 // detectJapanese(...);
269 // detectKorean(...);
270 // detectChinese(...);
271 // ...
272
273 try {
274
275 while ((whatcode == EUCORSJIS || whatcode == ASCII) && c != EOF) {
276 if ((c = fisInFile.read()) != EOF) {
277 // ESC
278 if (c == ESC) {
279 c = fisInFile.read();
280 // $
281 if (c == '$') {
282 c = fisInFile.read();
283 // ESC+$+B
284 if (c == 'B') // JIS X 0208-1983
285 whatcode = JIS_X_0208_1983;
286 else if (c == '@') // JIS C 6226-1978
287 whatcode = JIS_C_6226_1978;
288 else if (c == 'A') // GB 2312-80
289 whatcode = GB_2312_80;
290 else if (c == '(') { // JISx0212_1990 or KS?
291 c = fisInFile.read();
292 if (c == 'D') //
293 whatcode = JIS_X_0212_1990; // JIS x 0212-1990
294 else if (c == 'C')
295 whatcode = KS_X_1001;
296 }
297 // ESC+$+)
298 else if (c == ')') { // CN stuff
299 c = fisInFile.read();
300 if (c == 'A')
301 whatcode = GB_2312_80;
302 else if (c == 'G')
303 whatcode = CNS_1;
304 else if (c == 'E')
305 whatcode = ISO_IR_165;
306 } // ESC+$++
307 else if (c == '+') { // more CN stuff
308 c = fisInFile.read();
309 if (c == 'I')
310 whatcode = CNS_3;
311 else if (c == 'J')
312 whatcode = CNS_4;
313 else if (c == 'K')
314 whatcode = CNS_5;
315 else if (c == 'L')
316 whatcode = CNS_6;
317 else if (c == 'M')
318 whatcode = CNS_7;
319 } // ESC+$+*
320 else if ( c == '*') { // more CN stuff
321 c = fisInFile.read();
322 if (c == 'H')
323 whatcode = CNS_2;
324 } // ESC+.+A and ESC+.+F
325 } // ESC-K
326 else if (c == 'K')
327 whatcode = NEC;
328 // ESC+(
329 else if (c == '(') {
330 c = fisInFile.read();
331 if ( c == 'B' )
332 whatcode = ASCII;
333 else if ( c == 'J' || c == 'H' )
334 whatcode = JIS_ROMAN;
335 else if ( c == 'I' )
336 whatcode = HALF_WIDTH_KATAKANA;
337 }
338 else if (c == '.') { // JP stuff
339 c = fisInFile.read();
340 if (c == 'A')
341 whatcode = ISO_8859_1;
342 else if (c == 'F')
343 whatcode = ISO_8859_7;
344 }
345 }
346 else if ((c >= 129 && c <= 141) || (c >= 143 && c <= 159))
347 whatcode = SJIS;
348 else if (c == 142) {
349 c = fisInFile.read();
350 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160) || (c >= 224 && c <= 252))
351 whatcode = SJIS;
352 else if (c >= 161 && c <= 223)
353 whatcode = EUCORSJIS;
354 }
355 else if (c >= 161 && c <= 223) {
356 c = fisInFile.read();
357 if (c >= 240 && c <= 254)
358 whatcode = EUC;
359 else if (c >= 161 && c <= 223)
360 whatcode = EUCORSJIS;
361 else if (c >= 224 && c <= 239) {
362 whatcode = EUCORSJIS;
363 while (c >= 64 && c != EOF && whatcode == EUCORSJIS) {
364 if (c >= 129) {
365 if (c <= 141 || (c >= 143 && c <= 159))
366 whatcode = SJIS;
367 else if (c >= 253 && c <= 254)
368 whatcode = EUC;
369 }
370 c = fisInFile.read();
371 }
372 }
373 else if (c <= 159)
374 whatcode = SJIS;
375 }
376 else if (c >= 240 && c <= 254)
377 whatcode = EUC;
378 else if (c >= 224 && c <= 239) {
379 c = fisInFile.read();
380 if ((c >= 64 && c <= 126) || (c >= 128 && c <= 160))
381 whatcode = SJIS;
382 else if (c >= 253 && c <= 254)
383 whatcode = EUC;
384 else if (c >= 161 && c <= 252)
385 whatcode = EUCORSJIS;
386 }
387 }
388 }
389 } catch (IOException ioe){
390 System.err.println("IOException: Couldn't read in character from the file.");
391 return -1;
392 }
393
394 return whatcode;
395 }
396
397
398 /** 'Table look up' -- go Novak! */
399 public static String convertToAcronym( int code ) {
400
401 if ( code > codeArray.length ||
402 code < 0 )
403 return "Error - integer code out of bounds.";
404 else
405 return codeArray[code];
406 }
407
408
409
410 /** Converts to Java-knowable stream names */
411 public static int convertToKnownCode( int code ) {
412
413 if ( code > codeArray.length ||
414 code < 0 )
415 return -1;
416 else if ( code == EUC ||
417 code == EUCORSJIS || /// Don't think this one belongs here.
418 code == JIS_ROMAN ||
419 code == JIS_X_0208_1997 ||
420 code == JIS_X_0212_1990 ||
421 code == HALF_WIDTH_KATAKANA )
422 return EUC_JP;
423
424 else if ( code == GB_2312_80 )
425 return EUC_CN; // Chinese
426
427 else if ( code == CNS_1 ||
428 code == CNS_2 ||
429 code == CNS_3 ||
430 code == CNS_4 ||
431 code == CNS_5 ||
432 code == CNS_6 ||
433 code == CNS_7 )
434 return EUC_TW; // Chinese (Taiwan)
435
436 else if ( code == KS_X_1001 )
437 return EUC_KR;
438
439 else if ( code == ISO_8859_1 ||
440 code == ISO_8859_7 ||
441 code == JIS_C_6226_1978 ||
442 code == JIS_X_0208_1983 )
443 // was ISO-2022
444 return JIS; // Japanese
445
446 else return code;
447 // ... and so on.
448 }
449
450
451
452 /** Saves the current state of the init file
453 * @param - globalProps is a non-null properties file whose contents
454 * will be written out to the disk, to the initFile as declared in J3. */
455 protected static void saveInitFile( Properties globalProps ){
456
457 // stupid user
458 if ( globalProps == null )
459 return;
460
461 // create a new File based on initFile
462 FileOutputStream outFile = null;
463 try {
464 outFile = new FileOutputStream( j3.J3.initFile );
465 } catch (FileNotFoundException fnfe ) {
466 System.out.println("Could not open basic init file to write: "
467 + fnfe.toString() );
468 return;
469 }
470
471 // comment for the initFile
472 String comment = new String("J3's initialization file and settings. "
473 + lineSep
474 + "#Please don't mess with this unless you know what you're doing.");
475
476 // try writing the file.
477 try {
478 globalProps.store( outFile , comment );
479 } catch ( Exception e ) {
480 System.err.println("There was a problem writing the non-existent init file,"
481 + j3.J3.initFile );
482 System.err.println("You might check that the path exists, and that you can "
483 + "write to that path");
484 System.err.println("I'll try to continue anyway.");
485 return;
486 }
487 }
488
489
490
491 /** Helper method thtat gets the integer representing the highest
492 * numbered dictionary entry and adds one to it: eg: If your ...properties
493 * file has the following in it:
494 *
495 * Dictionary.1
496 * Dictionary.2
497 * Dictionary.3
498 *
499 * This'll return "4"
500 */
501 protected static String getNextDictionaryNumber( Properties globalProps ) {
502 Integer highest = new Integer(0);
503
504 for (Enumeration e = globalProps.propertyNames(); e.hasMoreElements() ;) {
505 String cur = (String) e.nextElement();
506
507 if ( cur.startsWith( dict + "." )) {
508 Integer temp = new Integer( cur.substring(cur.length()-1) );
509 if ( temp.intValue() > highest.intValue() )
510 highest = temp;
511 }
512 }
513 return new Integer(highest.intValue() + 1).toString();
514 }
515
516
517
518 /**
519 * <code>pause</code> just a sec.
520 *
521 * @param numberMillis a <code>long</code> value
522 */
523 public static void pause( long numberMillis ) {
524
525 Date now = new Date();
526 numberMillis *= 100;
527 long exitTime = now.getTime() + numberMillis;
528
529 while (true) {
530 now = new Date();
531 if (now.getTime() > exitTime)
532 return;
533 }
534 }
535
536
537
538
539 /** example of how to use the code.
540 java -classpath C:\playground\ J3.utils C:\temp\test.sjis.txt
541 or similar will work
542 */
543 public static void main( String[] args ){
544 System.out.println( convertToAcronym(explicate(args[0]) ));
545 TreeMap parsedMap = j3.J3Dict.parseEDICTStyleDictionary( args[0] );
546 // playWithDictionary( parsedMap );
547 }
548 }