Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/mayhoo/kanji/KanjiDicReader.java


1   // $Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $
2   package com.mayhoo.kanji;
3   
4   import java.io.*;
5   import java.util.*;
6   import java.nio.*;
7   import java.nio.channels.*;
8   import java.nio.charset.*;
9   import com.mayhoo.kanji.xml.*;
10  import com.mayhoo.kanji.xml.types.*;
11  
12  /**
13   * KanjiDicReader enables the reading of the Kanjidic file format
14   * created by Jim Breen. The class reads the file and produces
15   * a data structure related to the XML file format by Duane May,
16   * which was origianlly based on the kanjidic format.
17   * 
18   * The class consists of one function load and the classes constructor.
19   *
20   * A file is imported as follows:
21   *
22   * <code>
23   * KanjiDicReader reader = new KanjiDicReader( filename );
24   * KanjiDic theDictionary = reader.load();
25   * </code>
26   *
27   * @version $Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $
28   * @author Duane J. May <djmay@mayhoo.com>
29   */
30  public class KanjiDicReader
31  {
32      /** Version information */
33      private final static String VERSION = 
34    "$Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $";
35  
36      private String m_fileName = null;
37  
38      private int controlHigh = 0x0021;
39      private int csZeroLow = 0x0021;
40      private int csZeroHigh = 0x007E;
41      private int csOneLow = 0x00A1;
42      private int csOneHigh = 0x00FE;
43      private int csTwo = 0x008E;
44      private int csThree = 0x008F;
45  
46      /**
47       * The constructor. Creates a KanjiDic reader reading from the
48       * file, fileName.
49       */
50      public KanjiDicReader(String fileName) 
51      {
52    m_fileName = fileName;
53      }
54    
55      /** 
56       * Loads the file specified in the constructor and returns the
57       * KanjiDic struture containing the information in the file.
58       */
59      public Kanjidic load() 
60      {
61    Kanjidic kanjidic = new Kanjidic();      
62    byte bar[] = null;
63    int fileSize = 0;
64        
65    try {      
66        File file = new File( m_fileName );
67        if ( !file.canRead() || !file.isFile() ) {
68      throw new FileNotFoundException ( 
69         "Cannot could not read or write to the file: " 
70         + m_fileName );
71        }
72        long fileSizeLong = file.length();
73        if ( fileSizeLong > Integer.MAX_VALUE ) {
74      throw new RuntimeException ( 
75            "Cannot allocate more then an ints worth " +
76            "of elements in an array." );
77        }
78        fileSize = (int)fileSizeLong;
79            
80        FileInputStream fin = new FileInputStream( file );
81        FileChannel in = fin.getChannel();
82        
83        FileOutputStream fout = 
84      new FileOutputStream( m_fileName + ".unicode" );
85        FileChannel out = fout.getChannel();
86        Charset unicodeCharset = Charset.forName( "UTF-16" );
87        CharsetEncoder encoder = unicodeCharset.newEncoder();
88        ByteBuffer bb = ByteBuffer.allocateDirect( 2048 );
89        int bbPos = 0;
90        bar = new byte[fileSize];
91        
92        int readBytes = in.read(bb);
93        while( readBytes != -1 ) {
94      bb.flip();
95          
96      bb.get( bar, bbPos, readBytes );
97      bbPos += readBytes;
98              
99      bb.clear();
100         
101     readBytes = in.read(bb);
102       }
103       in.close();
104       fin.close();
105       
106       bb = null;
107       in = null;
108       fin = null;
109   } catch (Exception e) {
110       e.printStackTrace(System.err);
111   }
112       
113   char car[] = new char[fileSize];
114   int j = 0;
115   for( int i = 0; i < fileSize; i++ ) {
116       int b = (int)bar[i];
117       if ( b < 0 ) {
118     b = b & 0x000000FF;
119       }
120       char c = (char)b;
121 
122       if ( b < controlHigh ) {
123     // control characters
124     car[j] = c;
125     j++;
126       } else if ( b >= csZeroLow && b <= csZeroHigh ) {
127     // code set 0
128     car[j] = c;
129     j++;
130       } else if ( b >= csOneLow && b <= csOneHigh ) {
131     // code set 1
132     i++;
133     int b2 = (int)bar[i];
134     if ( b2 < 0 ) {
135         b2 = b2 & 0x000000FF;
136     }
137 
138     if ( b2 < 0xA1 || b2 > 0x000000FE ) {
139         throw new RuntimeException( "Corrupt file, second byte " +
140             "of code set 1 is not within allowed range." );
141     }
142             
143     if ( b == 0x00A4 ) {
144         // hiragana
145         car[j] = (char)( b2 + 0x00002FA0 );
146         j++;                  
147     } else if ( b == 0x00A5 ) {
148         // katakana
149         car[j] = (char)( b2 | 0x3000 );
150         j++;                  
151     } else {
152         // Kanji Character
153         // skip this and recreate it from the unicode value
154     }
155       } else if ( b == csTwo ) {
156                  // code set 2
157     throw new RuntimeException( "code set 2 is not supported" );
158       } else if ( b == csThree ) {
159     // code set 3
160           throw new RuntimeException( "code set 3 is not supported" );
161       } else {
162     throw new RuntimeException( "The character " + b + 
163         " is not allowed in the EUC-JP standard." );
164       }
165   }
166   bar = null;
167 
168   String completeFile = new String( car, 0, j );
169   StringTokenizer lines = new StringTokenizer( completeFile, "\n\r" );
170   int lineNumber = 0;
171   while ( lines.hasMoreTokens() ) {
172       String line = lines.nextToken();
173       lineNumber++;
174       int comment = line.indexOf( '#' );
175       if ( comment >= 0 ) {
176     line = line.substring( 0, comment );
177       }
178         
179       if ( line.length() == 0 ) {
180     continue;
181       }
182         
183       Kanji kanji = new Kanji();
184       kanjidic.addKanji( kanji );
185         
186       boolean hasStroke = false;
187       int field = 0;
188       int nanoriField = 0;
189       int radicalField = 0;
190       StringTokenizer tokens = new StringTokenizer( line );
191       while ( tokens.hasMoreTokens() ) {
192     String token = tokens.nextToken();
193       
194     char type = token.charAt( 0 );
195     char subType;
196     int temp;
197     String content = token.substring( 1 );
198     switch ( type ) {
199     case 'B':
200         temp = Integer.parseInt( content );
201         kanji.setBushu( temp );
202         kanji.setRadical( temp );
203         break;
204     case 'C':
205         temp = Integer.parseInt( content );
206         kanji.setRadical( temp );
207         break;
208     case 'D':
209         subType = content.charAt( 0 );
210         content = content.substring( 1 );
211           
212         switch ( subType ) {
213         case 'R':
214       temp = Integer.parseInt( content );
215       kanji.setDeroo( temp );
216       break;
217         case 'K':
218       temp = Integer.parseInt( content );
219       kanji.setHalpern2( temp );
220       break;
221         case 'O':
222       temp = Integer.parseInt( content );
223       kanji.setOneill( temp );
224       break;
225         }
226         break;
227     case 'E':
228         temp = Integer.parseInt( content );
229         kanji.setHenshall( temp );
230         break;
231     case 'F':
232         temp = Integer.parseInt( content );
233         kanji.setFrequency( temp );
234         break;
235     case 'G':
236         temp = Integer.parseInt( content );
237         switch ( temp ) {
238         case 1:
239         case 2:
240         case 3:
241         case 4:
242         case 5:
243         case 6:
244       kanji.setJouyou( temp );
245       break;
246         case 8:                
247       kanji.setGeneral( true );
248       break;
249         case 9:
250       kanji.setJinmeiyou( true );
251       break;
252         default:
253             throw new RuntimeException( "Grade value is invalid " +
254           " must be 1-6, general(8), or jinmeiyou(9); is " +
255           temp + "; on line " + lineNumber );
256         }
257         break;
258     case 'H':
259         temp = Integer.parseInt( content );
260         kanji.setHalpern( temp );
261         break;
262     case 'I':
263         subType = content.charAt( 0 );
264         content = content.substring( 1 );
265         if ( subType == 'N' ) {
266       temp = Integer.parseInt( content );
267       kanji.setSpahn2( temp );
268         } else {
269       kanji.setSpahn( content );
270         }
271         break;
272     case 'K':
273         temp = Integer.parseInt( content );
274         kanji.setGakken( temp );
275         break;
276     case 'L':
277         temp = Integer.parseInt( content );
278         kanji.setHeisig( temp );
279         break;
280     case 'M':
281         Morohashi morohashi = kanji.getMorohashi();
282         if ( morohashi == null ) {
283       morohashi = new Morohashi();
284       kanji.setMorohashi( morohashi );
285         }
286         subType = content.charAt( 0 );
287         content = content.substring( 1 );
288         if ( subType == 'N' ) {
289       if ( ! content.equals( "0" ) ) {
290           morohashi.setIndex( content );
291       }
292         } else if ( subType == 'P' ) {
293       int index = content.indexOf( '.' );
294       if ( index > 0 ) {
295           temp = Integer.parseInt( 
296               content.substring( 0, index ) );
297           if ( temp != 0 ) {
298         morohashi.setVolume( temp );
299           }
300           temp = Integer.parseInt( 
301               content.substring( index + 1 ) );
302           if ( temp != 0 ) {
303         morohashi.setPage( temp );
304           }
305       }
306         }
307         break;
308     case 'N':
309         temp = Integer.parseInt( content );
310         kanji.setNelson( temp );
311         break;
312     case 'O':
313         kanji.setOneill2( content );
314         break;
315     case 'P':
316         kanji.addSkip( content );
317         break;
318     case 'Q':
319         kanji.addCorner( content );
320         break;
321     case 'S':
322         Stroke stroke = new Stroke();
323         kanji.addStroke( stroke );
324         if ( hasStroke ) {
325       stroke.setMiscount( true );
326         }          
327         hasStroke = true;
328         temp = Integer.parseInt( content );
329         stroke.setContent( temp );
330         break;
331     case 'T':
332         temp = Integer.parseInt( content );
333         if ( temp == 1 ) {
334       nanoriField = field + 1;
335         } else if ( temp == 2 ) {
336       radicalField = field + 1;
337         } else {
338       throw new RuntimeException( "Unsupported T-Reading " +
339           "type " + temp + "; on line " + lineNumber );
340         }
341         break;
342     case 'U':
343         Unicode unicode = new Unicode();
344         kanji.setUnicode( unicode );
345         unicode.setHex( content );
346         temp = Integer.parseInt( content, 16 );
347         unicode.setDec( temp );
348         kanji.setChar( String.valueOf( (char)temp ) );
349         break;
350     case 'V':
351         temp = Integer.parseInt( content );
352         kanji.setHaig( temp );
353         break;
354     case 'W':
355         kanji.addKorean( content );
356         break;
357     case 'X':
358         System.err.println( "Cross Ref: " + content );
359         break;
360     case 'Y':
361         kanji.addPinyin( content );
362         break;
363     case 'Z':
364         //temp = Integer.parseInt( content );
365         //kanji.setNelson( temp );
366         System.err.println( "Mis Class: " + content );
367         break;
368     case '{':
369         StringBuffer buf = new StringBuffer( content );
370         int size = buf.length() - 1;
371         char last = buf.charAt( size );
372         while ( last != '}' ) {
373       buf.append( ' ' )
374           .append( tokens.nextToken() );
375       size = buf.length() - 1;
376       last = buf.charAt( size );
377         }
378         buf.deleteCharAt( size );
379         content = buf.toString();
380         Meaning meaning = new Meaning();
381         kanji.addMeaning( meaning );
382         meaning.setLang( LangType.EN );
383         meaning.setContent( content );
384         break;
385     default:                      
386         if ( field == 0 ) {
387       Jis jis = new Jis();
388       kanji.setJis( jis );
389       jis.setHex( token );
390       temp = Integer.parseInt( token, 16 );
391       jis.setDec( temp );
392         } else {
393       String readingStr = token;
394       Reading reading = new Reading();
395       kanji.addReading( reading );
396             
397       if ( type == '-' ) {
398           reading.setPrefix( true );
399           readingStr = readingStr.substring( 1 );
400           type = readingStr.charAt( 0 );
401       }
402 
403       if ( JapaneseCharacter.isKatakana( type ) ) {
404           reading.setType( TypeType.ONYOMI );
405       } else if ( JapaneseCharacter.isHiragana( type ) ) {
406           reading.setType( TypeType.KUNYOMI );
407       } else {
408           throw new RuntimeException( "Japanese reading " +
409               "is not Katakana nor Hiragana; on line " +
410         lineNumber + "; reading = " + readingStr );
411       }
412       
413       if ( nanoriField == field ) {
414           reading.setNanori( true );
415       }
416 
417       if ( radicalField == field ) {
418           reading.setRadical( true );
419       }
420                   
421       int index = readingStr.indexOf( '-' );
422       if ( index > 0 ) {
423           reading.setSuffix( true );
424           readingStr = readingStr.substring( 0, 
425               readingStr.length() - 1 );
426       }
427             
428       index = readingStr.indexOf( '.' );
429       if ( index > 0 ) {
430           reading.setOkurigana( 
431               readingStr.substring( index + 1 ) );
432           readingStr = readingStr.substring( 0, index );
433       }
434       reading.setContent( readingStr );
435       readingStr = JapaneseString.toRomaji( readingStr );
436       reading.setRomaji( readingStr );
437         }       
438         break;
439     }
440     field++;   
441       }
442       try {
443     kanji.validate();
444       } catch ( Exception e ) {
445     int size = kanjidic.getKanjiCount();
446           kanjidic.removeKanji( size - 1 );
447         System.err.println( "Validation error on input line#: "
448                 + lineNumber );
449     Exception ex = new RuntimeException( "Input file does not " +
450                     "conform to the XML output schema" );
451     ex.printStackTrace( System.err );
452     e.printStackTrace( System.err );
453       }
454   }          
455     
456   return kanjidic;    
457     }
458 }