Source code: com/mayhoo/kanji/KanjiDicReader.java
1 // $Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $
2 package com.mayhoo.kanji;
3
4 import java.io.*;
5 import java.util.*;
6 import java.nio.*;
7 import java.nio.channels.*;
8 import java.nio.charset.*;
9 import com.mayhoo.kanji.xml.*;
10 import com.mayhoo.kanji.xml.types.*;
11
12 /**
13 * KanjiDicReader enables the reading of the Kanjidic file format
14 * created by Jim Breen. The class reads the file and produces
15 * a data structure related to the XML file format by Duane May,
16 * which was origianlly based on the kanjidic format.
17 *
18 * The class consists of one function load and the classes constructor.
19 *
20 * A file is imported as follows:
21 *
22 * <code>
23 * KanjiDicReader reader = new KanjiDicReader( filename );
24 * KanjiDic theDictionary = reader.load();
25 * </code>
26 *
27 * @version $Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $
28 * @author Duane J. May <djmay@mayhoo.com>
29 */
30 public class KanjiDicReader
31 {
32 /** Version information */
33 private final static String VERSION =
34 "$Id: KanjiDicReader.java,v 1.2 2002/04/20 18:10:24 djmay Exp $";
35
36 private String m_fileName = null;
37
38 private int controlHigh = 0x0021;
39 private int csZeroLow = 0x0021;
40 private int csZeroHigh = 0x007E;
41 private int csOneLow = 0x00A1;
42 private int csOneHigh = 0x00FE;
43 private int csTwo = 0x008E;
44 private int csThree = 0x008F;
45
46 /**
47 * The constructor. Creates a KanjiDic reader reading from the
48 * file, fileName.
49 */
50 public KanjiDicReader(String fileName)
51 {
52 m_fileName = fileName;
53 }
54
55 /**
56 * Loads the file specified in the constructor and returns the
57 * KanjiDic struture containing the information in the file.
58 */
59 public Kanjidic load()
60 {
61 Kanjidic kanjidic = new Kanjidic();
62 byte bar[] = null;
63 int fileSize = 0;
64
65 try {
66 File file = new File( m_fileName );
67 if ( !file.canRead() || !file.isFile() ) {
68 throw new FileNotFoundException (
69 "Cannot could not read or write to the file: "
70 + m_fileName );
71 }
72 long fileSizeLong = file.length();
73 if ( fileSizeLong > Integer.MAX_VALUE ) {
74 throw new RuntimeException (
75 "Cannot allocate more then an ints worth " +
76 "of elements in an array." );
77 }
78 fileSize = (int)fileSizeLong;
79
80 FileInputStream fin = new FileInputStream( file );
81 FileChannel in = fin.getChannel();
82
83 FileOutputStream fout =
84 new FileOutputStream( m_fileName + ".unicode" );
85 FileChannel out = fout.getChannel();
86 Charset unicodeCharset = Charset.forName( "UTF-16" );
87 CharsetEncoder encoder = unicodeCharset.newEncoder();
88 ByteBuffer bb = ByteBuffer.allocateDirect( 2048 );
89 int bbPos = 0;
90 bar = new byte[fileSize];
91
92 int readBytes = in.read(bb);
93 while( readBytes != -1 ) {
94 bb.flip();
95
96 bb.get( bar, bbPos, readBytes );
97 bbPos += readBytes;
98
99 bb.clear();
100
101 readBytes = in.read(bb);
102 }
103 in.close();
104 fin.close();
105
106 bb = null;
107 in = null;
108 fin = null;
109 } catch (Exception e) {
110 e.printStackTrace(System.err);
111 }
112
113 char car[] = new char[fileSize];
114 int j = 0;
115 for( int i = 0; i < fileSize; i++ ) {
116 int b = (int)bar[i];
117 if ( b < 0 ) {
118 b = b & 0x000000FF;
119 }
120 char c = (char)b;
121
122 if ( b < controlHigh ) {
123 // control characters
124 car[j] = c;
125 j++;
126 } else if ( b >= csZeroLow && b <= csZeroHigh ) {
127 // code set 0
128 car[j] = c;
129 j++;
130 } else if ( b >= csOneLow && b <= csOneHigh ) {
131 // code set 1
132 i++;
133 int b2 = (int)bar[i];
134 if ( b2 < 0 ) {
135 b2 = b2 & 0x000000FF;
136 }
137
138 if ( b2 < 0xA1 || b2 > 0x000000FE ) {
139 throw new RuntimeException( "Corrupt file, second byte " +
140 "of code set 1 is not within allowed range." );
141 }
142
143 if ( b == 0x00A4 ) {
144 // hiragana
145 car[j] = (char)( b2 + 0x00002FA0 );
146 j++;
147 } else if ( b == 0x00A5 ) {
148 // katakana
149 car[j] = (char)( b2 | 0x3000 );
150 j++;
151 } else {
152 // Kanji Character
153 // skip this and recreate it from the unicode value
154 }
155 } else if ( b == csTwo ) {
156 // code set 2
157 throw new RuntimeException( "code set 2 is not supported" );
158 } else if ( b == csThree ) {
159 // code set 3
160 throw new RuntimeException( "code set 3 is not supported" );
161 } else {
162 throw new RuntimeException( "The character " + b +
163 " is not allowed in the EUC-JP standard." );
164 }
165 }
166 bar = null;
167
168 String completeFile = new String( car, 0, j );
169 StringTokenizer lines = new StringTokenizer( completeFile, "\n\r" );
170 int lineNumber = 0;
171 while ( lines.hasMoreTokens() ) {
172 String line = lines.nextToken();
173 lineNumber++;
174 int comment = line.indexOf( '#' );
175 if ( comment >= 0 ) {
176 line = line.substring( 0, comment );
177 }
178
179 if ( line.length() == 0 ) {
180 continue;
181 }
182
183 Kanji kanji = new Kanji();
184 kanjidic.addKanji( kanji );
185
186 boolean hasStroke = false;
187 int field = 0;
188 int nanoriField = 0;
189 int radicalField = 0;
190 StringTokenizer tokens = new StringTokenizer( line );
191 while ( tokens.hasMoreTokens() ) {
192 String token = tokens.nextToken();
193
194 char type = token.charAt( 0 );
195 char subType;
196 int temp;
197 String content = token.substring( 1 );
198 switch ( type ) {
199 case 'B':
200 temp = Integer.parseInt( content );
201 kanji.setBushu( temp );
202 kanji.setRadical( temp );
203 break;
204 case 'C':
205 temp = Integer.parseInt( content );
206 kanji.setRadical( temp );
207 break;
208 case 'D':
209 subType = content.charAt( 0 );
210 content = content.substring( 1 );
211
212 switch ( subType ) {
213 case 'R':
214 temp = Integer.parseInt( content );
215 kanji.setDeroo( temp );
216 break;
217 case 'K':
218 temp = Integer.parseInt( content );
219 kanji.setHalpern2( temp );
220 break;
221 case 'O':
222 temp = Integer.parseInt( content );
223 kanji.setOneill( temp );
224 break;
225 }
226 break;
227 case 'E':
228 temp = Integer.parseInt( content );
229 kanji.setHenshall( temp );
230 break;
231 case 'F':
232 temp = Integer.parseInt( content );
233 kanji.setFrequency( temp );
234 break;
235 case 'G':
236 temp = Integer.parseInt( content );
237 switch ( temp ) {
238 case 1:
239 case 2:
240 case 3:
241 case 4:
242 case 5:
243 case 6:
244 kanji.setJouyou( temp );
245 break;
246 case 8:
247 kanji.setGeneral( true );
248 break;
249 case 9:
250 kanji.setJinmeiyou( true );
251 break;
252 default:
253 throw new RuntimeException( "Grade value is invalid " +
254 " must be 1-6, general(8), or jinmeiyou(9); is " +
255 temp + "; on line " + lineNumber );
256 }
257 break;
258 case 'H':
259 temp = Integer.parseInt( content );
260 kanji.setHalpern( temp );
261 break;
262 case 'I':
263 subType = content.charAt( 0 );
264 content = content.substring( 1 );
265 if ( subType == 'N' ) {
266 temp = Integer.parseInt( content );
267 kanji.setSpahn2( temp );
268 } else {
269 kanji.setSpahn( content );
270 }
271 break;
272 case 'K':
273 temp = Integer.parseInt( content );
274 kanji.setGakken( temp );
275 break;
276 case 'L':
277 temp = Integer.parseInt( content );
278 kanji.setHeisig( temp );
279 break;
280 case 'M':
281 Morohashi morohashi = kanji.getMorohashi();
282 if ( morohashi == null ) {
283 morohashi = new Morohashi();
284 kanji.setMorohashi( morohashi );
285 }
286 subType = content.charAt( 0 );
287 content = content.substring( 1 );
288 if ( subType == 'N' ) {
289 if ( ! content.equals( "0" ) ) {
290 morohashi.setIndex( content );
291 }
292 } else if ( subType == 'P' ) {
293 int index = content.indexOf( '.' );
294 if ( index > 0 ) {
295 temp = Integer.parseInt(
296 content.substring( 0, index ) );
297 if ( temp != 0 ) {
298 morohashi.setVolume( temp );
299 }
300 temp = Integer.parseInt(
301 content.substring( index + 1 ) );
302 if ( temp != 0 ) {
303 morohashi.setPage( temp );
304 }
305 }
306 }
307 break;
308 case 'N':
309 temp = Integer.parseInt( content );
310 kanji.setNelson( temp );
311 break;
312 case 'O':
313 kanji.setOneill2( content );
314 break;
315 case 'P':
316 kanji.addSkip( content );
317 break;
318 case 'Q':
319 kanji.addCorner( content );
320 break;
321 case 'S':
322 Stroke stroke = new Stroke();
323 kanji.addStroke( stroke );
324 if ( hasStroke ) {
325 stroke.setMiscount( true );
326 }
327 hasStroke = true;
328 temp = Integer.parseInt( content );
329 stroke.setContent( temp );
330 break;
331 case 'T':
332 temp = Integer.parseInt( content );
333 if ( temp == 1 ) {
334 nanoriField = field + 1;
335 } else if ( temp == 2 ) {
336 radicalField = field + 1;
337 } else {
338 throw new RuntimeException( "Unsupported T-Reading " +
339 "type " + temp + "; on line " + lineNumber );
340 }
341 break;
342 case 'U':
343 Unicode unicode = new Unicode();
344 kanji.setUnicode( unicode );
345 unicode.setHex( content );
346 temp = Integer.parseInt( content, 16 );
347 unicode.setDec( temp );
348 kanji.setChar( String.valueOf( (char)temp ) );
349 break;
350 case 'V':
351 temp = Integer.parseInt( content );
352 kanji.setHaig( temp );
353 break;
354 case 'W':
355 kanji.addKorean( content );
356 break;
357 case 'X':
358 System.err.println( "Cross Ref: " + content );
359 break;
360 case 'Y':
361 kanji.addPinyin( content );
362 break;
363 case 'Z':
364 //temp = Integer.parseInt( content );
365 //kanji.setNelson( temp );
366 System.err.println( "Mis Class: " + content );
367 break;
368 case '{':
369 StringBuffer buf = new StringBuffer( content );
370 int size = buf.length() - 1;
371 char last = buf.charAt( size );
372 while ( last != '}' ) {
373 buf.append( ' ' )
374 .append( tokens.nextToken() );
375 size = buf.length() - 1;
376 last = buf.charAt( size );
377 }
378 buf.deleteCharAt( size );
379 content = buf.toString();
380 Meaning meaning = new Meaning();
381 kanji.addMeaning( meaning );
382 meaning.setLang( LangType.EN );
383 meaning.setContent( content );
384 break;
385 default:
386 if ( field == 0 ) {
387 Jis jis = new Jis();
388 kanji.setJis( jis );
389 jis.setHex( token );
390 temp = Integer.parseInt( token, 16 );
391 jis.setDec( temp );
392 } else {
393 String readingStr = token;
394 Reading reading = new Reading();
395 kanji.addReading( reading );
396
397 if ( type == '-' ) {
398 reading.setPrefix( true );
399 readingStr = readingStr.substring( 1 );
400 type = readingStr.charAt( 0 );
401 }
402
403 if ( JapaneseCharacter.isKatakana( type ) ) {
404 reading.setType( TypeType.ONYOMI );
405 } else if ( JapaneseCharacter.isHiragana( type ) ) {
406 reading.setType( TypeType.KUNYOMI );
407 } else {
408 throw new RuntimeException( "Japanese reading " +
409 "is not Katakana nor Hiragana; on line " +
410 lineNumber + "; reading = " + readingStr );
411 }
412
413 if ( nanoriField == field ) {
414 reading.setNanori( true );
415 }
416
417 if ( radicalField == field ) {
418 reading.setRadical( true );
419 }
420
421 int index = readingStr.indexOf( '-' );
422 if ( index > 0 ) {
423 reading.setSuffix( true );
424 readingStr = readingStr.substring( 0,
425 readingStr.length() - 1 );
426 }
427
428 index = readingStr.indexOf( '.' );
429 if ( index > 0 ) {
430 reading.setOkurigana(
431 readingStr.substring( index + 1 ) );
432 readingStr = readingStr.substring( 0, index );
433 }
434 reading.setContent( readingStr );
435 readingStr = JapaneseString.toRomaji( readingStr );
436 reading.setRomaji( readingStr );
437 }
438 break;
439 }
440 field++;
441 }
442 try {
443 kanji.validate();
444 } catch ( Exception e ) {
445 int size = kanjidic.getKanjiCount();
446 kanjidic.removeKanji( size - 1 );
447 System.err.println( "Validation error on input line#: "
448 + lineNumber );
449 Exception ex = new RuntimeException( "Input file does not " +
450 "conform to the XML output schema" );
451 ex.printStackTrace( System.err );
452 e.printStackTrace( System.err );
453 }
454 }
455
456 return kanjidic;
457 }
458 }