Source code: com/trapezium/parse/TokenFactory.java
1 /*
2 * @(#)TokenFactory.java
3 *
4 * Copyright (c) 1998 by Trapezium Development LLC. All Rights Reserved.
5 *
6 * The information in this file is the property of Trapezium Development LLC
7 * and may be used only in accordance with the terms of the license granted
8 * by Trapezium.
9 *
10 */
11 package com.trapezium.parse;
12
13 import com.trapezium.util.ReturnInteger;
14 import com.trapezium.util.KeywordList;
15 import com.trapezium.util.ByteString;
16
17 /**
18 * Assigns a range of characters within a line to a token of a particular
19 * type. Adapted from initial 1.0 strategy of creating Token objects.
20 * This approach abandoned due to OutOfMemory exception, tokens now
21 * managed within TokenEnumerator with arrays.
22 *
23 * The optimization that required this has complicated the code, since
24 * it was not initially designed this way. The TokenFactory is a single
25 * class that has taken the place of several classes in the previous
26 * architecture. The port from one architecture to the other was very
27 * straightforward, but has resulted in what appears to be very unusual
28 * and messy class.
29 *
30 * @author Johannes N. Johannsen
31 * @version 1.1, 31 Dec 1997
32 *
33 * @since 1.1
34 * @see TokenTypes
35 * @see TokenEnumerator
36 */
37 public class TokenFactory implements TokenTypes {
38 boolean unterminatedString = false;
39 boolean allowUnterminatedString = true;
40
41 /** class constructor */
42 public TokenFactory() {
43 this( true );
44 }
45
46 public TokenFactory( boolean allowUnterminatedString ) {
47 this.allowUnterminatedString = allowUnterminatedString;
48 }
49
50 void setUnterminatedString( boolean value ) {
51 if ( allowUnterminatedString ) {
52 unterminatedString = value;
53 }
54 }
55
56 /**
57 * create token information for the next token in a string
58 *
59 * @param s the ByteString containing the token
60 * @param offset the start offset of the text to check for the token
61 * @param offsetReturn output parameter with offset of start of next token,
62 * set to -1 if no token is created
63 * @param typeReturn output parameter with type of next token
64 * @param sizeReturn output parameter with size of next token
65 */
66 public void tokenize( ByteString s, int offset, ReturnInteger offsetReturn,
67 ReturnInteger typeReturn, ReturnInteger sizeReturn ) {
68 int len = s.length();
69 if (( len - offset ) == 0 ) {
70 offsetReturn.setValue( 0 );
71 typeReturn.setValue( EmptyLine );
72 sizeReturn.setValue( 1 );
73 return;
74 } else if ( unterminatedString ) {
75 BlackToken_Factory( s, offset, offsetReturn, typeReturn, sizeReturn );
76 return;
77 }
78
79 // skip all white characters, we don't make tokens out of them
80 while (( offset < len ) && isWhiteChar( s.charAt( offset ))) {
81 offset++;
82 }
83
84 // If there is nothing left in this line, set offsetReturn to -1 to indicate this
85 if ( offset == len ) {
86 offsetReturn.setValue( -1 );
87 return;
88 }
89
90 BlackToken_Factory( s, offset, offsetReturn, typeReturn, sizeReturn );
91 }
92
93 //
94 // The following static public methods are used for specific character identifications:
95 //
96 // isWhiteChar
97 // isBlackChar
98 // isCommentChar
99 // isBracketChar
100 // isBraceChar
101 // isNumber
102 //
103
104 /** is the character white space? */
105 static public boolean isWhiteChar( char x ) {
106 switch ( x ) {
107 case ' ':
108 case '\t':
109 case ',':
110 case '\n':
111 case '\r':
112 case 0x1a:
113 return( true );
114 default:
115 return( false );
116 }
117 }
118
119 /** is the character black space? */
120 static public boolean isBlackChar( char x ) {
121 if ( isWhiteChar( x )) {
122 return( false );
123 } else {
124 return( true );
125 }
126 }
127
128 /** is the character a special vrml character #[]{}" */
129 static public boolean isSpecialChar( char x ) {
130 return(( x == '#' ) || isQuoteChar( x ) || isBraceChar( x ) || isBracketChar( x ));
131 }
132
133 /** is the character a vrml comment character # */
134 static public boolean isCommentChar( char x ) {
135 if ( x == '#' ) {
136 return( true );
137 } else {
138 return( false );
139 }
140 }
141
142 /** is the character a vrml quote character " */
143 static public boolean isQuoteChar( char x ) {
144 if ( x == '"' ) {
145 return( true );
146 } else {
147 return( false );
148 }
149 }
150
151 /** is the character a bracket [] */
152 static public boolean isBracketChar( char x ) {
153 if (( x == '[' ) || ( x == ']' )) {
154 return( true );
155 } else {
156 return( false );
157 }
158 }
159
160 /** is the character a left bracket? */
161 static public boolean isLeftBracket( char x ) {
162 return( x == '[' );
163 }
164
165 /** is the character a right bracket? */
166 static public boolean isRightBracket( char x ) {
167 return( x == ']' );
168 }
169
170 /** is the character a left brace? */
171 static public boolean isLeftBrace( char x ) {
172 return( x == '{' );
173 }
174
175 /** is the character a right brace? */
176 static public boolean isRightBrace( char x ) {
177 return( x == '}' );
178 }
179
180 /* is the character a brace {} */
181 static public boolean isBraceChar( char x ) {
182 if (( x == '{' ) || ( x == '}' )) {
183 return( true );
184 } else {
185 return( false );
186 }
187 }
188
189 /** could the character be part of a number? */
190 static public boolean isNumberChar( char x ) {
191 switch( x ) {
192 case '0':
193 case '1':
194 case '2':
195 case '3':
196 case '4':
197 case '5':
198 case '6':
199 case '7':
200 case '8':
201 case '9':
202 case '.':
203 case '-':
204 case '+':
205 case 'e':
206 case 'E':
207 return( true );
208 default:
209 return( false );
210 }
211 }
212
213 /** create the next token, already known to exist when this is called
214 *
215 * @param s the ByteString containing the token
216 * @param offset offset of the start of the token
217 * @param offsetReturn output parameter, start offset of the token
218 * @param typeReturn output parameter, type of the token
219 * @param sizeReturn output parameter, size of the token
220 */
221 void BlackToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn,
222 ReturnInteger typeReturn, ReturnInteger sizeReturn ) {
223 if ( unterminatedString ) {
224 typeReturn.setValue( QuotedStringContinuation );
225 QuotedStringContinuation_Factory( s, offset, offsetReturn, sizeReturn );
226 return;
227 }
228
229 // determine type
230 char firstChar = s.charAt( offset );
231
232 if ((( firstChar >= 'a' ) && ( firstChar <= 'z' )) || (( firstChar >= 'A' ) && ( firstChar <= 'Z' ))) {
233 typeReturn.setValue( NameToken );
234 NameToken_Factory( s, offset, offsetReturn, sizeReturn );
235 keywordCheck( s, typeReturn, offset, sizeReturn.getValue() );
236 } else if ( isNumberSequence( s, offset )) {
237 NumberToken_Factory( s, offset, offsetReturn, sizeReturn, typeReturn );
238 } else if ( firstChar == '#' ) {
239 typeReturn.setValue( CommentToken );
240 CommentToken_Factory( s, offset, offsetReturn, sizeReturn );
241 } else if ( firstChar == '[' ) {
242 typeReturn.setValue( LeftBracket );
243 sizeReturn.setValue( 1 );
244 offsetReturn.setValue( offset );
245 } else if ( firstChar == '{' ) {
246 typeReturn.setValue( LeftBrace );
247 sizeReturn.setValue( 1 );
248 offsetReturn.setValue( offset );
249 } else if ( firstChar == ']' ) {
250 typeReturn.setValue( RightBracket );
251 sizeReturn.setValue( 1 );
252 offsetReturn.setValue( offset );
253 } else if ( firstChar == '}' ) {
254 typeReturn.setValue( RightBrace );
255 sizeReturn.setValue( 1 );
256 offsetReturn.setValue( offset );
257 } else if ( firstChar == '"' ) {
258 setUnterminatedString( true );
259 typeReturn.setValue( QuotedString );
260 QuotedString_Factory( s, offset, offsetReturn, sizeReturn );
261 } else {
262 typeReturn.setValue( NameToken );
263 NameToken_Factory( s, offset, offsetReturn, sizeReturn );
264 keywordCheck( s, typeReturn, offset, sizeReturn.getValue() );
265 }
266 }
267
268 /** convert the type to Keyword1Token or Keyword2Token of appropriate */
269 void keywordCheck( ByteString s, ReturnInteger typeReturn, int offset, int size ) {
270 char schar = s.charAt( offset );
271 KeywordList k1 = Keywords.getKeyList1( schar );
272 KeywordList k2 = Keywords.getKeyList2( schar );
273 if ( k1 != null ) {
274 if ( k1.find( s, offset, size )) {
275 typeReturn.setValue( Keyword1Token );
276 return;
277 }
278 }
279 if ( k2 != null ) {
280 if ( k2.find( s, offset, size )) {
281 typeReturn.setValue( Keyword2Token );
282 return;
283 }
284 }
285 }
286
287
288 /** can the character be the first character in a numeric token? */
289 boolean isFirstNumberChar( char x ) {
290 if ( x == '.' ) return( true );
291 if ( x == '+' ) return( true );
292 if ( x == '-' ) return( true );
293 if (( x >= '0' ) && ( x <= '9' )) return( true );
294 return( false );
295 }
296
297 /** is the character a digit? */
298 boolean isDigit( char x ) {
299 return(( x >= '0' ) && ( x <= '9' ));
300 }
301
302 /** is this a number sequence? */
303 boolean isNumberSequence( ByteString s, int offset ) {
304 if ( !isFirstNumberChar( s.charAt( offset ))) {
305 return( false );
306 }
307 boolean firstDigit = isDigit( s.charAt( offset ));
308 offset++;
309 int slen = s.length();
310 boolean additionalChars = false;
311 while (( offset < slen ) && isNumberChar( s.charAt( offset ))) {
312 offset++;
313 additionalChars = true;
314 }
315 if ( !firstDigit && !additionalChars ) {
316 return( false );
317 }
318 if ( offset == slen ) {
319 return( true );
320 }
321 return( isWhiteChar( s.charAt( offset )) || isBracketChar( s.charAt( offset )));
322 }
323
324 /** create a token
325 *
326 * @param s ByteString containing token
327 * @param offset starting offset of token
328 * @param sizeReturn output parameter, size of resulting token
329 */
330 void createToken( ByteString s, int offset, ReturnInteger sizeReturn ) {
331 int numberCharsProcessed = 0;
332 if ( s != null ) {
333 while ( numberCharsProcessed < ( s.length() - offset ) &&
334 isAppropriate( s.charAt( numberCharsProcessed + offset ))) {
335 numberCharsProcessed++;
336 }
337 }
338 sizeReturn.setValue( numberCharsProcessed );
339 }
340
341 /**
342 * is the character appropriate for the token currently being created?
343 */
344 boolean isAppropriate( char x ) {
345 switch( currentType ) {
346 case QuotedString:
347 case QuotedStringContinuation:
348 return( QuotedString_isAppropriate( x ));
349 case CommentToken:
350 return( CommentToken_isAppropriate( x ));
351 case NumberToken:
352 return( NumberToken_isAppropriate( x ));
353 case NameToken:
354 return( NameToken_isAppropriate( x ));
355 default:
356 return( false );
357 }
358 }
359
360
361 // isAppropriate was simpler when Tokens were objects and this was done with
362 // subclasses, now done with switch based on currentType
363 int currentType;
364
365 boolean firstQuoteFound = false;
366 boolean prevCharWasEscape = false;
367 boolean thatsAllFolks = false;
368
369 /** create a quoted string token */
370 public void QuotedString_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
371 firstQuoteFound = false;
372 prevCharWasEscape = false;
373 thatsAllFolks = false;
374 currentType = QuotedString;
375 offsetReturn.setValue( offset );
376 createToken( s, offset, sizeReturn );
377 }
378
379 /** has a quote been located? */
380 public boolean firstQuoteLocated() {
381 return( firstQuoteFound );
382 }
383
384 /** is the character valid for a quoted string? */
385 public boolean QuotedString_isAppropriate( char x ) {
386 if ( thatsAllFolks ) {
387 return( false );
388 } else if ( !firstQuoteLocated() ) {
389 if ( x == '"' ) {
390 firstQuoteFound = true;
391 return( true );
392 } else {
393 return( false );
394 }
395 } else {
396 if ( x == '\\' ) {
397 prevCharWasEscape = true;
398 return( true );
399 } else if ( x == '"' ) {
400 if ( !prevCharWasEscape ) {
401 thatsAllFolks = true;
402 unterminatedString = false;
403 return( true );
404 }
405 prevCharWasEscape = false;
406 return( true );
407 } else {
408 prevCharWasEscape = false;
409 return( true );
410 }
411 }
412 }
413
414 /** create a quoted string continuation token */
415 public void QuotedStringContinuation_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
416 firstQuoteFound = true;
417 prevCharWasEscape = false;
418 thatsAllFolks = false;
419 currentType = QuotedStringContinuation;
420 offsetReturn.setValue( offset );
421 createToken( s, offset, sizeReturn );
422 }
423
424 /** create a comment token */
425 public void CommentToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
426 currentType = CommentToken;
427 offsetReturn.setValue( offset );
428 createToken( s, offset, sizeReturn );
429 }
430
431 /** is the character appropriate for a comment token */
432 public boolean CommentToken_isAppropriate( char x ) {
433 if (( x == '\n' ) || ( x == '\r' )) {
434 return( false );
435 } else {
436 return( true );
437 }
438 }
439
440 /** create a number token */
441 public void NumberToken_Factory( ByteString s, int offset,
442 ReturnInteger offsetReturn, ReturnInteger sizeReturn, ReturnInteger typeReturn ) {
443 currentType = NumberToken;
444 offsetReturn.setValue( offset );
445 createToken( s, offset, sizeReturn );
446 setNumberTypeReturn( s, offset, sizeReturn.getValue(), typeReturn );
447 }
448
449
450 boolean isBadValue( int ndigits, ByteString s, int offset, int size, boolean eAllowed, boolean dotAllowed ) {
451 if ( ndigits == size ) {
452 return( false );
453 } else {
454 boolean result = true;
455 char nextchar = s.charAt( offset + ndigits );
456 if ( eAllowed ) {
457 if (( nextchar == 'e' ) || ( nextchar == 'E' )) {
458 return( false );
459 }
460 }
461 if ( dotAllowed ) {
462 if ( nextchar == '.' ) {
463 return( false );
464 }
465 }
466 return( true );
467 }
468 }
469
470 public int getNumberValue( ByteString s, int offset, int size ) {
471 int result = 0;
472 int base = 10;
473 for ( int i = 0; i < size; i++ ) {
474 char c = s.charAt( offset + i );
475 if (( c >= '0' ) && ( c <= '9' )) {
476 int n = c - '0';
477 result = result*base + n;
478 } else {
479 break;
480 }
481 }
482 return( result );
483 }
484
485 public int getNumberValue2( ByteString s, int offset, int size ) {
486 if ( s.charAt( offset ) == '0' ) {
487 return( 999 );
488 } else {
489 return( getNumberValue( s, offset, size ));
490 }
491 }
492
493 public int getNumberDigits( ByteString s, int offset, int size ) {
494 int numberDigits = 0;
495 for ( int i = 0; i < size; i++ ) {
496 char c = s.charAt( offset + i );
497 if (( c >= '0' ) && ( c <= '9' )) {
498 numberDigits++;
499 } else {
500 break;
501 }
502 }
503 return( numberDigits );
504 }
505
506
507 public void setNumberTypeReturn( ByteString s, int offset, int size, ReturnInteger typeReturn ) {
508 // now simplified to set type to either NumberToken or BadNumber
509
510 // eat initial +,-,.
511 char firstChar = s.charAt( offset );
512 boolean gotDot = false;
513 boolean gotDigits = false;
514 if (( firstChar == '+' ) || ( firstChar == '-' ) || ( firstChar == '.' )) {
515 offset++;
516 size--;
517 if ( firstChar == '.' ) {
518 gotDot = true;
519 }
520 }
521 if (( firstChar == '+' ) || ( firstChar == '-' )) {
522 char nextChar = s.charAt( offset );
523 if ( nextChar == '.' ) {
524 gotDot = true;
525 offset++;
526 size--;
527 }
528 }
529
530 // get digits
531 while (( size > 0 ) && isDigit( s.charAt( offset ))) {
532 gotDigits = true;
533 offset++;
534 size--;
535 }
536
537 if ( !gotDigits ) {
538 typeReturn.setValue( BadNumber );
539 return;
540 }
541
542 // If there is more, it has to be either a '.' or an 'e' or an 'E'
543 boolean gotE = false;
544 if ( size == 0 ) {
545 typeReturn.setValue( NumberToken );
546 return;
547 }
548 char nextChar = s.charAt( offset );
549 if (( nextChar == '.' ) && gotDot ) {
550 typeReturn.setValue( BadNumber );
551 return;
552 }
553 if ( nextChar == '.' ) {
554 gotDot = true;
555 } else if (( nextChar == 'e' ) || ( nextChar == 'E' )) {
556 gotE = true;
557 } else {
558 typeReturn.setValue( BadNumber );
559 return;
560 }
561 offset++;
562 size--;
563
564 // if we got an E, there has to be more
565 if ( gotE ) {
566 if ( size == 0 ) {
567 typeReturn.setValue( BadNumber );
568 return;
569 }
570 }
571
572 // if we got an E, the next char may be a sign
573 if ( gotE ) {
574 nextChar = s.charAt( offset );
575 if (( nextChar == '+' ) || ( nextChar == '-' )) {
576 offset++;
577 size--;
578 }
579 }
580
581 // check for more digits
582 while (( size > 0 ) && isDigit( s.charAt( offset ))) {
583 offset++;
584 size--;
585 }
586
587 // If there is more, and we haven't got an 'e' or 'E' yet, that is
588 // the only valid thing
589 if ( size == 0 ) {
590 typeReturn.setValue( NumberToken );
591 return;
592 }
593
594 if ( gotE ) {
595 typeReturn.setValue( BadNumber );
596 return;
597 }
598
599 nextChar = s.charAt( offset );
600 if (( nextChar != 'e' ) && ( nextChar != 'E' )) {
601 typeReturn.setValue( BadNumber );
602 return;
603 }
604 offset++;
605 size--;
606 if ( size == 0 ) {
607 typeReturn.setValue( BadNumber );
608 return;
609 }
610
611 // After the 'e', sign is allowed
612 nextChar = s.charAt( offset );
613 if (( nextChar == '+' ) || ( nextChar == '-' )) {
614 offset++;
615 size--;
616 }
617
618 if ( size == 0 ) {
619 typeReturn.setValue( BadNumber );
620 return;
621 }
622
623 while (( size > 0 ) && isDigit( s.charAt( offset ))) {
624 size--;
625 offset++;
626 }
627
628 if ( size != 0 ) {
629 typeReturn.setValue( BadNumber );
630 } else {
631 typeReturn.setValue( NumberToken );
632 }
633 }
634
635 /** is the character appropriate for a number token */
636 public boolean NumberToken_isAppropriate( char x ) {
637 return( isNumberChar( x ));
638 }
639
640 /** create a name token
641 *
642 * @param s the text containing the token
643 * @param offset offset in ByteString of the start of the token
644 * @param offsetReturn output parameter, offset of start of token
645 * @param sizeReturn output parameter, size of the token
646 */
647 public void NameToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
648 offsetReturn.setValue( offset );
649 currentType = NameToken;
650 createToken( s, offset, sizeReturn );
651 }
652
653 /** is the character appropriate for a name token */
654 public boolean NameToken_isAppropriate( char x ) {
655 return( !isWhiteChar( x ) && !isSpecialChar( x ));
656 }
657
658 }