Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/trapezium/parse/TokenFactory.java


1   /*
2    * @(#)TokenFactory.java
3    *
4    * Copyright (c) 1998 by Trapezium Development LLC.  All Rights Reserved.
5    *
6    * The information in this file is the property of Trapezium Development LLC
7    * and may be used only in accordance with the terms of the license granted
8    * by Trapezium.
9    *
10   */
11  package com.trapezium.parse;
12  
13  import com.trapezium.util.ReturnInteger;
14  import com.trapezium.util.KeywordList;
15  import com.trapezium.util.ByteString;
16  
17  /**
18   *  Assigns a range of characters within a line to a token of a particular
19   *  type.  Adapted from initial 1.0 strategy of creating Token objects.
20   *  This approach abandoned due to OutOfMemory exception, tokens now
21   *  managed within TokenEnumerator with arrays.
22   *
23   *  The optimization that required this has complicated the code, since
24   *  it was not initially designed this way.  The TokenFactory is a single
25   *  class that has taken the place of several classes in the previous
26   *  architecture.  The port from one architecture to the other was very
27   *  straightforward, but has resulted in what appears to be very unusual
28   *  and messy class.
29   *
30   *  @author          Johannes N. Johannsen
31   *  @version         1.1, 31 Dec 1997
32   *
33   *  @since           1.1
34   *  @see             TokenTypes
35   *  @see             TokenEnumerator
36   */
37  public class TokenFactory implements TokenTypes {
38      boolean unterminatedString = false;
39      boolean allowUnterminatedString = true;
40  
41      /** class constructor */
42      public TokenFactory() {
43          this( true );
44      }
45      
46      public TokenFactory( boolean allowUnterminatedString ) {
47          this.allowUnterminatedString = allowUnterminatedString;
48      }
49          
50      void setUnterminatedString( boolean value ) {
51          if ( allowUnterminatedString ) {
52              unterminatedString = value;
53          }
54      }
55  
56      /**
57       *  create token information for the next token in a string
58       *
59       *  @param  s      the ByteString containing the token
60       *  @param  offset the start offset of the text to check for the token
61       *  @param  offsetReturn  output parameter with offset of start of next token,
62       *                        set to -1 if no token is created
63       *  @param  typeReturn    output parameter with type of next token
64       *  @param  sizeReturn    output parameter with size of next token
65       */
66    public void tokenize( ByteString s, int offset, ReturnInteger offsetReturn,
67        ReturnInteger typeReturn, ReturnInteger sizeReturn ) {
68        int len = s.length();
69      if (( len - offset ) == 0 ) {
70          offsetReturn.setValue( 0 );
71          typeReturn.setValue( EmptyLine );
72          sizeReturn.setValue( 1 );
73          return;
74      } else if ( unterminatedString ) {
75        BlackToken_Factory( s, offset, offsetReturn, typeReturn, sizeReturn );
76        return;
77      }
78  
79          // skip all white characters, we don't make tokens out of them
80      while (( offset < len ) && isWhiteChar( s.charAt( offset ))) {
81          offset++;
82      }
83  
84      // If there is nothing left in this line, set offsetReturn to -1 to indicate this
85      if ( offset == len ) {
86          offsetReturn.setValue( -1 );
87          return;
88      }
89  
90          BlackToken_Factory( s, offset, offsetReturn, typeReturn, sizeReturn );
91    }
92  
93    //
94    //  The following static public methods are used for specific character identifications:
95    //
96    //     isWhiteChar
97    //     isBlackChar
98    //     isCommentChar
99    //     isBracketChar
100   //     isBraceChar
101   //     isNumber
102   //
103 
104     /** is the character white space? */
105   static public boolean isWhiteChar( char x ) {
106     switch ( x ) {
107     case ' ':
108     case '\t':
109     case ',':
110     case '\n':
111     case '\r':
112     case 0x1a:
113       return( true );
114     default:
115       return( false );
116     }
117   }
118 
119     /** is the character black space? */
120   static public boolean isBlackChar( char x ) {
121     if ( isWhiteChar( x )) {
122       return( false );
123     } else {
124       return( true );
125     }
126   }
127 
128     /** is the character a special vrml character #[]{}" */
129   static public boolean isSpecialChar( char x ) {
130     return(( x == '#' ) || isQuoteChar( x ) || isBraceChar( x ) || isBracketChar( x ));
131   }
132 
133     /** is the character a vrml comment character # */
134   static public boolean isCommentChar( char x ) {
135     if ( x == '#' ) {
136       return( true );
137     } else {
138       return( false );
139     }
140   }
141 
142     /** is the character a vrml quote character "  */
143   static public boolean isQuoteChar( char x ) {
144     if ( x == '"' ) {
145       return( true );
146     } else {
147       return( false );
148     }
149   }
150 
151     /** is the character a bracket [] */
152   static public boolean isBracketChar( char x ) {
153     if (( x == '[' ) || ( x == ']' )) {
154       return( true );
155     } else {
156       return( false );
157     }
158   }
159 
160     /** is the character a left bracket? */
161   static public boolean isLeftBracket( char x ) {
162       return( x == '[' );
163   }
164 
165     /** is the character a right bracket? */
166   static public boolean isRightBracket( char x ) {
167       return( x == ']' );
168   }
169 
170     /** is the character a left brace? */
171   static public boolean isLeftBrace( char x ) {
172       return( x == '{' );
173   }
174 
175     /** is the character a right brace? */
176   static public boolean isRightBrace( char x ) {
177       return( x == '}' );
178   }
179 
180     /* is the character a brace {} */
181   static public boolean isBraceChar( char x ) {
182     if (( x == '{' ) || ( x == '}' )) {
183       return( true );
184     } else {
185       return( false );
186     }
187   }
188 
189     /** could the character be part of a number? */
190   static public boolean isNumberChar( char x ) {
191     switch( x ) {
192     case '0':
193     case '1':
194     case '2':
195     case '3':
196     case '4':
197     case '5':
198     case '6':
199     case '7':
200     case '8':
201     case '9':
202     case '.':
203     case '-':
204     case '+':
205     case 'e':
206     case 'E':
207       return( true );
208     default:
209       return( false );
210     }
211   }
212 
213     /** create the next token, already known to exist when this is called
214      *
215      *  @param s      the ByteString containing the token
216      *  @param offset offset of the start of the token
217      *  @param offsetReturn  output parameter, start offset of the token
218      *  @param typeReturn    output parameter, type of the token
219      *  @param sizeReturn    output parameter, size of the token
220      */
221   void BlackToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn,
222       ReturnInteger typeReturn, ReturnInteger sizeReturn ) {
223     if ( unterminatedString ) {
224         typeReturn.setValue( QuotedStringContinuation );
225         QuotedStringContinuation_Factory( s, offset, offsetReturn, sizeReturn );
226         return;
227     }
228 
229     // determine type
230     char firstChar = s.charAt( offset );
231 
232     if ((( firstChar >= 'a' ) && ( firstChar <= 'z' )) || (( firstChar >= 'A' ) && ( firstChar <= 'Z' ))) {
233         typeReturn.setValue( NameToken );
234       NameToken_Factory( s, offset, offsetReturn, sizeReturn );
235       keywordCheck( s, typeReturn, offset, sizeReturn.getValue() );
236     } else if ( isNumberSequence( s, offset )) {
237       NumberToken_Factory( s, offset, offsetReturn, sizeReturn, typeReturn );
238     } else if ( firstChar == '#' ) {
239         typeReturn.setValue( CommentToken );
240       CommentToken_Factory( s, offset, offsetReturn, sizeReturn );
241     } else if ( firstChar == '[' ) {
242         typeReturn.setValue( LeftBracket );
243         sizeReturn.setValue( 1 );
244         offsetReturn.setValue( offset );
245     } else if ( firstChar == '{' ) {
246         typeReturn.setValue( LeftBrace );
247         sizeReturn.setValue( 1 );
248         offsetReturn.setValue( offset );
249     } else if ( firstChar == ']' ) {
250         typeReturn.setValue( RightBracket );
251         sizeReturn.setValue( 1 );
252         offsetReturn.setValue( offset );
253     } else if ( firstChar == '}' ) {
254         typeReturn.setValue( RightBrace );
255         sizeReturn.setValue( 1 );
256         offsetReturn.setValue( offset );
257     } else if ( firstChar == '"' ) {
258         setUnterminatedString( true );
259       typeReturn.setValue( QuotedString );
260       QuotedString_Factory( s, offset, offsetReturn, sizeReturn );
261     } else {
262         typeReturn.setValue( NameToken );
263       NameToken_Factory( s, offset, offsetReturn, sizeReturn );
264       keywordCheck( s, typeReturn, offset, sizeReturn.getValue() );
265     }
266   }
267   
268   /** convert the type to Keyword1Token or Keyword2Token of appropriate */
269   void keywordCheck( ByteString s, ReturnInteger typeReturn, int offset, int size ) {
270       char schar = s.charAt( offset );
271       KeywordList k1 = Keywords.getKeyList1( schar );
272       KeywordList k2 = Keywords.getKeyList2( schar );
273       if ( k1 != null ) {
274           if ( k1.find( s, offset, size )) {
275               typeReturn.setValue( Keyword1Token );
276               return;
277           }
278       }
279       if ( k2 != null ) {
280           if ( k2.find( s, offset, size )) {
281               typeReturn.setValue( Keyword2Token );
282               return;
283           }
284       }
285   }
286   
287 
288     /** can the character be the first character in a numeric token? */
289     boolean isFirstNumberChar( char x ) {
290         if ( x == '.' ) return( true );
291         if ( x == '+' ) return( true );
292         if ( x == '-' ) return( true );
293         if (( x >= '0' ) && ( x <= '9' )) return( true );
294         return( false );
295     }
296     
297     /** is the character a digit? */
298     boolean isDigit( char x ) {
299         return(( x >= '0' ) && ( x <= '9' ));
300     }
301     
302     /** is this a number sequence? */
303     boolean isNumberSequence( ByteString s, int offset ) {
304         if ( !isFirstNumberChar( s.charAt( offset ))) {
305             return( false );
306         }
307         boolean firstDigit = isDigit( s.charAt( offset ));
308         offset++;
309         int slen = s.length();
310         boolean additionalChars = false;
311         while (( offset < slen ) && isNumberChar( s.charAt( offset ))) {
312             offset++;
313             additionalChars = true;
314         }
315         if ( !firstDigit && !additionalChars ) {
316             return( false );
317         }
318         if ( offset == slen ) {
319             return( true );
320         }
321         return( isWhiteChar( s.charAt( offset )) || isBracketChar( s.charAt( offset )));
322     }
323     
324     /** create a token 
325      *  
326      *  @param s      ByteString containing token
327      *  @param offset starting offset of token
328      *  @param sizeReturn  output parameter, size of resulting token
329      */
330   void createToken( ByteString s, int offset, ReturnInteger sizeReturn ) {
331     int numberCharsProcessed = 0;
332     if ( s != null ) {
333       while ( numberCharsProcessed < ( s.length() - offset ) &&
334         isAppropriate( s.charAt( numberCharsProcessed + offset ))) {
335         numberCharsProcessed++;
336       }
337     }
338     sizeReturn.setValue( numberCharsProcessed );
339   }
340 
341     /**
342      *  is the character appropriate for the token currently being created?
343      */
344   boolean isAppropriate( char x ) {
345       switch( currentType ) {
346         case QuotedString:
347         case QuotedStringContinuation:
348             return( QuotedString_isAppropriate( x ));
349         case CommentToken:
350             return( CommentToken_isAppropriate( x ));
351         case NumberToken:
352             return( NumberToken_isAppropriate( x ));
353         case NameToken:
354             return( NameToken_isAppropriate( x ));
355         default:
356             return( false );
357         }
358     }
359 
360 
361   // isAppropriate was simpler when Tokens were objects and this was done with
362     // subclasses, now done with switch based on currentType
363     int currentType;
364 
365   boolean firstQuoteFound = false;
366   boolean prevCharWasEscape = false;
367   boolean thatsAllFolks = false;
368 
369     /** create a quoted string token */
370   public void QuotedString_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
371       firstQuoteFound = false;
372       prevCharWasEscape = false;
373       thatsAllFolks = false;
374       currentType = QuotedString;
375       offsetReturn.setValue( offset );
376       createToken( s, offset, sizeReturn );
377   }
378 
379     /** has a quote been located? */
380   public boolean firstQuoteLocated() {
381     return( firstQuoteFound );
382   }
383 
384     /** is the character valid for a quoted string? */
385   public boolean QuotedString_isAppropriate( char x ) {
386     if ( thatsAllFolks ) {
387       return( false );
388     } else if ( !firstQuoteLocated() ) {
389       if ( x == '"' ) {
390         firstQuoteFound = true;
391         return( true );
392       } else {
393         return( false );
394       }
395     } else {
396       if ( x == '\\' ) {
397         prevCharWasEscape = true;
398         return( true );
399       } else if ( x == '"' ) {
400         if ( !prevCharWasEscape ) {
401           thatsAllFolks = true;
402           unterminatedString = false;
403           return( true );
404         }
405         prevCharWasEscape = false;
406         return( true );
407       } else {
408         prevCharWasEscape = false;
409         return( true );
410       }
411     }
412   }
413 
414     /** create a quoted string continuation token */
415   public void QuotedStringContinuation_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
416       firstQuoteFound = true;
417       prevCharWasEscape = false;
418       thatsAllFolks = false;
419       currentType = QuotedStringContinuation;
420       offsetReturn.setValue( offset );
421       createToken( s, offset, sizeReturn );
422   }
423 
424     /** create a comment token */
425     public void CommentToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
426         currentType = CommentToken;
427         offsetReturn.setValue( offset );
428         createToken( s, offset, sizeReturn );
429     }
430 
431     /** is the character appropriate for a comment token */
432   public boolean CommentToken_isAppropriate( char x ) {
433     if (( x == '\n' ) || ( x == '\r' )) {
434       return( false );
435     } else {
436       return( true );
437     }
438   }
439 
440     /** create a number token */
441     public void NumberToken_Factory( ByteString s, int offset, 
442           ReturnInteger offsetReturn, ReturnInteger sizeReturn, ReturnInteger typeReturn ) {
443       currentType = NumberToken;
444       offsetReturn.setValue( offset );
445       createToken( s, offset, sizeReturn );
446       setNumberTypeReturn( s, offset, sizeReturn.getValue(), typeReturn );
447     }
448     
449     
450     boolean isBadValue( int ndigits, ByteString s, int offset, int size, boolean eAllowed, boolean dotAllowed ) {
451         if ( ndigits == size ) {
452             return( false );
453         } else {
454             boolean result = true;
455             char nextchar = s.charAt( offset + ndigits );
456             if ( eAllowed ) {
457                 if (( nextchar == 'e' ) || ( nextchar == 'E' )) {
458                     return( false );
459                 }
460             }
461             if ( dotAllowed ) {
462                 if ( nextchar == '.' ) {
463                     return( false );
464                 }
465             }
466             return( true );
467         }
468     }
469     
470     public int getNumberValue( ByteString s, int offset, int size ) {
471         int result = 0;
472         int base = 10;
473         for ( int i = 0; i < size; i++ ) {
474             char c = s.charAt( offset + i );
475             if (( c >= '0' ) && ( c <= '9' )) {
476                 int n = c - '0';
477                 result = result*base + n;
478             } else {
479                 break;
480             }
481         }
482         return( result );
483     }
484     
485     public int getNumberValue2( ByteString s, int offset, int size ) {
486         if ( s.charAt( offset ) == '0' ) {
487             return( 999 );
488         } else {
489             return( getNumberValue( s, offset, size ));
490         }
491     }        
492 
493     public int getNumberDigits( ByteString s, int offset, int size ) {
494         int numberDigits = 0;
495         for ( int i = 0; i < size; i++ ) {
496             char c = s.charAt( offset + i );
497             if (( c >= '0' ) && ( c <= '9' )) {
498                 numberDigits++;
499             } else {
500                 break;
501             }
502         }
503         return( numberDigits );
504     }
505     
506    
507     public void setNumberTypeReturn( ByteString s, int offset, int size, ReturnInteger typeReturn ) {
508         // now simplified to set type to either NumberToken or BadNumber
509         
510         // eat initial +,-,.
511         char firstChar = s.charAt( offset );
512         boolean gotDot = false;
513         boolean gotDigits = false;
514         if (( firstChar == '+' ) || ( firstChar == '-' ) || ( firstChar == '.' )) {
515             offset++;
516             size--;
517             if ( firstChar == '.' ) {
518                 gotDot = true;
519             }
520         }
521         if (( firstChar == '+' ) || ( firstChar == '-' )) {
522             char nextChar = s.charAt( offset );
523             if ( nextChar == '.' ) {
524                 gotDot = true;
525                 offset++;
526                 size--;
527             }
528         }
529         
530         // get digits
531         while (( size > 0 ) && isDigit( s.charAt( offset ))) {
532             gotDigits = true;
533             offset++;
534             size--;
535         }
536         
537         if ( !gotDigits ) {
538             typeReturn.setValue( BadNumber );
539             return;
540         }
541         
542         // If there is more, it has to be either a '.' or an 'e' or an 'E'
543         boolean gotE = false;
544         if ( size == 0 ) {
545             typeReturn.setValue( NumberToken );
546             return;
547         }
548         char nextChar = s.charAt( offset );
549         if (( nextChar == '.' ) && gotDot ) {
550             typeReturn.setValue( BadNumber );
551             return;
552         }
553         if ( nextChar == '.' ) {
554             gotDot = true;
555         } else if (( nextChar == 'e' ) || ( nextChar == 'E' )) {
556             gotE = true;
557         } else {
558             typeReturn.setValue( BadNumber );
559             return;
560         }
561         offset++;
562         size--;
563 
564         // if we got an E, there has to be more
565         if ( gotE ) {
566             if ( size == 0 ) {
567                 typeReturn.setValue( BadNumber );
568                 return;
569             }
570         }
571         
572         // if we got an E, the next char may be a sign
573         if ( gotE ) {
574             nextChar = s.charAt( offset );
575             if (( nextChar == '+' ) || ( nextChar == '-' )) {
576                 offset++;
577                 size--;
578             }
579         }
580         
581         // check for more digits
582         while (( size > 0 ) && isDigit( s.charAt( offset ))) {
583             offset++;
584             size--;
585         }
586         
587         // If there is more, and we haven't got an 'e' or 'E' yet, that is
588         // the only valid thing
589         if ( size == 0 ) {
590             typeReturn.setValue( NumberToken );
591             return;
592         }
593         
594         if ( gotE ) {
595             typeReturn.setValue( BadNumber );
596             return;
597         }
598         
599         nextChar = s.charAt( offset );
600         if (( nextChar != 'e' ) && ( nextChar != 'E' )) {
601             typeReturn.setValue( BadNumber );
602             return;
603         }
604         offset++;
605         size--;
606         if ( size == 0 ) {
607             typeReturn.setValue( BadNumber );
608             return;
609         }
610         
611         // After the 'e', sign is allowed
612         nextChar = s.charAt( offset );
613         if (( nextChar == '+' ) || ( nextChar == '-' )) {
614             offset++;
615             size--;
616         }
617         
618         if ( size == 0 ) {
619             typeReturn.setValue( BadNumber );
620             return;
621         }
622         
623         while (( size > 0 ) && isDigit( s.charAt( offset ))) {
624             size--;
625             offset++;
626         }
627         
628         if ( size != 0 ) {
629             typeReturn.setValue( BadNumber );
630         } else {
631             typeReturn.setValue( NumberToken );
632         }
633     }
634 
635     /** is the character appropriate for a number token */
636   public boolean NumberToken_isAppropriate( char x ) {
637     return( isNumberChar( x ));
638   }
639 
640     /** create a name token
641      *
642      *  @param s  the text containing the token
643      *  @param offset  offset in ByteString of the start of the token
644      *  @param offsetReturn  output parameter, offset of start of token
645      *  @param sizeReturn    output parameter, size of the token
646      */
647     public void NameToken_Factory( ByteString s, int offset, ReturnInteger offsetReturn, ReturnInteger sizeReturn ) {
648         offsetReturn.setValue( offset );
649         currentType = NameToken;
650         createToken( s, offset, sizeReturn );
651     }
652 
653     /** is the character appropriate for a name token */
654   public boolean NameToken_isAppropriate( char x ) {
655     return( !isWhiteChar( x ) && !isSpecialChar( x ));
656   }
657 
658 }