Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: java_cup/lexer.java


1   package java_cup;
2   
3   import java_cup.runtime.Symbol;
4   import java.util.Hashtable;
5   
6   /** This class implements a small scanner (aka lexical analyzer or lexer) for
7    *  the JavaCup specification.  This scanner reads characters from standard 
8    *  input (System.in) and returns integers corresponding to the terminal 
9    *  number of the next Symbol. Once end of input is reached the EOF Symbol is 
10   *  returned on every subsequent call.<p>
11   *  Symbols currently returned include: <pre>
12   *    Symbol        Constant Returned     Symbol        Constant Returned
13   *    ------        -----------------     ------        -----------------
14   *    "package"     PACKAGE               "import"      IMPORT 
15   *    "code"        CODE                  "action"      ACTION 
16   *    "parser"      PARSER                "terminal"    TERMINAL
17   *    "non"         NON                   "init"        INIT 
18   *    "scan"        SCAN                  "with"        WITH
19   *    "start"       START                 "precedence"  PRECEDENCE
20   *    "left"        LEFT      "right"       RIGHT
21   *    "nonassoc"    NONASSOC      "%prec        PRECENT_PREC  
22   *      [           LBRACK                  ]           RBRACK
23   *      ;           SEMI 
24   *      ,           COMMA                   *           STAR 
25   *      .           DOT                     :           COLON
26   *      ::=         COLON_COLON_EQUALS      |           BAR
27   *    identifier    ID                    {:...:}       CODE_STRING
28   *    "nonterminal" NONTERMINAL
29   *  </pre>
30   *  All symbol constants are defined in sym.java which is generated by 
31   *  JavaCup from parser.cup.<p>
32   * 
33   *  In addition to the scanner proper (called first via init() then with
34   *  next_token() to get each Symbol) this class provides simple error and 
35   *  warning routines and keeps a count of errors and warnings that is 
36   *  publicly accessible.<p>
37   *  
38   *  This class is "static" (i.e., it has only static members and methods).
39   *
40   * @version last updated: 7/3/96
41   * @author  Frank Flannery
42   */
43  public class lexer {
44  
45    /*-----------------------------------------------------------*/
46    /*--- Constructor(s) ----------------------------------------*/
47    /*-----------------------------------------------------------*/
48  
49    /** The only constructor is private, so no instances can be created. */
50    private lexer() { }
51  
52    /*-----------------------------------------------------------*/
53    /*--- Static (Class) Variables ------------------------------*/
54    /*-----------------------------------------------------------*/
55  
56    /** First character of lookahead. */
57    protected static int next_char; 
58  
59    /** Second character of lookahead. */
60    protected static int next_char2;
61  
62    /** Second character of lookahead. */
63    protected static int next_char3;
64  
65    /** Second character of lookahead. */
66    protected static int next_char4;
67  
68    /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
69  
70    /** EOF constant. */
71    protected static final int EOF_CHAR = -1;
72  
73    /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
74  
75    /** Table of keywords.  Keywords are initially treated as identifiers.
76     *  Just before they are returned we look them up in this table to see if
77     *  they match one of the keywords.  The string of the name is the key here,
78     *  which indexes Integer objects holding the symbol number. 
79     */
80    protected static Hashtable keywords = new Hashtable(23);
81  
82    /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
83  
84    /** Table of single character symbols.  For ease of implementation, we 
85     *  store all unambiguous single character Symbols in this table of Integer
86     *  objects keyed by Integer objects with the numerical value of the 
87     *  appropriate char (currently Character objects have a bug which precludes
88     *  their use in tables).
89     */
90    protected static Hashtable char_symbols = new Hashtable(11);
91  
92    /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
93  
94    /** Current line number for use in error messages. */
95    protected static int current_line = 1;
96  
97    /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
98  
99    /** Character position in current line. */
100   protected static int current_position = 1;
101 
102   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
103 
104   /** Character position in current line. */
105   protected static int absolute_position = 1;
106 
107   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
108 
109   /** Count of total errors detected so far. */
110   public static int error_count = 0;
111 
112   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
113 
114   /** Count of warnings issued so far */
115   public static int warning_count = 0;
116 
117   /*-----------------------------------------------------------*/
118   /*--- Static Methods ----------------------------------------*/
119   /*-----------------------------------------------------------*/
120 
121   /** Initialize the scanner.  This sets up the keywords and char_symbols
122     * tables and reads the first two characters of lookahead.  
123     */
124   public static void init() throws java.io.IOException
125     {
126       /* set up the keyword table */
127       keywords.put("package",    new Integer(sym.PACKAGE));
128       keywords.put("import",     new Integer(sym.IMPORT));
129       keywords.put("code",       new Integer(sym.CODE));
130       keywords.put("action",     new Integer(sym.ACTION));
131       keywords.put("parser",     new Integer(sym.PARSER));
132       keywords.put("terminal",   new Integer(sym.TERMINAL));
133       keywords.put("non",        new Integer(sym.NON));
134       keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA]
135       keywords.put("init",       new Integer(sym.INIT));
136       keywords.put("scan",       new Integer(sym.SCAN));
137       keywords.put("with",       new Integer(sym.WITH));
138       keywords.put("start",      new Integer(sym.START));
139       keywords.put("precedence", new Integer(sym.PRECEDENCE));
140       keywords.put("left",       new Integer(sym.LEFT));
141       keywords.put("right",      new Integer(sym.RIGHT));
142       keywords.put("nonassoc",   new Integer(sym.NONASSOC));
143 
144       /* set up the table of single character symbols */
145       char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
146       char_symbols.put(new Integer(','), new Integer(sym.COMMA));
147       char_symbols.put(new Integer('*'), new Integer(sym.STAR));
148       char_symbols.put(new Integer('.'), new Integer(sym.DOT));
149       char_symbols.put(new Integer('|'), new Integer(sym.BAR));
150       char_symbols.put(new Integer('['), new Integer(sym.LBRACK));
151       char_symbols.put(new Integer(']'), new Integer(sym.RBRACK));
152 
153       /* read two characters of lookahead */
154       next_char = System.in.read();
155       if (next_char == EOF_CHAR) {
156   next_char2 = EOF_CHAR;
157         next_char3 = EOF_CHAR;
158         next_char4 = EOF_CHAR;
159       } else {
160   next_char2 = System.in.read();
161   if (next_char2 == EOF_CHAR) {
162     next_char3 = EOF_CHAR;
163     next_char4 = EOF_CHAR;
164   } else {
165     next_char3 = System.in.read();
166     if (next_char3 == EOF_CHAR) {
167       next_char4 = EOF_CHAR;
168     } else {
169       next_char4 = System.in.read();
170     }
171   }
172       }
173     }
174 
175   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
176 
177   /** Advance the scanner one character in the input stream.  This moves
178    * next_char2 to next_char and then reads a new next_char2.  
179    */
180   protected static void advance() throws java.io.IOException
181     {
182       int old_char;
183 
184       old_char = next_char;
185       next_char = next_char2;
186       if (next_char == EOF_CHAR) {
187   next_char2 = EOF_CHAR;
188         next_char3 = EOF_CHAR;
189   next_char4 = EOF_CHAR;
190       } else {
191   next_char2 = next_char3;
192   if (next_char2 == EOF_CHAR) {
193     next_char3 = EOF_CHAR;
194     next_char4 = EOF_CHAR;
195   } else {
196     next_char3 = next_char4;
197     if (next_char3 == EOF_CHAR) {
198       next_char4 = EOF_CHAR;
199     } else {
200       next_char4 = System.in.read();
201     }
202   }
203       }
204 
205       /* count this */
206       absolute_position++;
207       current_position++;
208       if (old_char == '\n')
209   {
210     current_line++;
211     current_position = 1;
212   }
213     }
214 
215   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
216 
217   /** Emit an error message.  The message will be marked with both the 
218    *  current line number and the position in the line.  Error messages
219    *  are printed on standard error (System.err).
220    * @param message the message to print.
221    */
222   public static void emit_error(String message)
223     {
224       System.err.println("Error at " + current_line + "(" + current_position +
225        "): " + message);
226       error_count++;
227     }
228 
229   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
230 
231   /** Emit a warning message.  The message will be marked with both the 
232    *  current line number and the position in the line.  Messages are 
233    *  printed on standard error (System.err).
234    * @param message the message to print.
235    */
236   public static void emit_warn(String message)
237     {
238       System.err.println("Warning at " + current_line + "(" + current_position +
239        "): " + message);
240       warning_count++;
241     }
242 
243   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
244 
245   /** Determine if a character is ok to start an id. 
246    * @param ch the character in question.
247    */
248   protected static boolean id_start_char(int ch)
249     {
250       /* allow for % in identifiers.  a hack to allow my
251    %prec in.  Should eventually make lex spec for this 
252    frankf */
253       return (ch >= 'a' &&  ch <= 'z') || (ch >= 'A' && ch <= 'Z') || 
254        (ch == '_');
255 
256       // later need to deal with non-8-bit chars here
257     }
258 
259   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
260 
261   /** Determine if a character is ok for the middle of an id.
262    * @param ch the character in question. 
263    */
264   protected static boolean id_char(int ch)
265     {
266       return id_start_char(ch) || (ch >= '0' && ch <= '9');
267     }
268 
269   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
270 
271   /** Try to look up a single character symbol, returns -1 for not found. 
272    * @param ch the character in question.
273    */
274   protected static int find_single_char(int ch)
275     {
276       Integer result;
277 
278       result = (Integer)char_symbols.get(new Integer((char)ch));
279       if (result == null) 
280   return -1;
281       else
282   return result.intValue();
283     }
284 
285   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
286 
287   /** Handle swallowing up a comment.  Both old style C and new style C++
288    *  comments are handled.
289    */
290   protected static void swallow_comment() throws java.io.IOException
291     {
292       /* next_char == '/' at this point */
293 
294       /* is it a traditional comment */
295       if (next_char2 == '*')
296   {
297     /* swallow the opener */
298     advance(); advance();
299 
300     /* swallow the comment until end of comment or EOF */
301     for (;;)
302       {
303         /* if its EOF we have an error */
304         if (next_char == EOF_CHAR)
305     {
306       emit_error("Specification file ends inside a comment");
307       return;
308     }
309 
310         /* if we can see the closer we are done */
311         if (next_char == '*' && next_char2 == '/')
312     {
313       advance();
314       advance();
315       return;
316     }
317 
318         /* otherwise swallow char and move on */
319         advance();
320       }
321   }
322 
323       /* is its a new style comment */
324       if (next_char2 == '/')
325   {
326     /* swallow the opener */
327     advance(); advance();
328 
329     /* swallow to '\n', '\f', or EOF */ 
330     while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
331       advance();
332 
333     return;
334 
335   }
336 
337       /* shouldn't get here, but... if we get here we have an error */
338       emit_error("Malformed comment in specification -- ignored");
339       advance();
340     }
341 
342   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
343 
344   /** Swallow up a code string.  Code strings begin with "{:" and include
345       all characters up to the first occurrence of ":}" (there is no way to 
346       include ":}" inside a code string).  The routine returns a String
347       object suitable for return by the scanner.
348    */
349   protected static Symbol do_code_string() throws java.io.IOException
350     {
351       StringBuffer result = new StringBuffer();
352 
353       /* at this point we have lookahead of "{:" -- swallow that */
354       advance(); advance();
355 
356       /* save chars until we see ":}" */
357       while (!(next_char == ':' && next_char2 == '}'))
358   {
359     /* if we have run off the end issue a message and break out of loop */
360     if (next_char == EOF_CHAR)
361       {
362         emit_error("Specification file ends inside a code string");
363         break;
364       }
365 
366     /* otherwise record the char and move on */
367     result.append(new Character((char)next_char));
368     advance();
369   }
370 
371       /* advance past the closer and build a return Symbol */
372       advance(); advance();
373       return new Symbol(sym.CODE_STRING, result.toString());
374     }
375 
376   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
377 
378   /** Process an identifier.  Identifiers begin with a letter, underscore,
379    *  or dollar sign, which is followed by zero or more letters, numbers,
380    *  underscores or dollar signs.  This routine returns a String suitable
381    *  for return by the scanner.
382    */
383   protected static Symbol do_id() throws java.io.IOException
384     {
385       StringBuffer result = new StringBuffer();
386       String       result_str;
387       Integer      keyword_num;
388       char         buffer[] = new char[1];
389 
390       /* next_char holds first character of id */
391       buffer[0] = (char)next_char;
392       result.append(buffer,0,1);
393       advance();
394 
395       /* collect up characters while they fit in id */ 
396       while(id_char(next_char))
397   {
398           buffer[0] = (char)next_char;
399     result.append(buffer,0,1);
400     advance();
401   }
402 
403       /* extract a string and try to look it up as a keyword */
404       result_str = result.toString();
405       keyword_num = (Integer)keywords.get(result_str);
406 
407       /* if we found something, return that keyword */
408       if (keyword_num != null)
409   return new Symbol(keyword_num.intValue());
410 
411       /* otherwise build and return an id Symbol with an attached string */
412       return new Symbol(sym.ID, result_str);
413     }
414 
415   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
416 
417   /** Return one Symbol.  This is the main external interface to the scanner.
418    *  It consumes sufficient characters to determine the next input Symbol
419    *  and returns it.  To help with debugging, this routine actually calls
420    *  real_next_token() which does the work.  If you need to debug the 
421    *  parser, this can be changed to call debug_next_token() which prints
422    *  a debugging message before returning the Symbol.
423    */
424   public static Symbol next_token() throws java.io.IOException
425     {
426       return real_next_token();
427     }
428 
429   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
430 
431   /** Debugging version of next_token().  This routine calls the real scanning
432    *  routine, prints a message on System.out indicating what the Symbol is,
433    *  then returns it.
434    */
435   public static Symbol debug_next_token() throws java.io.IOException
436     {
437       Symbol result = real_next_token();
438       System.out.println("# next_Symbol() => " + result.sym);
439       return result;
440     }
441 
442   /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
443 
444   /** The actual routine to return one Symbol.  This is normally called from
445    *  next_token(), but for debugging purposes can be called indirectly from
446    *  debug_next_token(). 
447    */
448   protected static Symbol real_next_token() throws java.io.IOException
449     {
450       int sym_num;
451 
452       for (;;)
453   {
454     /* look for white space */
455     if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
456         next_char == '\f' ||  next_char == '\r')
457       {
458         /* advance past it and try the next character */
459         advance();
460         continue;
461       }
462 
463     /* look for a single character symbol */
464     sym_num = find_single_char(next_char);
465     if (sym_num != -1)
466       {
467         /* found one -- advance past it and return a Symbol for it */
468         advance();
469         return new Symbol(sym_num);
470       }
471 
472     /* look for : or ::= */
473     if (next_char == ':')
474       {
475         /* if we don't have a second ':' return COLON */
476         if (next_char2 != ':') 
477     {
478       advance();
479       return new Symbol(sym.COLON);
480     }
481 
482         /* move forward and look for the '=' */
483         advance();
484         if (next_char2 == '=') 
485     {
486       advance(); advance();
487       return new Symbol(sym.COLON_COLON_EQUALS);
488     }
489         else
490     {
491       /* return just the colon (already consumed) */
492       return new Symbol(sym.COLON);
493     }
494       }
495 
496     /* find a "%prec" string and return it.  otherwise, a '%' was found,
497        which has no right being in the specification otherwise */
498     if (next_char == '%') {
499       advance();
500       if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') && 
501     (next_char4 == 'c')) {
502         advance();
503         advance();
504         advance();
505         advance();
506         return new Symbol(sym.PERCENT_PREC);
507       } else {
508         emit_error("Found extraneous percent sign");
509       }
510     }
511 
512     /* look for a comment */
513     if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
514       {
515         /* swallow then continue the scan */
516         swallow_comment();
517         continue;
518       }
519 
520     /* look for start of code string */
521     if (next_char == '{' && next_char2 == ':')
522       return do_code_string();
523 
524     /* look for an id or keyword */
525     if (id_start_char(next_char)) return do_id();
526 
527     /* look for EOF */
528     if (next_char == EOF_CHAR) return new Symbol(sym.EOF);
529 
530     /* if we get here, we have an unrecognized character */
531     emit_warn("Unrecognized character '" + 
532       new Character((char)next_char) + "'(" + next_char + 
533       ") -- ignored");
534 
535     /* advance past it */
536     advance();
537   }
538     }
539 
540   /*-----------------------------------------------------------*/
541 }
542