Source code: java_cup/lexer.java
1 package java_cup;
2
3 import java_cup.runtime.Symbol;
4 import java.util.Hashtable;
5
6 /** This class implements a small scanner (aka lexical analyzer or lexer) for
7 * the JavaCup specification. This scanner reads characters from standard
8 * input (System.in) and returns integers corresponding to the terminal
9 * number of the next Symbol. Once end of input is reached the EOF Symbol is
10 * returned on every subsequent call.<p>
11 * Symbols currently returned include: <pre>
12 * Symbol Constant Returned Symbol Constant Returned
13 * ------ ----------------- ------ -----------------
14 * "package" PACKAGE "import" IMPORT
15 * "code" CODE "action" ACTION
16 * "parser" PARSER "terminal" TERMINAL
17 * "non" NON "init" INIT
18 * "scan" SCAN "with" WITH
19 * "start" START "precedence" PRECEDENCE
20 * "left" LEFT "right" RIGHT
21 * "nonassoc" NONASSOC "%prec PRECENT_PREC
22 * [ LBRACK ] RBRACK
23 * ; SEMI
24 * , COMMA * STAR
25 * . DOT : COLON
26 * ::= COLON_COLON_EQUALS | BAR
27 * identifier ID {:...:} CODE_STRING
28 * "nonterminal" NONTERMINAL
29 * </pre>
30 * All symbol constants are defined in sym.java which is generated by
31 * JavaCup from parser.cup.<p>
32 *
33 * In addition to the scanner proper (called first via init() then with
34 * next_token() to get each Symbol) this class provides simple error and
35 * warning routines and keeps a count of errors and warnings that is
36 * publicly accessible.<p>
37 *
38 * This class is "static" (i.e., it has only static members and methods).
39 *
40 * @version last updated: 7/3/96
41 * @author Frank Flannery
42 */
43 public class lexer {
44
45 /*-----------------------------------------------------------*/
46 /*--- Constructor(s) ----------------------------------------*/
47 /*-----------------------------------------------------------*/
48
49 /** The only constructor is private, so no instances can be created. */
50 private lexer() { }
51
52 /*-----------------------------------------------------------*/
53 /*--- Static (Class) Variables ------------------------------*/
54 /*-----------------------------------------------------------*/
55
56 /** First character of lookahead. */
57 protected static int next_char;
58
59 /** Second character of lookahead. */
60 protected static int next_char2;
61
62 /** Second character of lookahead. */
63 protected static int next_char3;
64
65 /** Second character of lookahead. */
66 protected static int next_char4;
67
68 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
69
70 /** EOF constant. */
71 protected static final int EOF_CHAR = -1;
72
73 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
74
75 /** Table of keywords. Keywords are initially treated as identifiers.
76 * Just before they are returned we look them up in this table to see if
77 * they match one of the keywords. The string of the name is the key here,
78 * which indexes Integer objects holding the symbol number.
79 */
80 protected static Hashtable keywords = new Hashtable(23);
81
82 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
83
84 /** Table of single character symbols. For ease of implementation, we
85 * store all unambiguous single character Symbols in this table of Integer
86 * objects keyed by Integer objects with the numerical value of the
87 * appropriate char (currently Character objects have a bug which precludes
88 * their use in tables).
89 */
90 protected static Hashtable char_symbols = new Hashtable(11);
91
92 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
93
94 /** Current line number for use in error messages. */
95 protected static int current_line = 1;
96
97 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
98
99 /** Character position in current line. */
100 protected static int current_position = 1;
101
102 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
103
104 /** Character position in current line. */
105 protected static int absolute_position = 1;
106
107 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
108
109 /** Count of total errors detected so far. */
110 public static int error_count = 0;
111
112 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
113
114 /** Count of warnings issued so far */
115 public static int warning_count = 0;
116
117 /*-----------------------------------------------------------*/
118 /*--- Static Methods ----------------------------------------*/
119 /*-----------------------------------------------------------*/
120
121 /** Initialize the scanner. This sets up the keywords and char_symbols
122 * tables and reads the first two characters of lookahead.
123 */
124 public static void init() throws java.io.IOException
125 {
126 /* set up the keyword table */
127 keywords.put("package", new Integer(sym.PACKAGE));
128 keywords.put("import", new Integer(sym.IMPORT));
129 keywords.put("code", new Integer(sym.CODE));
130 keywords.put("action", new Integer(sym.ACTION));
131 keywords.put("parser", new Integer(sym.PARSER));
132 keywords.put("terminal", new Integer(sym.TERMINAL));
133 keywords.put("non", new Integer(sym.NON));
134 keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA]
135 keywords.put("init", new Integer(sym.INIT));
136 keywords.put("scan", new Integer(sym.SCAN));
137 keywords.put("with", new Integer(sym.WITH));
138 keywords.put("start", new Integer(sym.START));
139 keywords.put("precedence", new Integer(sym.PRECEDENCE));
140 keywords.put("left", new Integer(sym.LEFT));
141 keywords.put("right", new Integer(sym.RIGHT));
142 keywords.put("nonassoc", new Integer(sym.NONASSOC));
143
144 /* set up the table of single character symbols */
145 char_symbols.put(new Integer(';'), new Integer(sym.SEMI));
146 char_symbols.put(new Integer(','), new Integer(sym.COMMA));
147 char_symbols.put(new Integer('*'), new Integer(sym.STAR));
148 char_symbols.put(new Integer('.'), new Integer(sym.DOT));
149 char_symbols.put(new Integer('|'), new Integer(sym.BAR));
150 char_symbols.put(new Integer('['), new Integer(sym.LBRACK));
151 char_symbols.put(new Integer(']'), new Integer(sym.RBRACK));
152
153 /* read two characters of lookahead */
154 next_char = System.in.read();
155 if (next_char == EOF_CHAR) {
156 next_char2 = EOF_CHAR;
157 next_char3 = EOF_CHAR;
158 next_char4 = EOF_CHAR;
159 } else {
160 next_char2 = System.in.read();
161 if (next_char2 == EOF_CHAR) {
162 next_char3 = EOF_CHAR;
163 next_char4 = EOF_CHAR;
164 } else {
165 next_char3 = System.in.read();
166 if (next_char3 == EOF_CHAR) {
167 next_char4 = EOF_CHAR;
168 } else {
169 next_char4 = System.in.read();
170 }
171 }
172 }
173 }
174
175 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
176
177 /** Advance the scanner one character in the input stream. This moves
178 * next_char2 to next_char and then reads a new next_char2.
179 */
180 protected static void advance() throws java.io.IOException
181 {
182 int old_char;
183
184 old_char = next_char;
185 next_char = next_char2;
186 if (next_char == EOF_CHAR) {
187 next_char2 = EOF_CHAR;
188 next_char3 = EOF_CHAR;
189 next_char4 = EOF_CHAR;
190 } else {
191 next_char2 = next_char3;
192 if (next_char2 == EOF_CHAR) {
193 next_char3 = EOF_CHAR;
194 next_char4 = EOF_CHAR;
195 } else {
196 next_char3 = next_char4;
197 if (next_char3 == EOF_CHAR) {
198 next_char4 = EOF_CHAR;
199 } else {
200 next_char4 = System.in.read();
201 }
202 }
203 }
204
205 /* count this */
206 absolute_position++;
207 current_position++;
208 if (old_char == '\n')
209 {
210 current_line++;
211 current_position = 1;
212 }
213 }
214
215 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
216
217 /** Emit an error message. The message will be marked with both the
218 * current line number and the position in the line. Error messages
219 * are printed on standard error (System.err).
220 * @param message the message to print.
221 */
222 public static void emit_error(String message)
223 {
224 System.err.println("Error at " + current_line + "(" + current_position +
225 "): " + message);
226 error_count++;
227 }
228
229 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
230
231 /** Emit a warning message. The message will be marked with both the
232 * current line number and the position in the line. Messages are
233 * printed on standard error (System.err).
234 * @param message the message to print.
235 */
236 public static void emit_warn(String message)
237 {
238 System.err.println("Warning at " + current_line + "(" + current_position +
239 "): " + message);
240 warning_count++;
241 }
242
243 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
244
245 /** Determine if a character is ok to start an id.
246 * @param ch the character in question.
247 */
248 protected static boolean id_start_char(int ch)
249 {
250 /* allow for % in identifiers. a hack to allow my
251 %prec in. Should eventually make lex spec for this
252 frankf */
253 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
254 (ch == '_');
255
256 // later need to deal with non-8-bit chars here
257 }
258
259 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
260
261 /** Determine if a character is ok for the middle of an id.
262 * @param ch the character in question.
263 */
264 protected static boolean id_char(int ch)
265 {
266 return id_start_char(ch) || (ch >= '0' && ch <= '9');
267 }
268
269 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
270
271 /** Try to look up a single character symbol, returns -1 for not found.
272 * @param ch the character in question.
273 */
274 protected static int find_single_char(int ch)
275 {
276 Integer result;
277
278 result = (Integer)char_symbols.get(new Integer((char)ch));
279 if (result == null)
280 return -1;
281 else
282 return result.intValue();
283 }
284
285 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
286
287 /** Handle swallowing up a comment. Both old style C and new style C++
288 * comments are handled.
289 */
290 protected static void swallow_comment() throws java.io.IOException
291 {
292 /* next_char == '/' at this point */
293
294 /* is it a traditional comment */
295 if (next_char2 == '*')
296 {
297 /* swallow the opener */
298 advance(); advance();
299
300 /* swallow the comment until end of comment or EOF */
301 for (;;)
302 {
303 /* if its EOF we have an error */
304 if (next_char == EOF_CHAR)
305 {
306 emit_error("Specification file ends inside a comment");
307 return;
308 }
309
310 /* if we can see the closer we are done */
311 if (next_char == '*' && next_char2 == '/')
312 {
313 advance();
314 advance();
315 return;
316 }
317
318 /* otherwise swallow char and move on */
319 advance();
320 }
321 }
322
323 /* is its a new style comment */
324 if (next_char2 == '/')
325 {
326 /* swallow the opener */
327 advance(); advance();
328
329 /* swallow to '\n', '\f', or EOF */
330 while (next_char != '\n' && next_char != '\f' && next_char!=EOF_CHAR)
331 advance();
332
333 return;
334
335 }
336
337 /* shouldn't get here, but... if we get here we have an error */
338 emit_error("Malformed comment in specification -- ignored");
339 advance();
340 }
341
342 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
343
344 /** Swallow up a code string. Code strings begin with "{:" and include
345 all characters up to the first occurrence of ":}" (there is no way to
346 include ":}" inside a code string). The routine returns a String
347 object suitable for return by the scanner.
348 */
349 protected static Symbol do_code_string() throws java.io.IOException
350 {
351 StringBuffer result = new StringBuffer();
352
353 /* at this point we have lookahead of "{:" -- swallow that */
354 advance(); advance();
355
356 /* save chars until we see ":}" */
357 while (!(next_char == ':' && next_char2 == '}'))
358 {
359 /* if we have run off the end issue a message and break out of loop */
360 if (next_char == EOF_CHAR)
361 {
362 emit_error("Specification file ends inside a code string");
363 break;
364 }
365
366 /* otherwise record the char and move on */
367 result.append(new Character((char)next_char));
368 advance();
369 }
370
371 /* advance past the closer and build a return Symbol */
372 advance(); advance();
373 return new Symbol(sym.CODE_STRING, result.toString());
374 }
375
376 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
377
378 /** Process an identifier. Identifiers begin with a letter, underscore,
379 * or dollar sign, which is followed by zero or more letters, numbers,
380 * underscores or dollar signs. This routine returns a String suitable
381 * for return by the scanner.
382 */
383 protected static Symbol do_id() throws java.io.IOException
384 {
385 StringBuffer result = new StringBuffer();
386 String result_str;
387 Integer keyword_num;
388 char buffer[] = new char[1];
389
390 /* next_char holds first character of id */
391 buffer[0] = (char)next_char;
392 result.append(buffer,0,1);
393 advance();
394
395 /* collect up characters while they fit in id */
396 while(id_char(next_char))
397 {
398 buffer[0] = (char)next_char;
399 result.append(buffer,0,1);
400 advance();
401 }
402
403 /* extract a string and try to look it up as a keyword */
404 result_str = result.toString();
405 keyword_num = (Integer)keywords.get(result_str);
406
407 /* if we found something, return that keyword */
408 if (keyword_num != null)
409 return new Symbol(keyword_num.intValue());
410
411 /* otherwise build and return an id Symbol with an attached string */
412 return new Symbol(sym.ID, result_str);
413 }
414
415 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
416
417 /** Return one Symbol. This is the main external interface to the scanner.
418 * It consumes sufficient characters to determine the next input Symbol
419 * and returns it. To help with debugging, this routine actually calls
420 * real_next_token() which does the work. If you need to debug the
421 * parser, this can be changed to call debug_next_token() which prints
422 * a debugging message before returning the Symbol.
423 */
424 public static Symbol next_token() throws java.io.IOException
425 {
426 return real_next_token();
427 }
428
429 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
430
431 /** Debugging version of next_token(). This routine calls the real scanning
432 * routine, prints a message on System.out indicating what the Symbol is,
433 * then returns it.
434 */
435 public static Symbol debug_next_token() throws java.io.IOException
436 {
437 Symbol result = real_next_token();
438 System.out.println("# next_Symbol() => " + result.sym);
439 return result;
440 }
441
442 /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/
443
444 /** The actual routine to return one Symbol. This is normally called from
445 * next_token(), but for debugging purposes can be called indirectly from
446 * debug_next_token().
447 */
448 protected static Symbol real_next_token() throws java.io.IOException
449 {
450 int sym_num;
451
452 for (;;)
453 {
454 /* look for white space */
455 if (next_char == ' ' || next_char == '\t' || next_char == '\n' ||
456 next_char == '\f' || next_char == '\r')
457 {
458 /* advance past it and try the next character */
459 advance();
460 continue;
461 }
462
463 /* look for a single character symbol */
464 sym_num = find_single_char(next_char);
465 if (sym_num != -1)
466 {
467 /* found one -- advance past it and return a Symbol for it */
468 advance();
469 return new Symbol(sym_num);
470 }
471
472 /* look for : or ::= */
473 if (next_char == ':')
474 {
475 /* if we don't have a second ':' return COLON */
476 if (next_char2 != ':')
477 {
478 advance();
479 return new Symbol(sym.COLON);
480 }
481
482 /* move forward and look for the '=' */
483 advance();
484 if (next_char2 == '=')
485 {
486 advance(); advance();
487 return new Symbol(sym.COLON_COLON_EQUALS);
488 }
489 else
490 {
491 /* return just the colon (already consumed) */
492 return new Symbol(sym.COLON);
493 }
494 }
495
496 /* find a "%prec" string and return it. otherwise, a '%' was found,
497 which has no right being in the specification otherwise */
498 if (next_char == '%') {
499 advance();
500 if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') &&
501 (next_char4 == 'c')) {
502 advance();
503 advance();
504 advance();
505 advance();
506 return new Symbol(sym.PERCENT_PREC);
507 } else {
508 emit_error("Found extraneous percent sign");
509 }
510 }
511
512 /* look for a comment */
513 if (next_char == '/' && (next_char2 == '*' || next_char2 == '/'))
514 {
515 /* swallow then continue the scan */
516 swallow_comment();
517 continue;
518 }
519
520 /* look for start of code string */
521 if (next_char == '{' && next_char2 == ':')
522 return do_code_string();
523
524 /* look for an id or keyword */
525 if (id_start_char(next_char)) return do_id();
526
527 /* look for EOF */
528 if (next_char == EOF_CHAR) return new Symbol(sym.EOF);
529
530 /* if we get here, we have an unrecognized character */
531 emit_warn("Unrecognized character '" +
532 new Character((char)next_char) + "'(" + next_char +
533 ") -- ignored");
534
535 /* advance past it */
536 advance();
537 }
538 }
539
540 /*-----------------------------------------------------------*/
541 }
542