Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/oro/text/awk/AwkCompiler.java


1   /*
2    * $Id: AwkCompiler.java,v 1.10 2003/11/07 20:16:24 dfs Exp $
3    *
4    * ====================================================================
5    * The Apache Software License, Version 1.1
6    *
7    * Copyright (c) 2000 The Apache Software Foundation.  All rights
8    * reserved.
9    *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   *
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer.
16   *
17   * 2. Redistributions in binary form must reproduce the above copyright
18   *    notice, this list of conditions and the following disclaimer in
19   *    the documentation and/or other materials provided with the
20   *    distribution.
21   *
22   * 3. The end-user documentation included with the redistribution,
23   *    if any, must include the following acknowledgment:
24   *       "This product includes software developed by the
25   *        Apache Software Foundation (http://www.apache.org/)."
26   *    Alternately, this acknowledgment may appear in the software itself,
27   *    if and wherever such third-party acknowledgments normally appear.
28   *
29   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
30   *    must not be used to endorse or promote products derived from this
31   *    software without prior written permission. For written
32   *    permission, please contact apache@apache.org.
33   *
34   * 5. Products derived from this software may not be called "Apache" 
35   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
36   *    name, without prior written permission of the Apache Software Foundation.
37   *
38   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49   * SUCH DAMAGE.
50   * ====================================================================
51   *
52   * This software consists of voluntary contributions made by many
53   * individuals on behalf of the Apache Software Foundation.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  
59  package org.apache.oro.text.awk;
60  
61  import org.apache.oro.text.regex.*;
62  
63  /**
64   * The AwkCompiler class is used to create compiled regular expressions
65   * conforming to the Awk regular expression syntax.  It generates
66   * AwkPattern instances upon compilation to be used in conjunction
67   * with an AwkMatcher instance.  AwkMatcher finds true leftmost-longest
68   * matches, so you must take care with how you formulate your regular
69   * expression to avoid matching more than you really want.
70   * <p>
71   * The supported regular expression syntax is a superset of traditional AWK,
72   * but NOT to be confused with GNU AWK or other AWK variants.  Additionally,
73   * this AWK implementation is DFA-based and only supports 8-bit ASCII.
74   * Consequently, these classes can perform very fast pattern matches in
75   * most cases.
76   * <p>
77   * This is the traditional Awk syntax that is supported:
78   * <ul>
79   * <li> Alternatives separated by |
80   * <li> Quantified atoms
81   * <dl compact>
82   *      <dt> *     <dd> Match 0 or more times.
83   *      <dt> +     <dd> Match 1 or more times.
84   *      <dt> ?     <dd> Match 0 or 1 times.
85   * </dl>
86   * <li> Atoms
87   * <ul>
88   *     <li> regular expression within parentheses
89   *     <li> a . matches everything including newline
90   *     <li> a ^ is a null token matching the beginning of a string
91   *          but has no relation to newlines (and is only valid at the
92   *          beginning of a regex; this differs from traditional awk
93   *          for the sake of efficiency in Java).
94   *     <li> a $ is a null token matching the end of a string but has
95   *          no relation to newlines (and is only valid at the
96   *          end of a regex; this differs from traditional awk for the
97   *          sake of efficiency in Java).
98   *     <li> Character classes (e.g., [abcd]) and ranges (e.g. [a-z])
99   *     <ul>
100  *         <li> Special backslashed characters work within a character class
101  *     </ul>
102  *     <li> Special backslashed characters
103  *     <dl compact>
104  *         <dt> \b <dd> backspace
105  *         <dt> \n <dd> newline
106  *         <dt> \r <dd> carriage return
107  *         <dt> \t <dd> tab
108  *         <dt> \f <dd> formfeed
109  *         <dt> \xnn <dd> hexadecimal representation of character
110  *         <dt> \nn or \nnn <dd> octal representation of character
111  *         <dt> Any other backslashed character matches itself
112  *     </dl>
113  * </ul></ul>
114  * <p>
115  * This is the extended syntax that is supported:
116  * <ul>
117  * <li> Quantified atoms
118  * <dl compact>
119  *      <dt> {n,m} <dd> Match at least n but not more than m times.
120  *  <dt> {n,}  <dd> Match at least n times.
121  *      <dt> {n}   <dd> Match exactly n times.  
122  * </dl>
123  * <li> Atoms
124  * <ul>
125  *     <li> Special backslashed characters
126  *     <dl compact>
127  *         <dt> \d <dd> digit [0-9]
128  *         <dt> \D <dd> non-digit [^0-9]
129  *         <dt> \w <dd> word character [0-9a-z_A-Z]
130  *         <dt> \W <dd> a non-word character [^0-9a-z_A-Z]
131  *         <dt> \s <dd> a whitespace character [ \t\n\r\f]
132  *         <dt> \S <dd> a non-whitespace character [^ \t\n\r\f]
133  *         <dt> \cD <dd> matches the corresponding control character
134  *         <dt> \0 <dd> matches null character
135  *     </dl>
136  * </ul></ul>
137  *
138  * @version @version@
139  * @since 1.0
140  * @see org.apache.oro.text.regex.PatternCompiler
141  * @see org.apache.oro.text.regex.MalformedPatternException
142  * @see AwkPattern
143  * @see AwkMatcher
144  */                        
145 public final class AwkCompiler implements PatternCompiler {
146 
147   /**
148    * The default mask for the {@link #compile compile} methods.
149    * It is equal to 0 and indicates no special options are active.
150    */
151   public static final int DEFAULT_MASK          = 0;
152 
153   /**
154    * A mask passed as an option to the {@link #compile compile} methods
155    * to indicate a compiled regular expression should be case insensitive.
156    */
157   public static final int CASE_INSENSITIVE_MASK = 0x0001;
158 
159 
160   /**
161    * A mask passed as an option to the  {@link #compile compile} methods
162    * to indicate a compiled regular expression should treat input as having
163    * multiple lines.  This option affects the interpretation of
164    * the <b> . </b> metacharacters.  When this mask is used,
165    * the <b> . </b> metacharacter will not match newlines.  The default
166    * behavior is for <b> . </b> to match newlines.
167    */
168   public static final int MULTILINE_MASK = 0x0002;
169 
170   static final char _END_OF_INPUT = '\uFFFF';
171   
172   // All of these are initialized by the compile() and _parse() methods
173   // so there is no need or use in initializing them in the constructor
174   // although this may change in the future.
175   private boolean __inCharacterClass, __caseSensitive, __multiline;
176   private boolean __beginAnchor, __endAnchor;
177   private char __lookahead;
178   private int __position, __bytesRead, __expressionLength;
179   private char[] __regularExpression;
180   private int __openParen, __closeParen;
181 
182   // We do not currently need to initialize any state, but keep this
183   // commented out as a reminder that we may have to at some point.
184   //public AwkCompiler() { }
185 
186   private static boolean __isMetachar(char token) {
187     return (token == '*' || token == '?' || token == '+' ||
188       token == '[' || token == ']' || token == '(' ||
189       token == ')' || token == '|' || /* token == '^' ||
190       token == '$' || */ token == '.');
191   }
192 
193   static boolean _isWordCharacter(char token) {
194     return ((token >= 'a' && token <= 'z') || 
195       (token >= 'A' && token <= 'Z') || 
196       (token >= '0' && token <= '9') || 
197       (token == '_'));
198   }
199 
200   static boolean _isLowerCase(char token){
201     return (token >= 'a' && token <= 'z');
202   }
203 
204   static boolean _isUpperCase(char token){
205     return (token >= 'A' && token <= 'Z');
206   }
207 
208   static char _toggleCase(char token){
209     if(_isUpperCase(token))
210       return (char)(token + 32);
211     else if(_isLowerCase(token))
212       return (char)(token - 32);
213 
214     return token;
215   }
216 
217 
218   private void __match(char token) throws MalformedPatternException {
219     if(token == __lookahead){
220       if(__bytesRead < __expressionLength)
221   __lookahead = __regularExpression[__bytesRead++];
222       else
223   __lookahead = _END_OF_INPUT;
224     }
225     else
226       throw new MalformedPatternException("token: " + token + 
227             " does not match lookahead: " +
228             __lookahead + " at position: " +
229                __bytesRead);
230   }
231 
232   private void __putback() {
233     if(__lookahead != _END_OF_INPUT)
234       --__bytesRead;
235     __lookahead = __regularExpression[__bytesRead - 1];
236   }
237 
238   private SyntaxNode __regex() throws MalformedPatternException {
239     SyntaxNode left;
240 
241     left = __branch();
242 
243     if(__lookahead == '|') {
244       __match('|');
245       return (new OrNode(left, __regex()));
246     } 
247 
248     return left;
249   }
250 
251 
252   private SyntaxNode __branch() throws MalformedPatternException {
253     CatNode current;
254     SyntaxNode left, root;
255 
256     left = __piece();
257 
258     if(__lookahead == ')'){
259       if(__openParen > __closeParen)
260   return left;
261       else
262     throw
263       new MalformedPatternException("Parse error: close parenthesis"
264        + " without matching open parenthesis at position " + __bytesRead);
265     } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT)
266       return left;
267 
268     root = current = new CatNode();
269     current._left = left;
270 
271     while(true) {
272       left = __piece();
273 
274       if(__lookahead == ')'){
275   if(__openParen > __closeParen){
276     current._right = left;
277     break;
278   }
279   else
280     throw
281       new MalformedPatternException("Parse error: close parenthesis"
282        + " without matching open parenthesis at position " + __bytesRead);
283       } else  if(__lookahead == '|' || __lookahead == _END_OF_INPUT){
284   current._right = left;
285   break;
286       }
287 
288       current._right = new CatNode();
289       current = (CatNode)current._right;
290       current._left   = left;
291     }
292 
293     return root;
294   }
295 
296 
297   private SyntaxNode __piece() throws MalformedPatternException {
298     SyntaxNode left;
299 
300     left = __atom();
301 
302     switch(__lookahead){
303     case '+' : __match('+'); return (new PlusNode(left));
304     case '?' : __match('?'); return (new QuestionNode(left));
305     case '*' : __match('*'); return (new StarNode(left));
306     case '{' : return __repetition(left);
307     }
308 
309     return left;
310   }
311 
312   // if numChars is 0, this means match as many as you want
313   private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits)
314     throws MalformedPatternException {
315     int num, digits = 0;
316     StringBuffer buf;
317 
318     // We don't expect huge numbers, so an initial buffer of 4 is fine.
319     buf = new StringBuffer(4);
320 
321     while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){
322       buf.append((char)__lookahead);
323       __match(__lookahead);
324       ++digits;
325     }
326 
327     if(digits < minDigits || digits > maxDigits)
328       throw
329   new MalformedPatternException(
330         "Parse error: unexpected number of digits at position " + __bytesRead);
331 
332     try {
333       num = Integer.parseInt(buf.toString(), radix);
334     } catch(NumberFormatException e) {
335       throw
336   new MalformedPatternException("Parse error: numeric value at " +
337         "position " + __bytesRead + " is invalid");
338     }
339 
340     return num;
341   }
342 
343   private SyntaxNode __repetition(SyntaxNode atom)
344     throws MalformedPatternException {
345     int min, max, startPosition[];
346     SyntaxNode root = null;
347     CatNode catNode;
348 
349     __match('{');
350 
351     min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
352     startPosition = new int[1];
353     startPosition[0] = __position;
354 
355     if(__lookahead == '}'){
356       // Match exactly min times.  Concatenate the atom min times.
357       __match('}');
358 
359       if(min == 0)
360   throw
361     new MalformedPatternException(
362               "Parse error: Superfluous interval specified at position " +
363               __bytesRead + ".  Number of occurences was set to zero.");
364 
365       if(min == 1)
366   return atom;
367 
368       root = catNode = new CatNode();
369       catNode._left = atom;
370 
371       while(--min > 1) {
372   atom = atom._clone(startPosition);
373 
374   catNode._right = new CatNode();
375   catNode       = (CatNode)catNode._right;
376   catNode._left  = atom;
377       }
378 
379       catNode._right = atom._clone(startPosition);
380     } else if(__lookahead == ','){
381       __match(',');
382 
383       if(__lookahead == '}') {
384   // match at least min times
385   __match('}');
386 
387   if(min == 0)
388     return new StarNode(atom);
389 
390   if(min == 1)
391     return new PlusNode(atom);
392 
393   root = catNode = new CatNode();
394   catNode._left = atom;
395 
396   while(--min > 0) {
397     atom = atom._clone(startPosition);
398 
399     catNode._right = new CatNode();
400     catNode       = (CatNode)catNode._right;
401     catNode._left  = atom;
402   }
403 
404   catNode._right = new StarNode(atom._clone(startPosition));
405       } else {
406   // match at least min times and at most max times
407   max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
408   __match('}');
409 
410   if(max < min)
411     throw
412       new MalformedPatternException("Parse error: invalid interval; "
413        +  max + " is less than " + min + " at position " + __bytesRead);
414   if(max == 0)
415     throw
416       new MalformedPatternException(
417       "Parse error: Superfluous interval specified at position " +
418       __bytesRead + ".  Number of occurences was set to zero.");
419 
420   if(min == 0) {
421     if(max == 1)
422       return new QuestionNode(atom);
423 
424     root = catNode = new CatNode();
425     atom = new QuestionNode(atom);
426     catNode._left = atom;
427 
428     while(--max > 1) {
429       atom =  atom._clone(startPosition);
430 
431       catNode._right = new CatNode();
432       catNode       = (CatNode)catNode._right;
433       catNode._left  = atom;
434     }
435 
436     catNode._right = atom._clone(startPosition);
437   } else if(min == max) {
438     if(min == 1)
439       return atom;
440 
441     root = catNode = new CatNode();
442     catNode._left = atom;
443 
444     while(--min > 1) {
445       atom = atom._clone(startPosition);
446 
447       catNode._right = new CatNode();
448       catNode       = (CatNode)catNode._right;
449       catNode._left  = atom;
450     }
451 
452     catNode._right = atom._clone(startPosition);
453   } else {
454     int count;
455 
456     root = catNode = new CatNode();
457     catNode._left = atom;
458 
459     for(count=1; count < min; count++) {
460       atom = atom._clone(startPosition);
461 
462       catNode._right = new CatNode();
463       catNode       = (CatNode)catNode._right;
464       catNode._left  = atom;
465     }
466 
467     atom = new QuestionNode(atom._clone(startPosition));
468 
469     count = max-min;
470 
471     if(count == 1)
472       catNode._right = atom;
473     else {
474       catNode._right = new CatNode();
475       catNode = (CatNode)catNode._right;
476       catNode._left = atom;
477 
478       while(--count > 1) {
479         atom = atom._clone(startPosition);
480 
481         catNode._right = new CatNode();
482         catNode       = (CatNode)catNode._right;
483         catNode._left  = atom;
484       }
485 
486       catNode._right = atom._clone(startPosition);
487     }
488   }
489       }
490     } else
491       throw
492   new MalformedPatternException("Parse error: unexpected character " +
493     __lookahead + " in interval at position "  + __bytesRead);
494     __position = startPosition[0];
495     return root;
496   }
497 
498 
499   private SyntaxNode __backslashToken() throws MalformedPatternException {
500     SyntaxNode current;
501     char token;
502     int number;
503 
504     __match('\\');
505 
506     if(__lookahead == 'x'){
507       __match('x');
508       // Parse a hexadecimal number
509       current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2),
510            __position++);
511     } else if(__lookahead == 'c') {
512       __match('c');
513       // Create a control character
514       token = Character.toUpperCase(__lookahead);
515       token = (char)(token > 63 ? token - 64 : token + 64);
516       current = new TokenNode(token, __position++);
517       __match(__lookahead);
518     } else if(__lookahead >= '0' && __lookahead <= '9') {
519       __match(__lookahead);
520 
521       if(__lookahead >= '0' && __lookahead <= '9'){
522   // We have an octal character or a multi-digit backreference.
523   // Assume octal character for now.
524   __putback();
525   number = __parseUnsignedInteger(10, 2, 3);
526   number = Integer.parseInt(Integer.toString(number), 8);
527   current =  _newTokenNode((char)number, __position++);
528       } else {
529   // We have either \0, an escaped digit, or a backreference.
530   __putback();
531   if(__lookahead == '0'){
532     // \0 matches the null character
533     __match('0');
534     current = new TokenNode('\0', __position++);
535   } else {
536     // Either an escaped digit or backreference.
537     number = Character.digit(__lookahead, 10);
538     current =  _newTokenNode(__lookahead, __position++);
539     __match(__lookahead);
540   }
541       }
542     } else if(__lookahead == 'b') {
543       // Inside of a character class the \b means backspace, otherwise
544       // it means a word boundary
545       //if(__inCharacterClass)
546       // \b always means backspace
547       current = new TokenNode('\b', __position++);
548       /*
549       else 
550   current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
551         position++);
552         */
553       __match('b');
554     } /*else if(__lookahead == 'B' && !__inCharacterClass){
555       current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
556             position++);
557       __match('B');
558     } */ else {
559       CharacterClassNode characterSet;
560       token = __lookahead;
561 
562       switch(__lookahead){
563       case 'n' : token = '\n'; break;
564       case 'r' : token = '\r'; break;
565       case 't' : token = '\t'; break;
566       case 'f' : token = '\f'; break;
567       }
568 
569       switch(token) {
570       case 'd' :
571   characterSet = new CharacterClassNode(__position++);
572   characterSet._addTokenRange('0', '9');
573   current = characterSet;
574   break;
575       case 'D' :
576   characterSet = new NegativeCharacterClassNode(__position++);
577   characterSet._addTokenRange('0', '9');
578   current = characterSet;
579   break;
580       case 'w' :
581   characterSet = new CharacterClassNode(__position++);
582   characterSet._addTokenRange('0', '9');
583   characterSet._addTokenRange('a', 'z');
584   characterSet._addTokenRange('A', 'Z');
585   characterSet._addToken('_');
586   current = characterSet;
587   break;
588       case 'W' :
589   characterSet = new NegativeCharacterClassNode(__position++);
590   characterSet._addTokenRange('0', '9');
591   characterSet._addTokenRange('a', 'z');
592   characterSet._addTokenRange('A', 'Z');
593   characterSet._addToken('_');
594   current = characterSet;
595   break;
596       case 's' :
597   characterSet = new CharacterClassNode(__position++);
598   characterSet._addToken(' ');
599   characterSet._addToken('\f');
600   characterSet._addToken('\n');
601   characterSet._addToken('\r');
602   characterSet._addToken('\t');
603   current = characterSet;
604   break;
605       case 'S' :
606   characterSet = new NegativeCharacterClassNode(__position++);
607   characterSet._addToken(' ');
608   characterSet._addToken('\f');
609   characterSet._addToken('\n');
610   characterSet._addToken('\r');
611   characterSet._addToken('\t');
612   current = characterSet;
613   break;
614   default  : current = _newTokenNode(token, __position++); break;
615       }
616 
617       __match(__lookahead);
618     }
619 
620     return current;
621   }
622 
623   private SyntaxNode __atom() throws MalformedPatternException {
624     SyntaxNode current;
625 
626     if(__lookahead == '(') {
627       __match('(');
628       ++__openParen;
629       current = __regex();
630       __match(')');
631       ++__closeParen;
632     } else if(__lookahead == '[')
633       current = __characterClass();
634     else if(__lookahead == '.') {
635       CharacterClassNode characterSet;
636 
637       __match('.');
638       characterSet = new NegativeCharacterClassNode(__position++);
639       if(__multiline)
640   characterSet._addToken('\n');
641       current = characterSet;
642     } else if(__lookahead == '\\') {
643       current = __backslashToken();
644     } /*else if(__lookahead == '^') {
645       current =
646   new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);
647       __match('^');
648     } else if(__lookahead == '$') {
649       current =
650   new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);
651       __match('$');
652     } */ else if(!__isMetachar(__lookahead)) {
653       current = _newTokenNode(__lookahead, __position++);
654       __match(__lookahead);
655     } else
656       throw
657   new MalformedPatternException("Parse error: unexpected character " +
658         __lookahead + " at position " + __bytesRead);
659 
660     return current;
661   }
662 
663 
664   private SyntaxNode __characterClass() throws MalformedPatternException {
665     char lastToken, token;
666     SyntaxNode node;
667     CharacterClassNode current;
668 
669     __match('[');
670     __inCharacterClass = true;
671 
672     if(__lookahead == '^'){
673       __match('^');
674       current = new NegativeCharacterClassNode(__position++);
675     } else
676       current = new CharacterClassNode(__position++);
677 
678     while(__lookahead != ']' && __lookahead != _END_OF_INPUT) {
679 
680       if(__lookahead == '\\'){
681   node = __backslashToken();
682   --__position;
683 
684   // __backslashToken() (actually newTokenNode()) does not take care of
685         // case insensitivity when __inCharacterClass is true.
686   if(node instanceof TokenNode){
687     lastToken = ((TokenNode)node)._token;
688     current._addToken(lastToken);
689     if(!__caseSensitive)
690       current._addToken(_toggleCase(lastToken));
691   } else {
692     CharacterClassNode slash;
693     slash = (CharacterClassNode)node;
694     // This could be made more efficient by manipulating the
695     // characterSet elements of the CharacterClassNodes but
696     // for the moment, this is more clear.
697     for(token=0; token < LeafNode._NUM_TOKENS; token++){
698       if(slash._matches(token))
699         current._addToken(token);
700     }
701 
702     // A byproduct of this act is that when a '-' occurs after
703     // a \d, \w, etc. it is not interpreted as a range and no
704     // parse exception is thrown.
705     // This is considered a feature and not a bug for now.
706     continue;
707   }
708       } else {
709   lastToken = __lookahead;
710   current._addToken(__lookahead);
711   if(!__caseSensitive)
712     current._addToken(_toggleCase(__lookahead));
713   __match(__lookahead);
714       }
715 
716       // In Perl, a - is a token if it occurs at the beginning
717       // or end of the character class.  Anywhere else, it indicates
718       // a range.
719       // A byproduct of this implementation is that if a '-' occurs
720       // after the end of a range, it is interpreted as a '-' and no
721       // exception is thrown. e.g., the second dash in [a-z-x]
722       // This is considered a feature and not a bug for now.
723       if(__lookahead == '-'){
724   __match('-');
725   if(__lookahead == ']'){
726     current._addToken('-');
727     break;
728   } else if(__lookahead == '\\') {
729     node = __backslashToken();
730     --__position;
731     if(node instanceof TokenNode)
732       token = ((TokenNode)node)._token;
733     else
734       throw new MalformedPatternException(
735      "Parse error: invalid range specified at position " + __bytesRead);
736   } else {
737     token = __lookahead;
738     __match(__lookahead);
739   }
740 
741   if(token < lastToken)
742     throw new MalformedPatternException(
743    "Parse error: invalid range specified at position " + __bytesRead);
744   current._addTokenRange(lastToken + 1, token);
745   if(!__caseSensitive)
746     current._addTokenRange(_toggleCase((char)(lastToken + 1)),
747         _toggleCase(token));
748       }
749     }
750 
751     __match(']');
752     __inCharacterClass = false;
753     return current;
754   }
755 
756 
757   SyntaxNode _newTokenNode(char token, int position){
758     if(!__inCharacterClass && !__caseSensitive &&
759        (_isUpperCase(token) || _isLowerCase(token))){
760       CharacterClassNode node = new CharacterClassNode(position);
761       node._addToken(token);
762       node._addToken(_toggleCase(token));
763       return node;
764     }
765 
766     return new TokenNode(token, position);
767   }
768 
769 
770   SyntaxTree _parse(char[] expression) throws MalformedPatternException {
771     SyntaxTree tree;
772 
773     __openParen = __closeParen = 0;
774     __regularExpression = expression;
775     __bytesRead = 0;
776     __expressionLength = expression.length;
777     __inCharacterClass = false;
778 
779     __position = 0;
780     __match(__lookahead); // Call match to read first input.
781 
782     if(__lookahead == '^') {
783       __beginAnchor = true;
784       __match(__lookahead);
785     }
786 
787     if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') {
788       --__expressionLength;
789       __endAnchor = true;
790     }
791 
792     if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) {
793       CatNode root;
794       root = new CatNode();
795       root._left  = __regex();
796       // end marker
797       root._right =
798   new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++);
799       tree = new SyntaxTree(root, __position);
800     } else 
801       tree = new
802   SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1);
803 
804     tree._computeFollowPositions();
805 
806     return tree;
807   }
808 
809 
810   /**
811    * Compiles an Awk regular expression into an AwkPattern instance that
812    * can be used by an AwkMatcher object to perform pattern matching.
813    * <p>
814    * @param pattern  An Awk regular expression to compile.
815    * @param options  A set of flags giving the compiler instructions on
816    *                 how to treat the regular expression.  Currently the
817    *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
818    * @return A Pattern instance constituting the compiled regular expression.
819    *         This instance will always be an AwkPattern and can be reliably
820    *         be casted to an AwkPattern.
821    * @exception MalformedPatternException  If the compiled expression
822    *  is not a valid Awk regular expression.
823    */
824   public Pattern compile(char[] pattern, int options)
825        throws MalformedPatternException
826   {
827     SyntaxTree tree;
828     AwkPattern regexp;
829 
830     __beginAnchor   = __endAnchor = false;
831     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
832     __multiline     = ((options & MULTILINE_MASK) != 0);
833     tree   = _parse(pattern);
834     regexp = new AwkPattern(new String(pattern), tree);
835     regexp._options = options;
836     regexp._hasBeginAnchor = __beginAnchor;
837     regexp._hasEndAnchor   = __endAnchor;
838 
839     return regexp;
840   }
841 
842 
843   /**
844    * Compiles an Awk regular expression into an AwkPattern instance that
845    * can be used by an AwkMatcher object to perform pattern matching.
846    * <p>
847    * @param pattern  An Awk regular expression to compile.
848    * @param options  A set of flags giving the compiler instructions on
849    *                 how to treat the regular expression.  Currently the
850    *                 only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
851    * @return A Pattern instance constituting the compiled regular expression.
852    *         This instance will always be an AwkPattern and can be reliably
853    *         be casted to an AwkPattern.
854    * @exception MalformedPatternException  If the compiled expression
855    *  is not a valid Awk regular expression.
856    */
857   public Pattern compile(String pattern, int options)
858        throws MalformedPatternException
859   {
860     SyntaxTree tree;
861     AwkPattern regexp;
862 
863     __beginAnchor = __endAnchor = false;
864     __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
865     __multiline     = ((options & MULTILINE_MASK) != 0);
866     tree   = _parse(pattern.toCharArray());
867     regexp = new AwkPattern(pattern, tree);
868     regexp._options = options;
869     regexp._hasBeginAnchor = __beginAnchor;
870     regexp._hasEndAnchor   = __endAnchor;
871 
872     return regexp;
873   }
874 
875   /**
876    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
877    * <p>
878    * @param pattern  A regular expression to compile.
879    * @return A Pattern instance constituting the compiled regular expression.
880    *         This instance will always be an AwkPattern and can be reliably
881    *         be casted to an AwkPattern.
882    * @exception MalformedPatternException  If the compiled expression
883    *  is not a valid Awk regular expression.
884    */
885   public Pattern compile(char[] pattern) throws MalformedPatternException {
886     return compile(pattern, DEFAULT_MASK);
887   }
888 
889 
890   /**
891    * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
892    * <p>
893    * @param pattern  A regular expression to compile.
894    * @return A Pattern instance constituting the compiled regular expression.
895    *         This instance will always be an AwkPattern and can be reliably
896    *         be casted to an AwkPattern.
897    * @exception MalformedPatternException  If the compiled expression
898    *  is not a valid Awk regular expression.
899    */
900   public Pattern compile(String pattern) throws MalformedPatternException {
901     return compile(pattern, DEFAULT_MASK);
902   }
903 
904 }