Source code: org/apache/oro/text/awk/AwkCompiler.java
1 /*
2 * $Id: AwkCompiler.java,v 1.10 2003/11/07 20:16:24 dfs Exp $
3 *
4 * ====================================================================
5 * The Apache Software License, Version 1.1
6 *
7 * Copyright (c) 2000 The Apache Software Foundation. All rights
8 * reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * 3. The end-user documentation included with the redistribution,
23 * if any, must include the following acknowledgment:
24 * "This product includes software developed by the
25 * Apache Software Foundation (http://www.apache.org/)."
26 * Alternately, this acknowledgment may appear in the software itself,
27 * if and wherever such third-party acknowledgments normally appear.
28 *
29 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
30 * must not be used to endorse or promote products derived from this
31 * software without prior written permission. For written
32 * permission, please contact apache@apache.org.
33 *
34 * 5. Products derived from this software may not be called "Apache"
35 * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
36 * name, without prior written permission of the Apache Software Foundation.
37 *
38 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This software consists of voluntary contributions made by many
53 * individuals on behalf of the Apache Software Foundation. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58
59 package org.apache.oro.text.awk;
60
61 import org.apache.oro.text.regex.*;
62
63 /**
64 * The AwkCompiler class is used to create compiled regular expressions
65 * conforming to the Awk regular expression syntax. It generates
66 * AwkPattern instances upon compilation to be used in conjunction
67 * with an AwkMatcher instance. AwkMatcher finds true leftmost-longest
68 * matches, so you must take care with how you formulate your regular
69 * expression to avoid matching more than you really want.
70 * <p>
71 * The supported regular expression syntax is a superset of traditional AWK,
72 * but NOT to be confused with GNU AWK or other AWK variants. Additionally,
73 * this AWK implementation is DFA-based and only supports 8-bit ASCII.
74 * Consequently, these classes can perform very fast pattern matches in
75 * most cases.
76 * <p>
77 * This is the traditional Awk syntax that is supported:
78 * <ul>
79 * <li> Alternatives separated by |
80 * <li> Quantified atoms
81 * <dl compact>
82 * <dt> * <dd> Match 0 or more times.
83 * <dt> + <dd> Match 1 or more times.
84 * <dt> ? <dd> Match 0 or 1 times.
85 * </dl>
86 * <li> Atoms
87 * <ul>
88 * <li> regular expression within parentheses
89 * <li> a . matches everything including newline
90 * <li> a ^ is a null token matching the beginning of a string
91 * but has no relation to newlines (and is only valid at the
92 * beginning of a regex; this differs from traditional awk
93 * for the sake of efficiency in Java).
94 * <li> a $ is a null token matching the end of a string but has
95 * no relation to newlines (and is only valid at the
96 * end of a regex; this differs from traditional awk for the
97 * sake of efficiency in Java).
98 * <li> Character classes (e.g., [abcd]) and ranges (e.g. [a-z])
99 * <ul>
100 * <li> Special backslashed characters work within a character class
101 * </ul>
102 * <li> Special backslashed characters
103 * <dl compact>
104 * <dt> \b <dd> backspace
105 * <dt> \n <dd> newline
106 * <dt> \r <dd> carriage return
107 * <dt> \t <dd> tab
108 * <dt> \f <dd> formfeed
109 * <dt> \xnn <dd> hexadecimal representation of character
110 * <dt> \nn or \nnn <dd> octal representation of character
111 * <dt> Any other backslashed character matches itself
112 * </dl>
113 * </ul></ul>
114 * <p>
115 * This is the extended syntax that is supported:
116 * <ul>
117 * <li> Quantified atoms
118 * <dl compact>
119 * <dt> {n,m} <dd> Match at least n but not more than m times.
120 * <dt> {n,} <dd> Match at least n times.
121 * <dt> {n} <dd> Match exactly n times.
122 * </dl>
123 * <li> Atoms
124 * <ul>
125 * <li> Special backslashed characters
126 * <dl compact>
127 * <dt> \d <dd> digit [0-9]
128 * <dt> \D <dd> non-digit [^0-9]
129 * <dt> \w <dd> word character [0-9a-z_A-Z]
130 * <dt> \W <dd> a non-word character [^0-9a-z_A-Z]
131 * <dt> \s <dd> a whitespace character [ \t\n\r\f]
132 * <dt> \S <dd> a non-whitespace character [^ \t\n\r\f]
133 * <dt> \cD <dd> matches the corresponding control character
134 * <dt> \0 <dd> matches null character
135 * </dl>
136 * </ul></ul>
137 *
138 * @version @version@
139 * @since 1.0
140 * @see org.apache.oro.text.regex.PatternCompiler
141 * @see org.apache.oro.text.regex.MalformedPatternException
142 * @see AwkPattern
143 * @see AwkMatcher
144 */
145 public final class AwkCompiler implements PatternCompiler {
146
147 /**
148 * The default mask for the {@link #compile compile} methods.
149 * It is equal to 0 and indicates no special options are active.
150 */
151 public static final int DEFAULT_MASK = 0;
152
153 /**
154 * A mask passed as an option to the {@link #compile compile} methods
155 * to indicate a compiled regular expression should be case insensitive.
156 */
157 public static final int CASE_INSENSITIVE_MASK = 0x0001;
158
159
160 /**
161 * A mask passed as an option to the {@link #compile compile} methods
162 * to indicate a compiled regular expression should treat input as having
163 * multiple lines. This option affects the interpretation of
164 * the <b> . </b> metacharacters. When this mask is used,
165 * the <b> . </b> metacharacter will not match newlines. The default
166 * behavior is for <b> . </b> to match newlines.
167 */
168 public static final int MULTILINE_MASK = 0x0002;
169
170 static final char _END_OF_INPUT = '\uFFFF';
171
172 // All of these are initialized by the compile() and _parse() methods
173 // so there is no need or use in initializing them in the constructor
174 // although this may change in the future.
175 private boolean __inCharacterClass, __caseSensitive, __multiline;
176 private boolean __beginAnchor, __endAnchor;
177 private char __lookahead;
178 private int __position, __bytesRead, __expressionLength;
179 private char[] __regularExpression;
180 private int __openParen, __closeParen;
181
182 // We do not currently need to initialize any state, but keep this
183 // commented out as a reminder that we may have to at some point.
184 //public AwkCompiler() { }
185
186 private static boolean __isMetachar(char token) {
187 return (token == '*' || token == '?' || token == '+' ||
188 token == '[' || token == ']' || token == '(' ||
189 token == ')' || token == '|' || /* token == '^' ||
190 token == '$' || */ token == '.');
191 }
192
193 static boolean _isWordCharacter(char token) {
194 return ((token >= 'a' && token <= 'z') ||
195 (token >= 'A' && token <= 'Z') ||
196 (token >= '0' && token <= '9') ||
197 (token == '_'));
198 }
199
200 static boolean _isLowerCase(char token){
201 return (token >= 'a' && token <= 'z');
202 }
203
204 static boolean _isUpperCase(char token){
205 return (token >= 'A' && token <= 'Z');
206 }
207
208 static char _toggleCase(char token){
209 if(_isUpperCase(token))
210 return (char)(token + 32);
211 else if(_isLowerCase(token))
212 return (char)(token - 32);
213
214 return token;
215 }
216
217
218 private void __match(char token) throws MalformedPatternException {
219 if(token == __lookahead){
220 if(__bytesRead < __expressionLength)
221 __lookahead = __regularExpression[__bytesRead++];
222 else
223 __lookahead = _END_OF_INPUT;
224 }
225 else
226 throw new MalformedPatternException("token: " + token +
227 " does not match lookahead: " +
228 __lookahead + " at position: " +
229 __bytesRead);
230 }
231
232 private void __putback() {
233 if(__lookahead != _END_OF_INPUT)
234 --__bytesRead;
235 __lookahead = __regularExpression[__bytesRead - 1];
236 }
237
238 private SyntaxNode __regex() throws MalformedPatternException {
239 SyntaxNode left;
240
241 left = __branch();
242
243 if(__lookahead == '|') {
244 __match('|');
245 return (new OrNode(left, __regex()));
246 }
247
248 return left;
249 }
250
251
252 private SyntaxNode __branch() throws MalformedPatternException {
253 CatNode current;
254 SyntaxNode left, root;
255
256 left = __piece();
257
258 if(__lookahead == ')'){
259 if(__openParen > __closeParen)
260 return left;
261 else
262 throw
263 new MalformedPatternException("Parse error: close parenthesis"
264 + " without matching open parenthesis at position " + __bytesRead);
265 } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT)
266 return left;
267
268 root = current = new CatNode();
269 current._left = left;
270
271 while(true) {
272 left = __piece();
273
274 if(__lookahead == ')'){
275 if(__openParen > __closeParen){
276 current._right = left;
277 break;
278 }
279 else
280 throw
281 new MalformedPatternException("Parse error: close parenthesis"
282 + " without matching open parenthesis at position " + __bytesRead);
283 } else if(__lookahead == '|' || __lookahead == _END_OF_INPUT){
284 current._right = left;
285 break;
286 }
287
288 current._right = new CatNode();
289 current = (CatNode)current._right;
290 current._left = left;
291 }
292
293 return root;
294 }
295
296
297 private SyntaxNode __piece() throws MalformedPatternException {
298 SyntaxNode left;
299
300 left = __atom();
301
302 switch(__lookahead){
303 case '+' : __match('+'); return (new PlusNode(left));
304 case '?' : __match('?'); return (new QuestionNode(left));
305 case '*' : __match('*'); return (new StarNode(left));
306 case '{' : return __repetition(left);
307 }
308
309 return left;
310 }
311
312 // if numChars is 0, this means match as many as you want
313 private int __parseUnsignedInteger(int radix, int minDigits, int maxDigits)
314 throws MalformedPatternException {
315 int num, digits = 0;
316 StringBuffer buf;
317
318 // We don't expect huge numbers, so an initial buffer of 4 is fine.
319 buf = new StringBuffer(4);
320
321 while(Character.digit(__lookahead, radix) != -1 && digits < maxDigits){
322 buf.append((char)__lookahead);
323 __match(__lookahead);
324 ++digits;
325 }
326
327 if(digits < minDigits || digits > maxDigits)
328 throw
329 new MalformedPatternException(
330 "Parse error: unexpected number of digits at position " + __bytesRead);
331
332 try {
333 num = Integer.parseInt(buf.toString(), radix);
334 } catch(NumberFormatException e) {
335 throw
336 new MalformedPatternException("Parse error: numeric value at " +
337 "position " + __bytesRead + " is invalid");
338 }
339
340 return num;
341 }
342
343 private SyntaxNode __repetition(SyntaxNode atom)
344 throws MalformedPatternException {
345 int min, max, startPosition[];
346 SyntaxNode root = null;
347 CatNode catNode;
348
349 __match('{');
350
351 min = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
352 startPosition = new int[1];
353 startPosition[0] = __position;
354
355 if(__lookahead == '}'){
356 // Match exactly min times. Concatenate the atom min times.
357 __match('}');
358
359 if(min == 0)
360 throw
361 new MalformedPatternException(
362 "Parse error: Superfluous interval specified at position " +
363 __bytesRead + ". Number of occurences was set to zero.");
364
365 if(min == 1)
366 return atom;
367
368 root = catNode = new CatNode();
369 catNode._left = atom;
370
371 while(--min > 1) {
372 atom = atom._clone(startPosition);
373
374 catNode._right = new CatNode();
375 catNode = (CatNode)catNode._right;
376 catNode._left = atom;
377 }
378
379 catNode._right = atom._clone(startPosition);
380 } else if(__lookahead == ','){
381 __match(',');
382
383 if(__lookahead == '}') {
384 // match at least min times
385 __match('}');
386
387 if(min == 0)
388 return new StarNode(atom);
389
390 if(min == 1)
391 return new PlusNode(atom);
392
393 root = catNode = new CatNode();
394 catNode._left = atom;
395
396 while(--min > 0) {
397 atom = atom._clone(startPosition);
398
399 catNode._right = new CatNode();
400 catNode = (CatNode)catNode._right;
401 catNode._left = atom;
402 }
403
404 catNode._right = new StarNode(atom._clone(startPosition));
405 } else {
406 // match at least min times and at most max times
407 max = __parseUnsignedInteger(10, 1, Integer.MAX_VALUE);
408 __match('}');
409
410 if(max < min)
411 throw
412 new MalformedPatternException("Parse error: invalid interval; "
413 + max + " is less than " + min + " at position " + __bytesRead);
414 if(max == 0)
415 throw
416 new MalformedPatternException(
417 "Parse error: Superfluous interval specified at position " +
418 __bytesRead + ". Number of occurences was set to zero.");
419
420 if(min == 0) {
421 if(max == 1)
422 return new QuestionNode(atom);
423
424 root = catNode = new CatNode();
425 atom = new QuestionNode(atom);
426 catNode._left = atom;
427
428 while(--max > 1) {
429 atom = atom._clone(startPosition);
430
431 catNode._right = new CatNode();
432 catNode = (CatNode)catNode._right;
433 catNode._left = atom;
434 }
435
436 catNode._right = atom._clone(startPosition);
437 } else if(min == max) {
438 if(min == 1)
439 return atom;
440
441 root = catNode = new CatNode();
442 catNode._left = atom;
443
444 while(--min > 1) {
445 atom = atom._clone(startPosition);
446
447 catNode._right = new CatNode();
448 catNode = (CatNode)catNode._right;
449 catNode._left = atom;
450 }
451
452 catNode._right = atom._clone(startPosition);
453 } else {
454 int count;
455
456 root = catNode = new CatNode();
457 catNode._left = atom;
458
459 for(count=1; count < min; count++) {
460 atom = atom._clone(startPosition);
461
462 catNode._right = new CatNode();
463 catNode = (CatNode)catNode._right;
464 catNode._left = atom;
465 }
466
467 atom = new QuestionNode(atom._clone(startPosition));
468
469 count = max-min;
470
471 if(count == 1)
472 catNode._right = atom;
473 else {
474 catNode._right = new CatNode();
475 catNode = (CatNode)catNode._right;
476 catNode._left = atom;
477
478 while(--count > 1) {
479 atom = atom._clone(startPosition);
480
481 catNode._right = new CatNode();
482 catNode = (CatNode)catNode._right;
483 catNode._left = atom;
484 }
485
486 catNode._right = atom._clone(startPosition);
487 }
488 }
489 }
490 } else
491 throw
492 new MalformedPatternException("Parse error: unexpected character " +
493 __lookahead + " in interval at position " + __bytesRead);
494 __position = startPosition[0];
495 return root;
496 }
497
498
499 private SyntaxNode __backslashToken() throws MalformedPatternException {
500 SyntaxNode current;
501 char token;
502 int number;
503
504 __match('\\');
505
506 if(__lookahead == 'x'){
507 __match('x');
508 // Parse a hexadecimal number
509 current = _newTokenNode((char)__parseUnsignedInteger(16, 2, 2),
510 __position++);
511 } else if(__lookahead == 'c') {
512 __match('c');
513 // Create a control character
514 token = Character.toUpperCase(__lookahead);
515 token = (char)(token > 63 ? token - 64 : token + 64);
516 current = new TokenNode(token, __position++);
517 __match(__lookahead);
518 } else if(__lookahead >= '0' && __lookahead <= '9') {
519 __match(__lookahead);
520
521 if(__lookahead >= '0' && __lookahead <= '9'){
522 // We have an octal character or a multi-digit backreference.
523 // Assume octal character for now.
524 __putback();
525 number = __parseUnsignedInteger(10, 2, 3);
526 number = Integer.parseInt(Integer.toString(number), 8);
527 current = _newTokenNode((char)number, __position++);
528 } else {
529 // We have either \0, an escaped digit, or a backreference.
530 __putback();
531 if(__lookahead == '0'){
532 // \0 matches the null character
533 __match('0');
534 current = new TokenNode('\0', __position++);
535 } else {
536 // Either an escaped digit or backreference.
537 number = Character.digit(__lookahead, 10);
538 current = _newTokenNode(__lookahead, __position++);
539 __match(__lookahead);
540 }
541 }
542 } else if(__lookahead == 'b') {
543 // Inside of a character class the \b means backspace, otherwise
544 // it means a word boundary
545 //if(__inCharacterClass)
546 // \b always means backspace
547 current = new TokenNode('\b', __position++);
548 /*
549 else
550 current = new TokenNode((char)LeafNode._WORD_BOUNDARY_MARKER_TOKEN,
551 position++);
552 */
553 __match('b');
554 } /*else if(__lookahead == 'B' && !__inCharacterClass){
555 current = new TokenNode((char)LeafNode._NONWORD_BOUNDARY_MARKER_TOKEN,
556 position++);
557 __match('B');
558 } */ else {
559 CharacterClassNode characterSet;
560 token = __lookahead;
561
562 switch(__lookahead){
563 case 'n' : token = '\n'; break;
564 case 'r' : token = '\r'; break;
565 case 't' : token = '\t'; break;
566 case 'f' : token = '\f'; break;
567 }
568
569 switch(token) {
570 case 'd' :
571 characterSet = new CharacterClassNode(__position++);
572 characterSet._addTokenRange('0', '9');
573 current = characterSet;
574 break;
575 case 'D' :
576 characterSet = new NegativeCharacterClassNode(__position++);
577 characterSet._addTokenRange('0', '9');
578 current = characterSet;
579 break;
580 case 'w' :
581 characterSet = new CharacterClassNode(__position++);
582 characterSet._addTokenRange('0', '9');
583 characterSet._addTokenRange('a', 'z');
584 characterSet._addTokenRange('A', 'Z');
585 characterSet._addToken('_');
586 current = characterSet;
587 break;
588 case 'W' :
589 characterSet = new NegativeCharacterClassNode(__position++);
590 characterSet._addTokenRange('0', '9');
591 characterSet._addTokenRange('a', 'z');
592 characterSet._addTokenRange('A', 'Z');
593 characterSet._addToken('_');
594 current = characterSet;
595 break;
596 case 's' :
597 characterSet = new CharacterClassNode(__position++);
598 characterSet._addToken(' ');
599 characterSet._addToken('\f');
600 characterSet._addToken('\n');
601 characterSet._addToken('\r');
602 characterSet._addToken('\t');
603 current = characterSet;
604 break;
605 case 'S' :
606 characterSet = new NegativeCharacterClassNode(__position++);
607 characterSet._addToken(' ');
608 characterSet._addToken('\f');
609 characterSet._addToken('\n');
610 characterSet._addToken('\r');
611 characterSet._addToken('\t');
612 current = characterSet;
613 break;
614 default : current = _newTokenNode(token, __position++); break;
615 }
616
617 __match(__lookahead);
618 }
619
620 return current;
621 }
622
623 private SyntaxNode __atom() throws MalformedPatternException {
624 SyntaxNode current;
625
626 if(__lookahead == '(') {
627 __match('(');
628 ++__openParen;
629 current = __regex();
630 __match(')');
631 ++__closeParen;
632 } else if(__lookahead == '[')
633 current = __characterClass();
634 else if(__lookahead == '.') {
635 CharacterClassNode characterSet;
636
637 __match('.');
638 characterSet = new NegativeCharacterClassNode(__position++);
639 if(__multiline)
640 characterSet._addToken('\n');
641 current = characterSet;
642 } else if(__lookahead == '\\') {
643 current = __backslashToken();
644 } /*else if(__lookahead == '^') {
645 current =
646 new TokenNode((char)LeafNode._BEGIN_LINE_MARKER_TOKEN, __position++);
647 __match('^');
648 } else if(__lookahead == '$') {
649 current =
650 new TokenNode((char)LeafNode._END_LINE_MARKER_TOKEN, __position++);
651 __match('$');
652 } */ else if(!__isMetachar(__lookahead)) {
653 current = _newTokenNode(__lookahead, __position++);
654 __match(__lookahead);
655 } else
656 throw
657 new MalformedPatternException("Parse error: unexpected character " +
658 __lookahead + " at position " + __bytesRead);
659
660 return current;
661 }
662
663
664 private SyntaxNode __characterClass() throws MalformedPatternException {
665 char lastToken, token;
666 SyntaxNode node;
667 CharacterClassNode current;
668
669 __match('[');
670 __inCharacterClass = true;
671
672 if(__lookahead == '^'){
673 __match('^');
674 current = new NegativeCharacterClassNode(__position++);
675 } else
676 current = new CharacterClassNode(__position++);
677
678 while(__lookahead != ']' && __lookahead != _END_OF_INPUT) {
679
680 if(__lookahead == '\\'){
681 node = __backslashToken();
682 --__position;
683
684 // __backslashToken() (actually newTokenNode()) does not take care of
685 // case insensitivity when __inCharacterClass is true.
686 if(node instanceof TokenNode){
687 lastToken = ((TokenNode)node)._token;
688 current._addToken(lastToken);
689 if(!__caseSensitive)
690 current._addToken(_toggleCase(lastToken));
691 } else {
692 CharacterClassNode slash;
693 slash = (CharacterClassNode)node;
694 // This could be made more efficient by manipulating the
695 // characterSet elements of the CharacterClassNodes but
696 // for the moment, this is more clear.
697 for(token=0; token < LeafNode._NUM_TOKENS; token++){
698 if(slash._matches(token))
699 current._addToken(token);
700 }
701
702 // A byproduct of this act is that when a '-' occurs after
703 // a \d, \w, etc. it is not interpreted as a range and no
704 // parse exception is thrown.
705 // This is considered a feature and not a bug for now.
706 continue;
707 }
708 } else {
709 lastToken = __lookahead;
710 current._addToken(__lookahead);
711 if(!__caseSensitive)
712 current._addToken(_toggleCase(__lookahead));
713 __match(__lookahead);
714 }
715
716 // In Perl, a - is a token if it occurs at the beginning
717 // or end of the character class. Anywhere else, it indicates
718 // a range.
719 // A byproduct of this implementation is that if a '-' occurs
720 // after the end of a range, it is interpreted as a '-' and no
721 // exception is thrown. e.g., the second dash in [a-z-x]
722 // This is considered a feature and not a bug for now.
723 if(__lookahead == '-'){
724 __match('-');
725 if(__lookahead == ']'){
726 current._addToken('-');
727 break;
728 } else if(__lookahead == '\\') {
729 node = __backslashToken();
730 --__position;
731 if(node instanceof TokenNode)
732 token = ((TokenNode)node)._token;
733 else
734 throw new MalformedPatternException(
735 "Parse error: invalid range specified at position " + __bytesRead);
736 } else {
737 token = __lookahead;
738 __match(__lookahead);
739 }
740
741 if(token < lastToken)
742 throw new MalformedPatternException(
743 "Parse error: invalid range specified at position " + __bytesRead);
744 current._addTokenRange(lastToken + 1, token);
745 if(!__caseSensitive)
746 current._addTokenRange(_toggleCase((char)(lastToken + 1)),
747 _toggleCase(token));
748 }
749 }
750
751 __match(']');
752 __inCharacterClass = false;
753 return current;
754 }
755
756
757 SyntaxNode _newTokenNode(char token, int position){
758 if(!__inCharacterClass && !__caseSensitive &&
759 (_isUpperCase(token) || _isLowerCase(token))){
760 CharacterClassNode node = new CharacterClassNode(position);
761 node._addToken(token);
762 node._addToken(_toggleCase(token));
763 return node;
764 }
765
766 return new TokenNode(token, position);
767 }
768
769
770 SyntaxTree _parse(char[] expression) throws MalformedPatternException {
771 SyntaxTree tree;
772
773 __openParen = __closeParen = 0;
774 __regularExpression = expression;
775 __bytesRead = 0;
776 __expressionLength = expression.length;
777 __inCharacterClass = false;
778
779 __position = 0;
780 __match(__lookahead); // Call match to read first input.
781
782 if(__lookahead == '^') {
783 __beginAnchor = true;
784 __match(__lookahead);
785 }
786
787 if(__expressionLength > 0 && expression[__expressionLength - 1] == '$') {
788 --__expressionLength;
789 __endAnchor = true;
790 }
791
792 if(__expressionLength > 1 || (__expressionLength == 1 && !__beginAnchor)) {
793 CatNode root;
794 root = new CatNode();
795 root._left = __regex();
796 // end marker
797 root._right =
798 new TokenNode((char)LeafNode._END_MARKER_TOKEN, __position++);
799 tree = new SyntaxTree(root, __position);
800 } else
801 tree = new
802 SyntaxTree(new TokenNode((char)LeafNode._END_MARKER_TOKEN, 0), 1);
803
804 tree._computeFollowPositions();
805
806 return tree;
807 }
808
809
810 /**
811 * Compiles an Awk regular expression into an AwkPattern instance that
812 * can be used by an AwkMatcher object to perform pattern matching.
813 * <p>
814 * @param pattern An Awk regular expression to compile.
815 * @param options A set of flags giving the compiler instructions on
816 * how to treat the regular expression. Currently the
817 * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
818 * @return A Pattern instance constituting the compiled regular expression.
819 * This instance will always be an AwkPattern and can be reliably
820 * be casted to an AwkPattern.
821 * @exception MalformedPatternException If the compiled expression
822 * is not a valid Awk regular expression.
823 */
824 public Pattern compile(char[] pattern, int options)
825 throws MalformedPatternException
826 {
827 SyntaxTree tree;
828 AwkPattern regexp;
829
830 __beginAnchor = __endAnchor = false;
831 __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
832 __multiline = ((options & MULTILINE_MASK) != 0);
833 tree = _parse(pattern);
834 regexp = new AwkPattern(new String(pattern), tree);
835 regexp._options = options;
836 regexp._hasBeginAnchor = __beginAnchor;
837 regexp._hasEndAnchor = __endAnchor;
838
839 return regexp;
840 }
841
842
843 /**
844 * Compiles an Awk regular expression into an AwkPattern instance that
845 * can be used by an AwkMatcher object to perform pattern matching.
846 * <p>
847 * @param pattern An Awk regular expression to compile.
848 * @param options A set of flags giving the compiler instructions on
849 * how to treat the regular expression. Currently the
850 * only meaningful flag is AwkCompiler.CASE_INSENSITIVE_MASK.
851 * @return A Pattern instance constituting the compiled regular expression.
852 * This instance will always be an AwkPattern and can be reliably
853 * be casted to an AwkPattern.
854 * @exception MalformedPatternException If the compiled expression
855 * is not a valid Awk regular expression.
856 */
857 public Pattern compile(String pattern, int options)
858 throws MalformedPatternException
859 {
860 SyntaxTree tree;
861 AwkPattern regexp;
862
863 __beginAnchor = __endAnchor = false;
864 __caseSensitive = ((options & CASE_INSENSITIVE_MASK) == 0);
865 __multiline = ((options & MULTILINE_MASK) != 0);
866 tree = _parse(pattern.toCharArray());
867 regexp = new AwkPattern(pattern, tree);
868 regexp._options = options;
869 regexp._hasBeginAnchor = __beginAnchor;
870 regexp._hasEndAnchor = __endAnchor;
871
872 return regexp;
873 }
874
875 /**
876 * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
877 * <p>
878 * @param pattern A regular expression to compile.
879 * @return A Pattern instance constituting the compiled regular expression.
880 * This instance will always be an AwkPattern and can be reliably
881 * be casted to an AwkPattern.
882 * @exception MalformedPatternException If the compiled expression
883 * is not a valid Awk regular expression.
884 */
885 public Pattern compile(char[] pattern) throws MalformedPatternException {
886 return compile(pattern, DEFAULT_MASK);
887 }
888
889
890 /**
891 * Same as calling <b>compile(pattern, AwkCompiler.DEFAULT_MASK);</b>
892 * <p>
893 * @param pattern A regular expression to compile.
894 * @return A Pattern instance constituting the compiled regular expression.
895 * This instance will always be an AwkPattern and can be reliably
896 * be casted to an AwkPattern.
897 * @exception MalformedPatternException If the compiled expression
898 * is not a valid Awk regular expression.
899 */
900 public Pattern compile(String pattern) throws MalformedPatternException {
901 return compile(pattern, DEFAULT_MASK);
902 }
903
904 }