Source code: com/jcorporate/expresso/ext/regexp/RECompiler.java
1 /* ====================================================================
2 * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3 *
4 * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. The end-user documentation included with the redistribution,
19 * if any, must include the following acknowledgment:
20 * "This product includes software developed by Jcorporate Ltd.
21 * (http://www.jcorporate.com/)."
22 * Alternately, this acknowledgment may appear in the software itself,
23 * if and wherever such third-party acknowledgments normally appear.
24 *
25 * 4. "Jcorporate" and product names such as "Expresso" must
26 * not be used to endorse or promote products derived from this
27 * software without prior written permission. For written permission,
28 * please contact info@jcorporate.com.
29 *
30 * 5. Products derived from this software may not be called "Expresso",
31 * or other Jcorporate product names; nor may "Expresso" or other
32 * Jcorporate product names appear in their name, without prior
33 * written permission of Jcorporate Ltd.
34 *
35 * 6. No product derived from this software may compete in the same
36 * market space, i.e. framework, without prior written permission
37 * of Jcorporate Ltd. For written permission, please contact
38 * partners@jcorporate.com.
39 *
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 * ====================================================================
53 *
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Jcorporate Ltd. Contributions back
56 * to the project(s) are encouraged when you make modifications.
57 * Please send them to support@jcorporate.com. For more information
58 * on Jcorporate Ltd. and its products, please see
59 * <http://www.jcorporate.com/>.
60 *
61 * Portions of this software are based upon other open source
62 * products and are subject to their respective licenses.
63 */
64
65 package com.jcorporate.expresso.ext.regexp;
66
67 /*
68 * ====================================================================
69 *
70 * The Apache Software License, Version 1.1
71 *
72 * Copyright (c) 1999 The Apache Software Foundation. All rights
73 * reserved.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 *
79 * 1. Redistributions of source code must retain the above copyright
80 * notice, this list of conditions and the following disclaimer.
81 *
82 * 2. Redistributions in binary form must reproduce the above copyright
83 * notice, this list of conditions and the following disclaimer in
84 * the documentation and/or other materials provided with the
85 * distribution.
86 *
87 * 3. The end-user documentation included with the redistribution, if
88 * any, must include the following acknowlegement:
89 * "This product includes software developed by the
90 * Apache Software Foundation (http://www.apache.org/)."
91 * Alternately, this acknowlegement may appear in the software itself,
92 * if and wherever such third-party acknowlegements normally appear.
93 *
94 * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
95 * Foundation" must not be used to endorse or promote products derived
96 * from this software without prior written permission. For written
97 * permission, please contact apache@apache.org.
98 *
99 * 5. Products derived from this software may not be called "Apache"
100 * nor may "Apache" appear in their names without prior written
101 * permission of the Apache Group.
102 *
103 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
104 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
105 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
106 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
107 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
108 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
109 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
110 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
112 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
113 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
114 * SUCH DAMAGE.
115 * ====================================================================
116 *
117 * This software consists of voluntary contributions made by many
118 * individuals on behalf of the Apache Software Foundation. For more
119 * information on the Apache Software Foundation, please see
120 * <http://www.apache.org/>.
121 *
122 */
123
124 import java.util.Hashtable;
125
126
127 /**
128 * A regular expression compiler class. This class compiles a pattern string into a
129 * regular expression program interpretable by the RE evaluator class. The 'recompile'
130 * command line tool uses this compiler to pre-compile regular expressions for use
131 * with RE. For a description of the syntax accepted by RECompiler and what you can
132 * do with regular expressions, see the documentation for the RE matcher class.
133 *
134 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
135 * @version $Id: RECompiler.java,v 1.7 2004/11/17 20:48:15 lhamel Exp $
136 * @see RE
137 * @deprecated since v5.6, use jakarta oro
138 */
139 public class RECompiler {
140
141 // The compiled program
142 char[] instruction; // The compiled RE 'program' instruction buffer
143 int lenInstruction; // The amount of the program buffer currently in use
144
145 // Input state for compiling regular expression
146 String pattern; // Input string
147 int len; // Length of the pattern string
148 int idx; // Current input index into ac
149 int parens; // Total number of paren pairs
150
151 // Node flags
152 static final int NODE_NORMAL = 0; // No flags (nothing special)
153 static final int NODE_NULLABLE = 1; // True if node is potentially null
154 static final int NODE_TOPLEVEL = 2; // True if top level expr
155
156 // Special types of 'escapes'
157 static final char ESC_MASK = 0xfff0; // Escape complexity mask
158 static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
159 static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
160 static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
161
162 // {m,n} stacks
163 static final int maxBrackets = 10; // Maximum number of bracket pairs
164 static int brackets = 0; // Number of bracket sets
165 static int[] bracketStart = null; // Starting point
166 static int[] bracketEnd = null; // Ending point
167 static int[] bracketMin = null; // Minimum number of matches
168 static int[] bracketOpt = null; // Additional optional matches
169 static final int bracketUnbounded = -1; // Unbounded value
170 static final int bracketFinished = -2; // Unbounded value
171
172 // Lookup table for POSIX character class names
173 static Hashtable hashPOSIX = new Hashtable();
174
175 static {
176 hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
177 hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
178 hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
179 hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
180 hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
181 hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
182 hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
183 hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
184 hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
185 hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
186 hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
187 hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
188 hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
189 hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
190 }
191
192 /**
193 * Local, nested class for maintaining character ranges for character classes.
194 */
195 class RERange {
196 int size = 16; // Capacity of current range arrays
197 int[] minRange = new int[size]; // Range minima
198 int[] maxRange = new int[size]; // Range maxima
199 int num = 0; // Number of range array elements in use
200
201 /**
202 * Deletes the range at a given index from the range lists
203 *
204 * @param index Index of range to delete from minRange and maxRange arrays.
205 */
206 void delete(int index) {
207
208 // Return if no elements left or index is out of range
209 if (num == 0 || index >= num) {
210 return;
211 }
212 // Move elements down
213 while (index++ < num) {
214 if (index - 1 >= 0) {
215 minRange[index - 1] = minRange[index];
216 maxRange[index - 1] = maxRange[index];
217 }
218 }
219
220 // One less element now
221 num--;
222 }
223
224 /**
225 * Merges a range into the range list, coalescing ranges if possible.
226 *
227 * @param min Minimum end of range
228 * @param max Maximum end of range
229 */
230 void merge(int min, int max) {
231
232 // Loop through ranges
233 for (int i = 0; i < num; i++) {
234
235 // Min-max is subsumed by minRange[i]-maxRange[i]
236 if (min >= minRange[i] && max <= maxRange[i]) {
237 return;
238 }
239
240 // Min-max subsumes minRange[i]-maxRange[i]
241 else if (min <= minRange[i] && max >= maxRange[i]) {
242 delete(i);
243 merge(min, max);
244
245 return;
246 }
247
248 // Min is in the range, but max is outside
249 else if (min >= minRange[i] && min <= maxRange[i]) {
250 delete(i);
251 min = minRange[i];
252 merge(min, max);
253
254 return;
255 }
256
257 // Max is in the range, but min is outside
258 else if (max >= minRange[i] && max <= maxRange[i]) {
259 delete(i);
260 max = maxRange[i];
261 merge(min, max);
262
263 return;
264 }
265 }
266 // Must not overlap any other ranges
267 if (num >= size) {
268 size *= 2;
269
270 int[] newMin = new int[size];
271 int[] newMax = new int[size];
272 System.arraycopy(minRange, 0, newMin, 0, num);
273 System.arraycopy(maxRange, 0, newMax, 0, num);
274 minRange = newMin;
275 maxRange = newMax;
276 }
277
278 minRange[num] = min;
279 maxRange[num] = max;
280 num++;
281 }
282
283 /**
284 * Removes a range by deleting or shrinking all other ranges
285 *
286 * @param min Minimum end of range
287 * @param max Maximum end of range
288 */
289 void remove(int min, int max) {
290
291 // Loop through ranges
292 for (int i = 0; i < num; i++) {
293
294 // minRange[i]-maxRange[i] is subsumed by min-max
295 if (minRange[i] >= min && maxRange[i] <= max) {
296 delete(i);
297 i--;
298
299 return;
300 }
301
302 // min-max is subsumed by minRange[i]-maxRange[i]
303 else if (min >= minRange[i] && max <= maxRange[i]) {
304 int minr = minRange[i];
305 int maxr = maxRange[i];
306 delete(i);
307
308 if (minr < min - 1) {
309 merge(minr, min - 1);
310 }
311 if (max + 1 < maxr) {
312 merge(max + 1, maxr);
313 }
314
315 return;
316 }
317
318 // minRange is in the range, but maxRange is outside
319 else if (minRange[i] >= min && minRange[i] <= max) {
320 minRange[i] = max + 1;
321
322 return;
323 }
324
325 // maxRange is in the range, but minRange is outside
326 else if (maxRange[i] >= min && maxRange[i] <= max) {
327 maxRange[i] = min - 1;
328
329 return;
330 }
331 }
332 }
333
334 /**
335 * Includes (or excludes) the range from min to max, inclusive.
336 *
337 * @param min Minimum end of range
338 * @param max Maximum end of range
339 * @param include True if range should be included. False otherwise.
340 */
341 void include(int min, int max, boolean include) {
342 if (include) {
343 merge(min, max);
344 } else {
345 remove(min, max);
346 }
347 }
348
349 /**
350 * Includes a range with the same min and max
351 *
352 * @param minmax Minimum and maximum end of range (inclusive)
353 * @param include True if range should be included. False otherwise.
354 */
355 void include(char minmax, boolean include) {
356 include(minmax, minmax, include);
357 }
358 }
359
360 /**
361 * Constructor. Creates (initially empty) storage for a regular expression program.
362 */
363 public RECompiler() {
364
365 // Start off with a generous, yet reasonable, initial size
366 instruction = new char[128];
367 lenInstruction = 0;
368 }
369
370 /**
371 * Allocate storage for brackets only as needed
372 */
373 void allocBrackets() {
374
375 // Allocate bracket stacks if not already done
376 if (bracketStart == null) {
377
378 // Allocate storage
379 bracketStart = new int[maxBrackets];
380 bracketEnd = new int[maxBrackets];
381 bracketMin = new int[maxBrackets];
382 bracketOpt = new int[maxBrackets];
383
384 // Initialize to invalid values
385 for (int i = 0; i < maxBrackets; i++) {
386 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
387 }
388 }
389 }
390
391 /**
392 * Absorb an atomic character string. This method is a little tricky because
393 * it can un-include the last character of string if a closure operator follows.
394 * This is correct because *+? have higher precedence than concatentation (thus
395 * ABC* means AB(C*) and NOT (ABC)*).
396 *
397 * @return Index of new atom node
398 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
399 */
400 int atom()
401 throws RESyntaxException {
402
403 // Create a string node
404 int ret = node(RE.OP_ATOM, 0);
405
406 // Length of atom
407 int lenAtom = 0;
408
409 // Loop while we've got input
410 atomLoop:
411
412 while (idx < len) {
413
414 // Is there a next char?
415 if ((idx + 1) < len) {
416 char c = pattern.charAt(idx + 1);
417
418 // If the next 'char' is an escape, look past the whole escape
419 if (pattern.charAt(idx) == '\\') {
420 int idxEscape = idx;
421 escape();
422
423 if (idx < len) {
424 c = pattern.charAt(idx);
425 }
426
427 idx = idxEscape;
428 }
429 // Switch on next char
430 switch (c) {
431 case '{':
432 case '?':
433 case '*':
434 case '+':
435
436 // If the next character is a closure operator and our atom is non-empty, the
437 // current character should bind to the closure operator rather than the atom
438 if (lenAtom != 0) {
439 break atomLoop;
440 }
441 }
442 }
443 // Switch on current char
444 switch (pattern.charAt(idx)) {
445 case ']':
446 case '^':
447 case '$':
448 case '.':
449 case '[':
450 case '(':
451 case ')':
452 case '|':
453 break atomLoop;
454
455 case '{':
456 case '?':
457 case '*':
458 case '+':
459
460 // We should have an atom by now
461 if (lenAtom == 0) {
462
463 // No atom before closure
464 syntaxError("Missing operand to closure");
465 }
466
467 break atomLoop;
468
469 case '\\':
470 {
471
472 // Get the escaped character (advances input automatically)
473 int idxBeforeEscape = idx;
474 char c = escape();
475
476 // Check if it's a simple escape (as opposed to, say, a backreference)
477 if ((c & ESC_MASK) == ESC_MASK) {
478
479 // Not a simple escape, so backup to where we were before the escape.
480 idx = idxBeforeEscape;
481 break atomLoop;
482 }
483
484 // Add escaped char to atom
485 emit(c);
486 lenAtom++;
487 }
488
489 break;
490
491 default:
492
493 // Add normal character to atom
494 emit(pattern.charAt(idx++));
495 lenAtom++;
496 break;
497 }
498 }
499 // This "shouldn't" happen
500 if (lenAtom == 0) {
501 internalError();
502 }
503
504 // Emit the atom length into the program
505 instruction[ret + RE.offsetOpdata] = (char) lenAtom;
506
507 return ret;
508 }
509
510 /**
511 * Match bracket {m,n} expression put results in bracket member variables
512 *
513 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
514 */
515 void bracket()
516 throws RESyntaxException {
517
518 // Current character must be a '{'
519 if (idx >= len || pattern.charAt(idx++) != '{') {
520 internalError();
521 }
522 // Next char must be a digit
523 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
524 syntaxError("Expected digit");
525 }
526
527 // Get min ('m' of {m,n}) number
528 StringBuffer number = new StringBuffer();
529
530 while (idx < len && Character.isDigit(pattern.charAt(idx))) {
531 number.append(pattern.charAt(idx++));
532 }
533 try {
534 bracketMin[brackets] = Integer.parseInt(number.toString());
535 } catch (NumberFormatException e) {
536 syntaxError("Expected valid number");
537 }
538 // If out of input, fail
539 if (idx >= len) {
540 syntaxError("Expected comma or right bracket");
541 }
542 // If end of expr, optional limit is 0
543 if (pattern.charAt(idx) == '}') {
544 idx++;
545 bracketOpt[brackets] = 0;
546
547 return;
548 }
549 // Must have at least {m,} and maybe {m,n}.
550 if (idx >= len || pattern.charAt(idx++) != ',') {
551 syntaxError("Expected comma");
552 }
553 // If out of input, fail
554 if (idx >= len) {
555 syntaxError("Expected comma or right bracket");
556 }
557 // If {m,} max is unlimited
558 if (pattern.charAt(idx) == '}') {
559 idx++;
560 bracketOpt[brackets] = bracketUnbounded;
561
562 return;
563 }
564 // Next char must be a digit
565 if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
566 syntaxError("Expected digit");
567 }
568
569 // Get max number
570 number.setLength(0);
571
572 while (idx < len && Character.isDigit(pattern.charAt(idx))) {
573 number.append(pattern.charAt(idx++));
574 }
575 try {
576 bracketOpt[brackets] = Integer.parseInt(number.toString()) -
577 bracketMin[brackets];
578 } catch (NumberFormatException e) {
579 syntaxError("Expected valid number");
580 }
581 // Optional repetitions must be > 0
582 if (bracketOpt[brackets] <= 0) {
583 syntaxError("Bad range");
584 }
585 // Must have close brace
586 if (idx >= len || pattern.charAt(idx++) != '}') {
587 syntaxError("Missing close brace");
588 }
589 }
590
591 /**
592 * Compile one branch of an or operator (implements concatenation)
593 *
594 * @param flags Flags passed by reference
595 * @return Pointer to branch node
596 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
597 */
598 int branch(int[] flags)
599 throws RESyntaxException {
600
601 // Get each possibly closured piece and concat
602 int node;
603 int ret = node(RE.OP_BRANCH, 0);
604 int chain = -1;
605 int[] closureFlags = new int[1];
606 boolean nullable = true;
607
608 while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
609
610 // Get new node
611 closureFlags[0] = NODE_NORMAL;
612 node = closure(closureFlags);
613
614 if (closureFlags[0] == NODE_NORMAL) {
615 nullable = false;
616 }
617 // If there's a chain, append to the end
618 if (chain != -1) {
619 setNextOfEnd(chain, node);
620 }
621
622 // Chain starts at current
623 chain = node;
624 }
625 // If we don't run loop, make a nothing node
626 if (chain == -1) {
627 node(RE.OP_NOTHING, 0);
628 }
629 // Set nullable flag for this branch
630 if (nullable) {
631 flags[0] |= NODE_NULLABLE;
632 }
633
634 return ret;
635 }
636
637 /**
638 * Compile a character class
639 *
640 * @return Index of class node
641 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
642 */
643 int characterClass()
644 throws RESyntaxException {
645
646 // Check for bad calling or empty class
647 if (pattern.charAt(idx) != '[') {
648 internalError();
649 }
650 // Check for unterminated or empty class
651 if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
652 syntaxError("Empty or unterminated class");
653 }
654 // Check for POSIX character class
655 if (idx < len && pattern.charAt(idx) == ':') {
656
657 // Skip colon
658 idx++;
659
660 // POSIX character classes are denoted with lowercase ASCII strings
661 int idxStart = idx;
662
663 while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') {
664 idx++;
665 }
666 // Should be a ":]" to terminate the POSIX character class
667 if ((idx + 1) < len && pattern.charAt(idx) == ':' &&
668 pattern.charAt(idx + 1) == ']') {
669
670 // Get character class
671 String charClass = pattern.substring(idxStart, idx);
672
673 // Select the POSIX class id
674 Character i = (Character) hashPOSIX.get(charClass);
675
676 if (i != null) {
677
678 // Move past colon and right bracket
679 idx += 2;
680
681 // Return new POSIX character class node
682 return node(RE.OP_POSIXCLASS, i.charValue());
683 }
684
685 syntaxError("Invalid POSIX character class '" + charClass +
686 "'");
687 }
688
689 syntaxError("Invalid POSIX character class syntax");
690 }
691
692 // Try to build a class. Create OP_ANYOF node
693 int ret = node(RE.OP_ANYOF, 0);
694
695 // Parse class declaration
696 char CHAR_INVALID = Character.MAX_VALUE;
697 char last = CHAR_INVALID;
698 char simpleChar = 0;
699 boolean include = true;
700 boolean definingRange = false;
701 int idxFirst = idx;
702 char rangeStart = Character.MIN_VALUE;
703 char rangeEnd;
704 RERange range = new RERange();
705
706 while (idx < len && pattern.charAt(idx) != ']') {
707 switchOnCharacter:
708
709 // Switch on character
710 switch (pattern.charAt(idx)) {
711 case '^':
712 include = !include;
713
714 if (idx == idxFirst) {
715 range.include(Character.MIN_VALUE, Character.MAX_VALUE,
716 true);
717 }
718
719 idx++;
720 continue;
721
722 case '\\':
723 {
724
725 // Escape always advances the stream
726 char c;
727
728 switch (c = escape()) {
729 case ESC_COMPLEX:
730 case ESC_BACKREF:
731
732 // Word boundaries and backrefs not allowed in a character class!
733 syntaxError("Bad character class");
734
735 case ESC_CLASS:
736
737 // Classes can't be an endpoint of a range
738 if (definingRange) {
739 syntaxError("Bad character class");
740 }
741 // Handle specific type of class (some are ok)
742 switch (pattern.charAt(idx - 1)) {
743 case RE.E_NSPACE:
744 case RE.E_NDIGIT:
745 case RE.E_NALNUM:
746 syntaxError("Bad character class");
747
748 case RE.E_SPACE:
749 range.include('\t', include);
750 range.include('\r', include);
751 range.include('\f', include);
752 range.include('\n', include);
753 range.include('\b', include);
754 range.include(' ', include);
755 break;
756
757 case RE.E_ALNUM:
758 range.include('a', 'z', include);
759 range.include('A', 'Z', include);
760 range.include('_', include);
761
762 // Fall through!
763 case RE.E_DIGIT:
764 range.include('0', '9', include);
765 break;
766 }
767
768 // Make last char invalid (can't be a range start)
769 last = CHAR_INVALID;
770 break;
771
772 default:
773
774 // Escape is simple so treat as a simple char
775 simpleChar = c;
776 break switchOnCharacter;
777 }
778 }
779
780 continue;
781
782 case '-':
783
784 // Start a range if one isn't already started
785 if (definingRange) {
786 syntaxError("Bad class range");
787 }
788
789 definingRange = true;
790
791 // If no last character, start of range is 0
792 rangeStart = (last == CHAR_INVALID ? 0 : last);
793
794 // Premature end of range. define up to Character.MAX_VALUE
795 if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
796 simpleChar = Character.MAX_VALUE;
797 break;
798 }
799
800 continue;
801
802 default:
803 simpleChar = pattern.charAt(idx++);
804 break;
805 }
806 // Handle simple character simpleChar
807 if (definingRange) {
808
809 // if we are defining a range make it now
810 rangeEnd = simpleChar;
811
812 // Actually create a range if the range is ok
813 if (rangeStart >= rangeEnd) {
814 syntaxError("Bad character class");
815 }
816
817 range.include(rangeStart, rangeEnd, include);
818
819 // We are done defining the range
820 last = CHAR_INVALID;
821 definingRange = false;
822 } else {
823
824 // If simple character and not start of range, include it
825 if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') {
826 range.include(simpleChar, include);
827 }
828
829 last = simpleChar;
830 }
831 }
832 // Shouldn't be out of input
833 if (idx == len) {
834 syntaxError("Unterminated character class");
835 }
836
837 // Absorb the ']' end of class marker
838 idx++;
839
840 // Emit character class definition
841 instruction[ret + RE.offsetOpdata] = (char) range.num;
842
843 for (int i = 0; i < range.num; i++) {
844 emit((char) range.minRange[i]);
845 emit((char) range.maxRange[i]);
846 }
847
848 return ret;
849 }
850
851 /**
852 * Compile a possibly closured terminal
853 *
854 * @param flags Flags passed by reference
855 * @return Index of closured node
856 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
857 */
858 int closure(int[] flags)
859 throws RESyntaxException {
860
861 // Before terminal
862 int idxBeforeTerminal = idx;
863
864 // Values to pass by reference to terminal()
865 int[] terminalFlags = {NODE_NORMAL};
866
867 // Get terminal symbol
868 int ret = terminal(terminalFlags);
869
870 // Or in flags from terminal symbol
871 flags[0] |= terminalFlags[0];
872
873 // Advance input, set NODE_NULLABLE flag and do sanity checks
874 if (idx >= len) {
875 return ret;
876 }
877
878 boolean greedy = true;
879 char closureType = pattern.charAt(idx);
880
881 switch (closureType) {
882 case '?':
883 case '*':
884
885 // The current node can be null
886 flags[0] |= NODE_NULLABLE;
887
888 case '+':
889
890 // Eat closure character
891 idx++;
892
893 case '{':
894
895 // Don't allow blantant stupidity
896 int opcode = instruction[ret + RE.offsetOpcode];
897
898 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
899 syntaxError("Bad closure operand");
900 }
901 if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
902 syntaxError("Closure operand can't be nullable");
903 }
904
905 break;
906 }
907 // If the next character is a '?', make the closure non-greedy (reluctant)
908 if (idx < len && pattern.charAt(idx) == '?') {
909 idx++;
910 greedy = false;
911 }
912 if (greedy) {
913
914 // Actually do the closure now
915 switch (closureType) {
916 case '{':
917 {
918
919 // We look for our bracket in the list
920 boolean found = false;
921 int i;
922 allocBrackets();
923
924 for (i = 0; i < brackets; i++) {
925 if (bracketStart[i] == idx) {
926 found = true;
927 break;
928 }
929 }
930 // If its not in the list we parse the {m,n}
931 if (!found) {
932 if (brackets >= maxBrackets) {
933 syntaxError("Too many bracketed closures (limit is 10)");
934 }
935
936 bracketStart[brackets] = idx;
937 bracket();
938 bracketEnd[brackets] = idx;
939 i = brackets++;
940 }
941 // If there's a min, rewind stream and reparse
942 if (--bracketMin[i] > 0) {
943
944 // Rewind stream and run it through again
945 idx = idxBeforeTerminal;
946 break;
947 }
948 // Do the right thing for maximum ({m,})
949 if (bracketOpt[i] == bracketFinished) {
950
951 // Drop through now and closure expression.
952 // We are done with the {m,} expr, so skip rest
953 closureType = '*';
954 bracketOpt[i] = 0;
955 idx = bracketEnd[i];
956 } else if (bracketOpt[i] == bracketUnbounded) {
957 idx = idxBeforeTerminal;
958 bracketOpt[i] = bracketFinished;
959 break;
960 } else if (bracketOpt[i]-- > 0) {
961
962 // Drop through to optionally close and then 'play it again sam!'
963 idx = idxBeforeTerminal;
964 closureType = '?';
965 } else {
966
967 // We are done. skip the rest of {m,n} expr
968 idx = bracketEnd[i];
969 break;
970 }
971 }
972 // Fall through!
973 case '?':
974 case '*':
975 if (!greedy) {
976 break;
977 }
978 if (closureType == '?') {
979
980 // X? is compiled as (X|)
981 nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
982 setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // inserted branch to option
983
984 int nothing = node(RE.OP_NOTHING, 0); // which is OP_NOTHING
985 setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
986 setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
987 }
988 if (closureType == '*') {
989
990 // X* is compiled as (X{gotoX}|)
991 nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
992 setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
993 setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
994 setNextOfEnd(ret + RE.nodeSize, ret); // the start again
995 setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
996 setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
997 }
998
999 break;
1000
1001 case '+':
1002 {
1003
1004 // X+ is compiled as X({gotoX}|)
1005 int branch;
1006 branch = node(RE.OP_BRANCH, 0); // a new branch
1007 setNextOfEnd(ret, branch); // is added to the end of X
1008 setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1009 setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1010 setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1011 }
1012
1013 break;
1014 }
1015 } else {
1016
1017 // Add end after closured subexpr
1018 setNextOfEnd(ret, node(RE.OP_END, 0));
1019
1020 // Actually do the closure now
1021 switch (closureType) {
1022 case '?':
1023 nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1024 break;
1025
1026 case '*':
1027 nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1028 break;
1029
1030 case '+':
1031 nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1032 break;
1033 }
1034
1035 // Point to the expr after the closure
1036 setNextOfEnd(ret, lenInstruction);
1037 }
1038
1039 return ret;
1040 }
1041
1042 /**
1043 * Compiles a regular expression pattern into a program runnable by the pattern
1044 * matcher class 'RE'.
1045 *
1046 * @param pattern Regular expression pattern to compile (see RECompiler class
1047 * for details).
1048 * @return A compiled regular expression program.
1049 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1050 * @see RECompiler
1051 * @see RE
1052 */
1053 public REProgram compile(String pattern)
1054 throws RESyntaxException {
1055
1056 // Initialize variables for compilation
1057 this.pattern = pattern; // Save pattern in instance variable
1058 len = pattern.length(); // Precompute pattern length for speed
1059 idx = 0; // Set parsing index to the first character
1060 lenInstruction = 0; // Set emitted instruction count to zero
1061 parens = 1; // Set paren level to 1 (the implicit outer parens)
1062 brackets = 0; // No bracketed closures yet
1063
1064 // Initialize pass by reference flags value
1065 int[] flags = {NODE_TOPLEVEL};
1066
1067 // Parse expression
1068 expr(flags);
1069
1070 // Should be at end of input
1071 if (idx != len) {
1072 if (pattern.charAt(idx) == ')') {
1073 syntaxError("Unmatched close paren");
1074 }
1075
1076 syntaxError("Unexpected input remains");
1077 }
1078
1079 // Return the result
1080 char[] ins = new char[lenInstruction];
1081 System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1082
1083 return new REProgram(ins);
1084 }
1085
1086 /**
1087 * Emit a single character into the program stream.
1088 *
1089 * @param c Character to add
1090 */
1091 void emit(char c) {
1092
1093 // Make room for character
1094 ensure(1);
1095
1096 // Add character
1097 instruction[lenInstruction++] = c;
1098 }
1099
1100 /**
1101 * Ensures that n more characters can fit in the program buffer.
1102 * If n more can't fit, then the size is doubled until it can.
1103 *
1104 * @param n Number of additional characters to ensure will fit.
1105 */
1106 void ensure(int n) {
1107
1108 // Get current program length
1109 int curlen = instruction.length;
1110
1111 // If the current length + n more is too much
1112 if (lenInstruction + n >= curlen) {
1113
1114 // Double the size of the program array until n more will fit
1115 while (lenInstruction + n >= curlen) {
1116 curlen *= 2;
1117 }
1118
1119 // Allocate new program array and move data into it
1120 char[] newInstruction = new char[curlen];
1121 System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
1122 instruction = newInstruction;
1123 }
1124 }
1125
1126 /**
1127 * Match an escape sequence. Handles quoted chars and octal escapes as well
1128 * as normal escape characters. Always advances the input stream by the
1129 * right amount. This code "understands" the subtle difference between an
1130 * octal escape and a backref. You can access the type of ESC_CLASS or
1131 * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
1132 *
1133 * @return ESC_* code or character if simple escape
1134 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1135 */
1136 char escape()
1137 throws RESyntaxException {
1138
1139 // "Shouldn't" happen
1140 if (pattern.charAt(idx) != '\\') {
1141 internalError();
1142 }
1143 // Escape shouldn't occur as last character in string!
1144 if (idx + 1 == len) {
1145 syntaxError("Escape terminates string");
1146 }
1147
1148 // Switch on character after backslash
1149 idx += 2;
1150
1151 char escapeChar = pattern.charAt(idx - 1);
1152
1153 switch (escapeChar) {
1154 case RE.E_BOUND:
1155 case RE.E_NBOUND:
1156 return ESC_COMPLEX;
1157
1158 case RE.E_ALNUM:
1159 case RE.E_NALNUM:
1160 case RE.E_SPACE:
1161 case RE.E_NSPACE:
1162 case RE.E_DIGIT:
1163 case RE.E_NDIGIT:
1164 return ESC_CLASS;
1165
1166 case 'u':
1167 case 'x':
1168 {
1169
1170 // Exact required hex digits for escape type
1171 int hexDigits = (escapeChar == 'u' ? 4 : 2);
1172
1173 // Parse up to hexDigits characters from input
1174 int val = 0;
1175
1176 for (; idx < len && hexDigits-- > 0; idx++) {
1177
1178 // Get char
1179 char c = pattern.charAt(idx);
1180
1181 // If it's a hexadecimal digit (0-9)
1182 if (c >= '0' && c <= '9') {
1183
1184 // Compute new value
1185 val = (val << 4) + c - '0';
1186 } else {
1187
1188 // If it's a hexadecimal letter (a-f)
1189 c = Character.toLowerCase(c);
1190
1191 if (c >= 'a' && c <= 'f') {
1192
1193 // Compute new value
1194 val = (val << 4) + (c - 'a') + 10;
1195 } else {
1196
1197 // If it's not a valid digit or hex letter, the escape must be invalid
1198 // because hexDigits of input have not been absorbed yet.
1199 syntaxError("Expected " + hexDigits +
1200 " hexadecimal digits after \\" +
1201 escapeChar);
1202 }
1203 }
1204 }
1205
1206 return (char) val;
1207 }
1208 case 't':
1209 return '\t';
1210
1211 case 'n':
1212 return '\n';
1213
1214 case 'r':
1215 return '\r';
1216
1217 case 'f':
1218 return '\f';
1219
1220 case '0':
1221 case '1':
1222 case '2':
1223 case '3':
1224 case '4':
1225 case '5':
1226 case '6':
1227 case '7':
1228 case '8':
1229 case '9':
1230
1231 // An octal escape starts with a 0 or has two digits in a row
1232 if ((idx < len && Character.isDigit(pattern.charAt(idx))) ||
1233 escapeChar == '0') {
1234
1235 // Handle \nnn octal escapes
1236 int val = escapeChar - '0';
1237
1238 if (idx < len && Character.isDigit(pattern.charAt(idx))) {
1239 val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1240
1241 if (idx < len &&
1242 Character.isDigit(pattern.charAt(idx))) {
1243 val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1244 }
1245 }
1246
1247 return (char) val;
1248 }
1249
1250 // It's actually a backreference (\[1-9]), not an escape
1251 return ESC_BACKREF;
1252
1253 default:
1254
1255 // Simple quoting of a character
1256 return escapeChar;
1257 }
1258 }
1259
1260 /**
1261 * Compile an expression with possible parens around it. Paren matching
1262 * is done at this level so we can tie the branch tails together.
1263 *
1264 * @param flags Flag value passed by reference
1265 * @return Node index of expression in instruction array
1266 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1267 */
1268 int expr(int[] flags)
1269 throws RESyntaxException {
1270
1271 // Create open paren node unless we were called from the top level (which has no parens)
1272 boolean paren = false;
1273 int ret = -1;
1274 int closeParens = parens;
1275
1276 if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
1277 idx++;
1278 paren = true;
1279 ret = node(RE.OP_OPEN, parens++);
1280 }
1281
1282 flags[0] &= ~NODE_TOPLEVEL;
1283
1284 // Create a branch node
1285 int branch = branch(flags);
1286
1287 if (ret == -1) {
1288 ret = branch;
1289 } else {
1290 setNextOfEnd(ret, branch);
1291 }
1292 // Loop through branches
1293 while (idx < len && pattern.charAt(idx) == '|') {
1294 idx++;
1295 branch = branch(flags);
1296 setNextOfEnd(ret, branch);
1297 }
1298
1299 // Create an ending node (either a close paren or an OP_END)
1300 int end;
1301
1302 if (paren) {
1303 if (idx < len && pattern.charAt(idx) == ')') {
1304 idx++;
1305 } else {
1306 syntaxError("Missing close paren");
1307 }
1308
1309 end = node(RE.OP_CLOSE, closeParens);
1310 } else {
1311 end = node(RE.OP_END, 0);
1312 }
1313
1314 // Append the ending node to the ret nodelist
1315 setNextOfEnd(ret, end);
1316
1317 // Hook the ends of each branch to the end node
1318 for (int next = -1, i = ret;
1319 next != 0;
1320 next = instruction[i + RE.offsetNext], i += next) {
1321
1322 // If branch, make the end of the branch's operand chain point to the end node.
1323 if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) {
1324 setNextOfEnd(i + RE.nodeSize, end);
1325 }
1326 }
1327
1328 // Return the node list
1329 return ret;
1330 }
1331
1332 /**
1333 * Throws a new internal error exception
1334 *
1335 * @throws Error Thrown in the event of an internal error.
1336 */
1337 void internalError()
1338 throws Error {
1339 throw new Error("Internal error!");
1340 }
1341
1342 /**
1343 * Adds a new node
1344 *
1345 * @param opcode Opcode for node
1346 * @param opdata Opdata for node (only the low 16 bits are currently used)
1347 * @return Index of new node in program
1348 */
1349 int node(char opcode, int opdata) {
1350
1351 // Make room for a new node
1352 ensure(RE.nodeSize);
1353
1354 // Add new node at end
1355 instruction[lenInstruction + RE.offsetOpcode] = opcode;
1356 instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
1357 instruction[lenInstruction + RE.offsetNext] = 0;
1358 lenInstruction += RE.nodeSize;
1359
1360 // Return index of new node
1361 return lenInstruction - RE.nodeSize;
1362 }
1363
1364 /**
1365 * Inserts a node with a given opcode and opdata at insertAt. The node relative next
1366 * pointer is initialized to 0.
1367 *
1368 * @param opcode Opcode for new node
1369 * @param opdata Opdata for new node (only the low 16 bits are currently used)
1370 * @param insertAt Index at which to insert the new node in the program
1371 */
1372 void nodeInsert(char opcode, int opdata, int insertAt) {
1373
1374 // Make room for a new node
1375 ensure(RE.nodeSize);
1376
1377 // Move everything from insertAt to the end down nodeSize elements
1378 System.arraycopy(instruction, insertAt, instruction,
1379 insertAt + RE.nodeSize, lenInstruction - insertAt);
1380 instruction[insertAt + RE.offsetOpcode] = opcode;
1381 instruction[insertAt + RE.offsetOpdata] = (char) opdata;
1382 instruction[insertAt + RE.offsetNext] = 0;
1383 lenInstruction += RE.nodeSize;
1384 }
1385
1386 /**
1387 * Appends a node to the end of a node chain
1388 *
1389 * @param node Start of node chain to traverse
1390 * @param pointTo Node to have the tail of the chain point to
1391 */
1392 void setNextOfEnd(int node, int pointTo) {
1393
1394 // Traverse the chain until the next offset is 0
1395 int next;
1396
1397 while ((next = instruction[node + RE.offsetNext]) != 0) {
1398 node += next;
1399 }
1400
1401 // Point the last node in the chain to pointTo.
1402 instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
1403 }
1404
1405 /**
1406 * Throws a new syntax error exception
1407 *
1408 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1409 */
1410 void syntaxError(String s)
1411 throws RESyntaxException {
1412 throw new RESyntaxException(s);
1413 }
1414
1415 /**
1416 * Match a terminal node.
1417 *
1418 * @param flags Flags
1419 * @return Index of terminal node (closeable)
1420 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1421 */
1422 int terminal(int[] flags)
1423 throws RESyntaxException {
1424 switch (pattern.charAt(idx)) {
1425 case RE.OP_EOL:
1426 case RE.OP_BOL:
1427 case RE.OP_ANY:
1428 return node(pattern.charAt(idx++), 0);
1429
1430 case '[':
1431 return characterClass();
1432
1433 case '(':
1434 return expr(flags);
1435
1436 case ')':
1437 syntaxError("Unexpected close paren");
1438
1439 case '|':
1440 internalError();
1441
1442 case ']':
1443 syntaxError("Mismatched class");
1444
1445 case 0:
1446 syntaxError("Unexpected end of input");
1447
1448 case '?':
1449 case '+':
1450 case '{':
1451 case '*':
1452 syntaxError("Missing operand to closure");
1453
1454 case '\\':
1455 {
1456
1457 // Don't forget, escape() advances the input stream!
1458 int idxBeforeEscape = idx;
1459
1460 // Switch on escaped character
1461 switch (escape()) {
1462 case ESC_CLASS:
1463 case ESC_COMPLEX:
1464 flags[0] &= ~NODE_NULLABLE;
1465
1466 return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
1467
1468 case ESC_BACKREF:
1469 {
1470 char backreference = (char) (pattern.charAt(idx - 1) -
1471 '0');
1472
1473 if (parens <= backreference) {
1474 syntaxError("Bad backreference");
1475 }
1476
1477 flags[0] |= NODE_NULLABLE;
1478
1479 return node(RE.OP_BACKREF, backreference);
1480 }
1481 default:
1482
1483 // We had a simple escape and we want to have it end up in
1484 // an atom, so we back up and fall though to the default handling
1485 idx = idxBeforeEscape;
1486 flags[0] &= ~NODE_NULLABLE;
1487 break;
1488 }
1489 }
1490 }
1491
1492 // Everything above either fails or returns.
1493 // If it wasn't one of the above, it must be the start of an atom.
1494 flags[0] &= ~NODE_NULLABLE;
1495
1496 return atom();
1497 }
1498}