Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/jcorporate/expresso/ext/regexp/RECompiler.java


1   /* ====================================================================
2    * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3    *
4    * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5    *
6    * Redistribution and use in source and binary forms, with or without
7    * modification, are permitted provided that the following conditions
8    * are met:
9    *
10   * 1. Redistributions of source code must retain the above copyright
11   *    notice, this list of conditions and the following disclaimer.
12   *
13   * 2. Redistributions in binary form must reproduce the above copyright
14   *    notice, this list of conditions and the following disclaimer in
15   *    the documentation and/or other materials provided with the
16   *    distribution.
17   *
18   * 3. The end-user documentation included with the redistribution,
19   *    if any, must include the following acknowledgment:
20   *       "This product includes software developed by Jcorporate Ltd.
21   *        (http://www.jcorporate.com/)."
22   *    Alternately, this acknowledgment may appear in the software itself,
23   *    if and wherever such third-party acknowledgments normally appear.
24   *
25   * 4. "Jcorporate" and product names such as "Expresso" must
26   *    not be used to endorse or promote products derived from this
27   *    software without prior written permission. For written permission,
28   *    please contact info@jcorporate.com.
29   *
30   * 5. Products derived from this software may not be called "Expresso",
31   *    or other Jcorporate product names; nor may "Expresso" or other
32   *    Jcorporate product names appear in their name, without prior
33   *    written permission of Jcorporate Ltd.
34   *
35   * 6. No product derived from this software may compete in the same
36   *    market space, i.e. framework, without prior written permission
37   *    of Jcorporate Ltd. For written permission, please contact
38   *    partners@jcorporate.com.
39   *
40   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43   * DISCLAIMED.  IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44   * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45   * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46   * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51   * SUCH DAMAGE.
52   * ====================================================================
53   *
54   * This software consists of voluntary contributions made by many
55   * individuals on behalf of the Jcorporate Ltd. Contributions back
56   * to the project(s) are encouraged when you make modifications.
57   * Please send them to support@jcorporate.com. For more information
58   * on Jcorporate Ltd. and its products, please see
59   * <http://www.jcorporate.com/>.
60   *
61   * Portions of this software are based upon other open source
62   * products and are subject to their respective licenses.
63   */
64  
65  package com.jcorporate.expresso.ext.regexp;
66  
67  /*
68   * ====================================================================
69   *
70   * The Apache Software License, Version 1.1
71   *
72   * Copyright (c) 1999 The Apache Software Foundation.  All rights
73   * reserved.
74   *
75   * Redistribution and use in source and binary forms, with or without
76   * modification, are permitted provided that the following conditions
77   * are met:
78   *
79   * 1. Redistributions of source code must retain the above copyright
80   *    notice, this list of conditions and the following disclaimer.
81   *
82   * 2. Redistributions in binary form must reproduce the above copyright
83   *    notice, this list of conditions and the following disclaimer in
84   *    the documentation and/or other materials provided with the
85   *    distribution.
86   *
87   * 3. The end-user documentation included with the redistribution, if
88   *    any, must include the following acknowlegement:
89   *       "This product includes software developed by the
90   *        Apache Software Foundation (http://www.apache.org/)."
91   *    Alternately, this acknowlegement may appear in the software itself,
92   *    if and wherever such third-party acknowlegements normally appear.
93   *
94   * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
95   *    Foundation" must not be used to endorse or promote products derived
96   *    from this software without prior written permission. For written
97   *    permission, please contact apache@apache.org.
98   *
99   * 5. Products derived from this software may not be called "Apache"
100  *    nor may "Apache" appear in their names without prior written
101  *    permission of the Apache Group.
102  *
103  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
104  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
105  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
106  * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
107  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
108  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
109  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
110  * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
112  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
113  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
114  * SUCH DAMAGE.
115  * ====================================================================
116  *
117  * This software consists of voluntary contributions made by many
118  * individuals on behalf of the Apache Software Foundation.  For more
119  * information on the Apache Software Foundation, please see
120  * <http://www.apache.org/>.
121  *
122  */
123 
124 import java.util.Hashtable;
125 
126 
127 /**
128  * A regular expression compiler class.  This class compiles a pattern string into a
129  * regular expression program interpretable by the RE evaluator class.  The 'recompile'
130  * command line tool uses this compiler to pre-compile regular expressions for use
131  * with RE.  For a description of the syntax accepted by RECompiler and what you can
132  * do with regular expressions, see the documentation for the RE matcher class.
133  *
134  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
135  * @version $Id: RECompiler.java,v 1.7 2004/11/17 20:48:15 lhamel Exp $
136  * @see RE
137  * @deprecated since v5.6, use jakarta oro
138  */
139 public class RECompiler {
140 
141     // The compiled program
142     char[] instruction; // The compiled RE 'program' instruction buffer
143     int lenInstruction; // The amount of the program buffer currently in use
144 
145     // Input state for compiling regular expression
146     String pattern; // Input string
147     int len; // Length of the pattern string
148     int idx; // Current input index into ac
149     int parens; // Total number of paren pairs
150 
151     // Node flags
152     static final int NODE_NORMAL = 0; // No flags (nothing special)
153     static final int NODE_NULLABLE = 1; // True if node is potentially null
154     static final int NODE_TOPLEVEL = 2; // True if top level expr
155 
156     // Special types of 'escapes'
157     static final char ESC_MASK = 0xfff0; // Escape complexity mask
158     static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
159     static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
160     static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
161 
162     // {m,n} stacks
163     static final int maxBrackets = 10; // Maximum number of bracket pairs
164     static int brackets = 0; // Number of bracket sets
165     static int[] bracketStart = null; // Starting point
166     static int[] bracketEnd = null; // Ending point
167     static int[] bracketMin = null; // Minimum number of matches
168     static int[] bracketOpt = null; // Additional optional matches
169     static final int bracketUnbounded = -1; // Unbounded value
170     static final int bracketFinished = -2; // Unbounded value
171 
172     // Lookup table for POSIX character class names
173     static Hashtable hashPOSIX = new Hashtable();
174 
175     static {
176         hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
177         hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
178         hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
179         hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
180         hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
181         hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
182         hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
183         hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
184         hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
185         hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
186         hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
187         hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
188         hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
189         hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
190     }
191 
192     /**
193      * Local, nested class for maintaining character ranges for character classes.
194      */
195     class RERange {
196         int size = 16; // Capacity of current range arrays
197         int[] minRange = new int[size]; // Range minima
198         int[] maxRange = new int[size]; // Range maxima
199         int num = 0; // Number of range array elements in use
200 
201         /**
202          * Deletes the range at a given index from the range lists
203          *
204          * @param index Index of range to delete from minRange and maxRange arrays.
205          */
206         void delete(int index) {
207 
208             // Return if no elements left or index is out of range
209             if (num == 0 || index >= num) {
210                 return;
211             }
212             // Move elements down
213             while (index++ < num) {
214                 if (index - 1 >= 0) {
215                     minRange[index - 1] = minRange[index];
216                     maxRange[index - 1] = maxRange[index];
217                 }
218             }
219 
220             // One less element now
221             num--;
222         }
223 
224         /**
225          * Merges a range into the range list, coalescing ranges if possible.
226          *
227          * @param min Minimum end of range
228          * @param max Maximum end of range
229          */
230         void merge(int min, int max) {
231 
232             // Loop through ranges
233             for (int i = 0; i < num; i++) {
234 
235                 // Min-max is subsumed by minRange[i]-maxRange[i]
236                 if (min >= minRange[i] && max <= maxRange[i]) {
237                     return;
238                 }
239 
240                 // Min-max subsumes minRange[i]-maxRange[i]
241                 else if (min <= minRange[i] && max >= maxRange[i]) {
242                     delete(i);
243                     merge(min, max);
244 
245                     return;
246                 }
247 
248                 // Min is in the range, but max is outside
249                 else if (min >= minRange[i] && min <= maxRange[i]) {
250                     delete(i);
251                     min = minRange[i];
252                     merge(min, max);
253 
254                     return;
255                 }
256 
257                 // Max is in the range, but min is outside
258                 else if (max >= minRange[i] && max <= maxRange[i]) {
259                     delete(i);
260                     max = maxRange[i];
261                     merge(min, max);
262 
263                     return;
264                 }
265             }
266             // Must not overlap any other ranges
267             if (num >= size) {
268                 size *= 2;
269 
270                 int[] newMin = new int[size];
271                 int[] newMax = new int[size];
272                 System.arraycopy(minRange, 0, newMin, 0, num);
273                 System.arraycopy(maxRange, 0, newMax, 0, num);
274                 minRange = newMin;
275                 maxRange = newMax;
276             }
277 
278             minRange[num] = min;
279             maxRange[num] = max;
280             num++;
281         }
282 
283         /**
284          * Removes a range by deleting or shrinking all other ranges
285          *
286          * @param min Minimum end of range
287          * @param max Maximum end of range
288          */
289         void remove(int min, int max) {
290 
291             // Loop through ranges
292             for (int i = 0; i < num; i++) {
293 
294                 // minRange[i]-maxRange[i] is subsumed by min-max
295                 if (minRange[i] >= min && maxRange[i] <= max) {
296                     delete(i);
297                     i--;
298 
299                     return;
300                 }
301 
302                 // min-max is subsumed by minRange[i]-maxRange[i]
303                 else if (min >= minRange[i] && max <= maxRange[i]) {
304                     int minr = minRange[i];
305                     int maxr = maxRange[i];
306                     delete(i);
307 
308                     if (minr < min - 1) {
309                         merge(minr, min - 1);
310                     }
311                     if (max + 1 < maxr) {
312                         merge(max + 1, maxr);
313                     }
314 
315                     return;
316                 }
317 
318                 // minRange is in the range, but maxRange is outside
319                 else if (minRange[i] >= min && minRange[i] <= max) {
320                     minRange[i] = max + 1;
321 
322                     return;
323                 }
324 
325                 // maxRange is in the range, but minRange is outside
326                 else if (maxRange[i] >= min && maxRange[i] <= max) {
327                     maxRange[i] = min - 1;
328 
329                     return;
330                 }
331             }
332         }
333 
334         /**
335          * Includes (or excludes) the range from min to max, inclusive.
336          *
337          * @param min     Minimum end of range
338          * @param max     Maximum end of range
339          * @param include True if range should be included.  False otherwise.
340          */
341         void include(int min, int max, boolean include) {
342             if (include) {
343                 merge(min, max);
344             } else {
345                 remove(min, max);
346             }
347         }
348 
349         /**
350          * Includes a range with the same min and max
351          *
352          * @param minmax  Minimum and maximum end of range (inclusive)
353          * @param include True if range should be included.  False otherwise.
354          */
355         void include(char minmax, boolean include) {
356             include(minmax, minmax, include);
357         }
358     }
359 
360     /**
361      * Constructor.  Creates (initially empty) storage for a regular expression program.
362      */
363     public RECompiler() {
364 
365         // Start off with a generous, yet reasonable, initial size
366         instruction = new char[128];
367         lenInstruction = 0;
368     }
369 
370     /**
371      * Allocate storage for brackets only as needed
372      */
373     void allocBrackets() {
374 
375         // Allocate bracket stacks if not already done
376         if (bracketStart == null) {
377 
378             // Allocate storage
379             bracketStart = new int[maxBrackets];
380             bracketEnd = new int[maxBrackets];
381             bracketMin = new int[maxBrackets];
382             bracketOpt = new int[maxBrackets];
383 
384             // Initialize to invalid values
385             for (int i = 0; i < maxBrackets; i++) {
386                 bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
387             }
388         }
389     }
390 
391     /**
392      * Absorb an atomic character string.  This method is a little tricky because
393      * it can un-include the last character of string if a closure operator follows.
394      * This is correct because *+? have higher precedence than concatentation (thus
395      * ABC* means AB(C*) and NOT (ABC)*).
396      *
397      * @return Index of new atom node
398      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
399      */
400     int atom()
401             throws RESyntaxException {
402 
403         // Create a string node
404         int ret = node(RE.OP_ATOM, 0);
405 
406         // Length of atom
407         int lenAtom = 0;
408 
409 // Loop while we've got input
410         atomLoop:
411 
412                 while (idx < len) {
413 
414                     // Is there a next char?
415                     if ((idx + 1) < len) {
416                         char c = pattern.charAt(idx + 1);
417 
418                         // If the next 'char' is an escape, look past the whole escape
419                         if (pattern.charAt(idx) == '\\') {
420                             int idxEscape = idx;
421                             escape();
422 
423                             if (idx < len) {
424                                 c = pattern.charAt(idx);
425                             }
426 
427                             idx = idxEscape;
428                         }
429                         // Switch on next char
430                         switch (c) {
431                             case '{':
432                             case '?':
433                             case '*':
434                             case '+':
435 
436                                 // If the next character is a closure operator and our atom is non-empty, the
437                                 // current character should bind to the closure operator rather than the atom
438                                 if (lenAtom != 0) {
439                                     break atomLoop;
440                                 }
441                         }
442                     }
443                     // Switch on current char
444                     switch (pattern.charAt(idx)) {
445                         case ']':
446                         case '^':
447                         case '$':
448                         case '.':
449                         case '[':
450                         case '(':
451                         case ')':
452                         case '|':
453                             break atomLoop;
454 
455                         case '{':
456                         case '?':
457                         case '*':
458                         case '+':
459 
460                             // We should have an atom by now
461                             if (lenAtom == 0) {
462 
463                                 // No atom before closure
464                                 syntaxError("Missing operand to closure");
465                             }
466 
467                             break atomLoop;
468 
469                         case '\\':
470                             {
471 
472                                 // Get the escaped character (advances input automatically)
473                                 int idxBeforeEscape = idx;
474                                 char c = escape();
475 
476                                 // Check if it's a simple escape (as opposed to, say, a backreference)
477                                 if ((c & ESC_MASK) == ESC_MASK) {
478 
479                                     // Not a simple escape, so backup to where we were before the escape.
480                                     idx = idxBeforeEscape;
481                                     break atomLoop;
482                                 }
483 
484                                 // Add escaped char to atom
485                                 emit(c);
486                                 lenAtom++;
487                             }
488 
489                             break;
490 
491                         default:
492 
493                             // Add normal character to atom
494                             emit(pattern.charAt(idx++));
495                             lenAtom++;
496                             break;
497                     }
498                 }
499         // This "shouldn't" happen
500         if (lenAtom == 0) {
501             internalError();
502         }
503 
504         // Emit the atom length into the program
505         instruction[ret + RE.offsetOpdata] = (char) lenAtom;
506 
507         return ret;
508     }
509 
510     /**
511      * Match bracket {m,n} expression put results in bracket member variables
512      *
513      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
514      */
515     void bracket()
516             throws RESyntaxException {
517 
518         // Current character must be a '{'
519         if (idx >= len || pattern.charAt(idx++) != '{') {
520             internalError();
521         }
522         // Next char must be a digit
523         if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
524             syntaxError("Expected digit");
525         }
526 
527         // Get min ('m' of {m,n}) number
528         StringBuffer number = new StringBuffer();
529 
530         while (idx < len && Character.isDigit(pattern.charAt(idx))) {
531             number.append(pattern.charAt(idx++));
532         }
533         try {
534             bracketMin[brackets] = Integer.parseInt(number.toString());
535         } catch (NumberFormatException e) {
536             syntaxError("Expected valid number");
537         }
538         // If out of input, fail
539         if (idx >= len) {
540             syntaxError("Expected comma or right bracket");
541         }
542         // If end of expr, optional limit is 0
543         if (pattern.charAt(idx) == '}') {
544             idx++;
545             bracketOpt[brackets] = 0;
546 
547             return;
548         }
549         // Must have at least {m,} and maybe {m,n}.
550         if (idx >= len || pattern.charAt(idx++) != ',') {
551             syntaxError("Expected comma");
552         }
553         // If out of input, fail
554         if (idx >= len) {
555             syntaxError("Expected comma or right bracket");
556         }
557         // If {m,} max is unlimited
558         if (pattern.charAt(idx) == '}') {
559             idx++;
560             bracketOpt[brackets] = bracketUnbounded;
561 
562             return;
563         }
564         // Next char must be a digit
565         if (idx >= len || !Character.isDigit(pattern.charAt(idx))) {
566             syntaxError("Expected digit");
567         }
568 
569         // Get max number
570         number.setLength(0);
571 
572         while (idx < len && Character.isDigit(pattern.charAt(idx))) {
573             number.append(pattern.charAt(idx++));
574         }
575         try {
576             bracketOpt[brackets] = Integer.parseInt(number.toString()) -
577                     bracketMin[brackets];
578         } catch (NumberFormatException e) {
579             syntaxError("Expected valid number");
580         }
581         // Optional repetitions must be > 0
582         if (bracketOpt[brackets] <= 0) {
583             syntaxError("Bad range");
584         }
585         // Must have close brace
586         if (idx >= len || pattern.charAt(idx++) != '}') {
587             syntaxError("Missing close brace");
588         }
589     }
590 
591     /**
592      * Compile one branch of an or operator (implements concatenation)
593      *
594      * @param flags Flags passed by reference
595      * @return Pointer to branch node
596      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
597      */
598     int branch(int[] flags)
599             throws RESyntaxException {
600 
601         // Get each possibly closured piece and concat
602         int node;
603         int ret = node(RE.OP_BRANCH, 0);
604         int chain = -1;
605         int[] closureFlags = new int[1];
606         boolean nullable = true;
607 
608         while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')') {
609 
610             // Get new node
611             closureFlags[0] = NODE_NORMAL;
612             node = closure(closureFlags);
613 
614             if (closureFlags[0] == NODE_NORMAL) {
615                 nullable = false;
616             }
617             // If there's a chain, append to the end
618             if (chain != -1) {
619                 setNextOfEnd(chain, node);
620             }
621 
622             // Chain starts at current
623             chain = node;
624         }
625         // If we don't run loop, make a nothing node
626         if (chain == -1) {
627             node(RE.OP_NOTHING, 0);
628         }
629         // Set nullable flag for this branch
630         if (nullable) {
631             flags[0] |= NODE_NULLABLE;
632         }
633 
634         return ret;
635     }
636 
637     /**
638      * Compile a character class
639      *
640      * @return Index of class node
641      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
642      */
643     int characterClass()
644             throws RESyntaxException {
645 
646         // Check for bad calling or empty class
647         if (pattern.charAt(idx) != '[') {
648             internalError();
649         }
650         // Check for unterminated or empty class
651         if ((idx + 1) >= len || pattern.charAt(++idx) == ']') {
652             syntaxError("Empty or unterminated class");
653         }
654         // Check for POSIX character class
655         if (idx < len && pattern.charAt(idx) == ':') {
656 
657             // Skip colon
658             idx++;
659 
660             // POSIX character classes are denoted with lowercase ASCII strings
661             int idxStart = idx;
662 
663             while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z') {
664                 idx++;
665             }
666             // Should be a ":]" to terminate the POSIX character class
667             if ((idx + 1) < len && pattern.charAt(idx) == ':' &&
668                     pattern.charAt(idx + 1) == ']') {
669 
670                 // Get character class
671                 String charClass = pattern.substring(idxStart, idx);
672 
673                 // Select the POSIX class id
674                 Character i = (Character) hashPOSIX.get(charClass);
675 
676                 if (i != null) {
677 
678                     // Move past colon and right bracket
679                     idx += 2;
680 
681                     // Return new POSIX character class node
682                     return node(RE.OP_POSIXCLASS, i.charValue());
683                 }
684 
685                 syntaxError("Invalid POSIX character class '" + charClass +
686                         "'");
687             }
688 
689             syntaxError("Invalid POSIX character class syntax");
690         }
691 
692         // Try to build a class.  Create OP_ANYOF node
693         int ret = node(RE.OP_ANYOF, 0);
694 
695         // Parse class declaration
696         char CHAR_INVALID = Character.MAX_VALUE;
697         char last = CHAR_INVALID;
698         char simpleChar = 0;
699         boolean include = true;
700         boolean definingRange = false;
701         int idxFirst = idx;
702         char rangeStart = Character.MIN_VALUE;
703         char rangeEnd;
704         RERange range = new RERange();
705 
706         while (idx < len && pattern.charAt(idx) != ']') {
707             switchOnCharacter:
708 
709                         // Switch on character
710                         switch (pattern.charAt(idx)) {
711                             case '^':
712                                 include = !include;
713 
714                                 if (idx == idxFirst) {
715                                     range.include(Character.MIN_VALUE, Character.MAX_VALUE,
716                                             true);
717                                 }
718 
719                                 idx++;
720                                 continue;
721 
722                             case '\\':
723                                 {
724 
725                                     // Escape always advances the stream
726                                     char c;
727 
728                                     switch (c = escape()) {
729                                         case ESC_COMPLEX:
730                                         case ESC_BACKREF:
731 
732                                             // Word boundaries and backrefs not allowed in a character class!
733                                             syntaxError("Bad character class");
734 
735                                         case ESC_CLASS:
736 
737                                             // Classes can't be an endpoint of a range
738                                             if (definingRange) {
739                                                 syntaxError("Bad character class");
740                                             }
741                                             // Handle specific type of class (some are ok)
742                                             switch (pattern.charAt(idx - 1)) {
743                                                 case RE.E_NSPACE:
744                                                 case RE.E_NDIGIT:
745                                                 case RE.E_NALNUM:
746                                                     syntaxError("Bad character class");
747 
748                                                 case RE.E_SPACE:
749                                                     range.include('\t', include);
750                                                     range.include('\r', include);
751                                                     range.include('\f', include);
752                                                     range.include('\n', include);
753                                                     range.include('\b', include);
754                                                     range.include(' ', include);
755                                                     break;
756 
757                                                 case RE.E_ALNUM:
758                                                     range.include('a', 'z', include);
759                                                     range.include('A', 'Z', include);
760                                                     range.include('_', include);
761 
762                                                     // Fall through!
763                                                 case RE.E_DIGIT:
764                                                     range.include('0', '9', include);
765                                                     break;
766                                             }
767 
768                                             // Make last char invalid (can't be a range start)
769                                             last = CHAR_INVALID;
770                                             break;
771 
772                                         default:
773 
774                                             // Escape is simple so treat as a simple char
775                                             simpleChar = c;
776                                             break switchOnCharacter;
777                                     }
778                                 }
779 
780                                 continue;
781 
782                             case '-':
783 
784                                 // Start a range if one isn't already started
785                                 if (definingRange) {
786                                     syntaxError("Bad class range");
787                                 }
788 
789                                 definingRange = true;
790 
791                                 // If no last character, start of range is 0
792                                 rangeStart = (last == CHAR_INVALID ? 0 : last);
793 
794                                 // Premature end of range. define up to Character.MAX_VALUE
795                                 if ((idx + 1) < len && pattern.charAt(++idx) == ']') {
796                                     simpleChar = Character.MAX_VALUE;
797                                     break;
798                                 }
799 
800                                 continue;
801 
802                             default:
803                                 simpleChar = pattern.charAt(idx++);
804                                 break;
805                         }
806             // Handle simple character simpleChar
807             if (definingRange) {
808 
809                 // if we are defining a range make it now
810                 rangeEnd = simpleChar;
811 
812                 // Actually create a range if the range is ok
813                 if (rangeStart >= rangeEnd) {
814                     syntaxError("Bad character class");
815                 }
816 
817                 range.include(rangeStart, rangeEnd, include);
818 
819                 // We are done defining the range
820                 last = CHAR_INVALID;
821                 definingRange = false;
822             } else {
823 
824                 // If simple character and not start of range, include it
825                 if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-') {
826                     range.include(simpleChar, include);
827                 }
828 
829                 last = simpleChar;
830             }
831         }
832         // Shouldn't be out of input
833         if (idx == len) {
834             syntaxError("Unterminated character class");
835         }
836 
837         // Absorb the ']' end of class marker
838         idx++;
839 
840         // Emit character class definition
841         instruction[ret + RE.offsetOpdata] = (char) range.num;
842 
843         for (int i = 0; i < range.num; i++) {
844             emit((char) range.minRange[i]);
845             emit((char) range.maxRange[i]);
846         }
847 
848         return ret;
849     }
850 
851     /**
852      * Compile a possibly closured terminal
853      *
854      * @param flags Flags passed by reference
855      * @return Index of closured node
856      * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
857      */
858     int closure(int[] flags)
859             throws RESyntaxException {
860 
861         // Before terminal
862         int idxBeforeTerminal = idx;
863 
864         // Values to pass by reference to terminal()
865         int[] terminalFlags = {NODE_NORMAL};
866 
867         // Get terminal symbol
868         int ret = terminal(terminalFlags);
869 
870         // Or in flags from terminal symbol
871         flags[0] |= terminalFlags[0];
872 
873         // Advance input, set NODE_NULLABLE flag and do sanity checks
874         if (idx >= len) {
875             return ret;
876         }
877 
878         boolean greedy = true;
879         char closureType = pattern.charAt(idx);
880 
881         switch (closureType) {
882             case '?':
883             case '*':
884 
885                 // The current node can be null
886                 flags[0] |= NODE_NULLABLE;
887 
888             case '+':
889 
890                 // Eat closure character
891                 idx++;
892 
893             case '{':
894 
895                 // Don't allow blantant stupidity
896                 int opcode = instruction[ret + RE.offsetOpcode];
897 
898                 if (opcode == RE.OP_BOL || opcode == RE.OP_EOL) {
899                     syntaxError("Bad closure operand");
900                 }
901                 if ((terminalFlags[0] & NODE_NULLABLE) != 0) {
902                     syntaxError("Closure operand can't be nullable");
903                 }
904 
905                 break;
906         }
907         // If the next character is a '?', make the closure non-greedy (reluctant)
908         if (idx < len && pattern.charAt(idx) == '?') {
909             idx++;
910             greedy = false;
911         }
912         if (greedy) {
913 
914             // Actually do the closure now
915             switch (closureType) {
916                 case '{':
917                     {
918 
919                         // We look for our bracket in the list
920                         boolean found = false;
921                         int i;
922                         allocBrackets();
923 
924                         for (i = 0; i < brackets; i++) {
925                             if (bracketStart[i] == idx) {
926                                 found = true;
927                                 break;
928                             }
929                         }
930                         // If its not in the list we parse the {m,n}
931                         if (!found) {
932                             if (brackets >= maxBrackets) {
933                                 syntaxError("Too many bracketed closures (limit is 10)");
934                             }
935 
936                             bracketStart[brackets] = idx;
937                             bracket();
938                             bracketEnd[brackets] = idx;
939                             i = brackets++;
940                         }
941                         // If there's a min, rewind stream and reparse
942                         if (--bracketMin[i] > 0) {
943 
944                             // Rewind stream and run it through again
945                             idx = idxBeforeTerminal;
946                             break;
947                         }
948                         // Do the right thing for maximum ({m,})
949                         if (bracketOpt[i] == bracketFinished) {
950 
951                             // Drop through now and closure expression.
952                             // We are done with the {m,} expr, so skip rest
953                             closureType = '*';
954                             bracketOpt[i] = 0;
955                             idx = bracketEnd[i];
956                         } else if (bracketOpt[i] == bracketUnbounded) {
957                             idx = idxBeforeTerminal;
958                             bracketOpt[i] = bracketFinished;
959                             break;
960                         } else if (bracketOpt[i]-- > 0) {
961 
962                             // Drop through to optionally close and then 'play it again sam!'
963                             idx = idxBeforeTerminal;
964                             closureType = '?';
965                         } else {
966 
967                             // We are done. skip the rest of {m,n} expr
968                             idx = bracketEnd[i];
969                             break;
970                         }
971                     }
972                     // Fall through!
973                 case '?':
974                 case '*':
975                     if (!greedy) {
976                         break;
977                     }
978                     if (closureType == '?') {
979 
980                         // X? is compiled as (X|)
981                         nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
982                         setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // inserted branch to option
983 
984                         int nothing = node(RE.OP_NOTHING, 0); // which is OP_NOTHING
985                         setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
986                         setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
987                     }
988                     if (closureType == '*') {
989 
990                         // X* is compiled as (X{gotoX}|)
991                         nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
992                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
993                         setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
994                         setNextOfEnd(ret + RE.nodeSize, ret); // the start again
995                         setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
996                         setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
997                     }
998 
999                     break;
1000
1001                case '+':
1002                    {
1003
1004                        // X+ is compiled as X({gotoX}|)
1005                        int branch;
1006                        branch = node(RE.OP_BRANCH, 0); // a new branch
1007                        setNextOfEnd(ret, branch); // is added to the end of X
1008                        setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
1009                        setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
1010                        setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
1011                    }
1012
1013                    break;
1014            }
1015        } else {
1016
1017            // Add end after closured subexpr
1018            setNextOfEnd(ret, node(RE.OP_END, 0));
1019
1020            // Actually do the closure now
1021            switch (closureType) {
1022                case '?':
1023                    nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
1024                    break;
1025
1026                case '*':
1027                    nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
1028                    break;
1029
1030                case '+':
1031                    nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
1032                    break;
1033            }
1034
1035            // Point to the expr after the closure
1036            setNextOfEnd(ret, lenInstruction);
1037        }
1038
1039        return ret;
1040    }
1041
1042    /**
1043     * Compiles a regular expression pattern into a program runnable by the pattern
1044     * matcher class 'RE'.
1045     *
1046     * @param pattern Regular expression pattern to compile (see RECompiler class
1047     *                for details).
1048     * @return A compiled regular expression program.
1049     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1050     * @see RECompiler
1051     * @see RE
1052     */
1053    public REProgram compile(String pattern)
1054            throws RESyntaxException {
1055
1056        // Initialize variables for compilation
1057        this.pattern = pattern; // Save pattern in instance variable
1058        len = pattern.length(); // Precompute pattern length for speed
1059        idx = 0; // Set parsing index to the first character
1060        lenInstruction = 0; // Set emitted instruction count to zero
1061        parens = 1; // Set paren level to 1 (the implicit outer parens)
1062        brackets = 0; // No bracketed closures yet
1063
1064        // Initialize pass by reference flags value
1065        int[] flags = {NODE_TOPLEVEL};
1066
1067        // Parse expression
1068        expr(flags);
1069
1070        // Should be at end of input
1071        if (idx != len) {
1072            if (pattern.charAt(idx) == ')') {
1073                syntaxError("Unmatched close paren");
1074            }
1075
1076            syntaxError("Unexpected input remains");
1077        }
1078
1079        // Return the result
1080        char[] ins = new char[lenInstruction];
1081        System.arraycopy(instruction, 0, ins, 0, lenInstruction);
1082
1083        return new REProgram(ins);
1084    }
1085
1086    /**
1087     * Emit a single character into the program stream.
1088     *
1089     * @param c Character to add
1090     */
1091    void emit(char c) {
1092
1093        // Make room for character
1094        ensure(1);
1095
1096        // Add character
1097        instruction[lenInstruction++] = c;
1098    }
1099
1100    /**
1101     * Ensures that n more characters can fit in the program buffer.
1102     * If n more can't fit, then the size is doubled until it can.
1103     *
1104     * @param n Number of additional characters to ensure will fit.
1105     */
1106    void ensure(int n) {
1107
1108        // Get current program length
1109        int curlen = instruction.length;
1110
1111        // If the current length + n more is too much
1112        if (lenInstruction + n >= curlen) {
1113
1114            // Double the size of the program array until n more will fit
1115            while (lenInstruction + n >= curlen) {
1116                curlen *= 2;
1117            }
1118
1119            // Allocate new program array and move data into it
1120            char[] newInstruction = new char[curlen];
1121            System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
1122            instruction = newInstruction;
1123        }
1124    }
1125
1126    /**
1127     * Match an escape sequence.  Handles quoted chars and octal escapes as well
1128     * as normal escape characters.  Always advances the input stream by the
1129     * right amount. This code "understands" the subtle difference between an
1130     * octal escape and a backref.  You can access the type of ESC_CLASS or
1131     * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
1132     *
1133     * @return ESC_* code or character if simple escape
1134     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1135     */
1136    char escape()
1137            throws RESyntaxException {
1138
1139        // "Shouldn't" happen
1140        if (pattern.charAt(idx) != '\\') {
1141            internalError();
1142        }
1143        // Escape shouldn't occur as last character in string!
1144        if (idx + 1 == len) {
1145            syntaxError("Escape terminates string");
1146        }
1147
1148        // Switch on character after backslash
1149        idx += 2;
1150
1151        char escapeChar = pattern.charAt(idx - 1);
1152
1153        switch (escapeChar) {
1154            case RE.E_BOUND:
1155            case RE.E_NBOUND:
1156                return ESC_COMPLEX;
1157
1158            case RE.E_ALNUM:
1159            case RE.E_NALNUM:
1160            case RE.E_SPACE:
1161            case RE.E_NSPACE:
1162            case RE.E_DIGIT:
1163            case RE.E_NDIGIT:
1164                return ESC_CLASS;
1165
1166            case 'u':
1167            case 'x':
1168                {
1169
1170                    // Exact required hex digits for escape type
1171                    int hexDigits = (escapeChar == 'u' ? 4 : 2);
1172
1173                    // Parse up to hexDigits characters from input
1174                    int val = 0;
1175
1176                    for (; idx < len && hexDigits-- > 0; idx++) {
1177
1178                        // Get char
1179                        char c = pattern.charAt(idx);
1180
1181                        // If it's a hexadecimal digit (0-9)
1182                        if (c >= '0' && c <= '9') {
1183
1184                            // Compute new value
1185                            val = (val << 4) + c - '0';
1186                        } else {
1187
1188                            // If it's a hexadecimal letter (a-f)
1189                            c = Character.toLowerCase(c);
1190
1191                            if (c >= 'a' && c <= 'f') {
1192
1193                                // Compute new value
1194                                val = (val << 4) + (c - 'a') + 10;
1195                            } else {
1196
1197                                // If it's not a valid digit or hex letter, the escape must be invalid
1198                                // because hexDigits of input have not been absorbed yet.
1199                                syntaxError("Expected " + hexDigits +
1200                                        " hexadecimal digits after \\" +
1201                                        escapeChar);
1202                            }
1203                        }
1204                    }
1205
1206                    return (char) val;
1207                }
1208            case 't':
1209                return '\t';
1210
1211            case 'n':
1212                return '\n';
1213
1214            case 'r':
1215                return '\r';
1216
1217            case 'f':
1218                return '\f';
1219
1220            case '0':
1221            case '1':
1222            case '2':
1223            case '3':
1224            case '4':
1225            case '5':
1226            case '6':
1227            case '7':
1228            case '8':
1229            case '9':
1230
1231                // An octal escape starts with a 0 or has two digits in a row
1232                if ((idx < len && Character.isDigit(pattern.charAt(idx))) ||
1233                        escapeChar == '0') {
1234
1235                    // Handle \nnn octal escapes
1236                    int val = escapeChar - '0';
1237
1238                    if (idx < len && Character.isDigit(pattern.charAt(idx))) {
1239                        val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1240
1241                        if (idx < len &&
1242                                Character.isDigit(pattern.charAt(idx))) {
1243                            val = ((val << 3) + (pattern.charAt(idx++) - '0'));
1244                        }
1245                    }
1246
1247                    return (char) val;
1248                }
1249
1250                // It's actually a backreference (\[1-9]), not an escape
1251                return ESC_BACKREF;
1252
1253            default:
1254
1255                // Simple quoting of a character
1256                return escapeChar;
1257        }
1258    }
1259
1260    /**
1261     * Compile an expression with possible parens around it.  Paren matching
1262     * is done at this level so we can tie the branch tails together.
1263     *
1264     * @param flags Flag value passed by reference
1265     * @return Node index of expression in instruction array
1266     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1267     */
1268    int expr(int[] flags)
1269            throws RESyntaxException {
1270
1271        // Create open paren node unless we were called from the top level (which has no parens)
1272        boolean paren = false;
1273        int ret = -1;
1274        int closeParens = parens;
1275
1276        if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(') {
1277            idx++;
1278            paren = true;
1279            ret = node(RE.OP_OPEN, parens++);
1280        }
1281
1282        flags[0] &= ~NODE_TOPLEVEL;
1283
1284        // Create a branch node
1285        int branch = branch(flags);
1286
1287        if (ret == -1) {
1288            ret = branch;
1289        } else {
1290            setNextOfEnd(ret, branch);
1291        }
1292        // Loop through branches
1293        while (idx < len && pattern.charAt(idx) == '|') {
1294            idx++;
1295            branch = branch(flags);
1296            setNextOfEnd(ret, branch);
1297        }
1298
1299        // Create an ending node (either a close paren or an OP_END)
1300        int end;
1301
1302        if (paren) {
1303            if (idx < len && pattern.charAt(idx) == ')') {
1304                idx++;
1305            } else {
1306                syntaxError("Missing close paren");
1307            }
1308
1309            end = node(RE.OP_CLOSE, closeParens);
1310        } else {
1311            end = node(RE.OP_END, 0);
1312        }
1313
1314        // Append the ending node to the ret nodelist
1315        setNextOfEnd(ret, end);
1316
1317        // Hook the ends of each branch to the end node
1318        for (int next = -1, i = ret;
1319             next != 0;
1320             next = instruction[i + RE.offsetNext], i += next) {
1321
1322            // If branch, make the end of the branch's operand chain point to the end node.
1323            if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH) {
1324                setNextOfEnd(i + RE.nodeSize, end);
1325            }
1326        }
1327
1328        // Return the node list
1329        return ret;
1330    }
1331
1332    /**
1333     * Throws a new internal error exception
1334     *
1335     * @throws Error Thrown in the event of an internal error.
1336     */
1337    void internalError()
1338            throws Error {
1339        throw new Error("Internal error!");
1340    }
1341
1342    /**
1343     * Adds a new node
1344     *
1345     * @param opcode Opcode for node
1346     * @param opdata Opdata for node (only the low 16 bits are currently used)
1347     * @return Index of new node in program
1348     */
1349    int node(char opcode, int opdata) {
1350
1351        // Make room for a new node
1352        ensure(RE.nodeSize);
1353
1354        // Add new node at end
1355        instruction[lenInstruction + RE.offsetOpcode] = opcode;
1356        instruction[lenInstruction + RE.offsetOpdata] = (char) opdata;
1357        instruction[lenInstruction + RE.offsetNext] = 0;
1358        lenInstruction += RE.nodeSize;
1359
1360        // Return index of new node
1361        return lenInstruction - RE.nodeSize;
1362    }
1363
1364    /**
1365     * Inserts a node with a given opcode and opdata at insertAt.  The node relative next
1366     * pointer is initialized to 0.
1367     *
1368     * @param opcode   Opcode for new node
1369     * @param opdata   Opdata for new node (only the low 16 bits are currently used)
1370     * @param insertAt Index at which to insert the new node in the program
1371     */
1372    void nodeInsert(char opcode, int opdata, int insertAt) {
1373
1374        // Make room for a new node
1375        ensure(RE.nodeSize);
1376
1377        // Move everything from insertAt to the end down nodeSize elements
1378        System.arraycopy(instruction, insertAt, instruction,
1379                insertAt + RE.nodeSize, lenInstruction - insertAt);
1380        instruction[insertAt + RE.offsetOpcode] = opcode;
1381        instruction[insertAt + RE.offsetOpdata] = (char) opdata;
1382        instruction[insertAt + RE.offsetNext] = 0;
1383        lenInstruction += RE.nodeSize;
1384    }
1385
1386    /**
1387     * Appends a node to the end of a node chain
1388     *
1389     * @param node    Start of node chain to traverse
1390     * @param pointTo Node to have the tail of the chain point to
1391     */
1392    void setNextOfEnd(int node, int pointTo) {
1393
1394        // Traverse the chain until the next offset is 0
1395        int next;
1396
1397        while ((next = instruction[node + RE.offsetNext]) != 0) {
1398            node += next;
1399        }
1400
1401        // Point the last node in the chain to pointTo.
1402        instruction[node + RE.offsetNext] = (char) (short) (pointTo - node);
1403    }
1404
1405    /**
1406     * Throws a new syntax error exception
1407     *
1408     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1409     */
1410    void syntaxError(String s)
1411            throws RESyntaxException {
1412        throw new RESyntaxException(s);
1413    }
1414
1415    /**
1416     * Match a terminal node.
1417     *
1418     * @param flags Flags
1419     * @return Index of terminal node (closeable)
1420     * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
1421     */
1422    int terminal(int[] flags)
1423            throws RESyntaxException {
1424        switch (pattern.charAt(idx)) {
1425            case RE.OP_EOL:
1426            case RE.OP_BOL:
1427            case RE.OP_ANY:
1428                return node(pattern.charAt(idx++), 0);
1429
1430            case '[':
1431                return characterClass();
1432
1433            case '(':
1434                return expr(flags);
1435
1436            case ')':
1437                syntaxError("Unexpected close paren");
1438
1439            case '|':
1440                internalError();
1441
1442            case ']':
1443                syntaxError("Mismatched class");
1444
1445            case 0:
1446                syntaxError("Unexpected end of input");
1447
1448            case '?':
1449            case '+':
1450            case '{':
1451            case '*':
1452                syntaxError("Missing operand to closure");
1453
1454            case '\\':
1455                {
1456
1457                    // Don't forget, escape() advances the input stream!
1458                    int idxBeforeEscape = idx;
1459
1460                    // Switch on escaped character
1461                    switch (escape()) {
1462                        case ESC_CLASS:
1463                        case ESC_COMPLEX:
1464                            flags[0] &= ~NODE_NULLABLE;
1465
1466                            return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
1467
1468                        case ESC_BACKREF:
1469                            {
1470                                char backreference = (char) (pattern.charAt(idx - 1) -
1471                                        '0');
1472
1473                                if (parens <= backreference) {
1474                                    syntaxError("Bad backreference");
1475                                }
1476
1477                                flags[0] |= NODE_NULLABLE;
1478
1479                                return node(RE.OP_BACKREF, backreference);
1480                            }
1481                        default:
1482
1483                            // We had a simple escape and we want to have it end up in
1484                            // an atom, so we back up and fall though to the default handling
1485                            idx = idxBeforeEscape;
1486                            flags[0] &= ~NODE_NULLABLE;
1487                            break;
1488                    }
1489                }
1490        }
1491
1492        // Everything above either fails or returns.
1493        // If it wasn't one of the above, it must be the start of an atom.
1494        flags[0] &= ~NODE_NULLABLE;
1495
1496        return atom();
1497    }
1498}