Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/regexp/RE.java


1   /*
2    * Copyright 1999-2004 The Apache Software Foundation.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.regexp;
18  
19  import java.io.Serializable;
20  import java.util.Vector;
21  
22  /**
23   * RE is an efficient, lightweight regular expression evaluator/matcher
24   * class. Regular expressions are pattern descriptions which enable
25   * sophisticated matching of strings.  In addition to being able to
26   * match a string against a pattern, you can also extract parts of the
27   * match.  This is especially useful in text parsing! Details on the
28   * syntax of regular expression patterns are given below.
29   *
30   * <p>
31   * To compile a regular expression (RE), you can simply construct an RE
32   * matcher object from the string specification of the pattern, like this:
33   *
34   * <pre>
35   *  RE r = new RE("a*b");
36   * </pre>
37   *
38   * <p>
39   * Once you have done this, you can call either of the RE.match methods to
40   * perform matching on a String.  For example:
41   *
42   * <pre>
43   *  boolean matched = r.match("aaaab");
44   * </pre>
45   *
46   * will cause the boolean matched to be set to true because the
47   * pattern "a*b" matches the string "aaaab".
48   *
49   * <p>
50   * If you were interested in the <i>number</i> of a's which matched the
51   * first part of our example expression, you could change the expression to
52   * "(a*)b".  Then when you compiled the expression and matched it against
53   * something like "xaaaab", you would get results like this:
54   *
55   * <pre>
56   *  RE r = new RE("(a*)b");                  // Compile expression
57   *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
58   *
59   *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
60   *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
61   *
62   *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
63   *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
64   *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
65   *
66   *  int startInside = r.getParenStart(1);    // startInside will be index 1
67   *  int endInside = r.getParenEnd(1);        // endInside will be index 5
68   *  int lenInside = r.getParenLength(1);     // lenInside will be 4
69   * </pre>
70   *
71   * You can also refer to the contents of a parenthesized expression
72   * within a regular expression itself.  This is called a
73   * 'backreference'.  The first backreference in a regular expression is
74   * denoted by \1, the second by \2 and so on.  So the expression:
75   *
76   * <pre>
77   *  ([0-9]+)=\1
78   * </pre>
79   *
80   * will match any string of the form n=n (like 0=0 or 2=2).
81   *
82   * <p>
83   * The full regular expression syntax accepted by RE is described here:
84   *
85   * <pre>
86   *
87   *  <b><font face=times roman>Characters</font></b>
88   *
89   *    <i>unicodeChar</i>   Matches any identical unicode character
90   *    \                    Used to quote a meta-character (like '*')
91   *    \\                   Matches a single '\' character
92   *    \0nnn                Matches a given octal character
93   *    \xhh                 Matches a given 8-bit hexadecimal character
94   *    \\uhhhh              Matches a given 16-bit hexadecimal character
95   *    \t                   Matches an ASCII tab character
96   *    \n                   Matches an ASCII newline character
97   *    \r                   Matches an ASCII return character
98   *    \f                   Matches an ASCII form feed character
99   *
100  *
101  *  <b><font face=times roman>Character Classes</font></b>
102  *
103  *    [abc]                Simple character class
104  *    [a-zA-Z]             Character class with ranges
105  *    [^abc]               Negated character class
106  * </pre>
107  *
108  * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
109  * from zero&quot; or &quot;ends with last character&quot;.
110  * <br>
111  * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
112  * [-] means &quot;all characters&quot;.
113  *
114  * <pre>
115  *
116  *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
117  *
118  *    [:alnum:]            Alphanumeric characters.
119  *    [:alpha:]            Alphabetic characters.
120  *    [:blank:]            Space and tab characters.
121  *    [:cntrl:]            Control characters.
122  *    [:digit:]            Numeric characters.
123  *    [:graph:]            Characters that are printable and are also visible.
124  *                         (A space is printable, but not visible, while an
125  *                         `a' is both.)
126  *    [:lower:]            Lower-case alphabetic characters.
127  *    [:print:]            Printable characters (characters that are not
128  *                         control characters.)
129  *    [:punct:]            Punctuation characters (characters that are not letter,
130  *                         digits, control characters, or space characters).
131  *    [:space:]            Space characters (such as space, tab, and formfeed,
132  *                         to name a few).
133  *    [:upper:]            Upper-case alphabetic characters.
134  *    [:xdigit:]           Characters that are hexadecimal digits.
135  *
136  *
137  *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
138  *
139  *    [:javastart:]        Start of a Java identifier
140  *    [:javapart:]         Part of a Java identifier
141  *
142  *
143  *  <b><font face=times roman>Predefined Classes</font></b>
144  *
145  *    .         Matches any character other than newline
146  *    \w        Matches a "word" character (alphanumeric plus "_")
147  *    \W        Matches a non-word character
148  *    \s        Matches a whitespace character
149  *    \S        Matches a non-whitespace character
150  *    \d        Matches a digit character
151  *    \D        Matches a non-digit character
152  *
153  *
154  *  <b><font face=times roman>Boundary Matchers</font></b>
155  *
156  *    ^         Matches only at the beginning of a line
157  *    $         Matches only at the end of a line
158  *    \b        Matches only at a word boundary
159  *    \B        Matches only at a non-word boundary
160  *
161  *
162  *  <b><font face=times roman>Greedy Closures</font></b>
163  *
164  *    A*        Matches A 0 or more times (greedy)
165  *    A+        Matches A 1 or more times (greedy)
166  *    A?        Matches A 1 or 0 times (greedy)
167  *    A{n}      Matches A exactly n times (greedy)
168  *    A{n,}     Matches A at least n times (greedy)
169  *    A{n,m}    Matches A at least n but not more than m times (greedy)
170  *
171  *
172  *  <b><font face=times roman>Reluctant Closures</font></b>
173  *
174  *    A*?       Matches A 0 or more times (reluctant)
175  *    A+?       Matches A 1 or more times (reluctant)
176  *    A??       Matches A 0 or 1 times (reluctant)
177  *
178  *
179  *  <b><font face=times roman>Logical Operators</font></b>
180  *
181  *    AB        Matches A followed by B
182  *    A|B       Matches either A or B
183  *    (A)       Used for subexpression grouping
184  *   (?:A)      Used for subexpression clustering (just like grouping but
185  *              no backrefs)
186  *
187  *
188  *  <b><font face=times roman>Backreferences</font></b>
189  *
190  *    \1    Backreference to 1st parenthesized subexpression
191  *    \2    Backreference to 2nd parenthesized subexpression
192  *    \3    Backreference to 3rd parenthesized subexpression
193  *    \4    Backreference to 4th parenthesized subexpression
194  *    \5    Backreference to 5th parenthesized subexpression
195  *    \6    Backreference to 6th parenthesized subexpression
196  *    \7    Backreference to 7th parenthesized subexpression
197  *    \8    Backreference to 8th parenthesized subexpression
198  *    \9    Backreference to 9th parenthesized subexpression
199  * </pre>
200  *
201  * <p>
202  * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
203  * that they match as many elements of the string as possible without
204  * causing the overall match to fail.  If you want a closure to be
205  * reluctant (non-greedy), you can simply follow it with a '?'.  A
206  * reluctant closure will match as few elements of the string as
207  * possible when finding matches.  {m,n} closures don't currently
208  * support reluctancy.
209  *
210  * <p>
211  * <b><font face="times roman">Line terminators</font></b>
212  * <br>
213  * A line terminator is a one- or two-character sequence that marks
214  * the end of a line of the input character sequence. The following
215  * are recognized as line terminators:
216  * <ul>
217  * <li>A newline (line feed) character ('\n'),</li>
218  * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
219  * <li>A standalone carriage-return character ('\r'),</li>
220  * <li>A next-line character ('\u0085'),</li>
221  * <li>A line-separator character ('\u2028'), or</li>
222  * <li>A paragraph-separator character ('\u2029).</li>
223  * </ul>
224  *
225  * <p>
226  * RE runs programs compiled by the RECompiler class.  But the RE
227  * matcher class does not include the actual regular expression compiler
228  * for reasons of efficiency.  In fact, if you want to pre-compile one
229  * or more regular expressions, the 'recompile' class can be invoked
230  * from the command line to produce compiled output like this:
231  *
232  * <pre>
233  *    // Pre-compiled regular expression "a*b"
234  *    char[] re1Instructions =
235  *    {
236  *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
237  *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
238  *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
239  *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
240  *        0x0000,
241  *    };
242  *
243  *
244  *    REProgram re1 = new REProgram(re1Instructions);
245  * </pre>
246  *
247  * You can then construct a regular expression matcher (RE) object from
248  * the pre-compiled expression re1 and thus avoid the overhead of
249  * compiling the expression at runtime. If you require more dynamic
250  * regular expressions, you can construct a single RECompiler object and
251  * re-use it to compile each expression. Similarly, you can change the
252  * program run by a given matcher object at any time. However, RE and
253  * RECompiler are not threadsafe (for efficiency reasons, and because
254  * requiring thread safety in this class is deemed to be a rare
255  * requirement), so you will need to construct a separate compiler or
256  * matcher object for each thread (unless you do thread synchronization
257  * yourself). Once expression compiled into the REProgram object, REProgram
258  * can be safely shared across multiple threads and RE objects.
259  *
260  * <br><p><br>
261  *
262  * <font color="red">
263  * <i>ISSUES:</i>
264  *
265  * <ul>
266  *  <li>com.weusours.util.re is not currently compatible with all
267  *      standard POSIX regcomp flags</li>
268  *  <li>com.weusours.util.re does not support POSIX equivalence classes
269  *      ([=foo=] syntax) (I18N/locale issue)</li>
270  *  <li>com.weusours.util.re does not support nested POSIX character
271  *      classes (definitely should, but not completely trivial)</li>
272  *  <li>com.weusours.util.re Does not support POSIX character collation
273  *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
274  *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
275  *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
276  *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
277  *  <li>Not *all* possibilities are considered for greediness when backreferences
278  *      are involved (as POSIX suggests should be the case).  The POSIX RE
279  *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
280  *      of acdacaa where \1 is "a".  This is not the case in this RE package,
281  *      and actually Perl doesn't go to this extent either!  Until someone
282  *      actually complains about this, I'm not sure it's worth "fixing".
283  *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
284  * </ul>
285  *
286  * </font>
287  *
288  * @see recompile
289  * @see RECompiler
290  *
291  * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
292  * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
293  * @version $Id: RE.java 232192 2005-08-12 03:04:07Z vgritsenko $
294  */
295 public class RE implements Serializable
296 {
297     /**
298      * Specifies normal, case-sensitive matching behaviour.
299      */
300     public static final int MATCH_NORMAL          = 0x0000;
301 
302     /**
303      * Flag to indicate that matching should be case-independent (folded)
304      */
305     public static final int MATCH_CASEINDEPENDENT = 0x0001;
306 
307     /**
308      * Newlines should match as BOL/EOL (^ and $)
309      */
310     public static final int MATCH_MULTILINE       = 0x0002;
311 
312     /**
313      * Consider all input a single body of text - newlines are matched by .
314      */
315     public static final int MATCH_SINGLELINE      = 0x0004;
316 
317     /************************************************
318      *                                              *
319      * The format of a node in a program is:        *
320      *                                              *
321      * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
322      *                                              *
323      * char OPCODE - instruction                    *
324      * char OPDATA - modifying data                 *
325      * char OPNEXT - next node (relative offset)    *
326      *                                              *
327      ************************************************/
328 
329                  //   Opcode              Char       Opdata/Operand  Meaning
330                  //   ----------          ---------- --------------- --------------------------------------------------
331     static final char OP_END              = 'E';  //                 end of program
332     static final char OP_BOL              = '^';  //                 match only if at beginning of line
333     static final char OP_EOL              = '$';  //                 match only if at end of line
334     static final char OP_ANY              = '.';  //                 match any single character except newline
335     static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
336     static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
337     static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
338     static final char OP_STAR             = '*';  // node            kleene closure
339     static final char OP_PLUS             = '+';  // node            positive closure
340     static final char OP_MAYBE            = '?';  // node            optional closure
341     static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
342     static final char OP_OPEN             = '(';  // number          nth opening paren
343     static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
344     static final char OP_CLOSE            = ')';  // number          nth closing paren
345     static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
346     static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
347     static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
348     static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
349     static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
350     static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
351     static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
352     static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
353 
354     // Escape codes
355     static final char E_ALNUM             = 'w';  // Alphanumeric
356     static final char E_NALNUM            = 'W';  // Non-alphanumeric
357     static final char E_BOUND             = 'b';  // Word boundary
358     static final char E_NBOUND            = 'B';  // Non-word boundary
359     static final char E_SPACE             = 's';  // Whitespace
360     static final char E_NSPACE            = 'S';  // Non-whitespace
361     static final char E_DIGIT             = 'd';  // Digit
362     static final char E_NDIGIT            = 'D';  // Non-digit
363 
364     // Posix character classes
365     static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
366     static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
367     static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
368     static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
369     static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
370     static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
371     static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
372     static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
373     static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
374     static final char POSIX_CLASS_SPACE   = 's';  // Spaces
375     static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
376     static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
377     static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
378     static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
379 
380     // Limits
381     static final int maxNode  = 65536;            // Maximum number of nodes in a program
382     static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
383 
384     // Node layout constants
385     static final int offsetOpcode = 0;            // Opcode offset (first character)
386     static final int offsetOpdata = 1;            // Opdata offset (second char)
387     static final int offsetNext   = 2;            // Next index offset (third char)
388     static final int nodeSize     = 3;            // Node size (in chars)
389 
390     // State of current program
391     REProgram program;                            // Compiled regular expression 'program'
392     transient CharacterIterator search;           // The string being matched against
393     int matchFlags;                               // Match behaviour flags
394     int maxParen = MAX_PAREN;
395 
396     // Parenthesized subexpressions
397     transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
398     transient int start0;                         // Cache of start[0]
399     transient int end0;                           // Cache of start[0]
400     transient int start1;                         // Cache of start[1]
401     transient int end1;                           // Cache of start[1]
402     transient int start2;                         // Cache of start[2]
403     transient int end2;                           // Cache of start[2]
404     transient int[] startn;                       // Lazy-alloced array of sub-expression starts
405     transient int[] endn;                         // Lazy-alloced array of sub-expression ends
406 
407     // Backreferences
408     transient int[] startBackref;                 // Lazy-alloced array of backref starts
409     transient int[] endBackref;                   // Lazy-alloced array of backref ends
410 
411     /**
412      * Constructs a regular expression matcher from a String by compiling it
413      * using a new instance of RECompiler.  If you will be compiling many
414      * expressions, you may prefer to use a single RECompiler object instead.
415      *
416      * @param pattern The regular expression pattern to compile.
417      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
418      * @see RECompiler
419      * @see recompile
420      */
421     public RE(String pattern) throws RESyntaxException
422     {
423         this(pattern, MATCH_NORMAL);
424     }
425 
426     /**
427      * Constructs a regular expression matcher from a String by compiling it
428      * using a new instance of RECompiler.  If you will be compiling many
429      * expressions, you may prefer to use a single RECompiler object instead.
430      *
431      * @param pattern The regular expression pattern to compile.
432      * @param matchFlags The matching style
433      * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
434      * @see RECompiler
435      * @see recompile
436      */
437     public RE(String pattern, int matchFlags) throws RESyntaxException
438     {
439         this(new RECompiler().compile(pattern));
440         setMatchFlags(matchFlags);
441     }
442 
443     /**
444      * Construct a matcher for a pre-compiled regular expression from program
445      * (bytecode) data.  Permits special flags to be passed in to modify matching
446      * behaviour.
447      *
448      * @param program Compiled regular expression program (see RECompiler and/or recompile)
449      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
450      *
451      * <pre>
452      *   MATCH_NORMAL              // Normal (case-sensitive) matching
453      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
454      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
455      * </pre>
456      *
457      * @see RECompiler
458      * @see REProgram
459      * @see recompile
460      */
461     public RE(REProgram program, int matchFlags)
462     {
463         setProgram(program);
464         setMatchFlags(matchFlags);
465     }
466 
467     /**
468      * Construct a matcher for a pre-compiled regular expression from program
469      * (bytecode) data.
470      *
471      * @param program Compiled regular expression program
472      * @see RECompiler
473      * @see recompile
474      */
475     public RE(REProgram program)
476     {
477         this(program, MATCH_NORMAL);
478     }
479 
480     /**
481      * Constructs a regular expression matcher with no initial program.
482      * This is likely to be an uncommon practice, but is still supported.
483      */
484     public RE()
485     {
486         this((REProgram)null, MATCH_NORMAL);
487     }
488 
489     /**
490      * Converts a 'simplified' regular expression to a full regular expression
491      *
492      * @param pattern The pattern to convert
493      * @return The full regular expression
494      */
495     public static String simplePatternToFullRegularExpression(String pattern)
496     {
497         StringBuffer buf = new StringBuffer();
498         for (int i = 0; i < pattern.length(); i++)
499         {
500             char c = pattern.charAt(i);
501             switch (c)
502             {
503                 case '*':
504                     buf.append(".*");
505                     break;
506 
507                 case '.':
508                 case '[':
509                 case ']':
510                 case '\\':
511                 case '+':
512                 case '?':
513                 case '{':
514                 case '}':
515                 case '$':
516                 case '^':
517                 case '|':
518                 case '(':
519                 case ')':
520                     buf.append('\\');
521                 default:
522                     buf.append(c);
523                     break;
524             }
525         }
526         return buf.toString();
527     }
528 
529     /**
530      * Sets match behaviour flags which alter the way RE does matching.
531      * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
532      *
533      * <pre>
534      *   MATCH_NORMAL              // Normal (case-sensitive) matching
535      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
536      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
537      * </pre>
538      */
539     public void setMatchFlags(int matchFlags)
540     {
541         this.matchFlags = matchFlags;
542     }
543 
544     /**
545      * Returns the current match behaviour flags.
546      * @return Current match behaviour flags (RE.MATCH_*).
547      *
548      * <pre>
549      *   MATCH_NORMAL              // Normal (case-sensitive) matching
550      *   MATCH_CASEINDEPENDENT     // Case folded comparisons
551      *   MATCH_MULTILINE           // Newline matches as BOL/EOL
552      * </pre>
553      *
554      * @see #setMatchFlags
555      */
556     public int getMatchFlags()
557     {
558         return matchFlags;
559     }
560 
561     /**
562      * Sets the current regular expression program used by this matcher object.
563      *
564      * @param program Regular expression program compiled by RECompiler.
565      * @see RECompiler
566      * @see REProgram
567      * @see recompile
568      */
569     public void setProgram(REProgram program)
570     {
571         this.program = program;
572         if (program != null && program.maxParens != -1) {
573             this.maxParen = program.maxParens;
574         } else {
575             this.maxParen = MAX_PAREN;
576         }
577     }
578 
579     /**
580      * Returns the current regular expression program in use by this matcher object.
581      *
582      * @return Regular expression program
583      * @see #setProgram
584      */
585     public REProgram getProgram()
586     {
587         return program;
588     }
589 
590     /**
591      * Returns the number of parenthesized subexpressions available after a successful match.
592      *
593      * @return Number of available parenthesized subexpressions
594      */
595     public int getParenCount()
596     {
597         return parenCount;
598     }
599 
600     /**
601      * Gets the contents of a parenthesized subexpression after a successful match.
602      *
603      * @param which Nesting level of subexpression
604      * @return String
605      */
606     public String getParen(int which)
607     {
608         int start;
609         if (which < parenCount && (start = getParenStart(which)) >= 0)
610         {
611             return search.substring(start, getParenEnd(which));
612         }
613         return null;
614     }
615 
616     /**
617      * Returns the start index of a given paren level.
618      *
619      * @param which Nesting level of subexpression
620      * @return String index
621      */
622     public final int getParenStart(int which)
623     {
624         if (which < parenCount)
625         {
626             switch (which)
627             {
628                 case 0:
629                     return start0;
630 
631                 case 1:
632                     return start1;
633 
634                 case 2:
635                     return start2;
636 
637                 default:
638                     if (startn == null)
639                     {
640                         allocParens();
641                     }
642                     return startn[which];
643             }
644         }
645         return -1;
646     }
647 
648     /**
649      * Returns the end index of a given paren level.
650      *
651      * @param which Nesting level of subexpression
652      * @return String index
653      */
654     public final int getParenEnd(int which)
655     {
656         if (which < parenCount)
657         {
658             switch (which)
659             {
660                 case 0:
661                     return end0;
662 
663                 case 1:
664                     return end1;
665 
666                 case 2:
667                     return end2;
668 
669                 default:
670                     if (endn == null)
671                     {
672                         allocParens();
673                     }
674                     return endn[which];
675             }
676         }
677         return -1;
678     }
679 
680     /**
681      * Returns the length of a given paren level.
682      *
683      * @param which Nesting level of subexpression
684      * @return Number of characters in the parenthesized subexpression
685      */
686     public final int getParenLength(int which)
687     {
688         if (which < parenCount)
689         {
690             return getParenEnd(which) - getParenStart(which);
691         }
692         return -1;
693     }
694 
695     /**
696      * Sets the start of a paren level
697      *
698      * @param which Which paren level
699      * @param i Index in input array
700      */
701     protected final void setParenStart(int which, int i)
702     {
703         if (which < parenCount)
704         {
705             switch (which)
706             {
707                 case 0:
708                     start0 = i;
709                     break;
710 
711                 case 1:
712                     start1 = i;
713                     break;
714 
715                 case 2:
716                     start2 = i;
717                     break;
718 
719                 default:
720                     if (startn == null)
721                     {
722                         allocParens();
723                     }
724                     startn[which] = i;
725                     break;
726             }
727         }
728     }
729 
730     /**
731      * Sets the end of a paren level
732      *
733      * @param which Which paren level
734      * @param i Index in input array
735      */
736     protected final void setParenEnd(int which, int i)
737     {
738         if (which < parenCount)
739         {
740             switch (which)
741             {
742                 case 0:
743                     end0 = i;
744                     break;
745 
746                 case 1:
747                     end1 = i;
748                     break;
749 
750                 case 2:
751                     end2 = i;
752                     break;
753 
754                 default:
755                     if (endn == null)
756                     {
757                         allocParens();
758                     }
759                     endn[which] = i;
760                     break;
761             }
762         }
763     }
764 
765     /**
766      * Throws an Error representing an internal error condition probably resulting
767      * from a bug in the regular expression compiler (or possibly data corruption).
768      * In practice, this should be very rare.
769      *
770      * @param s Error description
771      */
772     protected void internalError(String s) throws Error
773     {
774         throw new Error("RE internal error: " + s);
775     }
776 
777     /**
778      * Performs lazy allocation of subexpression arrays
779      */
780     private final void allocParens()
781     {
782         // Allocate arrays for subexpressions
783         startn = new int[maxParen];
784         endn = new int[maxParen];
785 
786         // Set sub-expression pointers to invalid values
787         for (int i = 0; i < maxParen; i++)
788         {
789             startn[i] = -1;
790             endn[i] = -1;
791         }
792     }
793 
794     /**
795      * Try to match a string against a subset of nodes in the program
796      *
797      * @param firstNode Node to start at in program
798      * @param lastNode  Last valid node (used for matching a subexpression without
799      *                  matching the rest of the program as well).
800      * @param idxStart  Starting position in character array
801      * @return Final input array index if match succeeded.  -1 if not.
802      */
803     protected int matchNodes(int firstNode, int lastNode, int idxStart)
804     {
805         // Our current place in the string
806         int idx = idxStart;
807 
808         // Loop while node is valid
809         int next, opcode, opdata;
810         int idxNew;
811         char[] instruction = program.instruction;
812         for (int node = firstNode; node < lastNode; )
813         {
814             opcode = instruction[node + offsetOpcode];
815             next   = node + (short)instruction[node + offsetNext];
816             opdata = instruction[node + offsetOpdata];
817 
818             switch (opcode)
819             {
820                 case OP_RELUCTANTMAYBE:
821                     {
822                         int once = 0;
823                         do
824                         {
825                             // Try to match the rest without using the reluctant subexpr
826                             if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
827                             {
828                                 return idxNew;
829                             }
830                         }
831                         while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
832                         return -1;
833                     }
834 
835                 case OP_RELUCTANTPLUS:
836                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
837                     {
838                         // Try to match the rest without using the reluctant subexpr
839                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
840                         {
841                             return idxNew;
842                         }
843                     }
844                     return -1;
845 
846                 case OP_RELUCTANTSTAR:
847                     do
848                     {
849                         // Try to match the rest without using the reluctant subexpr
850                         if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
851                         {
852                             return idxNew;
853                         }
854                     }
855                     while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
856                     return -1;
857 
858                 case OP_OPEN:
859 
860                     // Match subexpression
861                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
862                     {
863                         startBackref[opdata] = idx;
864                     }
865                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
866                     {
867                         // Increase valid paren count
868                         if ((opdata + 1) > parenCount)
869                         {
870                             parenCount = opdata + 1;
871                         }
872 
873                         // Don't set paren if already set later on
874                         if (getParenStart(opdata) == -1)
875                         {
876                             setParenStart(opdata, idx);
877                         }
878                     }
879                     return idxNew;
880 
881                 case OP_CLOSE:
882 
883                     // Done matching subexpression
884                     if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
885                     {
886                         endBackref[opdata] = idx;
887                     }
888                     if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
889                     {
890                         // Increase valid paren count
891                         if ((opdata + 1) > parenCount)
892                         {
893                             parenCount = opdata + 1;
894                         }
895 
896                         // Don't set paren if already set later on
897                         if (getParenEnd(opdata) == -1)
898                         {
899                             setParenEnd(opdata, idx);
900                         }
901                     }
902                     return idxNew;
903 
904                 case OP_OPEN_CLUSTER:
905                 case OP_CLOSE_CLUSTER:
906                     // starting or ending the matching of a subexpression which has no backref.
907                     return matchNodes( next, maxNode, idx );
908 
909                 case OP_BACKREF:
910                     {
911                         // Get the start and end of the backref
912                         int s = startBackref[opdata];
913                         int e = endBackref[opdata];
914 
915                         // We don't know the backref yet
916                         if (s == -1 || e == -1)
917                         {
918                             return -1;
919                         }
920 
921                         // The backref is empty size
922                         if (s == e)
923                         {
924                             break;
925                         }
926 
927                         // Get the length of the backref
928                         int l = e - s;
929 
930                         // If there's not enough input left, give up.
931                         if (search.isEnd(idx + l - 1))
932                         {
933                             return -1;
934                         }
935 
936                         // Case fold the backref?
937                         final boolean caseFold =
938                             ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
939                         // Compare backref to input
940                         for (int i = 0; i < l; i++)
941                         {
942                             if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
943                             {
944                                 return -1;
945                             }
946                         }
947                     }
948                     break;
949 
950                 case OP_BOL:
951 
952                     // Fail if we're not at the start of the string
953                     if (idx != 0)
954                     {
955                         // If we're multiline matching, we could still be at the start of a line
956                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
957                         {
958                             // If not at start of line, give up
959                             if (idx <= 0 || !isNewline(idx - 1)) {
960                                 return -1;
961                             } else {
962                                 break;
963                             }
964                         }
965                         return -1;
966                     }
967                     break;
968 
969                 case OP_EOL:
970 
971                     // If we're not at the end of string
972                     if (!search.isEnd(0) && !search.isEnd(idx))
973                     {
974                         // If we're multi-line matching
975                         if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
976                         {
977                             // Give up if we're not at the end of a line
978                             if (!isNewline(idx)) {
979                                 return -1;
980                             } else {
981                                 break;
982                             }
983                         }
984                         return -1;
985                     }
986                     break;
987 
988                 case OP_ESCAPE:
989 
990                     // Which escape?
991                     switch (opdata)
992                     {
993                         // Word boundary match
994                         case E_NBOUND:
995                         case E_BOUND:
996                             {
997                                 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
998                                 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
999                                 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1000                                {
1001                                    return -1;
1002                                }
1003                            }
1004                            break;
1005
1006                        // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1007                        case E_ALNUM:
1008                        case E_NALNUM:
1009                        case E_DIGIT:
1010                        case E_NDIGIT:
1011                        case E_SPACE:
1012                        case E_NSPACE:
1013
1014                            // Give up if out of input
1015                            if (search.isEnd(idx))
1016                            {
1017                                return -1;
1018                            }
1019
1020                            char c = search.charAt(idx);
1021
1022                            // Switch on escape
1023                            switch (opdata)
1024                            {
1025                                case E_ALNUM:
1026                                case E_NALNUM:
1027                                    if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1028                                    {
1029                                        return -1;
1030                                    }
1031                                    break;
1032
1033                                case E_DIGIT:
1034                                case E_NDIGIT:
1035                                    if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1036                                    {
1037                                        return -1;
1038                                    }
1039                                    break;
1040
1041                                case E_SPACE:
1042                                case E_NSPACE:
1043                                    if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1044                                    {
1045                                        return -1;
1046                                    }
1047                                    break;
1048                            }
1049                            idx++;
1050                            break;
1051
1052                        default:
1053                            internalError("Unrecognized escape '" + opdata + "'");
1054                    }
1055                    break;
1056
1057                case OP_ANY:
1058
1059                    if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1060                        // Match anything
1061                        if (search.isEnd(idx))
1062                        {
1063                            return -1;
1064                        }
1065                    }
1066                    else
1067                    {
1068                        // Match anything but a newline
1069                        if (search.isEnd(idx) || isNewline(idx))
1070                        {
1071                            return -1;
1072                        }
1073                    }
1074                    idx++;
1075                    break;
1076
1077                case OP_ATOM:
1078                    {
1079                        // Match an atom value
1080                        if (search.isEnd(idx))
1081                        {
1082                            return -1;
1083                        }
1084
1085                        // Get length of atom and starting index
1086                        int lenAtom = opdata;
1087                        int startAtom = node + nodeSize;
1088
1089                        // Give up if not enough input remains to have a match
1090                        if (search.isEnd(lenAtom + idx - 1))
1091                        {
1092                            return -1;
1093                        }
1094
1095                        // Match atom differently depending on casefolding flag
1096                        final boolean caseFold =
1097                            ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1098
1099                        for (int i = 0; i < lenAtom; i++)
1100                        {
1101                            if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1102                            {
1103                                return -1;
1104                            }
1105                        }
1106                    }
1107                    break;
1108
1109                case OP_POSIXCLASS:
1110                    {
1111                        // Out of input?
1112                        if (search.isEnd(idx))
1113                        {
1114                            return -1;
1115                        }
1116
1117                        switch (opdata)
1118                        {
1119                            case POSIX_CLASS_ALNUM:
1120                                if (!Character.isLetterOrDigit(search.charAt(idx)))
1121                                {
1122                                    return -1;
1123                                }
1124                                break;
1125
1126                            case POSIX_CLASS_ALPHA:
1127                                if (!Character.isLetter(search.charAt(idx)))
1128                                {
1129                                    return -1;
1130                                }
1131                                break;
1132
1133                            case POSIX_CLASS_DIGIT:
1134                                if (!Character.isDigit(search.charAt(idx)))
1135                                {
1136                                    return -1;
1137                                }
1138                                break;
1139
1140                            case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1141                                if (!Character.isSpaceChar(search.charAt(idx)))
1142                                {
1143                                    return -1;
1144                                }
1145                                break;
1146
1147                            case POSIX_CLASS_SPACE:
1148                                if (!Character.isWhitespace(search.charAt(idx)))
1149                                {
1150                                    return -1;
1151                                }
1152                                break;
1153
1154                            case POSIX_CLASS_CNTRL:
1155                                if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1156                                {
1157                                    return -1;
1158                                }
1159                                break;
1160
1161                            case POSIX_CLASS_GRAPH: // JWL - bugbug???
1162                                switch (Character.getType(search.charAt(idx)))
1163                                {
1164                                    case Character.MATH_SYMBOL:
1165                                    case Character.CURRENCY_SYMBOL:
1166                                    case Character.MODIFIER_SYMBOL:
1167                                    case Character.OTHER_SYMBOL:
1168                                        break;
1169
1170                                    default:
1171                                        return -1;
1172                                }
1173                                break;
1174
1175                            case POSIX_CLASS_LOWER:
1176                                if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1177                                {
1178                                    return -1;
1179                                }
1180                                break;
1181
1182                            case POSIX_CLASS_UPPER:
1183                                if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1184                                {
1185                                    return -1;
1186                                }
1187                                break;
1188
1189                            case POSIX_CLASS_PRINT:
1190                                if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1191                                {
1192                                    return -1;
1193                                }
1194                                break;
1195
1196                            case POSIX_CLASS_PUNCT:
1197                            {
1198                                int type = Character.getType(search.charAt(idx));
1199                                switch(type)
1200                                {
1201                                    case Character.DASH_PUNCTUATION:
1202                                    case Character.START_PUNCTUATION:
1203                                    case Character.END_PUNCTUATION:
1204                                    case Character.CONNECTOR_PUNCTUATION:
1205                                    case Character.OTHER_PUNCTUATION:
1206                                        break;
1207
1208                                    default:
1209                                        return -1;
1210                                }
1211                            }
1212                            break;
1213
1214                            case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1215                            {
1216                                boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1217                                                    (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1218                                                    (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1219                                if (!isXDigit)
1220                                {
1221                                    return -1;
1222                                }
1223                            }
1224                            break;
1225
1226                            case POSIX_CLASS_JSTART:
1227                                if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1228                                {
1229                                    return -1;
1230                                }
1231                                break;
1232
1233                            case POSIX_CLASS_JPART:
1234                                if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1235                                {
1236                                    return -1;
1237                                }
1238                                break;
1239
1240                            default:
1241                                internalError("Bad posix class");
1242                                break;
1243                        }
1244
1245                        // Matched.
1246                        idx++;
1247                    }
1248                    break;
1249
1250                case OP_ANYOF:
1251                    {
1252                        // Out of input?
1253                        if (search.isEnd(idx))
1254                        {
1255                            return -1;
1256                        }
1257
1258                        // Get character to match against character class and maybe casefold
1259                        char c = search.charAt(idx);
1260                        boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1261                        // Loop through character class checking our match character
1262                        int idxRange = node + nodeSize;
1263                        int idxEnd = idxRange + (opdata * 2);
1264                        boolean match = false;
1265                        for (int i = idxRange; !match && i < idxEnd; )
1266                        {
1267                            // Get start, end and match characters
1268                            char s = instruction[i++];
1269                            char e = instruction[i++];
1270
1271                            match = ((compareChars(c, s, caseFold) >= 0)
1272                                     && (compareChars(c, e, caseFold) <= 0));
1273                        }
1274
1275                        // Fail if we didn't match the character class
1276                        if (!match)
1277                        {
1278                            return -1;
1279                        }
1280                        idx++;
1281                    }
1282                    break;
1283
1284                case OP_BRANCH:
1285                {
1286                    // Check for choices
1287                    if (instruction[next + offsetOpcode] != OP_BRANCH)
1288                    {
1289                        // If there aren't any other choices, just evaluate this branch.
1290                        node += nodeSize;
1291                        continue;
1292                    }
1293
1294                    // Try all available branches
1295                    short nextBranch;
1296                    do
1297                    {
1298                        // Try matching the branch against the string
1299                        if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1300                        {
1301                            return idxNew;
1302                        }
1303
1304                        // Go to next branch (if any)
1305                        nextBranch = (short)instruction[node + offsetNext];
1306                        node += nextBranch;
1307                    }
1308                    while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1309
1310                    // Failed to match any branch!
1311                    return -1;
1312                }
1313
1314                case OP_NOTHING:
1315                case OP_GOTO:
1316
1317                    // Just advance to the next node without doing anything
1318                    break;
1319
1320                case OP_END:
1321
1322                    // Match has succeeded!
1323                    setParenEnd(0, idx);
1324                    return idx;
1325
1326                default:
1327
1328                    // Corrupt program
1329                    internalError("Invalid opcode '" + opcode + "'");
1330            }
1331
1332            // Advance to the next node in the program
1333            node = next;
1334        }
1335
1336        // We "should" never end up here
1337        internalError("Corrupt program");
1338        return -1;
1339    }
1340
1341    /**
1342     * Match the current regular expression program against the current
1343     * input string, starting at index i of the input string.  This method
1344     * is only meant for internal use.
1345     *
1346     * @param i The input string index to start matching at
1347     * @return True if the input matched the expression
1348     */
1349    protected boolean matchAt(int i)
1350    {
1351        // Initialize start pointer, paren cache and paren count
1352        start0 = -1;
1353        end0   = -1;
1354        start1 = -1;
1355        end1   = -1;
1356        start2 = -1;
1357        end2   = -1;
1358        startn = null;
1359        endn   = null;
1360        parenCount = 1;
1361        setParenStart(0, i);
1362
1363        // Allocate backref arrays (unless optimizations indicate otherwise)
1364        if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1365        {
1366            startBackref = new int[maxParen];
1367            endBackref = new int[maxParen];
1368        }
1369
1370        // Match against string
1371        int idx;
1372        if ((idx = matchNodes(0, maxNode, i)) != -1)
1373        {
1374            setParenEnd(0, idx);
1375            return true;
1376        }
1377
1378        // Didn't match
1379        parenCount = 0;
1380        return false;
1381    }
1382
1383    /**
1384     * Matches the current regular expression program against a character array,
1385     * starting at a given index.
1386     *
1387     * @param search String to match against
1388     * @param i Index to start searching at
1389     * @return True if string matched
1390     */
1391    public boolean match(String search, int i)
1392    {
1393        return match(new StringCharacterIterator(search), i);
1394    }
1395
1396    /**
1397     * Matches the current regular expression program against a character array,
1398     * starting at a given index.
1399     *
1400     * @param search String to match against
1401     * @param i Index to start searching at
1402     * @return True if string matched
1403     */
1404    public boolean match(CharacterIterator search, int i)
1405    {
1406        // There is no compiled program to search with!
1407        if (program == null)
1408        {
1409            // This should be uncommon enough to be an error case rather
1410            // than an exception (which would have to be handled everywhere)
1411            internalError("No RE program to run!");
1412        }
1413
1414        // Save string to search
1415        this.search = search;
1416
1417        // Can we optimize the search by looking for new lines?
1418        if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
1419        {
1420            // Non multi-line matching with BOL: Must match at '0' index
1421            if ((matchFlags & MATCH_MULTILINE) == 0)
1422            {
1423                return i == 0 && matchAt(i);
1424            }
1425
1426            // Multi-line matching with BOL: Seek to next line
1427            for ( ;! search.isEnd(i); i++)
1428            {
1429                // Skip if we are at the beginning of the line
1430                if (isNewline(i))
1431                {
1432                    continue;
1433                }
1434
1435                // Match at the beginning of the line
1436                if (matchAt(i))
1437                {
1438                    return true;
1439                }
1440
1441                // Skip to the end of line
1442                for ( ;! search.isEnd(i); i++)
1443                {
1444                    if (isNewline(i))
1445                    {
1446                        break;
1447                    }
1448                }
1449            }
1450
1451            return false;
1452        }
1453
1454        // Can we optimize the search by looking for a prefix string?
1455        if (program.prefix == null)
1456        {
1457            // Unprefixed matching must try for a match at each character
1458            for ( ;! search.isEnd(i - 1); i++)
1459            {
1460                // Try a match at index i
1461                if (matchAt(i))
1462                {
1463                    return true;
1464                }
1465            }
1466            return false;
1467        }
1468        else
1469        {
1470            // Prefix-anchored matching is possible
1471            boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1472            char[] prefix = program.prefix;
1473            for ( ; !search.isEnd(i + prefix.length - 1); i++)
1474            {
1475                int j = i;
1476                int k = 0;
1477
1478                boolean match;
1479                do {
1480                    // If there's a mismatch of any character in the prefix, give up
1481                    match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1482                } while (match && k < prefix.length);
1483
1484                // See if the whole prefix string matched
1485                if (k == prefix.length)
1486                {
1487                    // We matched the full prefix at firstChar, so try it
1488                    if (matchAt(i))
1489                    {
1490                        return true;
1491                    }
1492                }
1493            }
1494            return false;
1495        }
1496    }
1497
1498    /**
1499     * Matches the current regular expression program against a String.
1500     *
1501     * @param search String to match against
1502     * @return True if string matched
1503     */
1504    public boolean match(String search)
1505    {
1506        return match(search, 0);
1507    }
1508
1509    /**
1510     * Splits a string into an array of strings on regular expression boundaries.
1511     * This function works the same way as the Perl function of the same name.
1512     * Given a regular expression of "[ab]+" and a string to split of
1513     * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1514     * "[xyzzy, yyz, 123]".
1515     *
1516     * <p>Please note that the first string in the resulting array may be an empty
1517     * string. This happens when the very first character of input string is
1518     * matched by the pattern.
1519     *
1520     * @param s String to split on this regular exression
1521     * @return Array of strings
1522     */
1523    public String[] split(String s)
1524    {
1525        // Create new vector
1526        Vector v = new Vector()