Source code: com/jcorporate/expresso/ext/regexp/RE.java
1 /* ====================================================================
2 * The Jcorporate Apache Style Software License, Version 1.2 05-07-2002
3 *
4 * Copyright (c) 1995-2002 Jcorporate Ltd. All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * 3. The end-user documentation included with the redistribution,
19 * if any, must include the following acknowledgment:
20 * "This product includes software developed by Jcorporate Ltd.
21 * (http://www.jcorporate.com/)."
22 * Alternately, this acknowledgment may appear in the software itself,
23 * if and wherever such third-party acknowledgments normally appear.
24 *
25 * 4. "Jcorporate" and product names such as "Expresso" must
26 * not be used to endorse or promote products derived from this
27 * software without prior written permission. For written permission,
28 * please contact info@jcorporate.com.
29 *
30 * 5. Products derived from this software may not be called "Expresso",
31 * or other Jcorporate product names; nor may "Expresso" or other
32 * Jcorporate product names appear in their name, without prior
33 * written permission of Jcorporate Ltd.
34 *
35 * 6. No product derived from this software may compete in the same
36 * market space, i.e. framework, without prior written permission
37 * of Jcorporate Ltd. For written permission, please contact
38 * partners@jcorporate.com.
39 *
40 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
41 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
42 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 * DISCLAIMED. IN NO EVENT SHALL JCORPORATE LTD OR ITS CONTRIBUTORS
44 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
45 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
46 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
47 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
48 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
49 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
50 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 * ====================================================================
53 *
54 * This software consists of voluntary contributions made by many
55 * individuals on behalf of the Jcorporate Ltd. Contributions back
56 * to the project(s) are encouraged when you make modifications.
57 * Please send them to support@jcorporate.com. For more information
58 * on Jcorporate Ltd. and its products, please see
59 * <http://www.jcorporate.com/>.
60 *
61 * Portions of this software are based upon other open source
62 * products and are subject to their respective licenses.
63 */
64
65 package com.jcorporate.expresso.ext.regexp;
66
67 /*
68 * ====================================================================
69 *
70 * The Apache Software License, Version 1.1
71 *
72 * Copyright (c) 1999 The Apache Software Foundation. All rights
73 * reserved.
74 *
75 * Redistribution and use in source and binary forms, with or without
76 * modification, are permitted provided that the following conditions
77 * are met:
78 *
79 * 1. Redistributions of source code must retain the above copyright
80 * notice, this list of conditions and the following disclaimer.
81 *
82 * 2. Redistributions in binary form must reproduce the above copyright
83 * notice, this list of conditions and the following disclaimer in
84 * the documentation and/or other materials provided with the
85 * distribution.
86 *
87 * 3. The end-user documentation included with the redistribution, if
88 * any, must include the following acknowlegement:
89 * "This product includes software developed by the
90 * Apache Software Foundation (http://www.apache.org/)."
91 * Alternately, this acknowlegement may appear in the software itself,
92 * if and wherever such third-party acknowlegements normally appear.
93 *
94 * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
95 * Foundation" must not be used to endorse or promote products derived
96 * from this software without prior written permission. For written
97 * permission, please contact apache@apache.org.
98 *
99 * 5. Products derived from this software may not be called "Apache"
100 * nor may "Apache" appear in their names without prior written
101 * permission of the Apache Group.
102 *
103 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
104 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
105 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
106 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
107 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
108 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
109 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
110 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
111 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
112 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
113 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
114 * SUCH DAMAGE.
115 * ====================================================================
116 *
117 * This software consists of voluntary contributions made by many
118 * individuals on behalf of the Apache Software Foundation. For more
119 * information on the Apache Software Foundation, please see
120 * <http://www.apache.org/>.
121 *
122 */
123
124 import java.util.Vector;
125
126
127 /**
128 * RE is an efficient, lightweight regular expression evaluator/matcher class.
129 * Regular expressions are pattern descriptions which enable sophisticated matching of
130 * strings. In addition to being able to match a string against a pattern, you
131 * can also extract parts of the match. This is especially useful in text parsing!
132 * Details on the syntax of regular expression patterns are given below.
133 * <p/>
134 * <p/>
135 * <p/>
136 * To compile a regular expression (RE), you can simply construct an RE matcher
137 * object from the string specification of the pattern, like this:
138 * <p/>
139 * <pre>
140 * <p/>
141 * RE r = new RE("a*b");
142 * <p/>
143 * </pre>
144 * <p/>
145 * <p/>
146 * <p/>
147 * Once you have done this, you can call either of the RE.match methods to
148 * perform matching on a String. For example:
149 * <p/>
150 * <pre>
151 * <p/>
152 * boolean matched = r.match("aaaab");
153 * <p/>
154 * </pre>
155 * <p/>
156 * will cause the boolean matched to be set to true because the
157 * pattern "a*b" matches the string "aaaab".
158 * <p/>
159 * <p/>
160 * If you were interested in the <i>number</i> of a's which matched the first
161 * part of our example expression, you could change the expression to
162 * "(a*)b". Then when you compiled the expression and matched it against
163 * something like "xaaaab", you would get results like this:
164 * <p/>
165 * <pre>
166 * <p/>
167 * RE r = new RE("(a*)b"); // Compile expression
168 * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
169 * <p/>
170 * <br>
171 * <p/>
172 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
173 * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
174 * <p/>
175 * <br>
176 * <p/>
177 * int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
178 * int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
179 * int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
180 * <p/>
181 * <br>
182 * <p/>
183 * int startInside = getParenStart(1); // startInside will be index 1
184 * int endInside = getParenEnd(1); // endInside will be index 5
185 * int lenInside = getParenLength(1); // lenInside will be 4
186 * <p/>
187 * </pre>
188 * <p/>
189 * You can also refer to the contents of a parenthesized expression within
190 * a regular expression itself. This is called a 'backreference'. The first
191 * backreference in a regular expression is denoted by \1, the second by \2
192 * and so on. So the expression:
193 * <p/>
194 * <pre>
195 * <p/>
196 * ([0-9]+)=\1
197 * <p/>
198 * </pre>
199 * <p/>
200 * will match any string of the form n=n (like 0=0 or 2=2).
201 * <p/>
202 * <p/>
203 * <p/>
204 * The full regular expression syntax accepted by RE is described here:
205 * <p/>
206 * <pre>
207 * <p/>
208 * <br>
209 * <p/>
210 * <b><font face=times roman>Characters</font></b>
211 * <p/>
212 * <br>
213 * <p/>
214 * <i>unicodeChar</i> Matches any identical unicode character
215 * \ Used to quote a meta-character (like '*')
216 * \\ Matches a single '\' character
217 * \0nnn Matches a given octal character
218 * \xhh Matches a given 8-bit hexadecimal character
219 * \\uhhhh Matches a given 16-bit hexadecimal character
220 * \t Matches an ASCII tab character
221 * \n Matches an ASCII newline character
222 * \r Matches an ASCII return character
223 * \f Matches an ASCII form feed character
224 * <p/>
225 * <br>
226 * <p/>
227 * <b><font face=times roman>Character Classes</font></b>
228 * <p/>
229 * <br>
230 * <p/>
231 * [abc] Simple character class
232 * [a-zA-Z] Character class with ranges
233 * [^abc] Negated character class
234 * <p/>
235 * <br>
236 * <p/>
237 * <b><font face=times roman>Standard POSIX Character Classes</font></b>
238 * <p/>
239 * <br>
240 * <p/>
241 * [:alnum:] Alphanumeric characters.
242 * [:alpha:] Alphabetic characters.
243 * [:blank:] Space and tab characters.
244 * [:cntrl:] Control characters.
245 * [:digit:] Numeric characters.
246 * [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
247 * [:lower:] Lower-case alphabetic characters.
248 * [:print:] Printable characters (characters that are not control characters.)
249 * [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
250 * [:space:] Space characters (such as space, tab, and formfeed, to name a few).
251 * [:upper:] Upper-case alphabetic characters.
252 * [:xdigit:] Characters that are hexadecimal digits.
253 * <p/>
254 * <br>
255 * <p/>
256 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
257 * <p/>
258 * <br>
259 * <p/>
260 * [:javastart:] Start of a Java identifier
261 * [:javapart:] Part of a Java identifier
262 * <p/>
263 * <br>
264 * <p/>
265 * <b><font face=times roman>Predefined Classes</font></b>
266 * <p/>
267 * <br>
268 * <p/>
269 * . Matches any character other than newline
270 * \w Matches a "word" character (alphanumeric plus "_")
271 * \W Matches a non-word character
272 * \s Matches a whitespace character
273 * \S Matches a non-whitespace character
274 * \d Matches a digit character
275 * \D Matches a non-digit character
276 * <p/>
277 * <br>
278 * <p/>
279 * <b><font face=times roman>Boundary Matchers</font></b>
280 * <p/>
281 * <br>
282 * <p/>
283 * ^ Matches only at the beginning of a line
284 * $ Matches only at the end of a line
285 * \b Matches only at a word boundary
286 * \B Matches only at a non-word boundary
287 * <p/>
288 * <br>
289 * <p/>
290 * <b><font face=times roman>Greedy Closures</font></b>
291 * <p/>
292 * <br>
293 * <p/>
294 * A* Matches A 0 or more times (greedy)
295 * A+ Matches A 1 or more times (greedy)
296 * A? Matches A 1 or 0 times (greedy)
297 * A{n} Matches A exactly n times (greedy)
298 * A{n,} Matches A at least n times (greedy)
299 * A{n,m} Matches A at least n but not more than m times (greedy)
300 * <p/>
301 * <br>
302 * <p/>
303 * <b><font face=times roman>Reluctant Closures</font></b>
304 * <p/>
305 * <br>
306 * <p/>
307 * A*? Matches A 0 or more times (reluctant)
308 * A+? Matches A 1 or more times (reluctant)
309 * A?? Matches A 0 or 1 times (reluctant)
310 * <p/>
311 * <br>
312 * <p/>
313 * <b><font face=times roman>Logical Operators</font></b>
314 * <p/>
315 * <br>
316 * <p/>
317 * AB Matches A followed by B
318 * A|B Matches either A or B
319 * (A) Used for subexpression grouping
320 * <p/>
321 * <br>
322 * <p/>
323 * <b><font face=times roman>Backreferences</font></b>
324 * <p/>
325 * <br>
326 * <p/>
327 * \1 Backreference to 1st parenthesized subexpression
328 * \2 Backreference to 2nd parenthesized subexpression
329 * \3 Backreference to 3rd parenthesized subexpression
330 * \4 Backreference to 4th parenthesized subexpression
331 * \5 Backreference to 5th parenthesized subexpression
332 * \6 Backreference to 6th parenthesized subexpression
333 * \7 Backreference to 7th parenthesized subexpression
334 * \8 Backreference to 8th parenthesized subexpression
335 * \9 Backreference to 9th parenthesized subexpression
336 * <p/>
337 * <br>
338 * <p/>
339 * </pre>
340 * <p/>
341 * <p/>
342 * <p/>
343 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
344 * match as many elements of the string as possible without causing the overall
345 * match to fail. If you want a closure to be reluctant (non-greedy), you can
346 * simply follow it with a '?'. A reluctant closure will match as few elements
347 * of the string as possible when finding matches. {m,n} closures don't currently
348 * support reluctancy.
349 * <p/>
350 * <p/>
351 * <p/>
352 * RE runs programs compiled by the RECompiler class. But the RE matcher class
353 * does not include the actual regular expression compiler for reasons of
354 * efficiency. In fact, if you want to pre-compile one or more regular expressions,
355 * the 'recompile' class can be invoked from the command line to produce compiled
356 * output like this:
357 * <p/>
358 * <pre>
359 * <p/>
360 * // Pre-compiled regular expression "a*b"
361 * char[] re1Instructions =
362 * {
363 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
364 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
365 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
366 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
367 * 0x0000,
368 * };
369 * <p/>
370 * <br>
371 * <p/>
372 * REProgram re1 = new REProgram(re1Instructions);
373 * <p/>
374 * </pre>
375 * <p/>
376 * You can then construct a regular expression matcher (RE) object from the pre-compiled
377 * expression re1 and thus avoid the overhead of compiling the expression at runtime.
378 * If you require more dynamic regular expressions, you can construct a single RECompiler
379 * object and re-use it to compile each expression. Similarly, you can change the
380 * program run by a given matcher object at any time. However, RE and RECompiler are
381 * not threadsafe (for efficiency reasons, and because requiring thread safety in this
382 * class is deemed to be a rare requirement), so you will need to construct a separate
383 * compiler or matcher object for each thread (unless you do thread synchronization
384 * yourself).
385 * <p/>
386 * </pre>
387 * <br><p><br>
388 * <p/>
389 * <font color=red>
390 * <i>ISSUES:</i>
391 * <p/>
392 * <ul>
393 * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
394 * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
395 * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
396 * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
397 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
398 * <li>Should RE support character iterators (for backwards RE matching!)?
399 * <li>Should RE support reluctant {m,n} closures (does anyone care)?
400 * <li>Not *all* possibilities are considered for greediness when backreferences
401 * are involved (as POSIX suggests should be the case). The POSIX RE
402 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
403 * of acdacaa where \1 is "a". This is not the case in this RE package,
404 * and actually Perl doesn't go to this extent either! Until someone
405 * actually complains about this, I'm not sure it's worth "fixing".
406 * If it ever is fixed, test #137 in RETest.txt should be updated.
407 * </ul>
408 * <p/>
409 * </font>
410 *
411 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
412 * @version $Id: RE.java,v 1.9 2004/11/18 02:03:28 lhamel Exp $
413 * @see RECompiler
414 * @deprecated since v5.6, use jakarta oro
415 */
416 public class RE {
417
418 /**
419 * Specifies normal, case-sensitive matching behaviour.
420 */
421 public static final int MATCH_NORMAL = 0x0000;
422
423 /**
424 * Flag to indicate that matching should be case-independent (folded)
425 */
426 public static final int MATCH_CASEINDEPENDENT = 0x0001;
427
428 /**
429 * Newlines should match as BOL/EOL (^ and $)
430 */
431 public static final int MATCH_MULTILINE = 0x0002;
432
433 /**
434 * *********************************************
435 * *
436 * The format of a node in a program is: *
437 * *
438 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
439 * *
440 * char OPCODE - instruction *
441 * char OPDATA - modifying data *
442 * char OPNEXT - next node (relative offset) *
443 * *
444 * **********************************************
445 */
446 // Opcode Char Opdata/Operand Meaning
447 // ---------- ---------- --------------- --------------------------------------------------
448 static final char OP_END = 'E'; // end of program
449 static final char OP_BOL = '^'; // match only if at beginning of line
450 static final char OP_EOL = '$'; // match only if at end of line
451 static final char OP_ANY = '.'; // match any single character except newline
452 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
453 static final char OP_BRANCH = '|'; // node match this alternative or the next one
454 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
455 static final char OP_STAR = '*'; // node kleene closure
456 static final char OP_PLUS = '+'; // node positive closure
457 static final char OP_MAYBE = '?'; // node optional closure
458 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
459 static final char OP_OPEN = '('; // number nth opening paren
460 static final char OP_CLOSE = ')'; // number nth closing paren
461 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
462 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
463 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
464 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
465 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
466 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
467 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
468
469 // Escape codes
470 static final char E_ALNUM = 'w'; // Alphanumeric
471 static final char E_NALNUM = 'W'; // Non-alphanumeric
472 static final char E_BOUND = 'b'; // Word boundary
473 static final char E_NBOUND = 'B'; // Non-word boundary
474 static final char E_SPACE = 's'; // Whitespace
475 static final char E_NSPACE = 'S'; // Non-whitespace
476 static final char E_DIGIT = 'd'; // Digit
477 static final char E_NDIGIT = 'D'; // Non-digit
478
479 // Posix character classes
480 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
481 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
482 static final char POSIX_CLASS_BLANK = 'b'; // Blanks
483 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
484 static final char POSIX_CLASS_DIGIT = 'd'; // Digits
485 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
486 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
487 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
488 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
489 static final char POSIX_CLASS_SPACE = 's'; // Spaces
490 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
491 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
492 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
493 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
494
495 // Limits
496 static final int maxNode = 65536; // Maximum number of nodes in a program
497 static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
498
499 // Node layout constants
500 static final int offsetOpcode = 0; // Opcode offset (first character)
501 static final int offsetOpdata = 1; // Opdata offset (second char)
502 static final int offsetNext = 2; // Next index offset (third char)
503 static final int nodeSize = 3; // Node size (in chars)
504
505 /**
506 * Line Separator
507 */
508 static final String NEWLINE = System.getProperty("line.separator");
509
510 // State of current program
511 REProgram program; // Compiled regular expression 'program'
512 CharacterIterator search; // The string being matched against
513 int idx; // Current index in string being searched
514 int matchFlags; // Match behaviour flags
515
516 // Parenthesized subexpressions
517 int parenCount; // Number of subexpressions matched (num open parens + 1)
518 int start0; // Cache of start[0]
519 int end0; // Cache of start[0]
520 int start1; // Cache of start[1]
521 int end1; // Cache of start[1]
522 int start2; // Cache of start[2]
523 int end2; // Cache of start[2]
524 int[] startn; // Lazy-alloced array of sub-expression starts
525 int[] endn; // Lazy-alloced array of sub-expression ends
526
527 // Backreferences
528 int[] startBackref; // Lazy-alloced array of backref starts
529 int[] endBackref; // Lazy-alloced array of backref ends
530
531 /**
532 * Flag bit that indicates that subst should replace all occurrences of this
533 * regular expression.
534 */
535 public static final int REPLACE_ALL = 0x0000;
536
537 /**
538 * Flag bit that indicates that subst should only replace the first occurrence
539 * of this regular expression.
540 */
541 public static final int REPLACE_FIRSTONLY = 0x0001;
542
543 /**
544 * Constructs a regular expression matcher with no initial program.
545 * This is likely to be an uncommon practice, but is still supported.
546 */
547 public RE() {
548 this((REProgram) null, MATCH_NORMAL);
549 }
550
551 /**
552 * Construct a matcher for a pre-compiled regular expression from program
553 * (bytecode) data.
554 *
555 * @param program Compiled regular expression program
556 * @see RECompiler
557 */
558 public RE(REProgram program) {
559 this(program, MATCH_NORMAL);
560 }
561
562 /**
563 * Construct a matcher for a pre-compiled regular expression from program
564 * (bytecode) data. Permits special flags to be passed in to modify matching
565 * behaviour.
566 *
567 * @param program Compiled regular expression program (see RECompiler and/or recompile)
568 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
569 * <p/>
570 * <pre>
571 * <p/>
572 * MATCH_NORMAL // Normal (case-sensitive) matching
573 * MATCH_CASEINDEPENDENT // Case folded comparisons
574 * MATCH_MULTILINE // Newline matches as BOL/EOL
575 * <p/>
576 * </pre>
577 * @see RECompiler
578 * @see REProgram
579 */
580 public RE(REProgram program, int matchFlags) {
581 setProgram(program);
582 setMatchFlags(matchFlags);
583 }
584
585 /**
586 * Constructs a regular expression matcher from a String by compiling it
587 * using a new instance of RECompiler. If you will be compiling many
588 * expressions, you may prefer to use a single RECompiler object instead.
589 *
590 * @param pattern The regular expression pattern to compile.
591 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
592 * @see RECompiler
593 */
594 public RE(String pattern)
595 throws RESyntaxException {
596 this(pattern, MATCH_NORMAL);
597 }
598
599 /**
600 * Constructs a regular expression matcher from a String by compiling it
601 * using a new instance of RECompiler. If you will be compiling many
602 * expressions, you may prefer to use a single RECompiler object instead.
603 *
604 * @param pattern The regular expression pattern to compile.
605 * @param matchFlags The matching style
606 * @throws RESyntaxException Thrown if the regular expression has invalid syntax.
607 * @see RECompiler
608 */
609 public RE(String pattern, int matchFlags)
610 throws RESyntaxException {
611 this(new RECompiler().compile(pattern));
612 setMatchFlags(matchFlags);
613 }
614
615 /**
616 * Performs lazy allocation of subexpression arrays
617 */
618 private final void allocParens() {
619
620 // Allocate arrays for subexpressions
621 startn = new int[maxParen];
622 endn = new int[maxParen];
623
624 // Set sub-expression pointers to invalid values
625 for (int i = 0; i < maxParen; i++) {
626 startn[i] = -1;
627 endn[i] = -1;
628 }
629 }
630
631 /**
632 * Returns the current match behaviour flags.
633 *
634 * @return Current match behaviour flags (RE.MATCH_*).
635 * <p/>
636 * <pre>
637 * <p/>
638 * MATCH_NORMAL // Normal (case-sensitive) matching
639 * MATCH_CASEINDEPENDENT // Case folded comparisons
640 * MATCH_MULTILINE // Newline matches as BOL/EOL
641 * <p/>
642 * </pre>
643 * @see #setMatchFlags
644 */
645 public int getMatchFlags() {
646 return matchFlags;
647 }
648
649 /**
650 * Gets the contents of a parenthesized subexpression after a successful match.
651 *
652 * @param which Nesting level of subexpression
653 * @return String
654 */
655 public String getParen(int which) {
656 if (which < parenCount) {
657 return search.substring(getParenStart(which), getParenEnd(which));
658 }
659
660 return null;
661 }
662
663 /**
664 * Returns the number of parenthesized subexpressions available after a successful match.
665 *
666 * @return Number of available parenthesized subexpressions
667 */
668 public int getParenCount() {
669 return parenCount;
670 }
671
672 /**
673 * Returns the end index of a given paren level.
674 *
675 * @param which Nesting level of subexpression
676 * @return String index
677 */
678 public final int getParenEnd(int which) {
679 if (which < parenCount) {
680 switch (which) {
681 case 0:
682 return end0;
683
684 case 1:
685 return end1;
686
687 case 2:
688 return end2;
689
690 default:
691
692 if (endn == null) {
693 allocParens();
694 }
695
696 return endn[which];
697 }
698 }
699
700 return -1;
701 }
702
703 /**
704 * Returns the length of a given paren level.
705 *
706 * @param which Nesting level of subexpression
707 * @return Number of characters in the parenthesized subexpression
708 */
709 public final int getParenLength(int which) {
710 if (which < parenCount) {
711 return getParenEnd(which) - getParenStart(which);
712 }
713
714 return -1;
715 }
716
717 /**
718 * Returns the start index of a given paren level.
719 *
720 * @param which Nesting level of subexpression
721 * @return String index
722 */
723 public final int getParenStart(int which) {
724 if (which < parenCount) {
725 switch (which) {
726 case 0:
727 return start0;
728
729 case 1:
730 return start1;
731
732 case 2:
733 return start2;
734
735 default:
736
737 if (startn == null) {
738 allocParens();
739 }
740
741 return startn[which];
742 }
743 }
744
745 return -1;
746 }
747
748 /**
749 * Returns the current regular expression program in use by this matcher object.
750 *
751 * @return Regular expression program
752 * @see #setProgram
753 */
754 public REProgram getProgram() {
755 return program;
756 }
757
758 /**
759 * Returns an array of Strings, whose toString representation matches a regular
760 * expression. This method works like the Perl function of the same name. Given
761 * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
762 * aaaab], the array of Strings returned by grep would be [aab, aaaab].
763 *
764 * @param search Array of Objects to search
765 * @return Array of Objects whose toString value matches this regular expression.
766 */
767 public String[] grep(Object[] search) {
768
769 // Create new vector to hold return items
770 Vector v = new Vector();
771
772 // Traverse array of objects
773 for (int i = 0; i < search.length; i++) {
774
775 // Get next object as a string
776 String s = search[i].toString();
777
778 // If it matches this regexp, add it to the list
779 if (match(s)) {
780 v.addElement(s);
781 }
782 }
783
784 // Return vector as an array of strings
785 String[] ret = new String[v.size()];
786 v.copyInto(ret);
787
788 return ret;
789 }
790
791 /**
792 * Throws an Error representing an internal error condition probably resulting
793 * from a bug in the regular expression compiler (or possibly data corruption).
794 * In practice, this should be very rare.
795 *
796 * @param s Error description
797 */
798 protected void internalError(String s)
799 throws Error {
800 throw new Error("RE internal error: " + s);
801 }
802
803 /**
804 * @return true if at the i-th position in the 'search' a newline ends
805 */
806 private boolean isNewline(int i) {
807 if (i < NEWLINE.length() - 1) {
808 return false;
809 }
810 if (search.charAt(i) == '\n') {
811 return true;
812 }
813 for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
814 if (NEWLINE.charAt(j) != search.charAt(i)) {
815 return false;
816 }
817 }
818
819 return true;
820 }
821
822 /**
823 * Matches the current regular expression program against a character array,
824 * starting at a given index.
825 *
826 * @param search String to match against
827 * @param i Index to start searching at
828 * @return True if string matched
829 */
830 public boolean match(CharacterIterator search, int i) {
831
832 // There is no compiled program to search with!
833 if (program == null) {
834
835 // This should be uncommon enough to be an error case rather
836 // than an exception (which would have to be handled everywhere)
837 internalError("No RE program to run!");
838 }
839
840 // Save string to search
841 this.search = search;
842
843 // Can we optimize the search by looking for a prefix string?
844 if (program.prefix == null) {
845
846 // Unprefixed matching must try for a match at each character
847 for (; !search.isEnd(i - 1); i++) {
848
849 // Try a match at index i
850 if (matchAt(i)) {
851 return true;
852 }
853 }
854
855 return false;
856 } else {
857
858 // Prefix-anchored matching is possible
859 char[] prefix = program.prefix;
860
861 for (; !search.isEnd(i + prefix.length - 1); i++) {
862
863 // If the first character of the prefix matches
864 if (search.charAt(i) == prefix[0]) {
865
866 // Save first character position
867 int firstChar = i++;
868 int k;
869
870 for (k = 1; k < prefix.length;) {
871
872 // If there's a mismatch of any character in the prefix, give up
873 if (search.charAt(i++) != prefix[k++]) {
874 break;
875 }
876 }
877 // See if the whole prefix string matched
878 if (k == prefix.length) {
879
880 // We matched the full prefix at firstChar, so try it
881 if (matchAt(firstChar)) {
882 return true;
883 }
884 }
885
886 // Match failed, reset i to continue the search
887 i = firstChar;
888 }
889 }
890
891 return false;
892 }
893 }
894
895 /**
896 * Matches the current regular expression program against a String.
897 *
898 * @param search String to match against
899 * @return True if string matched
900 */
901 public boolean match(String search) {
902 return match(search, 0);
903 }
904
905 /**
906 * Matches the current regular expression program against a character array,
907 * starting at a given index.
908 *
909 * @param search String to match against
910 * @param i Index to start searching at
911 * @return True if string matched
912 */
913 public boolean match(String search, int i) {
914 return match(new StringCharacterIterator(search), i);
915 }
916
917 /**
918 * Match the current regular expression program against the current
919 * input string, starting at index i of the input string. This method
920 * is only meant for internal use.
921 *
922 * @param i The input string index to start matching at
923 * @return True if the input matched the expression
924 */
925 protected boolean matchAt(int i) {
926
927 // Initialize start pointer, paren cache and paren count
928 start0 = -1;
929 end0 = -1;
930 start1 = -1;
931 end1 = -1;
932 start2 = -1;
933 end2 = -1;
934 startn = null;
935 endn = null;
936 parenCount = 1;
937 setParenStart(0, i);
938
939 // Allocate backref arrays (unless optimizations indicate otherwise)
940 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
941 startBackref = new int[maxParen];
942 endBackref = new int[maxParen];
943 }
944
945 // Match against string
946 int idx;
947
948 if ((idx = matchNodes(0, maxNode, i)) != -1) {
949 setParenEnd(0, idx);
950
951 return true;
952 }
953
954 // Didn't match
955 parenCount = 0;
956
957 return false;
958 }
959
960 /**
961 * Try to match a string against a subset of nodes in the program
962 *
963 * @param firstNode Node to start at in program
964 * @param lastNode Last valid node (used for matching a subexpression without
965 * matching the rest of the program as well).
966 * @param idxStart Starting position in character array
967 * @return Final input array index if match succeeded. -1 if not.
968 */
969 protected int matchNodes(int firstNode, int lastNode, int idxStart) {
970
971 // Our current place in the string
972 int idx = idxStart;
973
974 // Loop while node is valid
975 int next;
976 int opcode;
977 int opdata;
978 int idxNew;
979 char[] instruction = program.instruction;
980
981 for (int node = firstNode; node < lastNode;) {
982 opcode = instruction[node + offsetOpcode];
983 next = node + (short) instruction[node + offsetNext];
984 opdata = instruction[node + offsetOpdata];
985
986 switch (opcode) {
987 case OP_RELUCTANTMAYBE:
988 {
989 int once = 0;
990
991 do {
992
993 // Try to match the rest without using the reluctant subexpr
994 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
995 return idxNew;
996 }
997 } while ((once++ == 0) && (idx = matchNodes(node + nodeSize,
998 next, idx)) != -1);
999
1000 return -1;
1001 }
1002 case OP_RELUCTANTPLUS:
1003 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1) {
1004
1005 // Try to match the rest without using the reluctant subexpr
1006 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1007 return idxNew;
1008 }
1009 }
1010
1011 return -1;
1012
1013 case OP_RELUCTANTSTAR:
1014 do {
1015
1016 // Try to match the rest without using the reluctant subexpr
1017 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1018 return idxNew;
1019 }
1020 } while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
1021
1022 return -1;
1023
1024 case OP_OPEN:
1025
1026 // Match subexpression
1027 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1028 startBackref[opdata] = idx;
1029 }
1030 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1031
1032 // Increase valid paren count
1033 if ((opdata + 1) > parenCount) {
1034 parenCount = opdata + 1;
1035 }
1036 // Don't set paren if already set later on
1037 if (getParenStart(opdata) == -1) {
1038 setParenStart(opdata, idx);
1039 }
1040 }
1041
1042 return idxNew;
1043
1044 case OP_CLOSE:
1045
1046 // Done matching subexpression
1047 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0) {
1048 endBackref[opdata] = idx;
1049 }
1050 if ((idxNew = matchNodes(next, maxNode, idx)) != -1) {
1051
1052 // Increase valid paren count
1053 if ((opdata + 1) > parenCount) {
1054 parenCount = opdata + 1;
1055 }
1056 // Don't set paren if already set later on
1057 if (getParenEnd(opdata) == -1) {
1058 setParenEnd(opdata, idx);
1059 }
1060 }
1061
1062 return idxNew;
1063
1064 case OP_BACKREF:
1065 {
1066
1067 // Get the start and end of the backref
1068 int s = startBackref[opdata];
1069 int e = endBackref[opdata];
1070
1071 // We don't know the backref yet
1072 if (s == -1 || e == -1) {
1073 return -1;
1074 }
1075 // The backref is empty size
1076 if (s == e) {
1077 break;
1078 }
1079
1080 // Get the length of the backref
1081 int l = e - s;
1082
1083 // If there's not enough input left, give up.
1084 if (search.isEnd(idx + l)) {
1085 return -1;
1086 }
1087 // Case fold the backref?
1088 if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1089
1090 // Compare backref to input, case-folding as we go
1091 for (int i = 0; i < l; i++) {
1092 if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(
1093 s + i))) {
1094 return -1;
1095 }
1096 }
1097 } else {
1098
1099 // Compare backref to input
1100 for (int i = 0; i < l; i++) {
1101 if (search.charAt(idx++) != search.charAt(s + i)) {
1102 return -1;
1103 }
1104 }
1105 }
1106 }
1107
1108 break;
1109
1110 case OP_BOL:
1111
1112 // Fail if we're not at the start of the string
1113 if (idx != 0) {
1114
1115 // If we're multiline matching, we could still be at the start of a line
1116 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1117
1118 // If not at start of line, give up
1119 if (idx <= 0 || !isNewline(idx - 1)) {
1120 return -1;
1121 } else {
1122 break;
1123 }
1124 }
1125
1126 return -1;
1127 }
1128
1129 break;
1130
1131 case OP_EOL:
1132
1133 // If we're not at the end of string
1134 if (!search.isEnd(0) && !search.isEnd(idx)) {
1135
1136 // If we're multi-line matching
1137 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE) {
1138
1139 // Give up if we're not at the end of a line
1140 if (!isNewline(idx)) {
1141 return -1;
1142 } else {
1143 break;
1144 }
1145 }
1146
1147 return -1;
1148 }
1149
1150 break;
1151
1152 case OP_ESCAPE:
1153
1154 // Which escape?
1155 switch (opdata) {
1156
1157 // Word boundary match
1158 case E_NBOUND:
1159 case E_BOUND:
1160 {
1161 char cLast = ((idx == getParenStart(0))
1162 ? '\n' : search.charAt(idx - 1));
1163 char cNext = ((search.isEnd(idx))
1164 ? '\n' : search.charAt(idx));
1165
1166 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND)) {
1167 return -1;
1168 }
1169 }
1170
1171 break;
1172
1173 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1174 case E_ALNUM:
1175 case E_NALNUM:
1176 case E_DIGIT:
1177 case E_NDIGIT:
1178 case E_SPACE:
1179 case E_NSPACE:
1180
1181 // Give up if out of input
1182 if (search.isEnd(idx)) {
1183 return -1;
1184 }
1185 // Switch on escape
1186 switch (opdata) {
1187 case E_ALNUM:
1188 case E_NALNUM:
1189 if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM))) {
1190 return -1;
1191 }
1192
1193 break;
1194
1195 case E_DIGIT:
1196 case E_NDIGIT:
1197 if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT))) {
1198 return -1;
1199 }
1200
1201 break;
1202
1203 case E_SPACE:
1204 case E_NSPACE:
1205 if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE))) {
1206 return -1;
1207 }
1208
1209 break;
1210 }
1211
1212 idx++;
1213 break;
1214
1215 default:
1216 internalError("Unrecognized escape '" + opdata +
1217 "'");
1218 }
1219
1220 break;
1221
1222 case OP_ANY:
1223
1224 // Match anything but a newline
1225 if (search.isEnd(idx) || search.charAt(idx++) == '\n') {
1226 return -1;
1227 }
1228
1229 break;
1230
1231 case OP_ATOM:
1232 {
1233
1234 // Match an atom value
1235 if (search.isEnd(idx)) {
1236 return -1;
1237 }
1238
1239 // Get length of atom and starting index
1240 int lenAtom = opdata;
1241 int startAtom = node + nodeSize;
1242
1243 // Give up if not enough input remains to have a match
1244 if (search.isEnd(lenAtom + idx - 1)) {
1245 return -1;
1246 }
1247 // Match atom differently depending on casefolding flag
1248 if ((matchFlags & MATCH_CASEINDEPENDENT) != 0) {
1249 for (int i = 0; i < lenAtom; i++) {
1250 if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(
1251 instruction[startAtom + i])) {
1252 return -1;
1253 }
1254 }
1255 } else {
1256 for (int i = 0; i < lenAtom; i++) {
1257 if (search.charAt(idx++) != instruction[startAtom + i]) {
1258 return -1;
1259 }
1260 }
1261 }
1262 }
1263
1264 break;
1265
1266 case OP_POSIXCLASS:
1267 {
1268
1269 // Out of input?
1270 if (search.isEnd(idx)) {
1271 return -1;
1272 }
1273 switch (opdata) {
1274 case POSIX_CLASS_ALNUM:
1275 if (!Character.isLetterOrDigit(search.charAt(idx))) {
1276 return -1;
1277 }
1278
1279 break;
1280
1281 case POSIX_CLASS_ALPHA:
1282 if (!Character.isLetter(search.charAt(idx))) {
1283 return -1;
1284 }
1285
1286 break;
1287
1288 case POSIX_CLASS_DIGIT:
1289 if (!Character.isDigit(search.charAt(idx))) {
1290 return -1;
1291 }
1292
1293 break;
1294
1295 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1296 if (!Character.isSpaceChar(search.charAt(idx))) {
1297 return -1;
1298 }
1299
1300 break;
1301
1302 case POSIX_CLASS_SPACE:
1303 if (!Character.isWhitespace(search.charAt(idx))) {
1304 return -1;
1305 }
1306
1307 break;
1308
1309 case POSIX_CLASS_CNTRL:
1310 if (Character.getType(search.charAt(idx)) != Character.CONTROL) {
1311 return -1;
1312 }
1313
1314 break;
1315
1316 case POSIX_CLASS_GRAPH: // JWL - bugbug???
1317 switch (Character.getType(search.charAt(idx))) {
1318 case Character.MATH_SYMBOL:
1319 case Character.CURRENCY_SYMBOL:
1320 case Character.MODIFIER_SYMBOL:
1321 case Character.OTHER_SYMBOL:
1322 break;
1323
1324 default:
1325 return -1;
1326 }
1327
1328 break;
1329
1330 case POSIX_CLASS_LOWER:
1331 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER) {
1332 return -1;
1333 }
1334
1335 break;
1336
1337 case POSIX_CLASS_UPPER:
1338 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER) {
1339 return -1;
1340 }
1341
1342 break;
1343
1344 case POSIX_CLASS_PRINT:
1345 if (Character.getType(search.charAt(idx)) == Character.CONTROL) {
1346 return -1;
1347 }
1348
1349 break;
1350
1351 case POSIX_CLASS_PUNCT:
1352 {
1353 int type = Character.getType(search.charAt(idx));
1354
1355 switch (type) {
1356 case Character.DASH_PUNCTUATION:
1357 case Character.START_PUNCTUATION:
1358 case Character.END_PUNCTUATION:
1359 case Character.CONNECTOR_PUNCTUATION:
1360 case Character.OTHER_PUNCTUATION:
1361 break;
1362
1363 default:
1364 return -1;
1365 }
1366 }
1367
1368 break;
1369
1370 case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1371 {
1372 boolean isXDigit = ((search.charAt(idx) >= '0' &&
1373 search.charAt(idx) <= '9') ||
1374 (search.charAt(idx) >= 'a' &&
1375 search.charAt(idx) <= 'f') ||
1376 (search.charAt(idx) >= 'A' &&
1377 search.charAt(idx) <= 'F'));
1378
1379 if (!isXDigit) {
1380 return -1;
1381 }
1382 }
1383
1384 break;
1385
1386 case POSIX_CLASS_JSTART:
1387 if (!Character.isJavaIdentifierStart(search.charAt(idx))) {
1388 return -1;
1389 }
1390
1391 break;
1392
1393 case POSIX_CLASS_JPART:
1394 if (!Character.isJavaIdentifierPart(search.charAt(idx))) {
1395 return -1;
1396 }
1397
1398 break;
1399
1400 default:
1401 internalError("Bad posix class");
1402 break;
1403 }
1404
1405 // Matched.
1406 idx++;
1407 }
1408
1409 break;
1410
1411 case OP_ANYOF:
1412 {
1413
1414 // Out of input?
1415 if (search.isEnd(idx)) {
1416 return -1;
1417 }
1418
1419 // Get character to match against character class and maybe casefold
1420 char c = search.charAt(idx);
1421 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1422
1423 if (caseFold) {
1424 c = Character.toLowerCase(c);
1425 }
1426
1427 // Loop through character class checking our match character
1428 int idxRange = node + nodeSize;
1429 int idxEnd = idxRange + (opdata * 2);
1430 boolean match = false;
1431
1432 for (int i = idxRange; i < idxEnd;) {
1433
1434 // Get start, end and match characters
1435 char s = instruction[i++];
1436 char e = instruction[i++];
1437
1438 // Fold ends of range and match character
1439 if (caseFold) {
1440 s = Character.toLowerCase(s);
1441 e = Character.toLowerCase(e);
1442 }
1443 // If the match character is in range, break out
1444 if (c >= s && c <= e) {
1445 match = true;
1446 break;
1447 }
1448 }
1449 // Fail if we didn't match the character class
1450 if (!match) {
1451 return -1;
1452 }
1453
1454 idx++;
1455 }
1456
1457 break;
1458
1459 case OP_BRANCH:
1460 {
1461
1462 // Check for choices
1463 if (instruction[next + offsetOpcode] != OP_BRANCH) {
1464
1465 // If there aren't any other choices, just evaluate this branch.
1466 node += nodeSize;
1467 continue;
1468 }
1469
1470 // Try all available branches
1471 short nextBranch;
1472
1473 do {
1474
1475 // Try matching the branch against the string
1476 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1) {
1477 return idxNew;
1478 }
1479
1480 // Go to next branch (if any)
1481 nextBranch = (short) instruction[node + offsetNext];
1482 node += nextBranch;
1483 } while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1484 // Failed to match any branch!
1485 return -1;
1486 }
1487 case OP_NOTHING:
1488 case OP_GOTO:
1489
1490 // Just advance to the next node without doing anything
1491 break;
1492
1493 case OP_END:
1494
1495 // Match has succeeded!
1496 setParenEnd(0, idx);
1497
1498 return idx;
1499
1500 default:
1501
1502 // Corrupt program
1503 internalError("Invalid opcode '" + opcode + "'");
1504 }
1505
1506 // Advance to the next node in the program
1507 node = next;
1508 }
1509
1510 // We "should" never end up here
1511 internalError("Corrupt program");
1512
1513 return -1;
1514 }
1515
1516 /**
1517 * Sets match behaviour flags which alter the way RE does matching.
1518 *
1519 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
1520 * <p/>
1521 * <pre>
1522 * <p/>
1523 * MATCH_NORMAL // Normal (case-sensitive) matching
1524 * MATCH_CASEINDEPENDENT // Case folded comparisons
1525 * MATCH_MULTILINE // Newline matches as BOL/EOL
1526 * <p/>
1527 * </pre>
1528 */
1529 public void setMatchFlags(int matchFlags) {
1530 this.matchFlags = m