1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.regexp;
19
20 import java.io.Serializable;
21 import java.util.Vector;
22
23 /**
24 * RE is an efficient, lightweight regular expression evaluator/matcher
25 * class. Regular expressions are pattern descriptions which enable
26 * sophisticated matching of strings. In addition to being able to
27 * match a string against a pattern, you can also extract parts of the
28 * match. This is especially useful in text parsing! Details on the
29 * syntax of regular expression patterns are given below.
30 *
31 * <p>
32 * To compile a regular expression (RE), you can simply construct an RE
33 * matcher object from the string specification of the pattern, like this:
34 *
35 * <pre>
36 * RE r = new RE("a*b");
37 * </pre>
38 *
39 * <p>
40 * Once you have done this, you can call either of the RE.match methods to
41 * perform matching on a String. For example:
42 *
43 * <pre>
44 * boolean matched = r.match("aaaab");
45 * </pre>
46 *
47 * will cause the boolean matched to be set to true because the
48 * pattern "a*b" matches the string "aaaab".
49 *
50 * <p>
51 * If you were interested in the <i>number</i> of a's which matched the
52 * first part of our example expression, you could change the expression to
53 * "(a*)b". Then when you compiled the expression and matched it against
54 * something like "xaaaab", you would get results like this:
55 *
56 * <pre>
57 * RE r = new RE("(a*)b"); // Compile expression
58 * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
59 *
60 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
61 * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
62 *
63 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
64 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
65 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
66 *
67 * int startInside = r.getParenStart(1); // startInside will be index 1
68 * int endInside = r.getParenEnd(1); // endInside will be index 5
69 * int lenInside = r.getParenLength(1); // lenInside will be 4
70 * </pre>
71 *
72 * You can also refer to the contents of a parenthesized expression
73 * within a regular expression itself. This is called a
74 * 'backreference'. The first backreference in a regular expression is
75 * denoted by \1, the second by \2 and so on. So the expression:
76 *
77 * <pre>
78 * ([0-9]+)=\1
79 * </pre>
80 *
81 * will match any string of the form n=n (like 0=0 or 2=2).
82 *
83 * <p>
84 * The full regular expression syntax accepted by RE is described here:
85 *
86 * <pre>
87 *
88 * <b><font face=times roman>Characters</font></b>
89 *
90 * <i>unicodeChar</i> Matches any identical unicode character
91 * \ Used to quote a meta-character (like '*')
92 * \\ Matches a single '\' character
93 * \0nnn Matches a given octal character
94 * \xhh Matches a given 8-bit hexadecimal character
95 * \\uhhhh Matches a given 16-bit hexadecimal character
96 * \t Matches an ASCII tab character
97 * \n Matches an ASCII newline character
98 * \r Matches an ASCII return character
99 * \f Matches an ASCII form feed character
100 *
101 *
102 * <b><font face=times roman>Character Classes</font></b>
103 *
104 * [abc] Simple character class
105 * [a-zA-Z] Character class with ranges
106 * [^abc] Negated character class
107 * </pre>
108 *
109 * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts
110 * from zero" or "ends with last character".
111 * <br>
112 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
113 * [-] means "all characters".
114 *
115 * <pre>
116 *
117 * <b><font face=times roman>Standard POSIX Character Classes</font></b>
118 *
119 * [:alnum:] Alphanumeric characters.
120 * [:alpha:] Alphabetic characters.
121 * [:blank:] Space and tab characters.
122 * [:cntrl:] Control characters.
123 * [:digit:] Numeric characters.
124 * [:graph:] Characters that are printable and are also visible.
125 * (A space is printable, but not visible, while an
126 * `a' is both.)
127 * [:lower:] Lower-case alphabetic characters.
128 * [:print:] Printable characters (characters that are not
129 * control characters.)
130 * [:punct:] Punctuation characters (characters that are not letter,
131 * digits, control characters, or space characters).
132 * [:space:] Space characters (such as space, tab, and formfeed,
133 * to name a few).
134 * [:upper:] Upper-case alphabetic characters.
135 * [:xdigit:] Characters that are hexadecimal digits.
136 *
137 *
138 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
139 *
140 * [:javastart:] Start of a Java identifier
141 * [:javapart:] Part of a Java identifier
142 *
143 *
144 * <b><font face=times roman>Predefined Classes</font></b>
145 *
146 * . Matches any character other than newline
147 * \w Matches a "word" character (alphanumeric plus "_")
148 * \W Matches a non-word character
149 * \s Matches a whitespace character
150 * \S Matches a non-whitespace character
151 * \d Matches a digit character
152 * \D Matches a non-digit character
153 *
154 *
155 * <b><font face=times roman>Boundary Matchers</font></b>
156 *
157 * ^ Matches only at the beginning of a line
158 * $ Matches only at the end of a line
159 * \b Matches only at a word boundary
160 * \B Matches only at a non-word boundary
161 *
162 *
163 * <b><font face=times roman>Greedy Closures</font></b>
164 *
165 * A* Matches A 0 or more times (greedy)
166 * A+ Matches A 1 or more times (greedy)
167 * A? Matches A 1 or 0 times (greedy)
168 * A{n} Matches A exactly n times (greedy)
169 * A{n,} Matches A at least n times (greedy)
170 * A{n,m} Matches A at least n but not more than m times (greedy)
171 *
172 *
173 * <b><font face=times roman>Reluctant Closures</font></b>
174 *
175 * A*? Matches A 0 or more times (reluctant)
176 * A+? Matches A 1 or more times (reluctant)
177 * A?? Matches A 0 or 1 times (reluctant)
178 *
179 *
180 * <b><font face=times roman>Logical Operators</font></b>
181 *
182 * AB Matches A followed by B
183 * A|B Matches either A or B
184 * (A) Used for subexpression grouping
185 * (?:A) Used for subexpression clustering (just like grouping but
186 * no backrefs)
187 *
188 *
189 * <b><font face=times roman>Backreferences</font></b>
190 *
191 * \1 Backreference to 1st parenthesized subexpression
192 * \2 Backreference to 2nd parenthesized subexpression
193 * \3 Backreference to 3rd parenthesized subexpression
194 * \4 Backreference to 4th parenthesized subexpression
195 * \5 Backreference to 5th parenthesized subexpression
196 * \6 Backreference to 6th parenthesized subexpression
197 * \7 Backreference to 7th parenthesized subexpression
198 * \8 Backreference to 8th parenthesized subexpression
199 * \9 Backreference to 9th parenthesized subexpression
200 * </pre>
201 *
202 * <p>
203 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
204 * that they match as many elements of the string as possible without
205 * causing the overall match to fail. If you want a closure to be
206 * reluctant (non-greedy), you can simply follow it with a '?'. A
207 * reluctant closure will match as few elements of the string as
208 * possible when finding matches. {m,n} closures don't currently
209 * support reluctancy.
210 *
211 * <p>
212 * <b><font face="times roman">Line terminators</font></b>
213 * <br>
214 * A line terminator is a one- or two-character sequence that marks
215 * the end of a line of the input character sequence. The following
216 * are recognized as line terminators:
217 * <ul>
218 * <li>A newline (line feed) character ('\n'),</li>
219 * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
220 * <li>A standalone carriage-return character ('\r'),</li>
221 * <li>A next-line character ('\u0085'),</li>
222 * <li>A line-separator character ('\u2028'), or</li>
223 * <li>A paragraph-separator character ('\u2029).</li>
224 * </ul>
225 *
226 * <p>
227 * RE runs programs compiled by the RECompiler class. But the RE
228 * matcher class does not include the actual regular expression compiler
229 * for reasons of efficiency. In fact, if you want to pre-compile one
230 * or more regular expressions, the 'recompile' class can be invoked
231 * from the command line to produce compiled output like this:
232 *
233 * <pre>
234 * // Pre-compiled regular expression "a*b"
235 * char[] re1Instructions =
236 * {
237 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
238 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
239 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
240 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
241 * 0x0000,
242 * };
243 *
244 *
245 * REProgram re1 = new REProgram(re1Instructions);
246 * </pre>
247 *
248 * You can then construct a regular expression matcher (RE) object from
249 * the pre-compiled expression re1 and thus avoid the overhead of
250 * compiling the expression at runtime. If you require more dynamic
251 * regular expressions, you can construct a single RECompiler object and
252 * re-use it to compile each expression. Similarly, you can change the
253 * program run by a given matcher object at any time. However, RE and
254 * RECompiler are not threadsafe (for efficiency reasons, and because
255 * requiring thread safety in this class is deemed to be a rare
256 * requirement), so you will need to construct a separate compiler or
257 * matcher object for each thread (unless you do thread synchronization
258 * yourself). Once expression compiled into the REProgram object, REProgram
259 * can be safely shared across multiple threads and RE objects.
260 *
261 * <br><p><br>
262 *
263 * <font color="red">
264 * <i>ISSUES:</i>
265 *
266 * <ul>
267 * <li>com.weusours.util.re is not currently compatible with all
268 * standard POSIX regcomp flags</li>
269 * <li>com.weusours.util.re does not support POSIX equivalence classes
270 * ([=foo=] syntax) (I18N/locale issue)</li>
271 * <li>com.weusours.util.re does not support nested POSIX character
272 * classes (definitely should, but not completely trivial)</li>
273 * <li>com.weusours.util.re Does not support POSIX character collation
274 * concepts ([.foo.] syntax) (I18N/locale issue)</li>
275 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
276 * <li>Should RE support character iterators (for backwards RE matching!)?</li>
277 * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
278 * <li>Not *all* possibilities are considered for greediness when backreferences
279 * are involved (as POSIX suggests should be the case). The POSIX RE
280 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
281 * of acdacaa where \1 is "a". This is not the case in this RE package,
282 * and actually Perl doesn't go to this extent either! Until someone
283 * actually complains about this, I'm not sure it's worth "fixing".
284 * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
285 * </ul>
286 *
287 * </font>
288 *
289 * @see recompile
290 * @see RECompiler
291 *
292 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
293 * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a>
294 * @version $Id: RE.java 518156 2007-03-14 14:31:26Z vgritsenko $
295 */
296 public class RE implements Serializable
297 {
298 /**
299 * Specifies normal, case-sensitive matching behaviour.
300 */
301 public static final int MATCH_NORMAL = 0x0000;
302
303 /**
304 * Flag to indicate that matching should be case-independent (folded)
305 */
306 public static final int MATCH_CASEINDEPENDENT = 0x0001;
307
308 /**
309 * Newlines should match as BOL/EOL (^ and $)
310 */
311 public static final int MATCH_MULTILINE = 0x0002;
312
313 /**
314 * Consider all input a single body of text - newlines are matched by .
315 */
316 public static final int MATCH_SINGLELINE = 0x0004;
317
318 /************************************************
319 * *
320 * The format of a node in a program is: *
321 * *
322 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
323 * *
324 * char OPCODE - instruction *
325 * char OPDATA - modifying data *
326 * char OPNEXT - next node (relative offset) *
327 * *
328 ************************************************/
329
330 // Opcode Char Opdata/Operand Meaning
331 // ---------- ---------- --------------- --------------------------------------------------
332 static final char OP_END = 'E'; // end of program
333 static final char OP_BOL = '^'; // match only if at beginning of line
334 static final char OP_EOL = '$'; // match only if at end of line
335 static final char OP_ANY = '.'; // match any single character except newline
336 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
337 static final char OP_BRANCH = '|'; // node match this alternative or the next one
338 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
339 static final char OP_STAR = '*'; // node kleene closure
340 static final char OP_PLUS = '+'; // node positive closure
341 static final char OP_MAYBE = '?'; // node optional closure
342 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
343 static final char OP_OPEN = '('; // number nth opening paren
344 static final char OP_OPEN_CLUSTER = '<'; // opening cluster
345 static final char OP_CLOSE = ')'; // number nth closing paren
346 static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
347 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
348 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
349 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
350 static final char OP_CONTINUE = 'C'; // continue to the following command (ignore next)
351 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
352 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
353 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
354 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
355
356 // Escape codes
357 static final char E_ALNUM = 'w'; // Alphanumeric
358 static final char E_NALNUM = 'W'; // Non-alphanumeric
359 static final char E_BOUND = 'b'; // Word boundary
360 static final char E_NBOUND = 'B'; // Non-word boundary
361 static final char E_SPACE = 's'; // Whitespace
362 static final char E_NSPACE = 'S'; // Non-whitespace
363 static final char E_DIGIT = 'd'; // Digit
364 static final char E_NDIGIT = 'D'; // Non-digit
365
366 // Posix character classes
367 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
368 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
369 static final char POSIX_CLASS_BLANK = 'b'; // Blanks
370 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
371 static final char POSIX_CLASS_DIGIT = 'd'; // Digits
372 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
373 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
374 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
375 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
376 static final char POSIX_CLASS_SPACE = 's'; // Spaces
377 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
378 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
379 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
380 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
381
382 // Limits
383 static final int maxNode = 65536; // Maximum number of nodes in a program
384 static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
385
386 // Node layout constants
387 static final int offsetOpcode = 0; // Opcode offset (first character)
388 static final int offsetOpdata = 1; // Opdata offset (second char)
389 static final int offsetNext = 2; // Next index offset (third char)
390 static final int nodeSize = 3; // Node size (in chars)
391
392 // State of current program
393 REProgram program; // Compiled regular expression 'program'
394 transient CharacterIterator search; // The string being matched against
395 int matchFlags; // Match behaviour flags
396 int maxParen = MAX_PAREN;
397
398 // Parenthesized subexpressions
399 transient int parenCount; // Number of subexpressions matched (num open parens + 1)
400 transient int start0; // Cache of start[0]
401 transient int end0; // Cache of start[0]
402 transient int start1; // Cache of start[1]
403 transient int end1; // Cache of start[1]
404 transient int start2; // Cache of start[2]
405 transient int end2; // Cache of start[2]
406 transient int[] startn; // Lazy-alloced array of sub-expression starts
407 transient int[] endn; // Lazy-alloced array of sub-expression ends
408
409 // Backreferences
410 transient int[] startBackref; // Lazy-alloced array of backref starts
411 transient int[] endBackref; // Lazy-alloced array of backref ends
412
413 /**
414 * Constructs a regular expression matcher from a String by compiling it
415 * using a new instance of RECompiler. If you will be compiling many
416 * expressions, you may prefer to use a single RECompiler object instead.
417 *
418 * @param pattern The regular expression pattern to compile.
419 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
420 * @see RECompiler
421 * @see recompile
422 */
423 public RE(String pattern) throws RESyntaxException
424 {
425 this(pattern, MATCH_NORMAL);
426 }
427
428 /**
429 * Constructs a regular expression matcher from a String by compiling it
430 * using a new instance of RECompiler. If you will be compiling many
431 * expressions, you may prefer to use a single RECompiler object instead.
432 *
433 * @param pattern The regular expression pattern to compile.
434 * @param matchFlags The matching style
435 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
436 * @see RECompiler
437 * @see recompile
438 */
439 public RE(String pattern, int matchFlags) throws RESyntaxException
440 {
441 this(new RECompiler().compile(pattern), matchFlags);
442 }
443
444 /**
445 * Construct a matcher for a pre-compiled regular expression from program
446 * (bytecode) data. Permits special flags to be passed in to modify matching
447 * behaviour.
448 *
449 * @param program Compiled regular expression program (see RECompiler and/or recompile)
450 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
451 *
452 * <pre>
453 * MATCH_NORMAL // Normal (case-sensitive) matching
454 * MATCH_CASEINDEPENDENT // Case folded comparisons
455 * MATCH_MULTILINE // Newline matches as BOL/EOL
456 * </pre>
457 *
458 * @see RECompiler
459 * @see REProgram
460 * @see recompile
461 */
462 public RE(REProgram program, int matchFlags)
463 {
464 setProgram(program);
465 setMatchFlags(matchFlags);
466 }
467
468 /**
469 * Construct a matcher for a pre-compiled regular expression from program
470 * (bytecode) data.
471 *
472 * @param program Compiled regular expression program
473 * @see RECompiler
474 * @see recompile
475 */
476 public RE(REProgram program)
477 {
478 this(program, MATCH_NORMAL);
479 }
480
481 /**
482 * Constructs a regular expression matcher with no initial program.
483 * This is likely to be an uncommon practice, but is still supported.
484 */
485 public RE()
486 {
487 this((REProgram) null, MATCH_NORMAL);
488 }
489
490 /**
491 * Converts a 'simplified' regular expression to a full regular expression
492 *
493 * @param pattern The pattern to convert
494 * @return The full regular expression
495 */
496 public static String simplePatternToFullRegularExpression(String pattern)
497 {
498 StringBuffer buf = new StringBuffer();
499 for (int i = 0; i < pattern.length(); i++)
500 {
501 char c = pattern.charAt(i);
502 switch (c)
503 {
504 case '*':
505 buf.append(".*");
506 break;
507
508 case '.':
509 case '[':
510 case ']':
511 case '\\':
512 case '+':
513 case '?':
514 case '{':
515 case '}':
516 case '$':
517 case '^':
518 case '|':
519 case '(':
520 case ')':
521 buf.append('\\');
522 default:
523 buf.append(c);
524 break;
525 }
526 }
527 return buf.toString();
528 }
529
530 /**
531 * Sets match behaviour flags which alter the way RE does matching.
532 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
533 *
534 * <pre>
535 * MATCH_NORMAL // Normal (case-sensitive) matching
536 * MATCH_CASEINDEPENDENT // Case folded comparisons
537 * MATCH_MULTILINE // Newline matches as BOL/EOL
538 * </pre>
539 */
540 public void setMatchFlags(int matchFlags)
541 {
542 this.matchFlags = matchFlags;
543 }
544
545 /**
546 * Returns the current match behaviour flags.
547 * @return Current match behaviour flags (RE.MATCH_*).
548 *
549 * <pre>
550 * MATCH_NORMAL // Normal (case-sensitive) matching
551 * MATCH_CASEINDEPENDENT // Case folded comparisons
552 * MATCH_MULTILINE // Newline matches as BOL/EOL
553 * </pre>
554 *
555 * @see #setMatchFlags
556 */
557 public int getMatchFlags()
558 {
559 return matchFlags;
560 }
561
562 /**
563 * Sets the current regular expression program used by this matcher object.
564 *
565 * @param program Regular expression program compiled by RECompiler.
566 * @see RECompiler
567 * @see REProgram
568 * @see recompile
569 */
570 public void setProgram(REProgram program)
571 {
572 this.program = program;
573 if (program != null && program.maxParens != -1) {
574 this.maxParen = program.maxParens;
575 } else {
576 this.maxParen = MAX_PAREN;
577 }
578 }
579
580 /**
581 * Returns the current regular expression program in use by this matcher object.
582 *
583 * @return Regular expression program
584 * @see #setProgram
585 */
586 public REProgram getProgram()
587 {
588 return program;
589 }
590
591 /**
592 * Returns the number of parenthesized subexpressions available after a successful match.
593 *
594 * @return Number of available parenthesized subexpressions
595 */
596 public int getParenCount()
597 {
598 return parenCount;
599 }
600
601 /**
602 * Gets the contents of a parenthesized subexpression after a successful match.
603 *
604 * @param which Nesting level of subexpression
605 * @return String
606 */
607 public String getParen(int which)
608 {
609 int start;
610 if (which < parenCount && (start = getParenStart(which)) >= 0)
611 {
612 return search.substring(start, getParenEnd(which));
613 }
614 return null;
615 }
616
617 /**
618 * Returns the start index of a given paren level.
619 *
620 * @param which Nesting level of subexpression
621 * @return String index
622 */
623 public final int getParenStart(int which)
624 {
625 if (which < parenCount)
626 {
627 switch (which)
628 {
629 case 0:
630 return start0;
631
632 case 1:
633 return start1;
634
635 case 2:
636 return start2;
637
638 default:
639 if (startn == null)
640 {
641 allocParens();
642 }
643 return startn[which];
644 }
645 }
646 return -1;
647 }
648
649 /**
650 * Returns the end index of a given paren level.
651 *
652 * @param which Nesting level of subexpression
653 * @return String index
654 */
655 public final int getParenEnd(int which)
656 {
657 if (which < parenCount)
658 {
659 switch (which)
660 {
661 case 0:
662 return end0;
663
664 case 1:
665 return end1;
666
667 case 2:
668 return end2;
669
670 default:
671 if (endn == null)
672 {
673 allocParens();
674 }
675 return endn[which];
676 }
677 }
678 return -1;
679 }
680
681 /**
682 * Returns the length of a given paren level.
683 *
684 * @param which Nesting level of subexpression
685 * @return Number of characters in the parenthesized subexpression
686 */
687 public final int getParenLength(int which)
688 {
689 if (which < parenCount)
690 {
691 return getParenEnd(which) - getParenStart(which);
692 }
693 return -1;
694 }
695
696 /**
697 * Sets the start of a paren level
698 *
699 * @param which Which paren level
700 * @param i Index in input array
701 */
702 protected final void setParenStart(int which, int i)
703 {
704 if (which < parenCount)
705 {
706 switch (which)
707 {
708 case 0:
709 start0 = i;
710 break;
711
712 case 1:
713 start1 = i;
714 break;
715
716 case 2:
717 start2 = i;
718 break;
719
720 default:
721 if (startn == null)
722 {
723 allocParens();
724 }
725 startn[which] = i;
726 break;
727 }
728 }
729 }
730
731 /**
732 * Sets the end of a paren level
733 *
734 * @param which Which paren level
735 * @param i Index in input array
736 */
737 protected final void setParenEnd(int which, int i)
738 {
739 if (which < parenCount)
740 {
741 switch (which)
742 {
743 case 0:
744 end0 = i;
745 break;
746
747 case 1:
748 end1 = i;
749 break;
750
751 case 2:
752 end2 = i;
753 break;
754
755 default:
756 if (endn == null)
757 {
758 allocParens();
759 }
760 endn[which] = i;
761 break;
762 }
763 }
764 }
765
766 /**
767 * Throws an Error representing an internal error condition probably resulting
768 * from a bug in the regular expression compiler (or possibly data corruption).
769 * In practice, this should be very rare.
770 *
771 * @param s Error description
772 */
773 protected void internalError(String s) throws Error
774 {
775 throw new Error("RE internal error: " + s);
776 }
777
778 /**
779 * Performs lazy allocation of subexpression arrays
780 */
781 private void allocParens()
782 {
783 // Allocate arrays for subexpressions
784 startn = new int[maxParen];
785 endn = new int[maxParen];
786
787 // Set sub-expression pointers to invalid values
788 for (int i = 0; i < maxParen; i++)
789 {
790 startn[i] = -1;
791 endn[i] = -1;
792 }
793 }
794
795 /**
796 * Try to match a string against a subset of nodes in the program
797 *
798 * @param firstNode Node to start at in program
799 * @param lastNode Last valid node (used for matching a subexpression without
800 * matching the rest of the program as well).
801 * @param idxStart Starting position in character array
802 * @return Final input array index if match succeeded. -1 if not.
803 */
804 protected int matchNodes(int firstNode, int lastNode, int idxStart)
805 {
806 // Our current place in the string
807 int idx = idxStart;
808
809 // Loop while node is valid
810 int next, opcode, opdata;
811 int idxNew;
812 char[] instruction = program.instruction;
813 for (int node = firstNode; node < lastNode; )
814 {
815 opcode = instruction[node /* + offsetOpcode */];
816 next = node + (short) instruction[node + offsetNext];
817 opdata = instruction[node + offsetOpdata];
818
819 switch (opcode)
820 {
821 case OP_MAYBE:
822 case OP_STAR:
823 {
824 // Try to match the following subexpr. If it matches:
825 // MAYBE: Continues matching rest of the expression
826 // STAR: Points back here to repeat subexpr matching
827 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
828 {
829 return idxNew;
830 }
831
832 // If failed, just continue with the rest of expression
833 break;
834 }
835
836 case OP_PLUS:
837 {
838 // Try to match the subexpr again (and again (and ...
839 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
840 {
841 return idxNew;
842 }
843
844 // If failed, just continue with the rest of expression
845 // Rest is located at the next pointer of the next instruction
846 // (which must be OP_CONTINUE)
847 node = next + (short) instruction[next + offsetNext];
848 continue;
849 }
850
851 case OP_RELUCTANTMAYBE:
852 case OP_RELUCTANTSTAR:
853 {
854 // Try to match the rest without using the reluctant subexpr
855 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
856 {
857 return idxNew;
858 }
859
860 // Try reluctant subexpr. If it matches:
861 // RELUCTANTMAYBE: Continues matching rest of the expression
862 // RELUCTANTSTAR: Points back here to repeat reluctant star matching
863 return matchNodes(node + nodeSize, next, idx);
864 }
865
866 case OP_RELUCTANTPLUS:
867 {
868 // Continue matching the rest without using the reluctant subexpr
869 if ((idxNew = matchNodes(next + (short) instruction[next + offsetNext], maxNode, idx)) != -1)
870 {
871 return idxNew;
872 }
873
874 // Try to match subexpression again
875 break;
876 }
877
878 case OP_OPEN:
879
880 // Match subexpression
881 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
882 {
883 startBackref[opdata] = idx;
884 }
885 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
886 {
887 // Increase valid paren count
888 if (opdata >= parenCount)
889 {
890 parenCount = opdata + 1;
891 }
892
893 // Don't set paren if already set later on
894 if (getParenStart(opdata) == -1)
895 {
896 setParenStart(opdata, idx);
897 }
898 }
899 return idxNew;
900
901 case OP_CLOSE:
902
903 // Done matching subexpression
904 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
905 {
906 endBackref[opdata] = idx;
907 }
908 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
909 {
910 // Increase valid paren count
911 if (opdata >= parenCount)
912 {
913 parenCount = opdata + 1;
914 }
915
916 // Don't set paren if already set later on
917 if (getParenEnd(opdata) == -1)
918 {
919 setParenEnd(opdata, idx);
920 }
921 }
922 return idxNew;
923
924 case OP_BACKREF:
925 {
926 // Get the start and end of the backref
927 int s = startBackref[opdata];
928 int e = endBackref[opdata];
929
930 // We don't know the backref yet
931 if (s == -1 || e == -1)
932 {
933 return -1;
934 }
935
936 // The backref is empty size
937 if (s == e)
938 {
939 break;
940 }
941
942 // Get the length of the backref
943 int l = e - s;
944
945 // If there's not enough input left, give up.
946 if (search.isEnd(idx + l - 1))
947 {
948 return -1;
949 }
950
951 // Case fold the backref?
952 final boolean caseFold =
953 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
954 // Compare backref to input
955 for (int i = 0; i < l; i++)
956 {
957 if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
958 {
959 return -1;
960 }
961 }
962 }
963 break;
964
965 case OP_BOL:
966
967 // Fail if we're not at the start of the string
968 if (idx != 0)
969 {
970 // If we're multiline matching, we could still be at the start of a line
971 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
972 {
973 // Continue if at the start of a line
974 if (isNewline(idx - 1))
975 {
976 break;
977 }
978 }
979 return -1;
980 }
981 break;
982
983 case OP_EOL:
984
985 // If we're not at the end of string
986 if (!search.isEnd(0) && !search.isEnd(idx))
987 {
988 // If we're multi-line matching
989 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
990 {
991 // Continue if we're at the end of a line
992 if (isNewline(idx))
993 {
994 break;
995 }
996 }
997 return -1;
998 }
999 break;
1000
1001 case OP_ESCAPE:
1002
1003 // Which escape?
1004 switch (opdata)
1005 {
1006 // Word boundary match
1007 case E_NBOUND:
1008 case E_BOUND:
1009 {
1010 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
1011 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
1012 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1013 {
1014 return -1;
1015 }
1016 }
1017 break;
1018
1019 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1020 case E_ALNUM:
1021 case E_NALNUM:
1022 case E_DIGIT:
1023 case E_NDIGIT:
1024 case E_SPACE:
1025 case E_NSPACE:
1026
1027 // Give up if out of input
1028 if (search.isEnd(idx))
1029 {
1030 return -1;
1031 }
1032
1033 char c = search.charAt(idx);
1034
1035 // Switch on escape
1036 switch (opdata)
1037 {
1038 case E_ALNUM:
1039 case E_NALNUM:
1040 if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1041 {
1042 return -1;
1043 }
1044 break;
1045
1046 case E_DIGIT:
1047 case E_NDIGIT:
1048 if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1049 {
1050 return -1;
1051 }
1052 break;
1053
1054 case E_SPACE:
1055 case E_NSPACE:
1056 if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1057 {
1058 return -1;
1059 }
1060 break;
1061 }
1062 idx++;
1063 break;
1064
1065 default:
1066 internalError("Unrecognized escape '" + opdata + "'");
1067 }
1068 break;
1069
1070 case OP_ANY:
1071
1072 if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1073 // Match anything
1074 if (search.isEnd(idx))
1075 {
1076 return -1;
1077 }
1078 }
1079 else
1080 {
1081 // Match anything but a newline
1082 if (search.isEnd(idx) || isNewline(idx))
1083 {
1084 return -1;
1085 }
1086 }
1087 idx++;
1088 break;
1089
1090 case OP_ATOM:
1091 {
1092 // Match an atom value
1093 if (search.isEnd(idx))
1094 {
1095 return -1;
1096 }
1097
1098 // Get length of atom and starting index
1099 // int lenAtom = opdata;
1100 int startAtom = node + nodeSize;
1101
1102 // Give up if not enough input remains to have a match
1103 if (search.isEnd(opdata + idx - 1))
1104 {
1105 return -1;
1106 }
1107
1108 // Match atom differently depending on casefolding flag
1109 final boolean caseFold =
1110 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1111
1112 for (int i = 0; i < opdata; i++)
1113 {
1114 if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1115 {
1116 return -1;
1117 }
1118 }
1119 }
1120 break;
1121
1122 case OP_POSIXCLASS:
1123 {
1124 // Out of input?
1125 if (search.isEnd(idx))
1126 {
1127 return -1;
1128 }
1129
1130 switch (opdata)
1131 {
1132 case POSIX_CLASS_ALNUM:
1133 if (!Character.isLetterOrDigit(search.charAt(idx)))
1134 {
1135 return -1;
1136 }
1137 break;
1138
1139 case POSIX_CLASS_ALPHA:
1140 if (!Character.isLetter(search.charAt(idx)))
1141 {
1142 return -1;
1143 }
1144 break;
1145
1146 case POSIX_CLASS_DIGIT:
1147 if (!Character.isDigit(search.charAt(idx)))
1148 {
1149 return -1;
1150 }
1151 break;
1152
1153 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1154 if (!Character.isSpaceChar(search.charAt(idx)))
1155 {
1156 return -1;
1157 }
1158 break;
1159
1160 case POSIX_CLASS_SPACE:
1161 if (!Character.isWhitespace(search.charAt(idx)))
1162 {
1163 return -1;
1164 }
1165 break;
1166
1167 case POSIX_CLASS_CNTRL:
1168 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1169 {
1170 return -1;
1171 }
1172 break;
1173
1174 case POSIX_CLASS_GRAPH: // JWL - bugbug???
1175 switch (Character.getType(search.charAt(idx)))
1176 {
1177 case Character.MATH_SYMBOL:
1178 case Character.CURRENCY_SYMBOL:
1179 case Character.MODIFIER_SYMBOL:
1180 case Character.OTHER_SYMBOL:
1181 break;
1182
1183 default:
1184 return -1;
1185 }
1186 break;
1187
1188 case POSIX_CLASS_LOWER:
1189 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1190 {
1191 return -1;
1192 }
1193 break;
1194
1195 case POSIX_CLASS_UPPER:
1196 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1197 {
1198 return -1;
1199 }
1200 break;
1201
1202 case POSIX_CLASS_PRINT:
1203 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1204 {
1205 return -1;
1206 }
1207 break;
1208
1209 case POSIX_CLASS_PUNCT:
1210 {
1211 int type = Character.getType(search.charAt(idx));
1212 switch(type)
1213 {
1214 case Character.DASH_PUNCTUATION:
1215 case Character.START_PUNCTUATION:
1216 case Character.END_PUNCTUATION:
1217 case Character.CONNECTOR_PUNCTUATION:
1218 case Character.OTHER_PUNCTUATION:
1219 break;
1220
1221 default:
1222 return -1;
1223 }
1224 }
1225 break;
1226
1227 case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1228 {
1229 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1230 (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1231 (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1232 if (!isXDigit)
1233 {
1234 return -1;
1235 }
1236 }
1237 break;
1238
1239 case POSIX_CLASS_JSTART:
1240 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1241 {
1242 return -1;
1243 }
1244 break;
1245
1246 case POSIX_CLASS_JPART:
1247 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1248 {
1249 return -1;
1250 }
1251 break;
1252
1253 default:
1254 internalError("Bad posix class");
1255 break;
1256 }
1257
1258 // Matched.
1259 idx++;
1260 }
1261 break;
1262
1263 case OP_ANYOF:
1264 {
1265 // Out of input?
1266 if (search.isEnd(idx))
1267 {
1268 return -1;
1269 }
1270
1271 // Get character to match against character class and maybe casefold
1272 char c = search.charAt(idx);
1273 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1274 // Loop through character class checking our match character
1275 int idxRange = node + nodeSize;
1276 int idxEnd = idxRange + (opdata * 2);
1277 boolean match = false;
1278 for (int i = idxRange; !match && i < idxEnd; )
1279 {
1280 // Get start, end and match characters
1281 char s = instruction[i++];
1282 char e = instruction[i++];
1283
1284 match = ((compareChars(c, s, caseFold) >= 0)
1285 && (compareChars(c, e, caseFold) <= 0));
1286 }
1287
1288 // Fail if we didn't match the character class
1289 if (!match)
1290 {
1291 return -1;
1292 }
1293 idx++;
1294 }
1295 break;
1296
1297 case OP_BRANCH:
1298 {
1299 // Check for choices
1300 // FIXME Dead code - only reason to keep is backward compat with pre-compiled exprs. Remove?
1301 if (instruction[next /* + offsetOpcode */] != OP_BRANCH)
1302 {
1303 // If there aren't any other choices, just evaluate this branch.
1304 node += nodeSize;
1305 continue;
1306 }
1307
1308 // Try all available branches
1309 int nextBranch;
1310 do
1311 {
1312 // Try matching the branch against the string
1313 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1314 {
1315 return idxNew;
1316 }
1317
1318 // Go to next branch (if any)
1319 nextBranch = (short) instruction[node + offsetNext];
1320 node += nextBranch;
1321 }
1322 while (nextBranch != 0 && (instruction[node /* + offsetOpcode */] == OP_BRANCH));
1323
1324 // Failed to match any branch!
1325 return -1;
1326 }
1327
1328 case OP_OPEN_CLUSTER:
1329 case OP_CLOSE_CLUSTER:
1330 // starting or ending the matching of a subexpression which has no backref.
1331
1332 case OP_NOTHING:
1333 case OP_GOTO:
1334
1335 // Just advance to the next node without doing anything
1336 break;
1337
1338 case OP_CONTINUE:
1339
1340 // Advance to the following node
1341 node += nodeSize;
1342 continue;
1343
1344 case OP_END:
1345
1346 // Match has succeeded!
1347 setParenEnd(0, idx);
1348 return idx;
1349
1350 default:
1351
1352 // Corrupt program
1353 internalError("Invalid opcode '" + opcode + "'");
1354 }
1355
1356 // Advance to the next node in the program
1357 node = next;
1358 }
1359
1360 // We "should" never end up here
1361 internalError("Corrupt program");
1362 return -1;
1363 }
1364
1365 /**
1366 * Match the current regular expression program against the current
1367 * input string, starting at index i of the input string. This method
1368 * is only meant for internal use.
1369 *
1370 * @param i The input string index to start matching at
1371 * @return True if the input matched the expression
1372 */
1373 protected boolean matchAt(int i)
1374 {
1375 // Initialize start pointer, paren cache and paren count
1376 start0 = -1;
1377 end0 = -1;
1378 start1 = -1;
1379 end1 = -1;
1380 start2 = -1;
1381 end2 = -1;
1382 startn = null;
1383 endn = null;
1384 parenCount = 1;
1385 setParenStart(0, i);
1386
1387 // Allocate backref arrays (unless optimizations indicate otherwise)
1388 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1389 {
1390 startBackref = new int[maxParen];
1391 endBackref = new int[maxParen];
1392 }
1393
1394 // Match against string
1395 int idx;
1396 if ((idx = matchNodes(0, maxNode, i)) != -1)
1397 {
1398 setParenEnd(0, idx);
1399 return true;
1400 }
1401
1402 // Didn't match
1403 parenCount = 0;
1404 return false;
1405 }
1406
1407 /**
1408 * Matches the current regular expression program against a character array,
1409 * starting at a given index.
1410 *
1411 * @param search String to match against
1412 * @param i Index to start searching at
1413 * @return True if string matched
1414 */
1415 public boolean match(String search, int i)
1416 {
1417 return match(new StringCharacterIterator(search), i);
1418 }
1419
1420 /**
1421 * Matches the current regular expression program against a character array,
1422 * starting at a given index.
1423 *
1424 * @param search String to match against
1425 * @param i Index to start searching at
1426 * @return True if string matched
1427 */
1428 public boolean match(CharacterIterator search, int i)
1429 {
1430 // There is no compiled program to search with!
1431 if (program == null)
1432 {
1433 // This should be uncommon enough to be an error case rather
1434 // than an exception (which would have to be handled everywhere)
1435 internalError("No RE program to run!");
1436 }
1437
1438 // Save string to search
1439 this.search = search;
1440
1441 // Can we optimize the search by looking for new lines?
1442 if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
1443 {
1444 // Non multi-line matching with BOL: Must match at '0' index
1445 if ((matchFlags & MATCH_MULTILINE) == 0)
1446 {
1447 return i == 0 && matchAt(i);
1448 }
1449
1450 // Multi-line matching with BOL: Seek to next line
1451 for ( ;! search.isEnd(i); i++)
1452 {
1453 // Skip if we are at the beginning of the line
1454 if (isNewline(i))
1455 {
1456 continue;
1457 }
1458
1459 // Match at the beginning of the line
1460 if (matchAt(i))
1461 {
1462 return true;
1463 }
1464
1465 // Skip to the end of line
1466 for ( ;! search.isEnd(i); i++)
1467 {
1468 if (isNewline(i))
1469 {
1470 break;
1471 }
1472 }
1473 }
1474
1475 return false;
1476 }
1477
1478 // Can we optimize the search by looking for a prefix string?
1479 if (program.prefix == null)
1480 {
1481 // Unprefixed matching must try for a match at each character
1482 for ( ;! search.isEnd(i - 1); i++)
1483 {
1484 // Try a match at index i
1485 if (matchAt(i))
1486 {
1487 return true;
1488 }
1489 }
1490 return false;
1491 }
1492 else
1493 {
1494 // Prefix-anchored matching is possible
1495 boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1496 char[] prefix = program.prefix;
1497 for ( ; !search.isEnd(i + prefix.length - 1); i++)
1498 {
1499 int j = i;
1500 int k = 0;
1501
1502 boolean match;
1503 do {
1504 // If there's a mismatch of any character in the prefix, give up
1505 match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1506 } while (match && k < prefix.length);
1507
1508 // See if the whole prefix string matched
1509 if (k == prefix.length)
1510 {
1511 // We matched the full prefix at firstChar, so try it
1512 if (matchAt(i))
1513 {
1514 return true;
1515 }
1516 }
1517 }
1518 return false;
1519 }
1520 }
1521
1522 /**
1523 * Matches the current regular expression program against a String.
1524 *
1525 * @param search String to match against
1526 * @return True if string matched
1527 */
1528 public boolean match(String search)
1529 {
1530 return match(search, 0);
1531 }
1532
1533 /**
1534 * Splits a string into an array of strings on regular expression boundaries.
1535 * This function works the same way as the Perl function of the same name.
1536 * Given a regular expression of "[ab]+" and a string to split of
1537 * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1538 * "[xyzzy, yyz, 123]".
1539 *
1540 * <p>Please note that the first string in the resulting array may be an empty
1541 * string. This happens when the very first character of input string is
1542 * matched by the pattern.
1543 *
1544 * @param s String to split on this regular exression
1545 * @return Array of strings
1546 */
1547 public String[] split(String s)
1548 {
1549 // Create new vector
1550 Vector v = new Vector();
1551
1552 // Start at position 0 and search the whole string
1553 int pos = 0;
1554 int len = s.length();
1555
1556 // Try a match at each position
1557 while (pos < len && match(s, pos))
1558 {
1559 // Get start of match
1560 int start = getParenStart(0);
1561
1562 // Get end of match
1563 int newpos = getParenEnd(0);
1564
1565 // Check if no progress was made
1566 if (newpos == pos)
1567 {
1568 v.addElement(s.substring(pos, start + 1));
1569 newpos++;
1570 }
1571 else
1572 {
1573 v.addElement(s.substring(pos, start));
1574 }
1575
1576 // Move to new position
1577 pos = newpos;
1578 }
1579
1580 // Push remainder if it's not empty
1581 String remainder = s.substring(pos);
1582 if (remainder.length() != 0)
1583 {
1584 v.addElement(remainder);
1585 }
1586
1587 // Return vector as an array of strings
1588 String[] ret = new String[v.size()];
1589 v.copyInto(ret);
1590 return ret;
1591 }
1592
1593 /**
1594 * Flag bit that indicates that subst should replace all occurrences of this
1595 * regular expression.
1596 */
1597 public static final int REPLACE_ALL = 0x0000;
1598
1599 /**
1600 * Flag bit that indicates that subst should only replace the first occurrence
1601 * of this regular expression.
1602 */
1603 public static final int REPLACE_FIRSTONLY = 0x0001;
1604
1605 /**
1606 * Flag bit that indicates that subst should replace backreferences
1607 */
1608 public static final int REPLACE_BACKREFERENCES = 0x0002;
1609
1610 /**
1611 * Substitutes a string for this regular expression in another string.
1612 * This method works like the Perl function of the same name.
1613 * Given a regular expression of "a*b", a String to substituteIn of
1614 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1615 * resulting String returned by subst would be "-foo-garply-wacky-".
1616 *
1617 * @param substituteIn String to substitute within
1618 * @param substitution String to substitute for all matches of this regular expression.
1619 * @return The string substituteIn with zero or more occurrences of the current
1620 * regular expression replaced with the substitution String (if this regular
1621 * expression object doesn't match at any position, the original String is returned
1622 * unchanged).
1623 */
1624 public String subst(String substituteIn, String substitution)
1625 {
1626 return subst(substituteIn, substitution, REPLACE_ALL);
1627 }
1628
1629 /**
1630 * Substitutes a string for this regular expression in another string.
1631 * This method works like the Perl function of the same name.
1632 * Given a regular expression of "a*b", a String to substituteIn of
1633 * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
1634 * resulting String returned by subst would be "-foo-garply-wacky-".
1635 * <p>
1636 * It is also possible to reference the contents of a parenthesized expression
1637 * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
1638 * a String to substituteIn of "visit us: http://www.apache.org!" and the
1639 * substitution String "<a href=\"$0\">$0</a>", the resulting String
1640 * returned by subst would be
1641 * "visit us: <a href=\"http://www.apache.org\">http://www.apache.org</a>!".
1642 * <p>
1643 * <i>Note:</i> $0 represents the whole match.
1644 *
1645 * @param substituteIn String to substitute within
1646 * @param substitution String to substitute for matches of this regular expression
1647 * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
1648 * flag bit is set, only the first occurrence of this regular expression is replaced.
1649 * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
1650 * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
1651 * be processed.
1652 * @return The string substituteIn with zero or more occurrences of the current
1653 * regular expression replaced with the substitution String (if this regular
1654 * expression object doesn't match at any position, the original String is returned
1655 * unchanged).
1656 */
1657 public String subst(String substituteIn, String substitution, int flags)
1658 {
1659 // String to return
1660 StringBuffer ret = new StringBuffer();
1661
1662 // Start at position 0 and search the whole string
1663 int pos = 0;
1664 int len = substituteIn.length();
1665
1666 // Try a match at each position
1667 while (pos < len && match(substituteIn, pos))
1668 {
1669 // Append string before match
1670 ret.append(substituteIn.substring(pos, getParenStart(0)));
1671
1672 if ((flags & REPLACE_BACKREFERENCES) != 0)
1673 {
1674 // Process backreferences
1675 int lCurrentPosition = 0;
1676 int lLastPosition = -2;
1677 int lLength = substitution.length();
1678
1679 while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
1680 {
1681 if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
1682 && lCurrentPosition + 1 < lLength)
1683 {
1684 char c = substitution.charAt(lCurrentPosition + 1);
1685 if (c >= '0' && c <= '9')
1686 {
1687 // Append everything between the last and the current $ sign
1688 ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
1689
1690 // Append the parenthesized expression, if present
1691 String val = getParen(c - '0');
1692 if (val != null) {
1693 ret.append(val);
1694 }
1695 lLastPosition = lCurrentPosition;
1696 }
1697 }
1698
1699 // Move forward, skipping past match
1700 lCurrentPosition++;
1701 }
1702
1703 // Append everything after the last $ sign
1704 ret.append(substitution.substring(lLastPosition + 2, lLength));
1705 }
1706 else
1707 {
1708 // Append substitution without processing backreferences
1709 ret.append(substitution);
1710 }
1711
1712 // Move forward, skipping past match
1713 int newpos = getParenEnd(0);
1714
1715 // We always want to make progress!
1716 if (newpos == pos)
1717 {
1718 newpos++;
1719 }
1720
1721 // Try new position
1722 pos = newpos;
1723
1724 // Break out if we're only supposed to replace one occurrence
1725 if ((flags & REPLACE_FIRSTONLY) != 0)
1726 {
1727 break;
1728 }
1729 }
1730
1731 // If there's remaining input, append it
1732 if (pos < len)
1733 {
1734 ret.append(substituteIn.substring(pos));
1735 }
1736
1737 // Return string buffer as string
1738 return ret.toString();
1739 }
1740
1741 /**
1742 * Returns an array of Strings, whose toString representation matches a regular
1743 * expression. This method works like the Perl function of the same name. Given
1744 * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
1745 * aaaab], the array of Strings returned by grep would be [aab, aaaab].
1746 *
1747 * @param search Array of Objects to search
1748 * @return Array of Strings whose toString() value matches this regular expression.
1749 */
1750 public String[] grep(Object[] search)
1751 {
1752 // Create new vector to hold return items
1753 Vector v = new Vector();
1754
1755 // Traverse array of objects
1756 for (int i = 0; i < search.length; i++)
1757 {
1758 // Get next object as a string
1759 String s = search[i].toString();
1760
1761 // If it matches this regexp, add it to the list
1762 if (match(s))
1763 {
1764 v.addElement(s);
1765 }
1766 }
1767
1768 // Return vector as an array of strings
1769 String[] ret = new String[v.size()];
1770 v.copyInto(ret);
1771 return ret;
1772 }
1773
1774 /**
1775 * @return true if character at i-th position in the <code>search</code> string is a newline
1776 */
1777 private boolean isNewline(int i)
1778 {
1779 char nextChar = search.charAt(i);
1780
1781 return nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' ||
1782 nextChar == '\u2028' || nextChar == '\u2029';
1783 }
1784
1785 /**
1786 * Compares two characters.
1787 *
1788 * @param c1 first character to compare.
1789 * @param c2 second character to compare.
1790 * @param caseIndependent whether comparision is case insensitive or not.
1791 * @return negative, 0, or positive integer as the first character
1792 * less than, equal to, or greater then the second.
1793 */
1794 private int compareChars(char c1, char c2, boolean caseIndependent)
1795 {
1796 if (caseIndependent)
1797 {
1798 c1 = Character.toLowerCase(c1);
1799 c2 = Character.toLowerCase(c2);
1800 }
1801 return ((int)c1 - (int)c2);
1802 }
1803 }