Source code: org/apache/regexp/RE.java
1 /*
2 * Copyright 1999-2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 package org.apache.regexp;
18
19 import java.io.Serializable;
20 import java.util.Vector;
21
22 /**
23 * RE is an efficient, lightweight regular expression evaluator/matcher
24 * class. Regular expressions are pattern descriptions which enable
25 * sophisticated matching of strings. In addition to being able to
26 * match a string against a pattern, you can also extract parts of the
27 * match. This is especially useful in text parsing! Details on the
28 * syntax of regular expression patterns are given below.
29 *
30 * <p>
31 * To compile a regular expression (RE), you can simply construct an RE
32 * matcher object from the string specification of the pattern, like this:
33 *
34 * <pre>
35 * RE r = new RE("a*b");
36 * </pre>
37 *
38 * <p>
39 * Once you have done this, you can call either of the RE.match methods to
40 * perform matching on a String. For example:
41 *
42 * <pre>
43 * boolean matched = r.match("aaaab");
44 * </pre>
45 *
46 * will cause the boolean matched to be set to true because the
47 * pattern "a*b" matches the string "aaaab".
48 *
49 * <p>
50 * If you were interested in the <i>number</i> of a's which matched the
51 * first part of our example expression, you could change the expression to
52 * "(a*)b". Then when you compiled the expression and matched it against
53 * something like "xaaaab", you would get results like this:
54 *
55 * <pre>
56 * RE r = new RE("(a*)b"); // Compile expression
57 * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
58 *
59 * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
60 * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
61 *
62 * int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
63 * int endWholeExpr = r.getParenEnd(0); // endWholeExpr will be index 6
64 * int lenWholeExpr = r.getParenLength(0); // lenWholeExpr will be 5
65 *
66 * int startInside = r.getParenStart(1); // startInside will be index 1
67 * int endInside = r.getParenEnd(1); // endInside will be index 5
68 * int lenInside = r.getParenLength(1); // lenInside will be 4
69 * </pre>
70 *
71 * You can also refer to the contents of a parenthesized expression
72 * within a regular expression itself. This is called a
73 * 'backreference'. The first backreference in a regular expression is
74 * denoted by \1, the second by \2 and so on. So the expression:
75 *
76 * <pre>
77 * ([0-9]+)=\1
78 * </pre>
79 *
80 * will match any string of the form n=n (like 0=0 or 2=2).
81 *
82 * <p>
83 * The full regular expression syntax accepted by RE is described here:
84 *
85 * <pre>
86 *
87 * <b><font face=times roman>Characters</font></b>
88 *
89 * <i>unicodeChar</i> Matches any identical unicode character
90 * \ Used to quote a meta-character (like '*')
91 * \\ Matches a single '\' character
92 * \0nnn Matches a given octal character
93 * \xhh Matches a given 8-bit hexadecimal character
94 * \\uhhhh Matches a given 16-bit hexadecimal character
95 * \t Matches an ASCII tab character
96 * \n Matches an ASCII newline character
97 * \r Matches an ASCII return character
98 * \f Matches an ASCII form feed character
99 *
100 *
101 * <b><font face=times roman>Character Classes</font></b>
102 *
103 * [abc] Simple character class
104 * [a-zA-Z] Character class with ranges
105 * [^abc] Negated character class
106 * </pre>
107 *
108 * <b>NOTE:</b> Incomplete ranges will be interpreted as "starts
109 * from zero" or "ends with last character".
110 * <br>
111 * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
112 * [-] means "all characters".
113 *
114 * <pre>
115 *
116 * <b><font face=times roman>Standard POSIX Character Classes</font></b>
117 *
118 * [:alnum:] Alphanumeric characters.
119 * [:alpha:] Alphabetic characters.
120 * [:blank:] Space and tab characters.
121 * [:cntrl:] Control characters.
122 * [:digit:] Numeric characters.
123 * [:graph:] Characters that are printable and are also visible.
124 * (A space is printable, but not visible, while an
125 * `a' is both.)
126 * [:lower:] Lower-case alphabetic characters.
127 * [:print:] Printable characters (characters that are not
128 * control characters.)
129 * [:punct:] Punctuation characters (characters that are not letter,
130 * digits, control characters, or space characters).
131 * [:space:] Space characters (such as space, tab, and formfeed,
132 * to name a few).
133 * [:upper:] Upper-case alphabetic characters.
134 * [:xdigit:] Characters that are hexadecimal digits.
135 *
136 *
137 * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
138 *
139 * [:javastart:] Start of a Java identifier
140 * [:javapart:] Part of a Java identifier
141 *
142 *
143 * <b><font face=times roman>Predefined Classes</font></b>
144 *
145 * . Matches any character other than newline
146 * \w Matches a "word" character (alphanumeric plus "_")
147 * \W Matches a non-word character
148 * \s Matches a whitespace character
149 * \S Matches a non-whitespace character
150 * \d Matches a digit character
151 * \D Matches a non-digit character
152 *
153 *
154 * <b><font face=times roman>Boundary Matchers</font></b>
155 *
156 * ^ Matches only at the beginning of a line
157 * $ Matches only at the end of a line
158 * \b Matches only at a word boundary
159 * \B Matches only at a non-word boundary
160 *
161 *
162 * <b><font face=times roman>Greedy Closures</font></b>
163 *
164 * A* Matches A 0 or more times (greedy)
165 * A+ Matches A 1 or more times (greedy)
166 * A? Matches A 1 or 0 times (greedy)
167 * A{n} Matches A exactly n times (greedy)
168 * A{n,} Matches A at least n times (greedy)
169 * A{n,m} Matches A at least n but not more than m times (greedy)
170 *
171 *
172 * <b><font face=times roman>Reluctant Closures</font></b>
173 *
174 * A*? Matches A 0 or more times (reluctant)
175 * A+? Matches A 1 or more times (reluctant)
176 * A?? Matches A 0 or 1 times (reluctant)
177 *
178 *
179 * <b><font face=times roman>Logical Operators</font></b>
180 *
181 * AB Matches A followed by B
182 * A|B Matches either A or B
183 * (A) Used for subexpression grouping
184 * (?:A) Used for subexpression clustering (just like grouping but
185 * no backrefs)
186 *
187 *
188 * <b><font face=times roman>Backreferences</font></b>
189 *
190 * \1 Backreference to 1st parenthesized subexpression
191 * \2 Backreference to 2nd parenthesized subexpression
192 * \3 Backreference to 3rd parenthesized subexpression
193 * \4 Backreference to 4th parenthesized subexpression
194 * \5 Backreference to 5th parenthesized subexpression
195 * \6 Backreference to 6th parenthesized subexpression
196 * \7 Backreference to 7th parenthesized subexpression
197 * \8 Backreference to 8th parenthesized subexpression
198 * \9 Backreference to 9th parenthesized subexpression
199 * </pre>
200 *
201 * <p>
202 * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
203 * that they match as many elements of the string as possible without
204 * causing the overall match to fail. If you want a closure to be
205 * reluctant (non-greedy), you can simply follow it with a '?'. A
206 * reluctant closure will match as few elements of the string as
207 * possible when finding matches. {m,n} closures don't currently
208 * support reluctancy.
209 *
210 * <p>
211 * <b><font face="times roman">Line terminators</font></b>
212 * <br>
213 * A line terminator is a one- or two-character sequence that marks
214 * the end of a line of the input character sequence. The following
215 * are recognized as line terminators:
216 * <ul>
217 * <li>A newline (line feed) character ('\n'),</li>
218 * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
219 * <li>A standalone carriage-return character ('\r'),</li>
220 * <li>A next-line character ('\u0085'),</li>
221 * <li>A line-separator character ('\u2028'), or</li>
222 * <li>A paragraph-separator character ('\u2029).</li>
223 * </ul>
224 *
225 * <p>
226 * RE runs programs compiled by the RECompiler class. But the RE
227 * matcher class does not include the actual regular expression compiler
228 * for reasons of efficiency. In fact, if you want to pre-compile one
229 * or more regular expressions, the 'recompile' class can be invoked
230 * from the command line to produce compiled output like this:
231 *
232 * <pre>
233 * // Pre-compiled regular expression "a*b"
234 * char[] re1Instructions =
235 * {
236 * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
237 * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
238 * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
239 * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
240 * 0x0000,
241 * };
242 *
243 *
244 * REProgram re1 = new REProgram(re1Instructions);
245 * </pre>
246 *
247 * You can then construct a regular expression matcher (RE) object from
248 * the pre-compiled expression re1 and thus avoid the overhead of
249 * compiling the expression at runtime. If you require more dynamic
250 * regular expressions, you can construct a single RECompiler object and
251 * re-use it to compile each expression. Similarly, you can change the
252 * program run by a given matcher object at any time. However, RE and
253 * RECompiler are not threadsafe (for efficiency reasons, and because
254 * requiring thread safety in this class is deemed to be a rare
255 * requirement), so you will need to construct a separate compiler or
256 * matcher object for each thread (unless you do thread synchronization
257 * yourself). Once expression compiled into the REProgram object, REProgram
258 * can be safely shared across multiple threads and RE objects.
259 *
260 * <br><p><br>
261 *
262 * <font color="red">
263 * <i>ISSUES:</i>
264 *
265 * <ul>
266 * <li>com.weusours.util.re is not currently compatible with all
267 * standard POSIX regcomp flags</li>
268 * <li>com.weusours.util.re does not support POSIX equivalence classes
269 * ([=foo=] syntax) (I18N/locale issue)</li>
270 * <li>com.weusours.util.re does not support nested POSIX character
271 * classes (definitely should, but not completely trivial)</li>
272 * <li>com.weusours.util.re Does not support POSIX character collation
273 * concepts ([.foo.] syntax) (I18N/locale issue)</li>
274 * <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
275 * <li>Should RE support character iterators (for backwards RE matching!)?</li>
276 * <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
277 * <li>Not *all* possibilities are considered for greediness when backreferences
278 * are involved (as POSIX suggests should be the case). The POSIX RE
279 * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
280 * of acdacaa where \1 is "a". This is not the case in this RE package,
281 * and actually Perl doesn't go to this extent either! Until someone
282 * actually complains about this, I'm not sure it's worth "fixing".
283 * If it ever is fixed, test #137 in RETest.txt should be updated.</li>
284 * </ul>
285 *
286 * </font>
287 *
288 * @see recompile
289 * @see RECompiler
290 *
291 * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
292 * @author <a href="mailto:ts@sch-fer.de">Tobias Schäfer</a>
293 * @version $Id: RE.java 232192 2005-08-12 03:04:07Z vgritsenko $
294 */
295 public class RE implements Serializable
296 {
297 /**
298 * Specifies normal, case-sensitive matching behaviour.
299 */
300 public static final int MATCH_NORMAL = 0x0000;
301
302 /**
303 * Flag to indicate that matching should be case-independent (folded)
304 */
305 public static final int MATCH_CASEINDEPENDENT = 0x0001;
306
307 /**
308 * Newlines should match as BOL/EOL (^ and $)
309 */
310 public static final int MATCH_MULTILINE = 0x0002;
311
312 /**
313 * Consider all input a single body of text - newlines are matched by .
314 */
315 public static final int MATCH_SINGLELINE = 0x0004;
316
317 /************************************************
318 * *
319 * The format of a node in a program is: *
320 * *
321 * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
322 * *
323 * char OPCODE - instruction *
324 * char OPDATA - modifying data *
325 * char OPNEXT - next node (relative offset) *
326 * *
327 ************************************************/
328
329 // Opcode Char Opdata/Operand Meaning
330 // ---------- ---------- --------------- --------------------------------------------------
331 static final char OP_END = 'E'; // end of program
332 static final char OP_BOL = '^'; // match only if at beginning of line
333 static final char OP_EOL = '$'; // match only if at end of line
334 static final char OP_ANY = '.'; // match any single character except newline
335 static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
336 static final char OP_BRANCH = '|'; // node match this alternative or the next one
337 static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
338 static final char OP_STAR = '*'; // node kleene closure
339 static final char OP_PLUS = '+'; // node positive closure
340 static final char OP_MAYBE = '?'; // node optional closure
341 static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
342 static final char OP_OPEN = '('; // number nth opening paren
343 static final char OP_OPEN_CLUSTER = '<'; // opening cluster
344 static final char OP_CLOSE = ')'; // number nth closing paren
345 static final char OP_CLOSE_CLUSTER = '>'; // closing cluster
346 static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
347 static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
348 static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
349 static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
350 static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
351 static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
352 static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
353
354 // Escape codes
355 static final char E_ALNUM = 'w'; // Alphanumeric
356 static final char E_NALNUM = 'W'; // Non-alphanumeric
357 static final char E_BOUND = 'b'; // Word boundary
358 static final char E_NBOUND = 'B'; // Non-word boundary
359 static final char E_SPACE = 's'; // Whitespace
360 static final char E_NSPACE = 'S'; // Non-whitespace
361 static final char E_DIGIT = 'd'; // Digit
362 static final char E_NDIGIT = 'D'; // Non-digit
363
364 // Posix character classes
365 static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
366 static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
367 static final char POSIX_CLASS_BLANK = 'b'; // Blanks
368 static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
369 static final char POSIX_CLASS_DIGIT = 'd'; // Digits
370 static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
371 static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
372 static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
373 static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
374 static final char POSIX_CLASS_SPACE = 's'; // Spaces
375 static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
376 static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
377 static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
378 static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
379
380 // Limits
381 static final int maxNode = 65536; // Maximum number of nodes in a program
382 static final int MAX_PAREN = 16; // Number of paren pairs (only 9 can be backrefs)
383
384 // Node layout constants
385 static final int offsetOpcode = 0; // Opcode offset (first character)
386 static final int offsetOpdata = 1; // Opdata offset (second char)
387 static final int offsetNext = 2; // Next index offset (third char)
388 static final int nodeSize = 3; // Node size (in chars)
389
390 // State of current program
391 REProgram program; // Compiled regular expression 'program'
392 transient CharacterIterator search; // The string being matched against
393 int matchFlags; // Match behaviour flags
394 int maxParen = MAX_PAREN;
395
396 // Parenthesized subexpressions
397 transient int parenCount; // Number of subexpressions matched (num open parens + 1)
398 transient int start0; // Cache of start[0]
399 transient int end0; // Cache of start[0]
400 transient int start1; // Cache of start[1]
401 transient int end1; // Cache of start[1]
402 transient int start2; // Cache of start[2]
403 transient int end2; // Cache of start[2]
404 transient int[] startn; // Lazy-alloced array of sub-expression starts
405 transient int[] endn; // Lazy-alloced array of sub-expression ends
406
407 // Backreferences
408 transient int[] startBackref; // Lazy-alloced array of backref starts
409 transient int[] endBackref; // Lazy-alloced array of backref ends
410
411 /**
412 * Constructs a regular expression matcher from a String by compiling it
413 * using a new instance of RECompiler. If you will be compiling many
414 * expressions, you may prefer to use a single RECompiler object instead.
415 *
416 * @param pattern The regular expression pattern to compile.
417 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
418 * @see RECompiler
419 * @see recompile
420 */
421 public RE(String pattern) throws RESyntaxException
422 {
423 this(pattern, MATCH_NORMAL);
424 }
425
426 /**
427 * Constructs a regular expression matcher from a String by compiling it
428 * using a new instance of RECompiler. If you will be compiling many
429 * expressions, you may prefer to use a single RECompiler object instead.
430 *
431 * @param pattern The regular expression pattern to compile.
432 * @param matchFlags The matching style
433 * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
434 * @see RECompiler
435 * @see recompile
436 */
437 public RE(String pattern, int matchFlags) throws RESyntaxException
438 {
439 this(new RECompiler().compile(pattern));
440 setMatchFlags(matchFlags);
441 }
442
443 /**
444 * Construct a matcher for a pre-compiled regular expression from program
445 * (bytecode) data. Permits special flags to be passed in to modify matching
446 * behaviour.
447 *
448 * @param program Compiled regular expression program (see RECompiler and/or recompile)
449 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
450 *
451 * <pre>
452 * MATCH_NORMAL // Normal (case-sensitive) matching
453 * MATCH_CASEINDEPENDENT // Case folded comparisons
454 * MATCH_MULTILINE // Newline matches as BOL/EOL
455 * </pre>
456 *
457 * @see RECompiler
458 * @see REProgram
459 * @see recompile
460 */
461 public RE(REProgram program, int matchFlags)
462 {
463 setProgram(program);
464 setMatchFlags(matchFlags);
465 }
466
467 /**
468 * Construct a matcher for a pre-compiled regular expression from program
469 * (bytecode) data.
470 *
471 * @param program Compiled regular expression program
472 * @see RECompiler
473 * @see recompile
474 */
475 public RE(REProgram program)
476 {
477 this(program, MATCH_NORMAL);
478 }
479
480 /**
481 * Constructs a regular expression matcher with no initial program.
482 * This is likely to be an uncommon practice, but is still supported.
483 */
484 public RE()
485 {
486 this((REProgram)null, MATCH_NORMAL);
487 }
488
489 /**
490 * Converts a 'simplified' regular expression to a full regular expression
491 *
492 * @param pattern The pattern to convert
493 * @return The full regular expression
494 */
495 public static String simplePatternToFullRegularExpression(String pattern)
496 {
497 StringBuffer buf = new StringBuffer();
498 for (int i = 0; i < pattern.length(); i++)
499 {
500 char c = pattern.charAt(i);
501 switch (c)
502 {
503 case '*':
504 buf.append(".*");
505 break;
506
507 case '.':
508 case '[':
509 case ']':
510 case '\\':
511 case '+':
512 case '?':
513 case '{':
514 case '}':
515 case '$':
516 case '^':
517 case '|':
518 case '(':
519 case ')':
520 buf.append('\\');
521 default:
522 buf.append(c);
523 break;
524 }
525 }
526 return buf.toString();
527 }
528
529 /**
530 * Sets match behaviour flags which alter the way RE does matching.
531 * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
532 *
533 * <pre>
534 * MATCH_NORMAL // Normal (case-sensitive) matching
535 * MATCH_CASEINDEPENDENT // Case folded comparisons
536 * MATCH_MULTILINE // Newline matches as BOL/EOL
537 * </pre>
538 */
539 public void setMatchFlags(int matchFlags)
540 {
541 this.matchFlags = matchFlags;
542 }
543
544 /**
545 * Returns the current match behaviour flags.
546 * @return Current match behaviour flags (RE.MATCH_*).
547 *
548 * <pre>
549 * MATCH_NORMAL // Normal (case-sensitive) matching
550 * MATCH_CASEINDEPENDENT // Case folded comparisons
551 * MATCH_MULTILINE // Newline matches as BOL/EOL
552 * </pre>
553 *
554 * @see #setMatchFlags
555 */
556 public int getMatchFlags()
557 {
558 return matchFlags;
559 }
560
561 /**
562 * Sets the current regular expression program used by this matcher object.
563 *
564 * @param program Regular expression program compiled by RECompiler.
565 * @see RECompiler
566 * @see REProgram
567 * @see recompile
568 */
569 public void setProgram(REProgram program)
570 {
571 this.program = program;
572 if (program != null && program.maxParens != -1) {
573 this.maxParen = program.maxParens;
574 } else {
575 this.maxParen = MAX_PAREN;
576 }
577 }
578
579 /**
580 * Returns the current regular expression program in use by this matcher object.
581 *
582 * @return Regular expression program
583 * @see #setProgram
584 */
585 public REProgram getProgram()
586 {
587 return program;
588 }
589
590 /**
591 * Returns the number of parenthesized subexpressions available after a successful match.
592 *
593 * @return Number of available parenthesized subexpressions
594 */
595 public int getParenCount()
596 {
597 return parenCount;
598 }
599
600 /**
601 * Gets the contents of a parenthesized subexpression after a successful match.
602 *
603 * @param which Nesting level of subexpression
604 * @return String
605 */
606 public String getParen(int which)
607 {
608 int start;
609 if (which < parenCount && (start = getParenStart(which)) >= 0)
610 {
611 return search.substring(start, getParenEnd(which));
612 }
613 return null;
614 }
615
616 /**
617 * Returns the start index of a given paren level.
618 *
619 * @param which Nesting level of subexpression
620 * @return String index
621 */
622 public final int getParenStart(int which)
623 {
624 if (which < parenCount)
625 {
626 switch (which)
627 {
628 case 0:
629 return start0;
630
631 case 1:
632 return start1;
633
634 case 2:
635 return start2;
636
637 default:
638 if (startn == null)
639 {
640 allocParens();
641 }
642 return startn[which];
643 }
644 }
645 return -1;
646 }
647
648 /**
649 * Returns the end index of a given paren level.
650 *
651 * @param which Nesting level of subexpression
652 * @return String index
653 */
654 public final int getParenEnd(int which)
655 {
656 if (which < parenCount)
657 {
658 switch (which)
659 {
660 case 0:
661 return end0;
662
663 case 1:
664 return end1;
665
666 case 2:
667 return end2;
668
669 default:
670 if (endn == null)
671 {
672 allocParens();
673 }
674 return endn[which];
675 }
676 }
677 return -1;
678 }
679
680 /**
681 * Returns the length of a given paren level.
682 *
683 * @param which Nesting level of subexpression
684 * @return Number of characters in the parenthesized subexpression
685 */
686 public final int getParenLength(int which)
687 {
688 if (which < parenCount)
689 {
690 return getParenEnd(which) - getParenStart(which);
691 }
692 return -1;
693 }
694
695 /**
696 * Sets the start of a paren level
697 *
698 * @param which Which paren level
699 * @param i Index in input array
700 */
701 protected final void setParenStart(int which, int i)
702 {
703 if (which < parenCount)
704 {
705 switch (which)
706 {
707 case 0:
708 start0 = i;
709 break;
710
711 case 1:
712 start1 = i;
713 break;
714
715 case 2:
716 start2 = i;
717 break;
718
719 default:
720 if (startn == null)
721 {
722 allocParens();
723 }
724 startn[which] = i;
725 break;
726 }
727 }
728 }
729
730 /**
731 * Sets the end of a paren level
732 *
733 * @param which Which paren level
734 * @param i Index in input array
735 */
736 protected final void setParenEnd(int which, int i)
737 {
738 if (which < parenCount)
739 {
740 switch (which)
741 {
742 case 0:
743 end0 = i;
744 break;
745
746 case 1:
747 end1 = i;
748 break;
749
750 case 2:
751 end2 = i;
752 break;
753
754 default:
755 if (endn == null)
756 {
757 allocParens();
758 }
759 endn[which] = i;
760 break;
761 }
762 }
763 }
764
765 /**
766 * Throws an Error representing an internal error condition probably resulting
767 * from a bug in the regular expression compiler (or possibly data corruption).
768 * In practice, this should be very rare.
769 *
770 * @param s Error description
771 */
772 protected void internalError(String s) throws Error
773 {
774 throw new Error("RE internal error: " + s);
775 }
776
777 /**
778 * Performs lazy allocation of subexpression arrays
779 */
780 private final void allocParens()
781 {
782 // Allocate arrays for subexpressions
783 startn = new int[maxParen];
784 endn = new int[maxParen];
785
786 // Set sub-expression pointers to invalid values
787 for (int i = 0; i < maxParen; i++)
788 {
789 startn[i] = -1;
790 endn[i] = -1;
791 }
792 }
793
794 /**
795 * Try to match a string against a subset of nodes in the program
796 *
797 * @param firstNode Node to start at in program
798 * @param lastNode Last valid node (used for matching a subexpression without
799 * matching the rest of the program as well).
800 * @param idxStart Starting position in character array
801 * @return Final input array index if match succeeded. -1 if not.
802 */
803 protected int matchNodes(int firstNode, int lastNode, int idxStart)
804 {
805 // Our current place in the string
806 int idx = idxStart;
807
808 // Loop while node is valid
809 int next, opcode, opdata;
810 int idxNew;
811 char[] instruction = program.instruction;
812 for (int node = firstNode; node < lastNode; )
813 {
814 opcode = instruction[node + offsetOpcode];
815 next = node + (short)instruction[node + offsetNext];
816 opdata = instruction[node + offsetOpdata];
817
818 switch (opcode)
819 {
820 case OP_RELUCTANTMAYBE:
821 {
822 int once = 0;
823 do
824 {
825 // Try to match the rest without using the reluctant subexpr
826 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
827 {
828 return idxNew;
829 }
830 }
831 while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
832 return -1;
833 }
834
835 case OP_RELUCTANTPLUS:
836 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
837 {
838 // Try to match the rest without using the reluctant subexpr
839 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
840 {
841 return idxNew;
842 }
843 }
844 return -1;
845
846 case OP_RELUCTANTSTAR:
847 do
848 {
849 // Try to match the rest without using the reluctant subexpr
850 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
851 {
852 return idxNew;
853 }
854 }
855 while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
856 return -1;
857
858 case OP_OPEN:
859
860 // Match subexpression
861 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
862 {
863 startBackref[opdata] = idx;
864 }
865 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
866 {
867 // Increase valid paren count
868 if ((opdata + 1) > parenCount)
869 {
870 parenCount = opdata + 1;
871 }
872
873 // Don't set paren if already set later on
874 if (getParenStart(opdata) == -1)
875 {
876 setParenStart(opdata, idx);
877 }
878 }
879 return idxNew;
880
881 case OP_CLOSE:
882
883 // Done matching subexpression
884 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
885 {
886 endBackref[opdata] = idx;
887 }
888 if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
889 {
890 // Increase valid paren count
891 if ((opdata + 1) > parenCount)
892 {
893 parenCount = opdata + 1;
894 }
895
896 // Don't set paren if already set later on
897 if (getParenEnd(opdata) == -1)
898 {
899 setParenEnd(opdata, idx);
900 }
901 }
902 return idxNew;
903
904 case OP_OPEN_CLUSTER:
905 case OP_CLOSE_CLUSTER:
906 // starting or ending the matching of a subexpression which has no backref.
907 return matchNodes( next, maxNode, idx );
908
909 case OP_BACKREF:
910 {
911 // Get the start and end of the backref
912 int s = startBackref[opdata];
913 int e = endBackref[opdata];
914
915 // We don't know the backref yet
916 if (s == -1 || e == -1)
917 {
918 return -1;
919 }
920
921 // The backref is empty size
922 if (s == e)
923 {
924 break;
925 }
926
927 // Get the length of the backref
928 int l = e - s;
929
930 // If there's not enough input left, give up.
931 if (search.isEnd(idx + l - 1))
932 {
933 return -1;
934 }
935
936 // Case fold the backref?
937 final boolean caseFold =
938 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
939 // Compare backref to input
940 for (int i = 0; i < l; i++)
941 {
942 if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
943 {
944 return -1;
945 }
946 }
947 }
948 break;
949
950 case OP_BOL:
951
952 // Fail if we're not at the start of the string
953 if (idx != 0)
954 {
955 // If we're multiline matching, we could still be at the start of a line
956 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
957 {
958 // If not at start of line, give up
959 if (idx <= 0 || !isNewline(idx - 1)) {
960 return -1;
961 } else {
962 break;
963 }
964 }
965 return -1;
966 }
967 break;
968
969 case OP_EOL:
970
971 // If we're not at the end of string
972 if (!search.isEnd(0) && !search.isEnd(idx))
973 {
974 // If we're multi-line matching
975 if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
976 {
977 // Give up if we're not at the end of a line
978 if (!isNewline(idx)) {
979 return -1;
980 } else {
981 break;
982 }
983 }
984 return -1;
985 }
986 break;
987
988 case OP_ESCAPE:
989
990 // Which escape?
991 switch (opdata)
992 {
993 // Word boundary match
994 case E_NBOUND:
995 case E_BOUND:
996 {
997 char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
998 char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
999 if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
1000 {
1001 return -1;
1002 }
1003 }
1004 break;
1005
1006 // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
1007 case E_ALNUM:
1008 case E_NALNUM:
1009 case E_DIGIT:
1010 case E_NDIGIT:
1011 case E_SPACE:
1012 case E_NSPACE:
1013
1014 // Give up if out of input
1015 if (search.isEnd(idx))
1016 {
1017 return -1;
1018 }
1019
1020 char c = search.charAt(idx);
1021
1022 // Switch on escape
1023 switch (opdata)
1024 {
1025 case E_ALNUM:
1026 case E_NALNUM:
1027 if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
1028 {
1029 return -1;
1030 }
1031 break;
1032
1033 case E_DIGIT:
1034 case E_NDIGIT:
1035 if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
1036 {
1037 return -1;
1038 }
1039 break;
1040
1041 case E_SPACE:
1042 case E_NSPACE:
1043 if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
1044 {
1045 return -1;
1046 }
1047 break;
1048 }
1049 idx++;
1050 break;
1051
1052 default:
1053 internalError("Unrecognized escape '" + opdata + "'");
1054 }
1055 break;
1056
1057 case OP_ANY:
1058
1059 if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
1060 // Match anything
1061 if (search.isEnd(idx))
1062 {
1063 return -1;
1064 }
1065 }
1066 else
1067 {
1068 // Match anything but a newline
1069 if (search.isEnd(idx) || isNewline(idx))
1070 {
1071 return -1;
1072 }
1073 }
1074 idx++;
1075 break;
1076
1077 case OP_ATOM:
1078 {
1079 // Match an atom value
1080 if (search.isEnd(idx))
1081 {
1082 return -1;
1083 }
1084
1085 // Get length of atom and starting index
1086 int lenAtom = opdata;
1087 int startAtom = node + nodeSize;
1088
1089 // Give up if not enough input remains to have a match
1090 if (search.isEnd(lenAtom + idx - 1))
1091 {
1092 return -1;
1093 }
1094
1095 // Match atom differently depending on casefolding flag
1096 final boolean caseFold =
1097 ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
1098
1099 for (int i = 0; i < lenAtom; i++)
1100 {
1101 if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
1102 {
1103 return -1;
1104 }
1105 }
1106 }
1107 break;
1108
1109 case OP_POSIXCLASS:
1110 {
1111 // Out of input?
1112 if (search.isEnd(idx))
1113 {
1114 return -1;
1115 }
1116
1117 switch (opdata)
1118 {
1119 case POSIX_CLASS_ALNUM:
1120 if (!Character.isLetterOrDigit(search.charAt(idx)))
1121 {
1122 return -1;
1123 }
1124 break;
1125
1126 case POSIX_CLASS_ALPHA:
1127 if (!Character.isLetter(search.charAt(idx)))
1128 {
1129 return -1;
1130 }
1131 break;
1132
1133 case POSIX_CLASS_DIGIT:
1134 if (!Character.isDigit(search.charAt(idx)))
1135 {
1136 return -1;
1137 }
1138 break;
1139
1140 case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
1141 if (!Character.isSpaceChar(search.charAt(idx)))
1142 {
1143 return -1;
1144 }
1145 break;
1146
1147 case POSIX_CLASS_SPACE:
1148 if (!Character.isWhitespace(search.charAt(idx)))
1149 {
1150 return -1;
1151 }
1152 break;
1153
1154 case POSIX_CLASS_CNTRL:
1155 if (Character.getType(search.charAt(idx)) != Character.CONTROL)
1156 {
1157 return -1;
1158 }
1159 break;
1160
1161 case POSIX_CLASS_GRAPH: // JWL - bugbug???
1162 switch (Character.getType(search.charAt(idx)))
1163 {
1164 case Character.MATH_SYMBOL:
1165 case Character.CURRENCY_SYMBOL:
1166 case Character.MODIFIER_SYMBOL:
1167 case Character.OTHER_SYMBOL:
1168 break;
1169
1170 default:
1171 return -1;
1172 }
1173 break;
1174
1175 case POSIX_CLASS_LOWER:
1176 if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
1177 {
1178 return -1;
1179 }
1180 break;
1181
1182 case POSIX_CLASS_UPPER:
1183 if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
1184 {
1185 return -1;
1186 }
1187 break;
1188
1189 case POSIX_CLASS_PRINT:
1190 if (Character.getType(search.charAt(idx)) == Character.CONTROL)
1191 {
1192 return -1;
1193 }
1194 break;
1195
1196 case POSIX_CLASS_PUNCT:
1197 {
1198 int type = Character.getType(search.charAt(idx));
1199 switch(type)
1200 {
1201 case Character.DASH_PUNCTUATION:
1202 case Character.START_PUNCTUATION:
1203 case Character.END_PUNCTUATION:
1204 case Character.CONNECTOR_PUNCTUATION:
1205 case Character.OTHER_PUNCTUATION:
1206 break;
1207
1208 default:
1209 return -1;
1210 }
1211 }
1212 break;
1213
1214 case POSIX_CLASS_XDIGIT: // JWL - bugbug??
1215 {
1216 boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
1217 (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
1218 (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
1219 if (!isXDigit)
1220 {
1221 return -1;
1222 }
1223 }
1224 break;
1225
1226 case POSIX_CLASS_JSTART:
1227 if (!Character.isJavaIdentifierStart(search.charAt(idx)))
1228 {
1229 return -1;
1230 }
1231 break;
1232
1233 case POSIX_CLASS_JPART:
1234 if (!Character.isJavaIdentifierPart(search.charAt(idx)))
1235 {
1236 return -1;
1237 }
1238 break;
1239
1240 default:
1241 internalError("Bad posix class");
1242 break;
1243 }
1244
1245 // Matched.
1246 idx++;
1247 }
1248 break;
1249
1250 case OP_ANYOF:
1251 {
1252 // Out of input?
1253 if (search.isEnd(idx))
1254 {
1255 return -1;
1256 }
1257
1258 // Get character to match against character class and maybe casefold
1259 char c = search.charAt(idx);
1260 boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1261 // Loop through character class checking our match character
1262 int idxRange = node + nodeSize;
1263 int idxEnd = idxRange + (opdata * 2);
1264 boolean match = false;
1265 for (int i = idxRange; !match && i < idxEnd; )
1266 {
1267 // Get start, end and match characters
1268 char s = instruction[i++];
1269 char e = instruction[i++];
1270
1271 match = ((compareChars(c, s, caseFold) >= 0)
1272 && (compareChars(c, e, caseFold) <= 0));
1273 }
1274
1275 // Fail if we didn't match the character class
1276 if (!match)
1277 {
1278 return -1;
1279 }
1280 idx++;
1281 }
1282 break;
1283
1284 case OP_BRANCH:
1285 {
1286 // Check for choices
1287 if (instruction[next + offsetOpcode] != OP_BRANCH)
1288 {
1289 // If there aren't any other choices, just evaluate this branch.
1290 node += nodeSize;
1291 continue;
1292 }
1293
1294 // Try all available branches
1295 short nextBranch;
1296 do
1297 {
1298 // Try matching the branch against the string
1299 if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
1300 {
1301 return idxNew;
1302 }
1303
1304 // Go to next branch (if any)
1305 nextBranch = (short)instruction[node + offsetNext];
1306 node += nextBranch;
1307 }
1308 while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
1309
1310 // Failed to match any branch!
1311 return -1;
1312 }
1313
1314 case OP_NOTHING:
1315 case OP_GOTO:
1316
1317 // Just advance to the next node without doing anything
1318 break;
1319
1320 case OP_END:
1321
1322 // Match has succeeded!
1323 setParenEnd(0, idx);
1324 return idx;
1325
1326 default:
1327
1328 // Corrupt program
1329 internalError("Invalid opcode '" + opcode + "'");
1330 }
1331
1332 // Advance to the next node in the program
1333 node = next;
1334 }
1335
1336 // We "should" never end up here
1337 internalError("Corrupt program");
1338 return -1;
1339 }
1340
1341 /**
1342 * Match the current regular expression program against the current
1343 * input string, starting at index i of the input string. This method
1344 * is only meant for internal use.
1345 *
1346 * @param i The input string index to start matching at
1347 * @return True if the input matched the expression
1348 */
1349 protected boolean matchAt(int i)
1350 {
1351 // Initialize start pointer, paren cache and paren count
1352 start0 = -1;
1353 end0 = -1;
1354 start1 = -1;
1355 end1 = -1;
1356 start2 = -1;
1357 end2 = -1;
1358 startn = null;
1359 endn = null;
1360 parenCount = 1;
1361 setParenStart(0, i);
1362
1363 // Allocate backref arrays (unless optimizations indicate otherwise)
1364 if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
1365 {
1366 startBackref = new int[maxParen];
1367 endBackref = new int[maxParen];
1368 }
1369
1370 // Match against string
1371 int idx;
1372 if ((idx = matchNodes(0, maxNode, i)) != -1)
1373 {
1374 setParenEnd(0, idx);
1375 return true;
1376 }
1377
1378 // Didn't match
1379 parenCount = 0;
1380 return false;
1381 }
1382
1383 /**
1384 * Matches the current regular expression program against a character array,
1385 * starting at a given index.
1386 *
1387 * @param search String to match against
1388 * @param i Index to start searching at
1389 * @return True if string matched
1390 */
1391 public boolean match(String search, int i)
1392 {
1393 return match(new StringCharacterIterator(search), i);
1394 }
1395
1396 /**
1397 * Matches the current regular expression program against a character array,
1398 * starting at a given index.
1399 *
1400 * @param search String to match against
1401 * @param i Index to start searching at
1402 * @return True if string matched
1403 */
1404 public boolean match(CharacterIterator search, int i)
1405 {
1406 // There is no compiled program to search with!
1407 if (program == null)
1408 {
1409 // This should be uncommon enough to be an error case rather
1410 // than an exception (which would have to be handled everywhere)
1411 internalError("No RE program to run!");
1412 }
1413
1414 // Save string to search
1415 this.search = search;
1416
1417 // Can we optimize the search by looking for new lines?
1418 if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
1419 {
1420 // Non multi-line matching with BOL: Must match at '0' index
1421 if ((matchFlags & MATCH_MULTILINE) == 0)
1422 {
1423 return i == 0 && matchAt(i);
1424 }
1425
1426 // Multi-line matching with BOL: Seek to next line
1427 for ( ;! search.isEnd(i); i++)
1428 {
1429 // Skip if we are at the beginning of the line
1430 if (isNewline(i))
1431 {
1432 continue;
1433 }
1434
1435 // Match at the beginning of the line
1436 if (matchAt(i))
1437 {
1438 return true;
1439 }
1440
1441 // Skip to the end of line
1442 for ( ;! search.isEnd(i); i++)
1443 {
1444 if (isNewline(i))
1445 {
1446 break;
1447 }
1448 }
1449 }
1450
1451 return false;
1452 }
1453
1454 // Can we optimize the search by looking for a prefix string?
1455 if (program.prefix == null)
1456 {
1457 // Unprefixed matching must try for a match at each character
1458 for ( ;! search.isEnd(i - 1); i++)
1459 {
1460 // Try a match at index i
1461 if (matchAt(i))
1462 {
1463 return true;
1464 }
1465 }
1466 return false;
1467 }
1468 else
1469 {
1470 // Prefix-anchored matching is possible
1471 boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
1472 char[] prefix = program.prefix;
1473 for ( ; !search.isEnd(i + prefix.length - 1); i++)
1474 {
1475 int j = i;
1476 int k = 0;
1477
1478 boolean match;
1479 do {
1480 // If there's a mismatch of any character in the prefix, give up
1481 match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
1482 } while (match && k < prefix.length);
1483
1484 // See if the whole prefix string matched
1485 if (k == prefix.length)
1486 {
1487 // We matched the full prefix at firstChar, so try it
1488 if (matchAt(i))
1489 {
1490 return true;
1491 }
1492 }
1493 }
1494 return false;
1495 }
1496 }
1497
1498 /**
1499 * Matches the current regular expression program against a String.
1500 *
1501 * @param search String to match against
1502 * @return True if string matched
1503 */
1504 public boolean match(String search)
1505 {
1506 return match(search, 0);
1507 }
1508
1509 /**
1510 * Splits a string into an array of strings on regular expression boundaries.
1511 * This function works the same way as the Perl function of the same name.
1512 * Given a regular expression of "[ab]+" and a string to split of
1513 * "xyzzyababbayyzabbbab123", the result would be the array of Strings
1514 * "[xyzzy, yyz, 123]".
1515 *
1516 * <p>Please note that the first string in the resulting array may be an empty
1517 * string. This happens when the very first character of input string is
1518 * matched by the pattern.
1519 *
1520 * @param s String to split on this regular exression
1521 * @return Array of strings
1522 */
1523 public String[] split(String s)
1524 {
1525 // Create new vector
1526 Vector v = new Vector()