Source code: com/memoire/re/RE.java
1
2
3 package com.memoire.re;
4 import com.memoire.re.*;
5
6 /*
7 * gnu/regexp/RE.java
8 * Copyright (C) 1998 Wes Biggs
9 *
10 * This library is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Library General Public License as published
12 * by the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Library General Public License for more details.
19 *
20 * You should have received a copy of the GNU Library General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25
26 import java.io.InputStream;
27 import java.util.Vector;
28
29 class IntPair {
30 public int first, second;
31 }
32
33 class CharUnit {
34 public char ch;
35 public boolean bk;
36 }
37
38 /**
39 * RE provides the user interface for compiling and matching regular
40 * expressions.
41 * <P>
42 * A regular expression object (class RE) is compiled by constructing it
43 * from a String, StringBuffer or character array, with optional
44 * compilation flags (below)
45 * and an optional syntax specification (see RESyntax; if not specified,
46 * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
47 * <P>
48 * Various methods attempt to match input text against a compiled
49 * regular expression. These methods are:
50 * <LI><code>isMatch</code>: returns true if the input text in its entirety
51 * matches the regular expression pattern.
52 * <LI><code>getMatch</code>: returns the first match found in the input text,
53 * or null if no match is found.
54 * <LI><code>getAllMatches</code>: returns an array of all non-overlapping
55 * matches found in the input text. If no matches are found, the array is
56 * zero-length.
57 * <LI><code>substitute</code>: substitute the first occurence of the pattern
58 * in the input text with a replacement string (which may include
59 * metacharacters $0-$9, see REMatch.substituteInto).
60 * <LI><code>substituteAll</code>: same as above, but repeat for each match
61 * before returning.
62 * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
63 * that allows iteration over the matches (see REMatchEnumeration for some
64 * reasons why you may want to do this instead of using <code>getAllMatches</code>.
65 * <P>
66 * These methods all have similar argument lists. The input can be a
67 * String, a character array, a StringBuffer or an InputStream of some sort.
68 * Note that
69 * when using an InputStream, the stream read position cannot be guaranteed
70 * after attempting a match (this is not a bug, but a consequence of the way
71 * regular expressions work). Using an REMatchEnumeration can eliminate most
72 * positioning problems.
73 * <P>
74 * The optional index argument specifies the offset from the beginning of the
75 * text at which the search should start (see the descriptions of some of
76 * the execution flags for how this can affect positional pattern operators).
77 * For an InputStream, this means an offset from the current read position,
78 * so subsequent calls with the same index argument on an InputStream will not
79 * necessarily be accessing the same position on the stream, whereas repeated
80 * searches at a given index in a fixed string will return consistent
81 * results.
82 * <P>
83 * You can optionally affect the execution environment by using a
84 * combination of execution flags (constants listed below).
85 *
86 * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
87 * @version 1.0.8, 21 March 1999
88 */
89
90 public class RE extends REToken {
91 // This String will be returned by getVersion()
92 private static final String s_version = "1.0.8";
93
94 // These are, respectively, the first and last tokens in our linked list
95 // If there is only one token, firstToken == lastToken
96 private REToken firstToken, lastToken;
97
98 // This is the number of subexpressions in this regular expression,
99 // with a minimum value of zero. Returned by getNumSubs()
100 private int m_numSubs;
101
102 /**
103 * Compilation flag. Do not differentiate case. Subsequent
104 * searches using this RE will be case insensitive.
105 */
106 public static final int REG_ICASE = 2;
107
108 /**
109 * Compilation flag. The match-any-character operator (dot)
110 * will match a newline character. When set this overrides the syntax
111 * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
112 * the "/s" operator in Perl.
113 */
114 public static final int REG_DOT_NEWLINE = 4;
115
116 /**
117 * Compilation flag. Use multiline mode. In this mode, the ^ and $
118 * anchors will match based on newlines within the input. This is
119 * equivalent to the "/m" operator in Perl.
120 */
121 public static final int REG_MULTILINE = 8;
122
123 /**
124 * Execution flag.
125 * The match-beginning operator (^) will not match at the beginning
126 * of the input string. Useful for matching on a substring when you
127 * know the context of the input is such that position zero of the
128 * input to the match test is not actually position zero of the text.
129 * <P>
130 * This example demonstrates the results of various ways of matching on
131 * a substring.
132 * <P>
133 * <CODE>
134 * String s = "food bar fool";<BR>
135 * RE exp = new RE("^foo.");<BR>
136 * REMatch m0 = exp.getMatch(s);<BR>
137 * REMatch m1 = exp.getMatch(s.substring(8));<BR>
138 * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
139 * REMatch m3 = exp.getMatch(s,8); <BR>
140 * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX); <BR>
141 * <P>
142 * // Results:<BR>
143 * // m0 = "food"<BR>
144 * // m1 = "fool"<BR>
145 * // m2 = null<BR>
146 * // m3 = null<BR>
147 * // m4 = "fool"<BR>
148 * </CODE>
149 */
150 public static final int REG_NOTBOL = 16;
151
152 /**
153 * Execution flag.
154 * The match-end operator ($) does not match at the end
155 * of the input string. Useful for matching on substrings.
156 */
157 public static final int REG_NOTEOL = 32;
158
159 /**
160 * Execution flag.
161 * The match-beginning operator (^) matches not at position 0
162 * in the input string, but at the position the search started at
163 * (based on the index input given to the getMatch function). See
164 * the example under REG_NOTBOL.
165 */
166 public static final int REG_ANCHORINDEX = 64;
167
168 /** Returns a string representing the version of the gnu.regexp package. */
169 public static final String version() {
170 return s_version;
171 }
172
173 /**
174 * Constructs a regular expression pattern buffer without any compilation
175 * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
176 *
177 * @param pattern A regular expression pattern, in the form of a String,
178 * StringBuffer or char[].
179 * @exception REException The input pattern could not be parsed.
180 * @exception IllegalArgumentException The pattern was not a String,
181 * StringBuffer or char[].
182 * @exception NullPointerException The pattern was null.
183 */
184 public RE(Object pattern) throws REException {
185 this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
186 }
187
188 /**
189 * Constructs a regular expression pattern buffer using the specified
190 * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
191 *
192 * @param pattern A regular expression pattern, in the form of a String,
193 * StringBuffer, or char[].
194 * @param cflags The logical OR of any combination of the compilation flags listed above.
195 * @exception REException The input pattern could not be parsed.
196 * @exception IllegalArgumentException The pattern was not a String,
197 * StringBuffer or char[].
198 * @exception NullPointerException The pattern was null.
199 */
200 public RE(Object pattern, int cflags) throws REException {
201 this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
202 }
203
204 /**
205 * Constructs a regular expression pattern buffer using the specified
206 * compilation flags and regular expression syntax.
207 *
208 * @param pattern A regular expression pattern, in the form of a String,
209 * StringBuffer, or char[].
210 * @param cflags The logical OR of any combination of the compilation flags listed above.
211 * @param syntax The type of regular expression syntax to use.
212 * @exception REException The input pattern could not be parsed.
213 * @exception IllegalArgumentException The pattern was not a String,
214 * StringBuffer or char[].
215 * @exception NullPointerException The pattern was null.
216 */
217 public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
218 this(pattern,cflags,syntax,0,0);
219 }
220
221 // internal constructor used for alternation
222 private RE(REToken f_first, REToken f_last,int f_subs, int f_subIndex) {
223 super(f_subIndex); // ???
224 firstToken = f_first;
225 lastToken = f_last;
226 m_numSubs = f_subs;
227 }
228
229 // Actual constructor implementation
230 private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
231 super(myIndex); // Subexpression index of this token.
232 char[] pattern;
233 if (patternObj instanceof String) {
234 pattern = ((String) patternObj).toCharArray();
235 } else if (patternObj instanceof char[]) {
236 pattern = (char[]) patternObj;
237 } else if (patternObj instanceof StringBuffer) {
238 pattern = new char [((StringBuffer) patternObj).length()];
239 ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
240 } else throw new IllegalArgumentException("Invalid class for pattern");
241
242 int pLength = pattern.length;
243
244 m_numSubs = 0; // Number of subexpressions in this token.
245 Vector branches = null;
246
247 // linked list of tokens (sort of -- some closed loops can exist)
248 firstToken = lastToken = null;
249
250 // Precalculate these so we don't pay for the math every time we
251 // need to access them.
252 boolean insens = ((cflags & REG_ICASE) > 0);
253
254 // Parse pattern into tokens. Does anyone know if it's more efficient
255 // to use char[] than a String.charAt()? I'm assuming so.
256
257 // index tracks the position in the char array
258 int index = 0;
259
260 // this will be the current parse character (pattern[index])
261 CharUnit unit = new CharUnit();
262
263 // This is used for {x,y} calculations
264 IntPair minMax = new IntPair();
265
266 // Buffer a token so we can create a TokenRepeated, etc.
267 REToken currentToken = null;
268 char ch;
269
270 while (index < pLength) {
271 // read the next character unit (including backslash escapes)
272 index = getCharUnit(pattern,index,unit);
273
274 // ALTERNATION OPERATOR
275 // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
276 // not available if RE_LIMITED_OPS is set
277
278 // TODO: the '\n' literal here should be a test against REToken.newline,
279 // which unfortunately may be more than a single character.
280 if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
281 || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
282 && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
283 // make everything up to here be a branch. create vector if nec.
284 if (branches == null) branches = new Vector();
285 addToken(currentToken);
286 branches.addElement(new RE(firstToken,lastToken,m_numSubs,m_subIndex));
287 firstToken = lastToken = currentToken = null;
288 }
289
290 // INTERVAL OPERATOR:
291 // {x} | {x,} | {x,y} (RE_INTERVALS && RE_NO_BK_BRACES)
292 // \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
293 //
294 // OPEN QUESTION:
295 // what is proper interpretation of '{' at start of string?
296
297 else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
298 if (currentToken == null) throw new REException("{ without preceding token",REException.REG_EBRACE,index);
299
300 index = getMinMax(pattern,index,minMax,syntax);
301 if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
302 throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
303 currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);
304 }
305
306 // LIST OPERATOR:
307 // [...] | [^...]
308
309 else if ((unit.ch == '[') && !unit.bk) {
310 Vector options = new Vector();
311 boolean negative = false;
312 char lastChar = 0;
313 if (index == pLength) throw new REException("unmatched [",REException.REG_EBRACK,index);
314
315 // Check for initial caret, negation
316 if ((ch = pattern[index]) == '^') {
317 negative = true;
318 if (++index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
319 ch = pattern[index];
320 }
321
322 // Check for leading right bracket literal
323 if (ch == ']') {
324 lastChar = ch;
325 if (++index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
326 }
327
328 while ((ch = pattern[index++]) != ']') {
329 if ((ch == '-') && (lastChar != 0)) {
330 if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
331 if ((ch = pattern[index]) == ']') {
332 options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
333 lastChar = '-';
334 } else {
335 options.addElement(new RETokenRange(m_subIndex,lastChar,ch,insens));
336 lastChar = 0;
337 index++;
338 }
339 } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
340 if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
341 int posixID = -1;
342 boolean negate = false;
343 if (syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
344 switch (pattern[index]) {
345 case 'D':
346 negate = true;
347 case 'd':
348 posixID = RETokenPOSIX.DIGIT;
349 break;
350 case 'S':
351 negate = true;
352 case 's':
353 posixID = RETokenPOSIX.SPACE;
354 break;
355 case 'W':
356 negate = true;
357 case 'w':
358 posixID = RETokenPOSIX.ALNUM;
359 break;
360 }
361 }
362 if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
363
364 if (posixID != -1) {
365 options.addElement(new RETokenPOSIX(m_subIndex,posixID,insens,negate));
366 } else {
367 lastChar = pattern[index];
368 }
369 ++index;
370 } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (pattern[index] == ':')) {
371 StringBuffer posixSet = new StringBuffer();
372 index = getPosixSet(pattern,index+1,posixSet);
373 int posixId = RETokenPOSIX.intValue(posixSet.toString());
374 if (posixId != -1)
375 options.addElement(new RETokenPOSIX(m_subIndex,posixId,insens,false));
376 } else {
377 if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
378 lastChar = ch;
379 }
380 if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
381 } // while in list
382 // Out of list, index is one past ']'
383
384 if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
385
386 // Create a new RETokenOneOf
387 addToken(currentToken);
388 options.trimToSize();
389 currentToken = new RETokenOneOf(m_subIndex,options,negative);
390 }
391
392 // SUBEXPRESSIONS
393 // (...) | \(...\) depending on RE_NO_BK_PARENS
394
395 else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
396 boolean pure = false;
397 boolean comment = false;
398 if ((index+1 < pLength) && (pattern[index] == '?')) {
399 switch (pattern[index+1]) {
400 case ':':
401 if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
402 pure = true;
403 index += 2;
404 }
405 break;
406 case '#':
407 if (syntax.get(RESyntax.RE_COMMENTS)) {
408 comment = true;
409 }
410 break;
411 }
412 }
413
414 // find end of subexpression
415 int endIndex = index;
416 int nextIndex = index;
417 int nested = 0;
418
419 while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
420 && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
421 if ((endIndex = nextIndex) >= pLength)
422 throw new REException("no end of subexpression",REException.REG_ESUBREG,index-1);
423 else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
424 nested++;
425 else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
426 nested--;
427
428 // endIndex is now position at a ')','\)'
429 // nextIndex is end of string or position after ')' or '\)'
430
431 if (comment) index = nextIndex;
432 else { // not a comment
433 // create RE subexpression as token.
434 addToken(currentToken);
435 if (!pure) {
436 nextSub++;
437 m_numSubs++;
438 }
439
440 int useIndex = pure ? 0 : nextSub;
441
442 currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub);
443 nextSub += ((RE) currentToken).getNumSubs();
444 m_numSubs += ((RE) currentToken).getNumSubs();
445 index = nextIndex;
446 } // not a comment
447 } // subexpression
448
449 // UNMATCHED RIGHT PAREN
450 // ) or \)? need to implement throw exception if
451 // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
452 else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
453 throw new REException("unmatched right paren",REException.REG_EPAREN,index);
454 }
455
456 // START OF LINE OPERATOR
457 // ^
458
459 else if ((unit.ch == '^') && !unit.bk) {
460 addToken(currentToken);
461 currentToken = null;
462 addToken(new RETokenStart(m_subIndex,(cflags & REG_MULTILINE) > 0));
463 }
464
465 // END OF LINE OPERATOR
466 // $
467
468 else if ((unit.ch == '$') && !unit.bk) {
469 addToken(currentToken);
470 currentToken = null;
471 addToken(new RETokenEnd(m_subIndex,(cflags & REG_MULTILINE) > 0));
472 }
473
474 // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
475 // .
476
477 else if ((unit.ch == '.') && !unit.bk) {
478 addToken(currentToken);
479 currentToken = new RETokenAny(m_subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
480 }
481
482 // ZERO-OR-MORE REPEAT OPERATOR
483 // *
484
485 else if ((unit.ch == '*') && !unit.bk) {
486 if ((currentToken == null) || (currentToken.getMinimumLength() == 0))
487 throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
488 currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
489 }
490
491 // ONE-OR-MORE REPEAT OPERATOR
492 // + | \+ depending on RE_BK_PLUS_QM
493 // not available if RE_LIMITED_OPS is set
494
495 else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
496 if ((currentToken == null) || (currentToken.getMinimumLength() == 0))
497 throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
498 currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
499 }
500
501 // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
502 // ? | \? depending on RE_BK_PLUS_QM
503 // not available if RE_LIMITED_OPS is set
504 // stingy matching if RE_STINGY_OPS is set and it follows a quantifier
505
506 else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
507 if (currentToken == null) throw new REException("? without preceding token",REException.REG_BADRPT,index);
508
509 // Check for stingy matching on RETokenRepeated
510 if ((currentToken instanceof RETokenRepeated) && (syntax.get(RESyntax.RE_STINGY_OPS)))
511 ((RETokenRepeated) currentToken).makeStingy();
512 else
513 currentToken = setRepeated(currentToken,0,1,index);
514 }
515
516 // BACKREFERENCE OPERATOR
517 // \1 \2 \3 \4 ...
518 // not available if RE_NO_BK_REFS is set
519
520 else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
521 addToken(currentToken);
522 currentToken = new RETokenBackRef(m_subIndex,Character.digit(unit.ch,10),insens);
523 }
524
525 // START OF STRING OPERATOR
526 // \A if RE_STRING_ANCHORS is set
527
528 else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
529 addToken(currentToken);
530 currentToken = new RETokenStart(m_subIndex,false);
531 }
532
533 // DIGIT OPERATOR
534 // \d if RE_CHAR_CLASS_ESCAPES is set
535
536 else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
537 addToken(currentToken);
538 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.DIGIT,insens,false);
539 }
540
541 // NON-DIGIT OPERATOR
542 // \D
543
544 else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
545 addToken(currentToken);
546 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.DIGIT,insens,true);
547 }
548
549 // NEWLINE ESCAPE
550 // \n
551
552 else if (unit.bk && (unit.ch == 'n')) {
553 addToken(currentToken);
554 currentToken = new RETokenChar(m_subIndex,'\n',false);
555 }
556
557 // RETURN ESCAPE
558 // \r
559
560 else if (unit.bk && (unit.ch == 'r')) {
561 addToken(currentToken);
562 currentToken = new RETokenChar(m_subIndex,'\r',false);
563 }
564
565 // WHITESPACE OPERATOR
566 // \s if RE_CHAR_CLASS_ESCAPES is set
567
568 else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
569 addToken(currentToken);
570 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.SPACE,insens,false);
571 }
572
573 // NON-WHITESPACE OPERATOR
574 // \S
575
576 else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
577 addToken(currentToken);
578 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.SPACE,insens,true);
579 }
580
581 // TAB ESCAPE
582 // \t
583
584 else if (unit.bk && (unit.ch == 't')) {
585 addToken(currentToken);
586 currentToken = new RETokenChar(m_subIndex,'\t',false);
587 }
588
589 // ALPHANUMERIC OPERATOR
590 // \w
591
592 else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
593 addToken(currentToken);
594 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.ALNUM,insens,false);
595 }
596
597 // NON-ALPHANUMERIC OPERATOR
598 // \W
599
600 else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
601 addToken(currentToken);
602 currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.ALNUM,insens,true);
603 }
604
605 // END OF STRING OPERATOR
606 // \Z
607
608 else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
609 addToken(currentToken);
610 currentToken = new RETokenEnd(m_subIndex,false);
611 }
612
613 // NON-SPECIAL CHARACTER (or escape to make literal)
614 // c | \* for example
615
616 else { // not a special character
617 addToken(currentToken);
618 currentToken = new RETokenChar(m_subIndex,unit.ch,insens);
619 }
620 } // end while
621
622 // Add final buffered token if applicable
623 addToken(currentToken);
624
625 if (branches != null) {
626 branches.addElement(new RE(firstToken,lastToken,m_numSubs,m_subIndex));
627 branches.trimToSize(); // compact the Vector
628 firstToken = lastToken = new RETokenOneOf(m_subIndex,branches,false);
629 }
630 }
631
632 private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
633 unit.ch = input[index++];
634 if (unit.bk = (unit.ch == '\\'))
635 if (index < input.length)
636 unit.ch = input[index++];
637 else throw new REException("\\ at end of pattern.",REException.REG_ESCAPE,index);
638 return index;
639 }
640
641 /**
642 * Checks if the input in its entirety is an exact match of
643 * this regular expression.
644 *
645 * @param input The input text.
646 * @exception IllegalArgumentException The input text was not a String, char[], or InputStream.
647 */
648 public boolean isMatch(Object input) {
649 return isMatch(input,0,0);
650 }
651
652 /**
653 * Checks if the input string, starting from index, is an exact match of
654 * this regular expression.
655 *
656 * @param input The input text.
657 * @param index The offset index at which the search should be begin.
658 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
659 */
660 public boolean isMatch(Object input,int index) {
661 return isMatch(input,index,0);
662 }
663
664
665 /**
666 * Checks if the input, starting from index and using the specified
667 * execution flags, is an exact match of this regular expression.
668 *
669 * @param input The input text.
670 * @param index The offset index at which the search should be begin.
671 * @param eflags The logical OR of any execution flags above.
672 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
673 */
674 public boolean isMatch(Object input,int index,int eflags) {
675 return isMatchImpl(makeRECharIndexed(input,index),index,eflags);
676 }
677
678 private boolean isMatchImpl(RECharIndexed input, int index, int eflags) {
679 if (firstToken == null) // Trivial case
680 return (input.charAt(0) == RECharIndexed.OUT_OF_BOUNDS);
681 int[] i = firstToken.match(input,0,eflags,new REMatch(m_numSubs,index));
682 return (i != null) && (input.charAt(i[0]) == RECharIndexed.OUT_OF_BOUNDS);
683 }
684
685 /**
686 * Returns the maximum number of subexpressions in this regular expression.
687 * If the expression contains branches, the value returned will be the
688 * maximum subexpressions in any of the branches.
689 */
690 public int getNumSubs() {
691 return m_numSubs;
692 }
693
694 // Overrides REToken.setUncle
695 void setUncle(REToken f_uncle) {
696 lastToken.setUncle(f_uncle);
697 }
698
699 // Overrides REToken.chain
700 boolean chain(REToken f_next) {
701 super.chain(f_next);
702 if (lastToken != null) lastToken.setUncle(f_next);
703 return true;
704 }
705
706 /**
707 * Returns the minimum number of characters that could possibly
708 * constitute a match of this regular expression.
709 */
710 public int getMinimumLength() {
711 int min = 0;
712 REToken t = firstToken;
713 if (t == null) return 0;
714 do {
715 min += t.getMinimumLength();
716 } while ((t = t.m_next) != null);
717 return min;
718 }
719
720 /**
721 * Returns an array of all matches found in the input.
722 *
723 * @param input The input text.
724 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
725 */
726 public REMatch[] getAllMatches(Object input) {
727 return getAllMatches(input,0,0);
728 }
729
730 /**
731 * Returns an array of all matches found in the input,
732 * beginning at the specified index position.
733 *
734 * @param input The input text.
735 * @param index The offset index at which the search should be begin.
736 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
737 */
738 public REMatch[] getAllMatches(Object input, int index) {
739 return getAllMatches(input,index,0);
740 }
741
742 /**
743 * Returns an array of all matches found in the input string,
744 * beginning at the specified index position and using the specified
745 * execution flags.
746 *
747 * @param input The input text.
748 * @param index The offset index at which the search should be begin.
749 * @param eflags The logical OR of any execution flags above.
750 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
751 */
752 public REMatch[] getAllMatches(Object input, int index, int eflags) {
753 return getAllMatchesImpl(makeRECharIndexed(input,index),index,eflags);
754 }
755
756 // this has been changed since 1.03 to be non-overlapping matches
757 private REMatch[] getAllMatchesImpl(RECharIndexed input, int index, int eflags) {
758 Vector all = new Vector();
759 REMatch m = null;
760 while ((m = getMatchImpl(input,index,eflags,null)) != null) {
761 all.addElement(m);
762 index = m.getEndIndex();
763 if (m.end[0] == 0) { // handle pathological case of zero-length match
764 index++;
765 input.move(1);
766 } else {
767 input.move(m.end[0]);
768 }
769 }
770 REMatch[] mset = new REMatch[all.size()];
771 all.copyInto(mset);
772 return mset;
773 }
774
775 /* Implements abstract method REToken.match() */
776 int[] match(RECharIndexed input, int index, int eflags, REMatch mymatch) {
777 if (firstToken == null) return new int[] { index }; // Trivial case
778 /*
779 if ((mymatch.start[m_subIndex] == -1)
780 || (mymatch.start[m_subIndex] > index))
781 */
782 int oldstart = mymatch.start[m_subIndex];
783 mymatch.start[m_subIndex] = index;
784 int[] newIndex = firstToken.match(input,index,eflags,mymatch);
785 if (newIndex == null) {
786 mymatch.start[m_subIndex] = oldstart;
787 } else {
788 // If this match succeeded, then whole rest of string is good,
789 // and newIndex[0] is the end of the match AT THIS LEVEL
790
791 // We need to make list of all possible nexts.
792 int[] doables = new int[0];
793 int[] thisResult;
794 for (int i = 0; i < newIndex.length; i++) {
795 thisResult = next(input,newIndex[i],eflags,mymatch);
796 if (thisResult != null) {
797 int[] temp = new int[doables.length + thisResult.length];
798 System.arraycopy(doables,0,temp,0,doables.length);
799 for (int j = 0; j < thisResult.length; j++) {
800 temp[doables.length + j] = thisResult[j];
801 }
802 doables = temp;
803 }
804 }
805 return (doables.length == 0) ? null : doables;
806 }
807 return null;
808 }
809
810 /**
811 * Returns the first match found in the input.
812 *
813 * @param input The input text.
814 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
815 */
816 public REMatch getMatch(Object input) {
817 return getMatch(input,0,0);
818 }
819
820 /**
821 * Returns the first match found in the input, beginning
822 * the search at the specified index.
823 *
824 * @param input The input text.
825 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
826 */
827 public REMatch getMatch(Object input, int index) {
828 return getMatch(input,index,0);
829 }
830
831 /**
832 * Returns the first match found in the input, beginning
833 * the search at the specified index, and using the specified
834 * execution flags. If no match is found, returns null.
835 *
836 * @param input The input text.
837 * @param index The offset index at which the search should be begin.
838 * @param eflags The logical OR of any execution flags above.
839 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
840 */
841 public REMatch getMatch(Object input, int index, int eflags) {
842 return getMatch(input,index,eflags,null);
843 }
844
845 /**
846 * Returns the first match found in the input, beginning
847 * the search at the specified index, and using the specified
848 * execution flags. If no match is found, returns null. If a StringBuffer
849 * is provided and is non-null, the contents of the input text from the index to the
850 * beginning of the match (or to the end of the input, if there is no match)
851 * are appended to the StringBuffer.
852 *
853 * @param input The input text.
854 * @param index The offset index at which the search should be begin.
855 * @param eflags The logical OR of any execution flags above.
856 * @param buffer The StringBuffer to save pre-match text in.
857 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
858 */
859 public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
860 return getMatchImpl(makeRECharIndexed(input,index),index,eflags,buffer);
861 }
862
863 REMatch getMatchImpl(RECharIndexed input, int index, int eflags, StringBuffer buffer) {
864 // check if input is at a valid position
865 if (!input.isValid()) return null;
866 REMatch mymatch = new REMatch(m_numSubs,index);
867 do {
868 int[] result = match(input,0,eflags,mymatch);
869 if (result != null) {
870 mymatch.end[0] = result[0]; // may break leftmost longest
871 mymatch.finish(input);
872 return mymatch;
873 }
874 mymatch.clear(++index);
875 if (buffer != null) buffer.append(input.charAt(0));
876 } while (input.move(1));
877
878 return null;
879 }
880
881 /**
882 * Returns an REMatchEnumeration that can be used to iterate over the
883 * matches found in the input text.
884 *
885 * @param input The input text.
886 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
887 */
888 public REMatchEnumeration getMatchEnumeration(Object input) {
889 return getMatchEnumeration(input,0,0);
890 }
891
892
893 /**
894 * Returns an REMatchEnumeration that can be used to iterate over the
895 * matches found in the input text.
896 *
897 * @param input The input text.
898 * @param index The offset index at which the search should be begin.
899 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
900 */
901 public REMatchEnumeration getMatchEnumeration(Object input, int index) {
902 return getMatchEnumeration(input,index,0);
903 }
904
905 /**
906 * Returns an REMatchEnumeration that can be used to iterate over the
907 * matches found in the input text.
908 *
909 * @param input The input text.
910 * @param index The offset index at which the search should be begin.
911 * @param eflags The logical OR of any execution flags above.
912 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
913 */
914 public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
915 return new REMatchEnumeration(this,makeRECharIndexed(input,index),index,eflags);
916 }
917
918
919 /**
920 * Substitutes the replacement text for the first match found in the input.
921 *
922 * @param input The input text.
923 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
924 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
925 */
926 public String substitute(Object input,String replace) {
927 return substitute(input,replace,0,0);
928 }
929
930 /**
931 * Substitutes the replacement text for the first match found in the input
932 * beginning at the specified index position.
933 *
934 * @param input The input text.
935 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
936 * @param index The offset index at which the search should be begin.
937 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
938 */
939 public String substitute(Object input,String replace,int index) {
940 return substitute(input,replace,index,0);
941 }
942
943 /**
944 * Substitutes the replacement text for the first match found in the input
945 * string, beginning at the specified index position and using the
946 * specified execution flags.
947 *
948 * @param input The input text.
949 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
950 * @param index The offset index at which the search should be begin.
951 * @param eflags The logical OR of any execution flags above.
952 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
953 */
954 public String substitute(Object input,String replace,int index,int eflags) {
955 return substituteImpl(makeRECharIndexed(input,index),replace,index,eflags);
956 }
957
958 private String substituteImpl(RECharIndexed input,String replace,int index,int eflags) {
959 StringBuffer buffer = new StringBuffer();
960 REMatch m = getMatchImpl(input,index,eflags,buffer);
961 if (m==null) return buffer.toString();
962 buffer.append(m.substituteInto(replace));
963 if (input.move(m.end[0])) {
964 do {
965 buffer.append(input.charAt(0));
966 } while (input.move(1));
967 }
968 return buffer.toString();
969 }
970
971 /**
972 * Substitutes the replacement text for each non-overlapping match found
973 * in the input text.
974 *
975 * @param input The input text.
976 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
977 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
978 */
979 public String substituteAll(Object input,String replace) {
980 return substituteAll(input,replace,0,0);
981 }
982
983 /**
984 * Substitutes the replacement text for each non-overlapping match found
985 * in the input text, starting at the specified index.
986 *
987 * @param input The input text.
988 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
989 * @param index The offset index at which the search should be begin.
990 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
991 */
992 public String substituteAll(Object input,String replace,int index) {
993 return substituteAll(input,replace,index,0);
994 }
995
996 /**
997 * Substitutes the replacement text for each non-overlapping match found
998 * in the input text, starting at the specified index and using the
999 * specified execution flags.
1000 *
1001 * @param input The input text.
1002 * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1003 * @param index The offset index at which the search should be begin.
1004 * @param eflags The logical OR of any execution flags above.
1005 * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1006 */
1007 public String substituteAll(Object input,String replace,int index,int eflags) {
1008 return substituteAllImpl(makeRECharIndexed(input,index),replace,index,eflags);
1009 }
1010
1011 private String substituteAllImpl(RECharIndexed input,String replace,int index,int eflags) {
1012 StringBuffer buffer = new StringBuffer();
1013 REMatch m;
1014 while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1015 buffer.append(m.substituteInto(replace));
1016 index = m.getEndIndex();
1017 if (m.end[0] == 0) {
1018 char ch = input.charAt(0);
1019 if (ch != RECharIndexed.OUT_OF_BOUNDS)
1020 buffer.append(ch);
1021 input.move(1);
1022 } else {
1023 input.move(m.end[0]);
1024 }
1025 }
1026 return buffer.toString();
1027 }
1028
1029 /* Helper function for constructor */
1030 private void addToken(REToken next) {
1031 if (next == null) return;
1032 if (firstToken == null)
1033 lastToken = firstToken = next;
1034 else
1035 // if chain returns false, it "rejected" the token due to
1036 // an optimization, and next was combined with lastToken
1037 if (lastToken.chain(next)) lastToken = next;
1038 }
1039
1040 private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1041 if (current == null) throw new REException("repeat preceding token",REException.REG_BADRPT,index);
1042 return new RETokenRepeated(current.m_subIndex,current,min,max);
1043 }
1044
1045 private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1046 // Precondition: pattern[index-1] == ':'
1047 // we will return pos of closing ']'.
1048 int i;
1049 for (i=index; i<(pattern.length-1); i++) {
1050 if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1051 return i+2;
1052 buf.append(pattern[i]);
1053 }
1054 return index; // didn't match up
1055 }
1056
1057 private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1058 // Precondition: input[index-1] == '{', minMax != null
1059
1060 if (index == input.length) throw new REException("no matching brace",REException.REG_EBRACE,index);
1061
1062 int min,max=0;
1063 CharUnit unit = new CharUnit();
1064 StringBuffer buf = new StringBuffer();
1065
1066 // Read string of digits
1067 while (((index = getCharUnit(input,index,unit)) != input.length)
1068 && Character.isDigit(unit.ch))
1069 buf.append(unit.ch);
1070
1071 // Check for {} tomfoolery
1072 if (buf.length() == 0) throw new REException("bad brace construct",REException.REG_EBRACE,index);
1073
1074 min = Integer.parseInt(buf.toString());
1075
1076 if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1077 max = min;
1078 else if ((unit.ch == ',') && !unit.bk) {
1079 buf = new StringBuffer();
1080 // Read string of digits
1081 while (((index = getCharUnit(input,index,unit)) != input.length)
1082 && Character.isDigit(unit.ch))
1083 buf.append(unit.ch);
1084
1085 if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1086 throw new REException("expected end of interval",REException.REG_EBRACE,index);
1087
1088 // This is the case of {x,}
1089 if (buf.length() == 0) max = Integer.MAX_VALUE;
1090 else max = Integer.parseInt(buf.toString());
1091 } else throw new REException("invalid character in brace expression",REException.REG_EBRACE,index);
1092
1093 // We know min and max now, and they are valid.
1094
1095 minMax.first = min;
1096 minMax.second = max;
1097
1098 // return the index following the '}'
1099 return index;
1100 }
1101
1102 /**
1103 * Return a human readable form of the compiled regular expression,
1104 * useful for debugging.
1105 */
1106 public String toString() {
1107 StringBuffer sb = new StringBuffer();
1108 dump(sb);
1109 return sb.toString();
1110 }
1111
1112 void dump(StringBuffer os) {
1113 os.append('(');
1114 if (m_subIndex == 0)
1115 os.append("?:");
1116 if (firstToken != null)
1117 firstToken.dumpAll(os);
1118 os.append(')');
1119 }
1120
1121 // Cast input appropriately or throw exception
1122 private static RECharIndexed makeRECharIndexed(Object input, int index) {
1123 if (input instanceof String)
1124 return new RECharIndexedString((String) input,index);
1125 else if (input instanceof char[])
1126 return new RECharIndexedCharArray((char[]) input,index);
1127 else if (input instanceof StringBuffer)
1128 return new RECharIndexedStringBuffer((StringBuffer) input,index);
1129 else if (input instanceof InputStream)
1130 return new RECharIndexedInputStream((InputStream) input,index);
1131 else throw new IllegalArgumentException("Invalid class for input text");
1132 }
1133}