Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/memoire/re/RE.java


1   
2   
3   package com.memoire.re;
4   import com.memoire.re.*;
5   
6   /*
7    *  gnu/regexp/RE.java
8    *  Copyright (C) 1998 Wes Biggs
9    *
10   *  This library is free software; you can redistribute it and/or modify
11   *  it under the terms of the GNU Library General Public License as published
12   *  by the Free Software Foundation; either version 2 of the License, or
13   *  (at your option) any later version.
14   *
15   *  This library is distributed in the hope that it will be useful,
16   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18   *  GNU Library General Public License for more details.
19   *
20   *  You should have received a copy of the GNU Library General Public License
21   *  along with this program; if not, write to the Free Software
22   *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23   */
24  
25  
26  import java.io.InputStream;
27  import java.util.Vector;
28  
29  class IntPair {
30    public int first, second;
31  }
32  
33  class CharUnit {
34    public char ch;
35    public boolean bk;
36  }
37  
38  /**
39   * RE provides the user interface for compiling and matching regular
40   * expressions.
41   * <P>
42   * A regular expression object (class RE) is compiled by constructing it
43   * from a String, StringBuffer or character array, with optional 
44   * compilation flags (below)
45   * and an optional syntax specification (see RESyntax; if not specified,
46   * <code>RESyntax.RE_SYNTAX_PERL5</code> is used).
47   * <P>
48   * Various methods attempt to match input text against a compiled
49   * regular expression.  These methods are:
50   * <LI><code>isMatch</code>: returns true if the input text in its entirety
51   * matches the regular expression pattern.
52   * <LI><code>getMatch</code>: returns the first match found in the input text,
53   * or null if no match is found.
54   * <LI><code>getAllMatches</code>: returns an array of all non-overlapping 
55   * matches found in the input text.  If no matches are found, the array is
56   * zero-length.
57   * <LI><code>substitute</code>: substitute the first occurence of the pattern
58   * in the input text with a replacement string (which may include
59   * metacharacters $0-$9, see REMatch.substituteInto).
60   * <LI><code>substituteAll</code>: same as above, but repeat for each match
61   * before returning.
62   * <LI><code>getMatchEnumeration</code>: returns an REMatchEnumeration object
63   * that allows iteration over the matches (see REMatchEnumeration for some
64   * reasons why you may want to do this instead of using <code>getAllMatches</code>.
65   * <P>
66   * These methods all have similar argument lists.  The input can be a
67   * String, a character array, a StringBuffer or an InputStream of some sort.
68   * Note that
69   * when using an InputStream, the stream read position cannot be guaranteed
70   * after attempting a match (this is not a bug, but a consequence of the way
71   * regular expressions work).  Using an REMatchEnumeration can eliminate most
72   * positioning problems.
73   * <P>
74   * The optional index argument specifies the offset from the beginning of the
75   * text at which the search should start (see the descriptions of some of
76   * the execution flags for how this can affect positional pattern operators).
77   * For an InputStream, this means an offset from the current read position,
78   * so subsequent calls with the same index argument on an InputStream will not
79   * necessarily be accessing the same position on the stream, whereas repeated
80   * searches at a given index in a fixed string will return consistent
81   * results.
82   * <P>
83   * You can optionally affect the execution environment by using a
84   * combination of execution flags (constants listed below).
85   *
86   * @author <A HREF="mailto:wes@cacas.org">Wes Biggs</A>
87   * @version 1.0.8, 21 March 1999
88   */
89  
90  public class RE extends REToken {
91    // This String will be returned by getVersion()
92    private static final String s_version = "1.0.8";
93  
94    // These are, respectively, the first and last tokens in our linked list
95    // If there is only one token, firstToken == lastToken
96    private REToken firstToken, lastToken;
97  
98    // This is the number of subexpressions in this regular expression,
99    // with a minimum value of zero.  Returned by getNumSubs()
100   private int m_numSubs;
101 
102   /**
103    * Compilation flag. Do  not  differentiate  case.   Subsequent
104    * searches  using  this  RE will be case insensitive.
105    */
106   public static final int REG_ICASE = 2;
107 
108   /**
109    * Compilation flag. The match-any-character operator (dot)
110    * will match a newline character.  When set this overrides the syntax
111    * bit RE_DOT_NEWLINE (see RESyntax for details).  This is equivalent to
112    * the "/s" operator in Perl.
113    */
114   public static final int REG_DOT_NEWLINE = 4;
115 
116   /**
117    * Compilation flag. Use multiline mode.  In this mode, the ^ and $
118    * anchors will match based on newlines within the input. This is
119    * equivalent to the "/m" operator in Perl.
120    */
121   public static final int REG_MULTILINE = 8;
122 
123   /**
124    * Execution flag.
125    * The match-beginning operator (^) will not match at the beginning
126    * of the input string. Useful for matching on a substring when you
127    * know the context of the input is such that position zero of the
128    * input to the match test is not actually position zero of the text.
129    * <P>
130    * This example demonstrates the results of various ways of matching on
131    * a substring.
132    * <P>
133    * <CODE>
134    * String s = "food bar fool";<BR>
135    * RE exp = new RE("^foo.");<BR>
136    * REMatch m0 = exp.getMatch(s);<BR>
137    * REMatch m1 = exp.getMatch(s.substring(8));<BR>
138    * REMatch m2 = exp.getMatch(s.substring(8),0,RE.REG_NOTBOL); <BR>
139    * REMatch m3 = exp.getMatch(s,8);                            <BR>
140    * REMatch m4 = exp.getMatch(s,8,RE.REG_ANCHORINDEX);         <BR>
141    * <P>
142    * // Results:<BR>
143    * //  m0 = "food"<BR>
144    * //  m1 = "fool"<BR>
145    * //  m2 = null<BR>
146    * //  m3 = null<BR>
147    * //  m4 = "fool"<BR>
148    * </CODE>
149    */
150   public static final int REG_NOTBOL = 16;
151 
152   /**
153    * Execution flag.
154    * The match-end operator ($) does not match at the end
155    * of the input string. Useful for matching on substrings.
156    */
157   public static final int REG_NOTEOL = 32;
158 
159   /**
160    * Execution flag.
161    * The match-beginning operator (^) matches not at position 0
162    * in the input string, but at the position the search started at
163    * (based on the index input given to the getMatch function).  See
164    * the example under REG_NOTBOL.
165    */
166   public static final int REG_ANCHORINDEX = 64;
167 
168   /** Returns a string representing the version of the gnu.regexp package. */
169   public static final String version() {
170     return s_version;
171   }
172 
173   /**
174    * Constructs a regular expression pattern buffer without any compilation
175    * flags set, and using the default syntax (RESyntax.RE_SYNTAX_PERL5).
176    *
177    * @param pattern A regular expression pattern, in the form of a String,
178    *   StringBuffer or char[].
179    * @exception REException The input pattern could not be parsed.
180    * @exception IllegalArgumentException The pattern was not a String, 
181    *   StringBuffer or char[].
182    * @exception NullPointerException The pattern was null.
183    */
184   public RE(Object pattern) throws REException {
185     this(pattern,0,RESyntax.RE_SYNTAX_PERL5,0,0);
186   }
187 
188   /**
189    * Constructs a regular expression pattern buffer using the specified
190    * compilation flags and the default syntax (RESyntax.RE_SYNTAX_PERL5).
191    *
192    * @param pattern A regular expression pattern, in the form of a String,
193    *   StringBuffer, or char[].
194    * @param cflags The logical OR of any combination of the compilation flags listed above.
195    * @exception REException The input pattern could not be parsed.
196    * @exception IllegalArgumentException The pattern was not a String, 
197    *   StringBuffer or char[].
198    * @exception NullPointerException The pattern was null.
199    */
200   public RE(Object pattern, int cflags) throws REException {
201     this(pattern,cflags,RESyntax.RE_SYNTAX_PERL5,0,0);
202   }
203 
204   /**
205    * Constructs a regular expression pattern buffer using the specified
206    * compilation flags and regular expression syntax.
207    *
208    * @param pattern A regular expression pattern, in the form of a String,
209    *   StringBuffer, or char[].
210    * @param cflags The logical OR of any combination of the compilation flags listed above.
211    * @param syntax The type of regular expression syntax to use.
212    * @exception REException The input pattern could not be parsed.
213    * @exception IllegalArgumentException The pattern was not a String, 
214    *   StringBuffer or char[].
215    * @exception NullPointerException The pattern was null.
216    */
217   public RE(Object pattern, int cflags, RESyntax syntax) throws REException {
218     this(pattern,cflags,syntax,0,0);
219   }
220 
221   // internal constructor used for alternation
222   private RE(REToken f_first, REToken f_last,int f_subs, int f_subIndex) {
223     super(f_subIndex); // ???
224     firstToken = f_first;
225     lastToken = f_last;
226     m_numSubs = f_subs;
227   }
228 
229   // Actual constructor implementation
230   private RE(Object patternObj, int cflags, RESyntax syntax, int myIndex, int nextSub) throws REException {
231     super(myIndex); // Subexpression index of this token.
232     char[] pattern;
233     if (patternObj instanceof String) {
234       pattern = ((String) patternObj).toCharArray();
235     } else if (patternObj instanceof char[]) {
236       pattern = (char[]) patternObj;
237     } else if (patternObj instanceof StringBuffer) {
238       pattern = new char [((StringBuffer) patternObj).length()];
239       ((StringBuffer) patternObj).getChars(0,pattern.length,pattern,0);
240     } else throw new IllegalArgumentException("Invalid class for pattern");
241 
242     int pLength = pattern.length;
243 
244     m_numSubs = 0; // Number of subexpressions in this token.
245     Vector branches = null;
246 
247     // linked list of tokens (sort of -- some closed loops can exist)
248     firstToken = lastToken = null;
249 
250     // Precalculate these so we don't pay for the math every time we
251     // need to access them.
252     boolean insens = ((cflags & REG_ICASE) > 0);
253 
254     // Parse pattern into tokens.  Does anyone know if it's more efficient
255     // to use char[] than a String.charAt()?  I'm assuming so.
256 
257     // index tracks the position in the char array
258     int index = 0;
259 
260     // this will be the current parse character (pattern[index])
261     CharUnit unit = new CharUnit();
262 
263     // This is used for {x,y} calculations
264     IntPair minMax = new IntPair();
265 
266     // Buffer a token so we can create a TokenRepeated, etc.
267     REToken currentToken = null;
268     char ch;
269 
270     while (index < pLength) {
271       // read the next character unit (including backslash escapes)
272       index = getCharUnit(pattern,index,unit);
273 
274       // ALTERNATION OPERATOR
275       //  \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT)
276       //  not available if RE_LIMITED_OPS is set
277 
278       // TODO: the '\n' literal here should be a test against REToken.newline,
279       // which unfortunately may be more than a single character.
280       if ( ( (unit.ch == '|' && (syntax.get(RESyntax.RE_NO_BK_VBAR) ^ unit.bk))
281        || (syntax.get(RESyntax.RE_NEWLINE_ALT) && (unit.ch == '\n') && !unit.bk) )
282      && !syntax.get(RESyntax.RE_LIMITED_OPS)) {
283   // make everything up to here be a branch. create vector if nec.
284   if (branches == null) branches = new Vector();
285   addToken(currentToken);
286   branches.addElement(new RE(firstToken,lastToken,m_numSubs,m_subIndex));
287   firstToken = lastToken = currentToken = null;
288       }
289       
290       // INTERVAL OPERATOR:
291       //  {x} | {x,} | {x,y}  (RE_INTERVALS && RE_NO_BK_BRACES)
292       //  \{x\} | \{x,\} | \{x,y\} (RE_INTERVALS && !RE_NO_BK_BRACES)
293       //
294       // OPEN QUESTION: 
295       //  what is proper interpretation of '{' at start of string?
296 
297       else if ((unit.ch == '{') && syntax.get(RESyntax.RE_INTERVALS) && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)) {
298   if (currentToken == null) throw new REException("{ without preceding token",REException.REG_EBRACE,index);
299     
300   index = getMinMax(pattern,index,minMax,syntax);
301   if ((currentToken.getMinimumLength() == 0) && (minMax.second == Integer.MAX_VALUE))
302     throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
303   currentToken = setRepeated(currentToken,minMax.first,minMax.second,index);  
304       }
305       
306       // LIST OPERATOR:
307       //  [...] | [^...]
308 
309       else if ((unit.ch == '[') && !unit.bk) {
310   Vector options = new Vector();
311   boolean negative = false;
312   char lastChar = 0;
313   if (index == pLength) throw new REException("unmatched [",REException.REG_EBRACK,index);
314   
315   // Check for initial caret, negation
316   if ((ch = pattern[index]) == '^') {
317     negative = true;
318     if (++index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
319     ch = pattern[index];
320   }
321 
322   // Check for leading right bracket literal
323   if (ch == ']') {
324     lastChar = ch;
325     if (++index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
326   }
327 
328   while ((ch = pattern[index++]) != ']') {
329     if ((ch == '-') && (lastChar != 0)) {
330       if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
331       if ((ch = pattern[index]) == ']') {
332         options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
333         lastChar = '-';
334       } else {
335         options.addElement(new RETokenRange(m_subIndex,lastChar,ch,insens));
336         lastChar = 0;
337         index++;
338       }
339           } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
340             if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
341       int posixID = -1;
342       boolean negate = false;
343       if (syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
344         switch (pattern[index]) {
345         case 'D':
346     negate = true;
347         case 'd':
348     posixID = RETokenPOSIX.DIGIT;
349     break;
350         case 'S':
351     negate = true;
352         case 's':
353     posixID = RETokenPOSIX.SPACE;
354     break;
355         case 'W':
356     negate = true;
357         case 'w':
358     posixID = RETokenPOSIX.ALNUM;
359     break;
360         }
361       }
362       if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
363       
364       if (posixID != -1) {
365         options.addElement(new RETokenPOSIX(m_subIndex,posixID,insens,negate));
366       } else {
367         lastChar = pattern[index];
368       }
369       ++index;
370     } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (pattern[index] == ':')) {
371       StringBuffer posixSet = new StringBuffer();
372       index = getPosixSet(pattern,index+1,posixSet);
373       int posixId = RETokenPOSIX.intValue(posixSet.toString());
374       if (posixId != -1)
375         options.addElement(new RETokenPOSIX(m_subIndex,posixId,insens,false));
376     } else {
377       if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
378       lastChar = ch;
379     }
380     if (index == pLength) throw new REException("no end of list",REException.REG_EBRACK,index);
381   } // while in list
382   // Out of list, index is one past ']'
383       
384   if (lastChar != 0) options.addElement(new RETokenChar(m_subIndex,lastChar,insens));
385       
386   // Create a new RETokenOneOf
387   addToken(currentToken);
388   options.trimToSize();
389   currentToken = new RETokenOneOf(m_subIndex,options,negative);
390       }
391 
392       // SUBEXPRESSIONS
393       //  (...) | \(...\) depending on RE_NO_BK_PARENS
394 
395       else if ((unit.ch == '(') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) {
396   boolean pure = false;
397   boolean comment = false;
398   if ((index+1 < pLength) && (pattern[index] == '?')) {
399     switch (pattern[index+1]) {
400     case ':':
401       if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
402         pure = true;
403         index += 2;
404       }
405       break;
406     case '#':
407       if (syntax.get(RESyntax.RE_COMMENTS)) {
408         comment = true;
409       }
410       break;
411     }
412   }
413 
414   // find end of subexpression
415   int endIndex = index;
416   int nextIndex = index;
417   int nested = 0;
418 
419   while ( ((nextIndex = getCharUnit(pattern,endIndex,unit)) > 0)
420     && !(nested == 0 && (unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk)) )
421     if ((endIndex = nextIndex) >= pLength)
422       throw new REException("no end of subexpression",REException.REG_ESUBREG,index-1);
423     else if (unit.ch == '(' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
424       nested++;
425     else if (unit.ch == ')' && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))
426       nested--;
427 
428   // endIndex is now position at a ')','\)' 
429   // nextIndex is end of string or position after ')' or '\)'
430 
431   if (comment) index = nextIndex;
432   else { // not a comment
433     // create RE subexpression as token.
434     addToken(currentToken);
435     if (!pure) {
436       nextSub++;
437       m_numSubs++;
438     }
439 
440     int useIndex = pure ? 0 : nextSub;
441 
442     currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub);
443     nextSub += ((RE) currentToken).getNumSubs();
444     m_numSubs += ((RE) currentToken).getNumSubs();
445     index = nextIndex;
446   } // not a comment
447       } // subexpression
448     
449       // UNMATCHED RIGHT PAREN
450       // ) or \)?  need to implement throw exception if
451       // !syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD)
452       else if (!syntax.get(RESyntax.RE_UNMATCHED_RIGHT_PAREN_ORD) && ((unit.ch == ')') && (syntax.get(RESyntax.RE_NO_BK_PARENS) ^ unit.bk))) {
453   throw new REException("unmatched right paren",REException.REG_EPAREN,index);
454       }
455 
456       // START OF LINE OPERATOR
457       //  ^
458 
459       else if ((unit.ch == '^') && !unit.bk) {
460   addToken(currentToken);
461   currentToken = null;
462   addToken(new RETokenStart(m_subIndex,(cflags & REG_MULTILINE) > 0));
463       }
464 
465       // END OF LINE OPERATOR
466       //  $
467 
468       else if ((unit.ch == '$') && !unit.bk) {
469   addToken(currentToken);
470   currentToken = null;
471   addToken(new RETokenEnd(m_subIndex,(cflags & REG_MULTILINE) > 0));
472       }
473 
474       // MATCH-ANY-CHARACTER OPERATOR (except possibly newline and null)
475       //  .
476 
477       else if ((unit.ch == '.') && !unit.bk) {
478   addToken(currentToken);
479   currentToken = new RETokenAny(m_subIndex,syntax.get(RESyntax.RE_DOT_NEWLINE) || ((cflags & REG_DOT_NEWLINE) > 0),syntax.get(RESyntax.RE_DOT_NOT_NULL));
480       }
481 
482       // ZERO-OR-MORE REPEAT OPERATOR
483       //  *
484 
485       else if ((unit.ch == '*') && !unit.bk) {
486   if ((currentToken == null) || (currentToken.getMinimumLength() == 0))
487     throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
488   currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
489       }
490 
491       // ONE-OR-MORE REPEAT OPERATOR
492       //  + | \+ depending on RE_BK_PLUS_QM
493       //  not available if RE_LIMITED_OPS is set
494 
495       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
496   if ((currentToken == null) || (currentToken.getMinimumLength() == 0))
497     throw new REException("repeated argument may be empty",REException.REG_BADRPT,index);
498   currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
499       }
500 
501       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
502       //  ? | \? depending on RE_BK_PLUS_QM
503       //  not available if RE_LIMITED_OPS is set
504       //  stingy matching if RE_STINGY_OPS is set and it follows a quantifier
505 
506       else if ((unit.ch == '?') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ unit.bk)) {
507   if (currentToken == null) throw new REException("? without preceding token",REException.REG_BADRPT,index);
508 
509   // Check for stingy matching on RETokenRepeated
510   if ((currentToken instanceof RETokenRepeated) && (syntax.get(RESyntax.RE_STINGY_OPS)))
511     ((RETokenRepeated) currentToken).makeStingy();
512   else
513     currentToken = setRepeated(currentToken,0,1,index);
514       }
515   
516       // BACKREFERENCE OPERATOR
517       //  \1 \2 \3 \4 ...
518       // not available if RE_NO_BK_REFS is set
519 
520       else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
521   addToken(currentToken);
522   currentToken = new RETokenBackRef(m_subIndex,Character.digit(unit.ch,10),insens);
523       }
524 
525       // START OF STRING OPERATOR
526       //  \A if RE_STRING_ANCHORS is set
527       
528       else if (unit.bk && (unit.ch == 'A') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
529   addToken(currentToken);
530   currentToken = new RETokenStart(m_subIndex,false);
531       }
532       
533       // DIGIT OPERATOR
534       //  \d if RE_CHAR_CLASS_ESCAPES is set
535       
536       else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
537   addToken(currentToken);
538   currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.DIGIT,insens,false);
539       }
540 
541       // NON-DIGIT OPERATOR
542       //  \D
543 
544   else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
545     addToken(currentToken);
546     currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.DIGIT,insens,true);
547   }
548 
549   // NEWLINE ESCAPE
550         //  \n
551 
552   else if (unit.bk && (unit.ch == 'n')) {
553     addToken(currentToken);
554     currentToken = new RETokenChar(m_subIndex,'\n',false);
555   }
556 
557   // RETURN ESCAPE
558         //  \r
559 
560   else if (unit.bk && (unit.ch == 'r')) {
561     addToken(currentToken);
562     currentToken = new RETokenChar(m_subIndex,'\r',false);
563   }
564 
565   // WHITESPACE OPERATOR
566         //  \s if RE_CHAR_CLASS_ESCAPES is set
567 
568   else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
569     addToken(currentToken);
570     currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.SPACE,insens,false);
571   }
572 
573   // NON-WHITESPACE OPERATOR
574         //  \S
575 
576   else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
577     addToken(currentToken);
578     currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.SPACE,insens,true);
579   }
580 
581   // TAB ESCAPE
582         //  \t
583 
584   else if (unit.bk && (unit.ch == 't')) {
585     addToken(currentToken);
586     currentToken = new RETokenChar(m_subIndex,'\t',false);
587   }
588 
589   // ALPHANUMERIC OPERATOR
590         //  \w
591 
592   else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
593     addToken(currentToken);
594     currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.ALNUM,insens,false);
595   }
596 
597   // NON-ALPHANUMERIC OPERATOR
598         //  \W
599 
600   else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) {
601     addToken(currentToken);
602     currentToken = new RETokenPOSIX(m_subIndex,RETokenPOSIX.ALNUM,insens,true);
603   }
604 
605   // END OF STRING OPERATOR
606         //  \Z
607 
608   else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) {
609     addToken(currentToken);
610     currentToken = new RETokenEnd(m_subIndex,false);
611   }
612 
613   // NON-SPECIAL CHARACTER (or escape to make literal)
614         //  c | \* for example
615 
616   else {  // not a special character
617     addToken(currentToken);
618     currentToken = new RETokenChar(m_subIndex,unit.ch,insens);
619   } 
620       } // end while
621 
622     // Add final buffered token if applicable
623     addToken(currentToken);
624       
625     if (branches != null) {
626       branches.addElement(new RE(firstToken,lastToken,m_numSubs,m_subIndex));
627       branches.trimToSize(); // compact the Vector
628       firstToken = lastToken = new RETokenOneOf(m_subIndex,branches,false);
629     }
630   }
631 
632   private static int getCharUnit(char[] input, int index, CharUnit unit) throws REException {
633     unit.ch = input[index++];
634     if (unit.bk = (unit.ch == '\\'))
635       if (index < input.length)
636   unit.ch = input[index++];
637       else throw new REException("\\ at end of pattern.",REException.REG_ESCAPE,index);
638     return index;
639   }
640 
641   /**
642    * Checks if the input in its entirety is an exact match of
643    * this regular expression.
644    *
645    * @param input The input text.
646    * @exception IllegalArgumentException The input text was not a String, char[], or InputStream.
647    */
648   public boolean isMatch(Object input) {
649     return isMatch(input,0,0);
650   }
651   
652   /**
653    * Checks if the input string, starting from index, is an exact match of
654    * this regular expression.
655    *
656    * @param input The input text.
657    * @param index The offset index at which the search should be begin.
658    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
659    */
660   public boolean isMatch(Object input,int index) {
661     return isMatch(input,index,0);
662   }
663   
664 
665   /**
666    * Checks if the input, starting from index and using the specified
667    * execution flags, is an exact match of this regular expression.
668    *
669    * @param input The input text.
670    * @param index The offset index at which the search should be begin.
671    * @param eflags The logical OR of any execution flags above.
672    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
673    */
674   public boolean isMatch(Object input,int index,int eflags) {
675     return isMatchImpl(makeRECharIndexed(input,index),index,eflags);
676   }
677 
678   private boolean isMatchImpl(RECharIndexed input, int index, int eflags) {
679     if (firstToken == null)  // Trivial case
680       return (input.charAt(0) == RECharIndexed.OUT_OF_BOUNDS);
681     int[] i = firstToken.match(input,0,eflags,new REMatch(m_numSubs,index));
682     return (i != null) && (input.charAt(i[0]) == RECharIndexed.OUT_OF_BOUNDS);
683   }
684     
685   /**
686    * Returns the maximum number of subexpressions in this regular expression.
687    * If the expression contains branches, the value returned will be the
688    * maximum subexpressions in any of the branches.
689    */
690   public int getNumSubs() {
691     return m_numSubs;
692   }
693 
694   // Overrides REToken.setUncle
695   void setUncle(REToken f_uncle) {
696     lastToken.setUncle(f_uncle);
697   }
698 
699   // Overrides REToken.chain
700   boolean chain(REToken f_next) {
701     super.chain(f_next);
702     if (lastToken != null) lastToken.setUncle(f_next);
703     return true;
704   }
705     
706   /**
707    * Returns the minimum number of characters that could possibly
708    * constitute a match of this regular expression.
709    */
710   public int getMinimumLength() {
711     int min = 0;
712     REToken t = firstToken;
713     if (t == null) return 0;
714     do {
715       min += t.getMinimumLength();
716     } while ((t = t.m_next) != null);
717     return min;
718   }
719 
720   /**
721    * Returns an array of all matches found in the input.
722    *
723    * @param input The input text.
724    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
725    */
726   public REMatch[] getAllMatches(Object input) {
727     return getAllMatches(input,0,0);
728   }
729 
730   /**
731    * Returns an array of all matches found in the input,
732    * beginning at the specified index position.
733    *
734    * @param input The input text.
735    * @param index The offset index at which the search should be begin.
736    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
737    */
738   public REMatch[] getAllMatches(Object input, int index) {
739     return getAllMatches(input,index,0);
740   }
741 
742   /**
743    * Returns an array of all matches found in the input string,
744    * beginning at the specified index position and using the specified
745    * execution flags.
746    *
747    * @param input The input text.
748    * @param index The offset index at which the search should be begin.
749    * @param eflags The logical OR of any execution flags above.
750    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
751    */
752   public REMatch[] getAllMatches(Object input, int index, int eflags) {
753     return getAllMatchesImpl(makeRECharIndexed(input,index),index,eflags);
754   }
755 
756   // this has been changed since 1.03 to be non-overlapping matches
757   private REMatch[] getAllMatchesImpl(RECharIndexed input, int index, int eflags) {
758     Vector all = new Vector();
759     REMatch m = null;
760     while ((m = getMatchImpl(input,index,eflags,null)) != null) {
761       all.addElement(m);
762       index = m.getEndIndex();
763       if (m.end[0] == 0) {   // handle pathological case of zero-length match
764   index++;
765   input.move(1);
766       } else {
767   input.move(m.end[0]);
768       }
769     }
770     REMatch[] mset = new REMatch[all.size()];
771     all.copyInto(mset);
772     return mset;
773   }
774   
775   /* Implements abstract method REToken.match() */
776   int[] match(RECharIndexed input, int index, int eflags, REMatch mymatch) { 
777     if (firstToken == null) return new int[] { index }; // Trivial case
778     /*
779     if ((mymatch.start[m_subIndex] == -1) 
780          || (mymatch.start[m_subIndex] > index))
781     */
782     int oldstart = mymatch.start[m_subIndex];
783     mymatch.start[m_subIndex] = index;
784     int[] newIndex = firstToken.match(input,index,eflags,mymatch);
785     if (newIndex == null) { 
786   mymatch.start[m_subIndex] = oldstart;
787     } else {
788       // If this match succeeded, then whole rest of string is good,
789       // and newIndex[0] is the end of the match AT THIS LEVEL
790 
791       // We need to make list of all possible nexts.
792       int[] doables = new int[0];
793       int[] thisResult;
794       for (int i = 0; i < newIndex.length; i++) {
795   thisResult = next(input,newIndex[i],eflags,mymatch);
796   if (thisResult != null) {
797     int[] temp = new int[doables.length + thisResult.length];
798     System.arraycopy(doables,0,temp,0,doables.length);
799     for (int j = 0; j < thisResult.length; j++) {
800       temp[doables.length + j] = thisResult[j];
801     }
802     doables = temp;
803   }
804       }
805       return (doables.length == 0) ? null : doables;
806     }
807     return null;
808   }
809   
810   /**
811    * Returns the first match found in the input.
812    *
813    * @param input The input text.
814    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
815    */
816   public REMatch getMatch(Object input) {
817     return getMatch(input,0,0);
818   }
819   
820   /**
821    * Returns the first match found in the input, beginning
822    * the search at the specified index.
823    *
824    * @param input The input text.
825    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
826    */
827   public REMatch getMatch(Object input, int index) {
828     return getMatch(input,index,0);
829   }
830   
831   /**
832    * Returns the first match found in the input, beginning
833    * the search at the specified index, and using the specified
834    * execution flags.  If no match is found, returns null.
835    *
836    * @param input The input text.
837    * @param index The offset index at which the search should be begin.
838    * @param eflags The logical OR of any execution flags above.
839    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
840    */
841   public REMatch getMatch(Object input, int index, int eflags) {
842     return getMatch(input,index,eflags,null);
843   }
844 
845   /**
846    * Returns the first match found in the input, beginning
847    * the search at the specified index, and using the specified
848    * execution flags.  If no match is found, returns null.  If a StringBuffer
849    * is provided and is non-null, the contents of the input text from the index to the
850    * beginning of the match (or to the end of the input, if there is no match)
851    * are appended to the StringBuffer.
852    *
853    * @param input The input text.
854    * @param index The offset index at which the search should be begin.
855    * @param eflags The logical OR of any execution flags above.
856    * @param buffer The StringBuffer to save pre-match text in.
857    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
858    */
859   public REMatch getMatch(Object input, int index, int eflags, StringBuffer buffer) {
860     return getMatchImpl(makeRECharIndexed(input,index),index,eflags,buffer);
861   }
862 
863   REMatch getMatchImpl(RECharIndexed input, int index, int eflags, StringBuffer buffer) {
864     // check if input is at a valid position
865     if (!input.isValid()) return null;
866     REMatch mymatch = new REMatch(m_numSubs,index);
867     do {
868       int[] result = match(input,0,eflags,mymatch);
869       if (result != null) {
870   mymatch.end[0] = result[0]; // may break leftmost longest
871   mymatch.finish(input);
872   return mymatch;
873       }
874       mymatch.clear(++index);
875       if (buffer != null) buffer.append(input.charAt(0));
876     } while (input.move(1));
877 
878     return null;
879   }
880 
881   /**
882    * Returns an REMatchEnumeration that can be used to iterate over the
883    * matches found in the input text.
884    *
885    * @param input The input text.
886    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
887    */
888   public REMatchEnumeration getMatchEnumeration(Object input) {
889     return getMatchEnumeration(input,0,0);
890   }
891 
892 
893   /**
894    * Returns an REMatchEnumeration that can be used to iterate over the
895    * matches found in the input text.
896    *
897    * @param input The input text.
898    * @param index The offset index at which the search should be begin.
899    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
900    */
901   public REMatchEnumeration getMatchEnumeration(Object input, int index) {
902     return getMatchEnumeration(input,index,0);
903   }
904 
905   /**
906    * Returns an REMatchEnumeration that can be used to iterate over the
907    * matches found in the input text.
908    *
909    * @param input The input text.
910    * @param index The offset index at which the search should be begin.
911    * @param eflags The logical OR of any execution flags above.
912    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
913    */
914   public REMatchEnumeration getMatchEnumeration(Object input, int index, int eflags) {
915     return new REMatchEnumeration(this,makeRECharIndexed(input,index),index,eflags);
916   }
917 
918 
919   /**
920    * Substitutes the replacement text for the first match found in the input.
921    *
922    * @param input The input text.
923    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
924    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
925    */
926   public String substitute(Object input,String replace) {
927     return substitute(input,replace,0,0);
928   }
929 
930   /**
931    * Substitutes the replacement text for the first match found in the input
932    * beginning at the specified index position.
933    *
934    * @param input The input text.
935    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
936    * @param index The offset index at which the search should be begin.
937    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
938    */
939   public String substitute(Object input,String replace,int index) {
940     return substitute(input,replace,index,0);
941   }
942 
943   /**
944    * Substitutes the replacement text for the first match found in the input
945    * string, beginning at the specified index position and using the
946    * specified execution flags.
947    *
948    * @param input The input text.
949    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
950    * @param index The offset index at which the search should be begin.
951    * @param eflags The logical OR of any execution flags above.
952    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
953    */
954   public String substitute(Object input,String replace,int index,int eflags) {
955     return substituteImpl(makeRECharIndexed(input,index),replace,index,eflags);
956   }
957 
958   private String substituteImpl(RECharIndexed input,String replace,int index,int eflags) {
959     StringBuffer buffer = new StringBuffer();
960     REMatch m = getMatchImpl(input,index,eflags,buffer);
961     if (m==null) return buffer.toString();
962     buffer.append(m.substituteInto(replace));
963     if (input.move(m.end[0])) {
964       do {
965   buffer.append(input.charAt(0));
966       } while (input.move(1));
967     }
968     return buffer.toString();
969   }
970   
971   /**
972    * Substitutes the replacement text for each non-overlapping match found 
973    * in the input text.
974    *
975    * @param input The input text.
976    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
977    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
978    */
979   public String substituteAll(Object input,String replace) {
980     return substituteAll(input,replace,0,0);
981   }
982 
983   /**
984    * Substitutes the replacement text for each non-overlapping match found 
985    * in the input text, starting at the specified index.
986    *
987    * @param input The input text.
988    * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
989    * @param index The offset index at which the search should be begin.
990    * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
991    */
992   public String substituteAll(Object input,String replace,int index) {
993     return substituteAll(input,replace,index,0);
994   }
995  
996   /**
997    * Substitutes the replacement text for each non-overlapping match found 
998    * in the input text, starting at the specified index and using the
999    * specified execution flags.
1000   *
1001   * @param input The input text.
1002   * @param replace The replacement text, which may contain $x metacharacters (see REMatch.substituteInto).
1003   * @param index The offset index at which the search should be begin.
1004   * @param eflags The logical OR of any execution flags above.
1005   * @exception IllegalArgumentException The input text was not a String, char[], StringBuffer or InputStream.
1006   */
1007  public String substituteAll(Object input,String replace,int index,int eflags) {
1008    return substituteAllImpl(makeRECharIndexed(input,index),replace,index,eflags);
1009  }
1010
1011  private String substituteAllImpl(RECharIndexed input,String replace,int index,int eflags) {
1012    StringBuffer buffer = new StringBuffer();
1013    REMatch m;
1014    while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
1015      buffer.append(m.substituteInto(replace));
1016      index = m.getEndIndex();
1017      if (m.end[0] == 0) {
1018  char ch = input.charAt(0);
1019  if (ch != RECharIndexed.OUT_OF_BOUNDS) 
1020    buffer.append(ch);
1021  input.move(1);
1022      } else {
1023  input.move(m.end[0]);
1024      }
1025    }
1026    return buffer.toString();
1027  }
1028  
1029  /* Helper function for constructor */
1030  private void addToken(REToken next) {
1031    if (next == null) return;
1032    if (firstToken == null)
1033      lastToken = firstToken = next;
1034    else
1035      // if chain returns false, it "rejected" the token due to
1036      // an optimization, and next was combined with lastToken
1037      if (lastToken.chain(next)) lastToken = next;
1038  }
1039
1040  private static REToken setRepeated(REToken current, int min, int max, int index) throws REException {
1041    if (current == null) throw new REException("repeat preceding token",REException.REG_BADRPT,index);
1042    return new RETokenRepeated(current.m_subIndex,current,min,max);
1043  }
1044
1045  private static int getPosixSet(char[] pattern,int index,StringBuffer buf) {
1046    // Precondition: pattern[index-1] == ':'
1047    // we will return pos of closing ']'.
1048    int i;
1049    for (i=index; i<(pattern.length-1); i++) {
1050      if ((pattern[i] == ':') && (pattern[i+1] == ']'))
1051  return i+2;
1052      buf.append(pattern[i]);
1053    }
1054    return index; // didn't match up
1055  }
1056
1057  private int getMinMax(char[] input,int index,IntPair minMax,RESyntax syntax) throws REException {
1058    // Precondition: input[index-1] == '{', minMax != null
1059
1060    if (index == input.length) throw new REException("no matching brace",REException.REG_EBRACE,index);
1061  
1062    int min,max=0;
1063    CharUnit unit = new CharUnit();
1064    StringBuffer buf = new StringBuffer();
1065    
1066    // Read string of digits
1067    while (((index = getCharUnit(input,index,unit)) != input.length)
1068     && Character.isDigit(unit.ch))
1069      buf.append(unit.ch);
1070
1071    // Check for {} tomfoolery
1072    if (buf.length() == 0) throw new REException("bad brace construct",REException.REG_EBRACE,index);
1073
1074    min = Integer.parseInt(buf.toString());
1075  
1076    if ((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk))
1077      max = min;
1078    else if ((unit.ch == ',') && !unit.bk) {
1079      buf = new StringBuffer();
1080      // Read string of digits
1081      while (((index = getCharUnit(input,index,unit)) != input.length)
1082       && Character.isDigit(unit.ch))
1083  buf.append(unit.ch);
1084
1085      if (!((unit.ch == '}') && (syntax.get(RESyntax.RE_NO_BK_BRACES) ^ unit.bk)))
1086  throw new REException("expected end of interval",REException.REG_EBRACE,index);
1087
1088      // This is the case of {x,}
1089      if (buf.length() == 0) max = Integer.MAX_VALUE;
1090      else max = Integer.parseInt(buf.toString());
1091    } else throw new REException("invalid character in brace expression",REException.REG_EBRACE,index);
1092
1093    // We know min and max now, and they are valid.
1094
1095    minMax.first = min;
1096    minMax.second = max;
1097
1098    // return the index following the '}'
1099    return index;
1100  }
1101
1102   /**
1103    * Return a human readable form of the compiled regular expression,
1104    * useful for debugging.
1105    */
1106   public String toString() {
1107     StringBuffer sb = new StringBuffer();
1108     dump(sb);
1109     return sb.toString();
1110   }
1111
1112  void dump(StringBuffer os) {
1113    os.append('(');
1114    if (m_subIndex == 0)
1115      os.append("?:");
1116    if (firstToken != null)
1117      firstToken.dumpAll(os);
1118    os.append(')');
1119  }
1120
1121  // Cast input appropriately or throw exception
1122  private static RECharIndexed makeRECharIndexed(Object input, int index) {
1123    if (input instanceof String)
1124      return new RECharIndexedString((String) input,index);
1125    else if (input instanceof char[])
1126      return new RECharIndexedCharArray((char[]) input,index);
1127    else if (input instanceof StringBuffer)
1128      return new RECharIndexedStringBuffer((StringBuffer) input,index);
1129    else if (input instanceof InputStream)
1130      return new RECharIndexedInputStream((InputStream) input,index);
1131    else throw new IllegalArgumentException("Invalid class for input text");
1132  }
1133}