Save This Page
Home » lucene-2.4.1-src » org.apache » regexp » [javadoc | source]
    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.regexp;
   19   
   20   import java.io.Serializable;
   21   import java.util.Vector;
   22   
   23   /**
   24    * RE is an efficient, lightweight regular expression evaluator/matcher
   25    * class. Regular expressions are pattern descriptions which enable
   26    * sophisticated matching of strings.  In addition to being able to
   27    * match a string against a pattern, you can also extract parts of the
   28    * match.  This is especially useful in text parsing! Details on the
   29    * syntax of regular expression patterns are given below.
   30    *
   31    * <p>
   32    * To compile a regular expression (RE), you can simply construct an RE
   33    * matcher object from the string specification of the pattern, like this:
   34    *
   35    * <pre>
   36    *  RE r = new RE("a*b");
   37    * </pre>
   38    *
   39    * <p>
   40    * Once you have done this, you can call either of the RE.match methods to
   41    * perform matching on a String.  For example:
   42    *
   43    * <pre>
   44    *  boolean matched = r.match("aaaab");
   45    * </pre>
   46    *
   47    * will cause the boolean matched to be set to true because the
   48    * pattern "a*b" matches the string "aaaab".
   49    *
   50    * <p>
   51    * If you were interested in the <i>number</i> of a's which matched the
   52    * first part of our example expression, you could change the expression to
   53    * "(a*)b".  Then when you compiled the expression and matched it against
   54    * something like "xaaaab", you would get results like this:
   55    *
   56    * <pre>
   57    *  RE r = new RE("(a*)b");                  // Compile expression
   58    *  boolean matched = r.match("xaaaab");     // Match against "xaaaab"
   59    *
   60    *  String wholeExpr = r.getParen(0);        // wholeExpr will be 'aaaab'
   61    *  String insideParens = r.getParen(1);     // insideParens will be 'aaaa'
   62    *
   63    *  int startWholeExpr = r.getParenStart(0); // startWholeExpr will be index 1
   64    *  int endWholeExpr = r.getParenEnd(0);     // endWholeExpr will be index 6
   65    *  int lenWholeExpr = r.getParenLength(0);  // lenWholeExpr will be 5
   66    *
   67    *  int startInside = r.getParenStart(1);    // startInside will be index 1
   68    *  int endInside = r.getParenEnd(1);        // endInside will be index 5
   69    *  int lenInside = r.getParenLength(1);     // lenInside will be 4
   70    * </pre>
   71    *
   72    * You can also refer to the contents of a parenthesized expression
   73    * within a regular expression itself.  This is called a
   74    * 'backreference'.  The first backreference in a regular expression is
   75    * denoted by \1, the second by \2 and so on.  So the expression:
   76    *
   77    * <pre>
   78    *  ([0-9]+)=\1
   79    * </pre>
   80    *
   81    * will match any string of the form n=n (like 0=0 or 2=2).
   82    *
   83    * <p>
   84    * The full regular expression syntax accepted by RE is described here:
   85    *
   86    * <pre>
   87    *
   88    *  <b><font face=times roman>Characters</font></b>
   89    *
   90    *    <i>unicodeChar</i>   Matches any identical unicode character
   91    *    \                    Used to quote a meta-character (like '*')
   92    *    \\                   Matches a single '\' character
   93    *    \0nnn                Matches a given octal character
   94    *    \xhh                 Matches a given 8-bit hexadecimal character
   95    *    \\uhhhh              Matches a given 16-bit hexadecimal character
   96    *    \t                   Matches an ASCII tab character
   97    *    \n                   Matches an ASCII newline character
   98    *    \r                   Matches an ASCII return character
   99    *    \f                   Matches an ASCII form feed character
  100    *
  101    *
  102    *  <b><font face=times roman>Character Classes</font></b>
  103    *
  104    *    [abc]                Simple character class
  105    *    [a-zA-Z]             Character class with ranges
  106    *    [^abc]               Negated character class
  107    * </pre>
  108    *
  109    * <b>NOTE:</b> Incomplete ranges will be interpreted as &quot;starts
  110    * from zero&quot; or &quot;ends with last character&quot;.
  111    * <br>
  112    * I.e. [-a] is the same as [\\u0000-a], and [a-] is the same as [a-\\uFFFF],
  113    * [-] means &quot;all characters&quot;.
  114    *
  115    * <pre>
  116    *
  117    *  <b><font face=times roman>Standard POSIX Character Classes</font></b>
  118    *
  119    *    [:alnum:]            Alphanumeric characters.
  120    *    [:alpha:]            Alphabetic characters.
  121    *    [:blank:]            Space and tab characters.
  122    *    [:cntrl:]            Control characters.
  123    *    [:digit:]            Numeric characters.
  124    *    [:graph:]            Characters that are printable and are also visible.
  125    *                         (A space is printable, but not visible, while an
  126    *                         `a' is both.)
  127    *    [:lower:]            Lower-case alphabetic characters.
  128    *    [:print:]            Printable characters (characters that are not
  129    *                         control characters.)
  130    *    [:punct:]            Punctuation characters (characters that are not letter,
  131    *                         digits, control characters, or space characters).
  132    *    [:space:]            Space characters (such as space, tab, and formfeed,
  133    *                         to name a few).
  134    *    [:upper:]            Upper-case alphabetic characters.
  135    *    [:xdigit:]           Characters that are hexadecimal digits.
  136    *
  137    *
  138    *  <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
  139    *
  140    *    [:javastart:]        Start of a Java identifier
  141    *    [:javapart:]         Part of a Java identifier
  142    *
  143    *
  144    *  <b><font face=times roman>Predefined Classes</font></b>
  145    *
  146    *    .         Matches any character other than newline
  147    *    \w        Matches a "word" character (alphanumeric plus "_")
  148    *    \W        Matches a non-word character
  149    *    \s        Matches a whitespace character
  150    *    \S        Matches a non-whitespace character
  151    *    \d        Matches a digit character
  152    *    \D        Matches a non-digit character
  153    *
  154    *
  155    *  <b><font face=times roman>Boundary Matchers</font></b>
  156    *
  157    *    ^         Matches only at the beginning of a line
  158    *    $         Matches only at the end of a line
  159    *    \b        Matches only at a word boundary
  160    *    \B        Matches only at a non-word boundary
  161    *
  162    *
  163    *  <b><font face=times roman>Greedy Closures</font></b>
  164    *
  165    *    A*        Matches A 0 or more times (greedy)
  166    *    A+        Matches A 1 or more times (greedy)
  167    *    A?        Matches A 1 or 0 times (greedy)
  168    *    A{n}      Matches A exactly n times (greedy)
  169    *    A{n,}     Matches A at least n times (greedy)
  170    *    A{n,m}    Matches A at least n but not more than m times (greedy)
  171    *
  172    *
  173    *  <b><font face=times roman>Reluctant Closures</font></b>
  174    *
  175    *    A*?       Matches A 0 or more times (reluctant)
  176    *    A+?       Matches A 1 or more times (reluctant)
  177    *    A??       Matches A 0 or 1 times (reluctant)
  178    *
  179    *
  180    *  <b><font face=times roman>Logical Operators</font></b>
  181    *
  182    *    AB        Matches A followed by B
  183    *    A|B       Matches either A or B
  184    *    (A)       Used for subexpression grouping
  185    *   (?:A)      Used for subexpression clustering (just like grouping but
  186    *              no backrefs)
  187    *
  188    *
  189    *  <b><font face=times roman>Backreferences</font></b>
  190    *
  191    *    \1    Backreference to 1st parenthesized subexpression
  192    *    \2    Backreference to 2nd parenthesized subexpression
  193    *    \3    Backreference to 3rd parenthesized subexpression
  194    *    \4    Backreference to 4th parenthesized subexpression
  195    *    \5    Backreference to 5th parenthesized subexpression
  196    *    \6    Backreference to 6th parenthesized subexpression
  197    *    \7    Backreference to 7th parenthesized subexpression
  198    *    \8    Backreference to 8th parenthesized subexpression
  199    *    \9    Backreference to 9th parenthesized subexpression
  200    * </pre>
  201    *
  202    * <p>
  203    * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning
  204    * that they match as many elements of the string as possible without
  205    * causing the overall match to fail.  If you want a closure to be
  206    * reluctant (non-greedy), you can simply follow it with a '?'.  A
  207    * reluctant closure will match as few elements of the string as
  208    * possible when finding matches.  {m,n} closures don't currently
  209    * support reluctancy.
  210    *
  211    * <p>
  212    * <b><font face="times roman">Line terminators</font></b>
  213    * <br>
  214    * A line terminator is a one- or two-character sequence that marks
  215    * the end of a line of the input character sequence. The following
  216    * are recognized as line terminators:
  217    * <ul>
  218    * <li>A newline (line feed) character ('\n'),</li>
  219    * <li>A carriage-return character followed immediately by a newline character ("\r\n"),</li>
  220    * <li>A standalone carriage-return character ('\r'),</li>
  221    * <li>A next-line character ('\u0085'),</li>
  222    * <li>A line-separator character ('\u2028'), or</li>
  223    * <li>A paragraph-separator character ('\u2029).</li>
  224    * </ul>
  225    *
  226    * <p>
  227    * RE runs programs compiled by the RECompiler class.  But the RE
  228    * matcher class does not include the actual regular expression compiler
  229    * for reasons of efficiency.  In fact, if you want to pre-compile one
  230    * or more regular expressions, the 'recompile' class can be invoked
  231    * from the command line to produce compiled output like this:
  232    *
  233    * <pre>
  234    *    // Pre-compiled regular expression "a*b"
  235    *    char[] re1Instructions =
  236    *    {
  237    *        0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
  238    *        0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
  239    *        0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
  240    *        0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
  241    *        0x0000,
  242    *    };
  243    *
  244    *
  245    *    REProgram re1 = new REProgram(re1Instructions);
  246    * </pre>
  247    *
  248    * You can then construct a regular expression matcher (RE) object from
  249    * the pre-compiled expression re1 and thus avoid the overhead of
  250    * compiling the expression at runtime. If you require more dynamic
  251    * regular expressions, you can construct a single RECompiler object and
  252    * re-use it to compile each expression. Similarly, you can change the
  253    * program run by a given matcher object at any time. However, RE and
  254    * RECompiler are not threadsafe (for efficiency reasons, and because
  255    * requiring thread safety in this class is deemed to be a rare
  256    * requirement), so you will need to construct a separate compiler or
  257    * matcher object for each thread (unless you do thread synchronization
  258    * yourself). Once expression compiled into the REProgram object, REProgram
  259    * can be safely shared across multiple threads and RE objects.
  260    *
  261    * <br><p><br>
  262    *
  263    * <font color="red">
  264    * <i>ISSUES:</i>
  265    *
  266    * <ul>
  267    *  <li>com.weusours.util.re is not currently compatible with all
  268    *      standard POSIX regcomp flags</li>
  269    *  <li>com.weusours.util.re does not support POSIX equivalence classes
  270    *      ([=foo=] syntax) (I18N/locale issue)</li>
  271    *  <li>com.weusours.util.re does not support nested POSIX character
  272    *      classes (definitely should, but not completely trivial)</li>
  273    *  <li>com.weusours.util.re Does not support POSIX character collation
  274    *      concepts ([.foo.] syntax) (I18N/locale issue)</li>
  275    *  <li>Should there be different matching styles (simple, POSIX, Perl etc?)</li>
  276    *  <li>Should RE support character iterators (for backwards RE matching!)?</li>
  277    *  <li>Should RE support reluctant {m,n} closures (does anyone care)?</li>
  278    *  <li>Not *all* possibilities are considered for greediness when backreferences
  279    *      are involved (as POSIX suggests should be the case).  The POSIX RE
  280    *      "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
  281    *      of acdacaa where \1 is "a".  This is not the case in this RE package,
  282    *      and actually Perl doesn't go to this extent either!  Until someone
  283    *      actually complains about this, I'm not sure it's worth "fixing".
  284    *      If it ever is fixed, test #137 in RETest.txt should be updated.</li>
  285    * </ul>
  286    *
  287    * </font>
  288    *
  289    * @see recompile
  290    * @see RECompiler
  291    *
  292    * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
  293    * @author <a href="mailto:ts@sch-fer.de">Tobias Sch&auml;fer</a>
  294    * @version $Id: RE.java 518156 2007-03-14 14:31:26Z vgritsenko $
  295    */
  296   public class RE implements Serializable
  297   {
  298       /**
  299        * Specifies normal, case-sensitive matching behaviour.
  300        */
  301       public static final int MATCH_NORMAL          = 0x0000;
  302   
  303       /**
  304        * Flag to indicate that matching should be case-independent (folded)
  305        */
  306       public static final int MATCH_CASEINDEPENDENT = 0x0001;
  307   
  308       /**
  309        * Newlines should match as BOL/EOL (^ and $)
  310        */
  311       public static final int MATCH_MULTILINE       = 0x0002;
  312   
  313       /**
  314        * Consider all input a single body of text - newlines are matched by .
  315        */
  316       public static final int MATCH_SINGLELINE      = 0x0004;
  317   
  318       /************************************************
  319        *                                              *
  320        * The format of a node in a program is:        *
  321        *                                              *
  322        * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
  323        *                                              *
  324        * char OPCODE - instruction                    *
  325        * char OPDATA - modifying data                 *
  326        * char OPNEXT - next node (relative offset)    *
  327        *                                              *
  328        ************************************************/
  329   
  330                    //   Opcode              Char       Opdata/Operand  Meaning
  331                    //   ----------          ---------- --------------- --------------------------------------------------
  332       static final char OP_END              = 'E';  //                 end of program
  333       static final char OP_BOL              = '^';  //                 match only if at beginning of line
  334       static final char OP_EOL              = '$';  //                 match only if at end of line
  335       static final char OP_ANY              = '.';  //                 match any single character except newline
  336       static final char OP_ANYOF            = '[';  // count/ranges    match any char in the list of ranges
  337       static final char OP_BRANCH           = '|';  // node            match this alternative or the next one
  338       static final char OP_ATOM             = 'A';  // length/string   length of string followed by string itself
  339       static final char OP_STAR             = '*';  // node            kleene closure
  340       static final char OP_PLUS             = '+';  // node            positive closure
  341       static final char OP_MAYBE            = '?';  // node            optional closure
  342       static final char OP_ESCAPE           = '\\'; // escape          special escape code char class (escape is E_* code)
  343       static final char OP_OPEN             = '(';  // number          nth opening paren
  344       static final char OP_OPEN_CLUSTER     = '<';  //                 opening cluster
  345       static final char OP_CLOSE            = ')';  // number          nth closing paren
  346       static final char OP_CLOSE_CLUSTER    = '>';  //                 closing cluster
  347       static final char OP_BACKREF          = '#';  // number          reference nth already matched parenthesized string
  348       static final char OP_GOTO             = 'G';  //                 nothing but a (back-)pointer
  349       static final char OP_NOTHING          = 'N';  //                 match null string such as in '(a|)'
  350       static final char OP_CONTINUE         = 'C';  //                 continue to the following command (ignore next)
  351       static final char OP_RELUCTANTSTAR    = '8';  // none/expr       reluctant '*' (mnemonic for char is unshifted '*')
  352       static final char OP_RELUCTANTPLUS    = '=';  // none/expr       reluctant '+' (mnemonic for char is unshifted '+')
  353       static final char OP_RELUCTANTMAYBE   = '/';  // none/expr       reluctant '?' (mnemonic for char is unshifted '?')
  354       static final char OP_POSIXCLASS       = 'P';  // classid         one of the posix character classes
  355   
  356       // Escape codes
  357       static final char E_ALNUM             = 'w';  // Alphanumeric
  358       static final char E_NALNUM            = 'W';  // Non-alphanumeric
  359       static final char E_BOUND             = 'b';  // Word boundary
  360       static final char E_NBOUND            = 'B';  // Non-word boundary
  361       static final char E_SPACE             = 's';  // Whitespace
  362       static final char E_NSPACE            = 'S';  // Non-whitespace
  363       static final char E_DIGIT             = 'd';  // Digit
  364       static final char E_NDIGIT            = 'D';  // Non-digit
  365   
  366       // Posix character classes
  367       static final char POSIX_CLASS_ALNUM   = 'w';  // Alphanumerics
  368       static final char POSIX_CLASS_ALPHA   = 'a';  // Alphabetics
  369       static final char POSIX_CLASS_BLANK   = 'b';  // Blanks
  370       static final char POSIX_CLASS_CNTRL   = 'c';  // Control characters
  371       static final char POSIX_CLASS_DIGIT   = 'd';  // Digits
  372       static final char POSIX_CLASS_GRAPH   = 'g';  // Graphic characters
  373       static final char POSIX_CLASS_LOWER   = 'l';  // Lowercase characters
  374       static final char POSIX_CLASS_PRINT   = 'p';  // Printable characters
  375       static final char POSIX_CLASS_PUNCT   = '!';  // Punctuation
  376       static final char POSIX_CLASS_SPACE   = 's';  // Spaces
  377       static final char POSIX_CLASS_UPPER   = 'u';  // Uppercase characters
  378       static final char POSIX_CLASS_XDIGIT  = 'x';  // Hexadecimal digits
  379       static final char POSIX_CLASS_JSTART  = 'j';  // Java identifier start
  380       static final char POSIX_CLASS_JPART   = 'k';  // Java identifier part
  381   
  382       // Limits
  383       static final int maxNode  = 65536;            // Maximum number of nodes in a program
  384       static final int MAX_PAREN = 16;              // Number of paren pairs (only 9 can be backrefs)
  385   
  386       // Node layout constants
  387       static final int offsetOpcode = 0;            // Opcode offset (first character)
  388       static final int offsetOpdata = 1;            // Opdata offset (second char)
  389       static final int offsetNext   = 2;            // Next index offset (third char)
  390       static final int nodeSize     = 3;            // Node size (in chars)
  391   
  392       // State of current program
  393       REProgram program;                            // Compiled regular expression 'program'
  394       transient CharacterIterator search;           // The string being matched against
  395       int matchFlags;                               // Match behaviour flags
  396       int maxParen = MAX_PAREN;
  397   
  398       // Parenthesized subexpressions
  399       transient int parenCount;                     // Number of subexpressions matched (num open parens + 1)
  400       transient int start0;                         // Cache of start[0]
  401       transient int end0;                           // Cache of start[0]
  402       transient int start1;                         // Cache of start[1]
  403       transient int end1;                           // Cache of start[1]
  404       transient int start2;                         // Cache of start[2]
  405       transient int end2;                           // Cache of start[2]
  406       transient int[] startn;                       // Lazy-alloced array of sub-expression starts
  407       transient int[] endn;                         // Lazy-alloced array of sub-expression ends
  408   
  409       // Backreferences
  410       transient int[] startBackref;                 // Lazy-alloced array of backref starts
  411       transient int[] endBackref;                   // Lazy-alloced array of backref ends
  412   
  413       /**
  414        * Constructs a regular expression matcher from a String by compiling it
  415        * using a new instance of RECompiler.  If you will be compiling many
  416        * expressions, you may prefer to use a single RECompiler object instead.
  417        *
  418        * @param pattern The regular expression pattern to compile.
  419        * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  420        * @see RECompiler
  421        * @see recompile
  422        */
  423       public RE(String pattern) throws RESyntaxException
  424       {
  425           this(pattern, MATCH_NORMAL);
  426       }
  427   
  428       /**
  429        * Constructs a regular expression matcher from a String by compiling it
  430        * using a new instance of RECompiler.  If you will be compiling many
  431        * expressions, you may prefer to use a single RECompiler object instead.
  432        *
  433        * @param pattern The regular expression pattern to compile.
  434        * @param matchFlags The matching style
  435        * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  436        * @see RECompiler
  437        * @see recompile
  438        */
  439       public RE(String pattern, int matchFlags) throws RESyntaxException
  440       {
  441           this(new RECompiler().compile(pattern), matchFlags);
  442       }
  443   
  444       /**
  445        * Construct a matcher for a pre-compiled regular expression from program
  446        * (bytecode) data.  Permits special flags to be passed in to modify matching
  447        * behaviour.
  448        *
  449        * @param program Compiled regular expression program (see RECompiler and/or recompile)
  450        * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
  451        *
  452        * <pre>
  453        *   MATCH_NORMAL              // Normal (case-sensitive) matching
  454        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
  455        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  456        * </pre>
  457        *
  458        * @see RECompiler
  459        * @see REProgram
  460        * @see recompile
  461        */
  462       public RE(REProgram program, int matchFlags)
  463       {
  464           setProgram(program);
  465           setMatchFlags(matchFlags);
  466       }
  467   
  468       /**
  469        * Construct a matcher for a pre-compiled regular expression from program
  470        * (bytecode) data.
  471        *
  472        * @param program Compiled regular expression program
  473        * @see RECompiler
  474        * @see recompile
  475        */
  476       public RE(REProgram program)
  477       {
  478           this(program, MATCH_NORMAL);
  479       }
  480   
  481       /**
  482        * Constructs a regular expression matcher with no initial program.
  483        * This is likely to be an uncommon practice, but is still supported.
  484        */
  485       public RE()
  486       {
  487           this((REProgram) null, MATCH_NORMAL);
  488       }
  489   
  490       /**
  491        * Converts a 'simplified' regular expression to a full regular expression
  492        *
  493        * @param pattern The pattern to convert
  494        * @return The full regular expression
  495        */
  496       public static String simplePatternToFullRegularExpression(String pattern)
  497       {
  498           StringBuffer buf = new StringBuffer();
  499           for (int i = 0; i < pattern.length(); i++)
  500           {
  501               char c = pattern.charAt(i);
  502               switch (c)
  503               {
  504                   case '*':
  505                       buf.append(".*");
  506                       break;
  507   
  508                   case '.':
  509                   case '[':
  510                   case ']':
  511                   case '\\':
  512                   case '+':
  513                   case '?':
  514                   case '{':
  515                   case '}':
  516                   case '$':
  517                   case '^':
  518                   case '|':
  519                   case '(':
  520                   case ')':
  521                       buf.append('\\');
  522                   default:
  523                       buf.append(c);
  524                       break;
  525               }
  526           }
  527           return buf.toString();
  528       }
  529   
  530       /**
  531        * Sets match behaviour flags which alter the way RE does matching.
  532        * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
  533        *
  534        * <pre>
  535        *   MATCH_NORMAL              // Normal (case-sensitive) matching
  536        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
  537        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  538        * </pre>
  539        */
  540       public void setMatchFlags(int matchFlags)
  541       {
  542           this.matchFlags = matchFlags;
  543       }
  544   
  545       /**
  546        * Returns the current match behaviour flags.
  547        * @return Current match behaviour flags (RE.MATCH_*).
  548        *
  549        * <pre>
  550        *   MATCH_NORMAL              // Normal (case-sensitive) matching
  551        *   MATCH_CASEINDEPENDENT     // Case folded comparisons
  552        *   MATCH_MULTILINE           // Newline matches as BOL/EOL
  553        * </pre>
  554        *
  555        * @see #setMatchFlags
  556        */
  557       public int getMatchFlags()
  558       {
  559           return matchFlags;
  560       }
  561   
  562       /**
  563        * Sets the current regular expression program used by this matcher object.
  564        *
  565        * @param program Regular expression program compiled by RECompiler.
  566        * @see RECompiler
  567        * @see REProgram
  568        * @see recompile
  569        */
  570       public void setProgram(REProgram program)
  571       {
  572           this.program = program;
  573           if (program != null && program.maxParens != -1) {
  574               this.maxParen = program.maxParens;
  575           } else {
  576               this.maxParen = MAX_PAREN;
  577           }
  578       }
  579   
  580       /**
  581        * Returns the current regular expression program in use by this matcher object.
  582        *
  583        * @return Regular expression program
  584        * @see #setProgram
  585        */
  586       public REProgram getProgram()
  587       {
  588           return program;
  589       }
  590   
  591       /**
  592        * Returns the number of parenthesized subexpressions available after a successful match.
  593        *
  594        * @return Number of available parenthesized subexpressions
  595        */
  596       public int getParenCount()
  597       {
  598           return parenCount;
  599       }
  600   
  601       /**
  602        * Gets the contents of a parenthesized subexpression after a successful match.
  603        *
  604        * @param which Nesting level of subexpression
  605        * @return String
  606        */
  607       public String getParen(int which)
  608       {
  609           int start;
  610           if (which < parenCount && (start = getParenStart(which)) >= 0)
  611           {
  612               return search.substring(start, getParenEnd(which));
  613           }
  614           return null;
  615       }
  616   
  617       /**
  618        * Returns the start index of a given paren level.
  619        *
  620        * @param which Nesting level of subexpression
  621        * @return String index
  622        */
  623       public final int getParenStart(int which)
  624       {
  625           if (which < parenCount)
  626           {
  627               switch (which)
  628               {
  629                   case 0:
  630                       return start0;
  631   
  632                   case 1:
  633                       return start1;
  634   
  635                   case 2:
  636                       return start2;
  637   
  638                   default:
  639                       if (startn == null)
  640                       {
  641                           allocParens();
  642                       }
  643                       return startn[which];
  644               }
  645           }
  646           return -1;
  647       }
  648   
  649       /**
  650        * Returns the end index of a given paren level.
  651        *
  652        * @param which Nesting level of subexpression
  653        * @return String index
  654        */
  655       public final int getParenEnd(int which)
  656       {
  657           if (which < parenCount)
  658           {
  659               switch (which)
  660               {
  661                   case 0:
  662                       return end0;
  663   
  664                   case 1:
  665                       return end1;
  666   
  667                   case 2:
  668                       return end2;
  669   
  670                   default:
  671                       if (endn == null)
  672                       {
  673                           allocParens();
  674                       }
  675                       return endn[which];
  676               }
  677           }
  678           return -1;
  679       }
  680   
  681       /**
  682        * Returns the length of a given paren level.
  683        *
  684        * @param which Nesting level of subexpression
  685        * @return Number of characters in the parenthesized subexpression
  686        */
  687       public final int getParenLength(int which)
  688       {
  689           if (which < parenCount)
  690           {
  691               return getParenEnd(which) - getParenStart(which);
  692           }
  693           return -1;
  694       }
  695   
  696       /**
  697        * Sets the start of a paren level
  698        *
  699        * @param which Which paren level
  700        * @param i Index in input array
  701        */
  702       protected final void setParenStart(int which, int i)
  703       {
  704           if (which < parenCount)
  705           {
  706               switch (which)
  707               {
  708                   case 0:
  709                       start0 = i;
  710                       break;
  711   
  712                   case 1:
  713                       start1 = i;
  714                       break;
  715   
  716                   case 2:
  717                       start2 = i;
  718                       break;
  719   
  720                   default:
  721                       if (startn == null)
  722                       {
  723                           allocParens();
  724                       }
  725                       startn[which] = i;
  726                       break;
  727               }
  728           }
  729       }
  730   
  731       /**
  732        * Sets the end of a paren level
  733        *
  734        * @param which Which paren level
  735        * @param i Index in input array
  736        */
  737       protected final void setParenEnd(int which, int i)
  738       {
  739           if (which < parenCount)
  740           {
  741               switch (which)
  742               {
  743                   case 0:
  744                       end0 = i;
  745                       break;
  746   
  747                   case 1:
  748                       end1 = i;
  749                       break;
  750   
  751                   case 2:
  752                       end2 = i;
  753                       break;
  754   
  755                   default:
  756                       if (endn == null)
  757                       {
  758                           allocParens();
  759                       }
  760                       endn[which] = i;
  761                       break;
  762               }
  763           }
  764       }
  765   
  766       /**
  767        * Throws an Error representing an internal error condition probably resulting
  768        * from a bug in the regular expression compiler (or possibly data corruption).
  769        * In practice, this should be very rare.
  770        *
  771        * @param s Error description
  772        */
  773       protected void internalError(String s) throws Error
  774       {
  775           throw new Error("RE internal error: " + s);
  776       }
  777   
  778       /**
  779        * Performs lazy allocation of subexpression arrays
  780        */
  781       private void allocParens()
  782       {
  783           // Allocate arrays for subexpressions
  784           startn = new int[maxParen];
  785           endn = new int[maxParen];
  786   
  787           // Set sub-expression pointers to invalid values
  788           for (int i = 0; i < maxParen; i++)
  789           {
  790               startn[i] = -1;
  791               endn[i] = -1;
  792           }
  793       }
  794   
  795       /**
  796        * Try to match a string against a subset of nodes in the program
  797        *
  798        * @param firstNode Node to start at in program
  799        * @param lastNode  Last valid node (used for matching a subexpression without
  800        *                  matching the rest of the program as well).
  801        * @param idxStart  Starting position in character array
  802        * @return Final input array index if match succeeded.  -1 if not.
  803        */
  804       protected int matchNodes(int firstNode, int lastNode, int idxStart)
  805       {
  806           // Our current place in the string
  807           int idx = idxStart;
  808   
  809           // Loop while node is valid
  810           int next, opcode, opdata;
  811           int idxNew;
  812           char[] instruction = program.instruction;
  813           for (int node = firstNode; node < lastNode; )
  814           {
  815               opcode = instruction[node /* + offsetOpcode */];
  816               next   = node + (short) instruction[node + offsetNext];
  817               opdata = instruction[node + offsetOpdata];
  818   
  819               switch (opcode)
  820               {
  821                   case OP_MAYBE:
  822                   case OP_STAR:
  823                       {
  824                           // Try to match the following subexpr. If it matches:
  825                           //   MAYBE:  Continues matching rest of the expression
  826                           //    STAR:  Points back here to repeat subexpr matching
  827                           if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
  828                           {
  829                               return idxNew;
  830                           }
  831   
  832                           // If failed, just continue with the rest of expression
  833                           break;
  834                       }
  835   
  836                   case OP_PLUS:
  837                       {
  838                           // Try to match the subexpr again (and again (and ...
  839                           if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  840                           {
  841                               return idxNew;
  842                           }
  843   
  844                           // If failed, just continue with the rest of expression
  845                           // Rest is located at the next pointer of the next instruction
  846                           // (which must be OP_CONTINUE)
  847                           node = next + (short) instruction[next + offsetNext];
  848                           continue;
  849                       }
  850   
  851                   case OP_RELUCTANTMAYBE:
  852                   case OP_RELUCTANTSTAR:
  853                       {
  854                           // Try to match the rest without using the reluctant subexpr
  855                           if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  856                           {
  857                               return idxNew;
  858                           }
  859   
  860                           // Try reluctant subexpr. If it matches:
  861                           //   RELUCTANTMAYBE: Continues matching rest of the expression
  862                           //    RELUCTANTSTAR: Points back here to repeat reluctant star matching
  863                           return matchNodes(node + nodeSize, next, idx);
  864                       }
  865   
  866                   case OP_RELUCTANTPLUS:
  867                       {
  868                           // Continue matching the rest without using the reluctant subexpr
  869                           if ((idxNew = matchNodes(next + (short) instruction[next + offsetNext], maxNode, idx)) != -1)
  870                           {
  871                               return idxNew;
  872                           }
  873   
  874                           // Try to match subexpression again
  875                           break;
  876                       }
  877   
  878                   case OP_OPEN:
  879   
  880                       // Match subexpression
  881                       if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
  882                       {
  883                           startBackref[opdata] = idx;
  884                       }
  885                       if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  886                       {
  887                           // Increase valid paren count
  888                           if (opdata >= parenCount)
  889                           {
  890                               parenCount = opdata + 1;
  891                           }
  892   
  893                           // Don't set paren if already set later on
  894                           if (getParenStart(opdata) == -1)
  895                           {
  896                               setParenStart(opdata, idx);
  897                           }
  898                       }
  899                       return idxNew;
  900   
  901                   case OP_CLOSE:
  902   
  903                       // Done matching subexpression
  904                       if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
  905                       {
  906                           endBackref[opdata] = idx;
  907                       }
  908                       if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  909                       {
  910                           // Increase valid paren count
  911                           if (opdata  >= parenCount)
  912                           {
  913                               parenCount = opdata + 1;
  914                           }
  915   
  916                           // Don't set paren if already set later on
  917                           if (getParenEnd(opdata) == -1)
  918                           {
  919                               setParenEnd(opdata, idx);
  920                           }
  921                       }
  922                       return idxNew;
  923   
  924                   case OP_BACKREF:
  925                       {
  926                           // Get the start and end of the backref
  927                           int s = startBackref[opdata];
  928                           int e = endBackref[opdata];
  929   
  930                           // We don't know the backref yet
  931                           if (s == -1 || e == -1)
  932                           {
  933                               return -1;
  934                           }
  935   
  936                           // The backref is empty size
  937                           if (s == e)
  938                           {
  939                               break;
  940                           }
  941   
  942                           // Get the length of the backref
  943                           int l = e - s;
  944   
  945                           // If there's not enough input left, give up.
  946                           if (search.isEnd(idx + l - 1))
  947                           {
  948                               return -1;
  949                           }
  950   
  951                           // Case fold the backref?
  952                           final boolean caseFold =
  953                               ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
  954                           // Compare backref to input
  955                           for (int i = 0; i < l; i++)
  956                           {
  957                               if (compareChars(search.charAt(idx++), search.charAt(s + i), caseFold) != 0)
  958                               {
  959                                   return -1;
  960                               }
  961                           }
  962                       }
  963                       break;
  964   
  965                   case OP_BOL:
  966   
  967                       // Fail if we're not at the start of the string
  968                       if (idx != 0)
  969                       {
  970                           // If we're multiline matching, we could still be at the start of a line
  971                           if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
  972                           {
  973                               // Continue if at the start of a line
  974                               if (isNewline(idx - 1))
  975                               {
  976                                   break;
  977                               }
  978                           }
  979                           return -1;
  980                       }
  981                       break;
  982   
  983                   case OP_EOL:
  984   
  985                       // If we're not at the end of string
  986                       if (!search.isEnd(0) && !search.isEnd(idx))
  987                       {
  988                           // If we're multi-line matching
  989                           if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
  990                           {
  991                               // Continue if we're at the end of a line
  992                               if (isNewline(idx))
  993                               {
  994                                   break;
  995                               }
  996                           }
  997                           return -1;
  998                       }
  999                       break;
 1000   
 1001                   case OP_ESCAPE:
 1002   
 1003                       // Which escape?
 1004                       switch (opdata)
 1005                       {
 1006                           // Word boundary match
 1007                           case E_NBOUND:
 1008                           case E_BOUND:
 1009                               {
 1010                                   char cLast = ((idx == 0) ? '\n' : search.charAt(idx - 1));
 1011                                   char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
 1012                                   if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
 1013                                   {
 1014                                       return -1;
 1015                                   }
 1016                               }
 1017                               break;
 1018   
 1019                           // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
 1020                           case E_ALNUM:
 1021                           case E_NALNUM:
 1022                           case E_DIGIT:
 1023                           case E_NDIGIT:
 1024                           case E_SPACE:
 1025                           case E_NSPACE:
 1026   
 1027                               // Give up if out of input
 1028                               if (search.isEnd(idx))
 1029                               {
 1030                                   return -1;
 1031                               }
 1032   
 1033                               char c = search.charAt(idx);
 1034   
 1035                               // Switch on escape
 1036                               switch (opdata)
 1037                               {
 1038                                   case E_ALNUM:
 1039                                   case E_NALNUM:
 1040                                       if (!((Character.isLetterOrDigit(c) || c == '_') == (opdata == E_ALNUM)))
 1041                                       {
 1042                                           return -1;
 1043                                       }
 1044                                       break;
 1045   
 1046                                   case E_DIGIT:
 1047                                   case E_NDIGIT:
 1048                                       if (!(Character.isDigit(c) == (opdata == E_DIGIT)))
 1049                                       {
 1050                                           return -1;
 1051                                       }
 1052                                       break;
 1053   
 1054                                   case E_SPACE:
 1055                                   case E_NSPACE:
 1056                                       if (!(Character.isWhitespace(c) == (opdata == E_SPACE)))
 1057                                       {
 1058                                           return -1;
 1059                                       }
 1060                                       break;
 1061                               }
 1062                               idx++;
 1063                               break;
 1064   
 1065                           default:
 1066                               internalError("Unrecognized escape '" + opdata + "'");
 1067                       }
 1068                       break;
 1069   
 1070                   case OP_ANY:
 1071   
 1072                       if ((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
 1073                           // Match anything
 1074                           if (search.isEnd(idx))
 1075                           {
 1076                               return -1;
 1077                           }
 1078                       }
 1079                       else
 1080                       {
 1081                           // Match anything but a newline
 1082                           if (search.isEnd(idx) || isNewline(idx))
 1083                           {
 1084                               return -1;
 1085                           }
 1086                       }
 1087                       idx++;
 1088                       break;
 1089   
 1090                   case OP_ATOM:
 1091                       {
 1092                           // Match an atom value
 1093                           if (search.isEnd(idx))
 1094                           {
 1095                               return -1;
 1096                           }
 1097   
 1098                           // Get length of atom and starting index
 1099                           // int lenAtom = opdata;
 1100                           int startAtom = node + nodeSize;
 1101   
 1102                           // Give up if not enough input remains to have a match
 1103                           if (search.isEnd(opdata + idx - 1))
 1104                           {
 1105                               return -1;
 1106                           }
 1107   
 1108                           // Match atom differently depending on casefolding flag
 1109                           final boolean caseFold =
 1110                               ((matchFlags & MATCH_CASEINDEPENDENT) != 0);
 1111   
 1112                           for (int i = 0; i < opdata; i++)
 1113                           {
 1114                               if (compareChars(search.charAt(idx++), instruction[startAtom + i], caseFold) != 0)
 1115                               {
 1116                                   return -1;
 1117                               }
 1118                           }
 1119                       }
 1120                       break;
 1121   
 1122                   case OP_POSIXCLASS:
 1123                       {
 1124                           // Out of input?
 1125                           if (search.isEnd(idx))
 1126                           {
 1127                               return -1;
 1128                           }
 1129   
 1130                           switch (opdata)
 1131                           {
 1132                               case POSIX_CLASS_ALNUM:
 1133                                   if (!Character.isLetterOrDigit(search.charAt(idx)))
 1134                                   {
 1135                                       return -1;
 1136                                   }
 1137                                   break;
 1138   
 1139                               case POSIX_CLASS_ALPHA:
 1140                                   if (!Character.isLetter(search.charAt(idx)))
 1141                                   {
 1142                                       return -1;
 1143                                   }
 1144                                   break;
 1145   
 1146                               case POSIX_CLASS_DIGIT:
 1147                                   if (!Character.isDigit(search.charAt(idx)))
 1148                                   {
 1149                                       return -1;
 1150                                   }
 1151                                   break;
 1152   
 1153                               case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
 1154                                   if (!Character.isSpaceChar(search.charAt(idx)))
 1155                                   {
 1156                                       return -1;
 1157                                   }
 1158                                   break;
 1159   
 1160                               case POSIX_CLASS_SPACE:
 1161                                   if (!Character.isWhitespace(search.charAt(idx)))
 1162                                   {
 1163                                       return -1;
 1164                                   }
 1165                                   break;
 1166   
 1167                               case POSIX_CLASS_CNTRL:
 1168                                   if (Character.getType(search.charAt(idx)) != Character.CONTROL)
 1169                                   {
 1170                                       return -1;
 1171                                   }
 1172                                   break;
 1173   
 1174                               case POSIX_CLASS_GRAPH: // JWL - bugbug???
 1175                                   switch (Character.getType(search.charAt(idx)))
 1176                                   {
 1177                                       case Character.MATH_SYMBOL:
 1178                                       case Character.CURRENCY_SYMBOL:
 1179                                       case Character.MODIFIER_SYMBOL:
 1180                                       case Character.OTHER_SYMBOL:
 1181                                           break;
 1182   
 1183                                       default:
 1184                                           return -1;
 1185                                   }
 1186                                   break;
 1187   
 1188                               case POSIX_CLASS_LOWER:
 1189                                   if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
 1190                                   {
 1191                                       return -1;
 1192                                   }
 1193                                   break;
 1194   
 1195                               case POSIX_CLASS_UPPER:
 1196                                   if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
 1197                                   {
 1198                                       return -1;
 1199                                   }
 1200                                   break;
 1201   
 1202                               case POSIX_CLASS_PRINT:
 1203                                   if (Character.getType(search.charAt(idx)) == Character.CONTROL)
 1204                                   {
 1205                                       return -1;
 1206                                   }
 1207                                   break;
 1208   
 1209                               case POSIX_CLASS_PUNCT:
 1210                               {
 1211                                   int type = Character.getType(search.charAt(idx));
 1212                                   switch(type)
 1213                                   {
 1214                                       case Character.DASH_PUNCTUATION:
 1215                                       case Character.START_PUNCTUATION:
 1216                                       case Character.END_PUNCTUATION:
 1217                                       case Character.CONNECTOR_PUNCTUATION:
 1218                                       case Character.OTHER_PUNCTUATION:
 1219                                           break;
 1220   
 1221                                       default:
 1222                                           return -1;
 1223                                   }
 1224                               }
 1225                               break;
 1226   
 1227                               case POSIX_CLASS_XDIGIT: // JWL - bugbug??
 1228                               {
 1229                                   boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
 1230                                                       (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
 1231                                                       (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
 1232                                   if (!isXDigit)
 1233                                   {
 1234                                       return -1;
 1235                                   }
 1236                               }
 1237                               break;
 1238   
 1239                               case POSIX_CLASS_JSTART:
 1240                                   if (!Character.isJavaIdentifierStart(search.charAt(idx)))
 1241                                   {
 1242                                       return -1;
 1243                                   }
 1244                                   break;
 1245   
 1246                               case POSIX_CLASS_JPART:
 1247                                   if (!Character.isJavaIdentifierPart(search.charAt(idx)))
 1248                                   {
 1249                                       return -1;
 1250                                   }
 1251                                   break;
 1252   
 1253                               default:
 1254                                   internalError("Bad posix class");
 1255                                   break;
 1256                           }
 1257   
 1258                           // Matched.
 1259                           idx++;
 1260                       }
 1261                       break;
 1262   
 1263                   case OP_ANYOF:
 1264                       {
 1265                           // Out of input?
 1266                           if (search.isEnd(idx))
 1267                           {
 1268                               return -1;
 1269                           }
 1270   
 1271                           // Get character to match against character class and maybe casefold
 1272                           char c = search.charAt(idx);
 1273                           boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
 1274                           // Loop through character class checking our match character
 1275                           int idxRange = node + nodeSize;
 1276                           int idxEnd = idxRange + (opdata * 2);
 1277                           boolean match = false;
 1278                           for (int i = idxRange; !match && i < idxEnd; )
 1279                           {
 1280                               // Get start, end and match characters
 1281                               char s = instruction[i++];
 1282                               char e = instruction[i++];
 1283   
 1284                               match = ((compareChars(c, s, caseFold) >= 0)
 1285                                        && (compareChars(c, e, caseFold) <= 0));
 1286                           }
 1287   
 1288                           // Fail if we didn't match the character class
 1289                           if (!match)
 1290                           {
 1291                               return -1;
 1292                           }
 1293                           idx++;
 1294                       }
 1295                       break;
 1296   
 1297                   case OP_BRANCH:
 1298                   {
 1299                       // Check for choices
 1300                       // FIXME Dead code - only reason to keep is backward compat with pre-compiled exprs. Remove?
 1301                       if (instruction[next /* + offsetOpcode */] != OP_BRANCH)
 1302                       {
 1303                           // If there aren't any other choices, just evaluate this branch.
 1304                           node += nodeSize;
 1305                           continue;
 1306                       }
 1307   
 1308                       // Try all available branches
 1309                       int nextBranch;
 1310                       do
 1311                       {
 1312                           // Try matching the branch against the string
 1313                           if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
 1314                           {
 1315                               return idxNew;
 1316                           }
 1317   
 1318                           // Go to next branch (if any)
 1319                           nextBranch = (short) instruction[node + offsetNext];
 1320                           node += nextBranch;
 1321                       }
 1322                       while (nextBranch != 0 && (instruction[node /* + offsetOpcode */] == OP_BRANCH));
 1323   
 1324                       // Failed to match any branch!
 1325                       return -1;
 1326                   }
 1327   
 1328                   case OP_OPEN_CLUSTER:
 1329                   case OP_CLOSE_CLUSTER:
 1330                       // starting or ending the matching of a subexpression which has no backref.
 1331   
 1332                   case OP_NOTHING:
 1333                   case OP_GOTO:
 1334   
 1335                       // Just advance to the next node without doing anything
 1336                       break;
 1337   
 1338                   case OP_CONTINUE:
 1339   
 1340                       // Advance to the following node
 1341                       node += nodeSize;
 1342                       continue;
 1343   
 1344                   case OP_END:
 1345   
 1346                       // Match has succeeded!
 1347                       setParenEnd(0, idx);
 1348                       return idx;
 1349   
 1350                   default:
 1351   
 1352                       // Corrupt program
 1353                       internalError("Invalid opcode '" + opcode + "'");
 1354               }
 1355   
 1356               // Advance to the next node in the program
 1357               node = next;
 1358           }
 1359   
 1360           // We "should" never end up here
 1361           internalError("Corrupt program");
 1362           return -1;
 1363       }
 1364   
 1365       /**
 1366        * Match the current regular expression program against the current
 1367        * input string, starting at index i of the input string.  This method
 1368        * is only meant for internal use.
 1369        *
 1370        * @param i The input string index to start matching at
 1371        * @return True if the input matched the expression
 1372        */
 1373       protected boolean matchAt(int i)
 1374       {
 1375           // Initialize start pointer, paren cache and paren count
 1376           start0 = -1;
 1377           end0   = -1;
 1378           start1 = -1;
 1379           end1   = -1;
 1380           start2 = -1;
 1381           end2   = -1;
 1382           startn = null;
 1383           endn   = null;
 1384           parenCount = 1;
 1385           setParenStart(0, i);
 1386   
 1387           // Allocate backref arrays (unless optimizations indicate otherwise)
 1388           if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
 1389           {
 1390               startBackref = new int[maxParen];
 1391               endBackref = new int[maxParen];
 1392           }
 1393   
 1394           // Match against string
 1395           int idx;
 1396           if ((idx = matchNodes(0, maxNode, i)) != -1)
 1397           {
 1398               setParenEnd(0, idx);
 1399               return true;
 1400           }
 1401   
 1402           // Didn't match
 1403           parenCount = 0;
 1404           return false;
 1405       }
 1406   
 1407       /**
 1408        * Matches the current regular expression program against a character array,
 1409        * starting at a given index.
 1410        *
 1411        * @param search String to match against
 1412        * @param i Index to start searching at
 1413        * @return True if string matched
 1414        */
 1415       public boolean match(String search, int i)
 1416       {
 1417           return match(new StringCharacterIterator(search), i);
 1418       }
 1419   
 1420       /**
 1421        * Matches the current regular expression program against a character array,
 1422        * starting at a given index.
 1423        *
 1424        * @param search String to match against
 1425        * @param i Index to start searching at
 1426        * @return True if string matched
 1427        */
 1428       public boolean match(CharacterIterator search, int i)
 1429       {
 1430           // There is no compiled program to search with!
 1431           if (program == null)
 1432           {
 1433               // This should be uncommon enough to be an error case rather
 1434               // than an exception (which would have to be handled everywhere)
 1435               internalError("No RE program to run!");
 1436           }
 1437   
 1438           // Save string to search
 1439           this.search = search;
 1440   
 1441           // Can we optimize the search by looking for new lines?
 1442           if ((program.flags & REProgram.OPT_HASBOL) == REProgram.OPT_HASBOL)
 1443           {
 1444               // Non multi-line matching with BOL: Must match at '0' index
 1445               if ((matchFlags & MATCH_MULTILINE) == 0)
 1446               {
 1447                   return i == 0 && matchAt(i);
 1448               }
 1449   
 1450               // Multi-line matching with BOL: Seek to next line
 1451               for ( ;! search.isEnd(i); i++)
 1452               {
 1453                   // Skip if we are at the beginning of the line
 1454                   if (isNewline(i))
 1455                   {
 1456                       continue;
 1457                   }
 1458   
 1459                   // Match at the beginning of the line
 1460                   if (matchAt(i))
 1461                   {
 1462                       return true;
 1463                   }
 1464   
 1465                   // Skip to the end of line
 1466                   for ( ;! search.isEnd(i); i++)
 1467                   {
 1468                       if (isNewline(i))
 1469                       {
 1470                           break;
 1471                       }
 1472                   }
 1473               }
 1474   
 1475               return false;
 1476           }
 1477   
 1478           // Can we optimize the search by looking for a prefix string?
 1479           if (program.prefix == null)
 1480           {
 1481               // Unprefixed matching must try for a match at each character
 1482               for ( ;! search.isEnd(i - 1); i++)
 1483               {
 1484                   // Try a match at index i
 1485                   if (matchAt(i))
 1486                   {
 1487                       return true;
 1488                   }
 1489               }
 1490               return false;
 1491           }
 1492           else
 1493           {
 1494               // Prefix-anchored matching is possible
 1495               boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
 1496               char[] prefix = program.prefix;
 1497               for ( ; !search.isEnd(i + prefix.length - 1); i++)
 1498               {
 1499                   int j = i;
 1500                   int k = 0;
 1501   
 1502                   boolean match;
 1503                   do {
 1504                       // If there's a mismatch of any character in the prefix, give up
 1505                       match = (compareChars(search.charAt(j++), prefix[k++], caseIndependent) == 0);
 1506                   } while (match && k < prefix.length);
 1507   
 1508                   // See if the whole prefix string matched
 1509                   if (k == prefix.length)
 1510                   {
 1511                       // We matched the full prefix at firstChar, so try it
 1512                       if (matchAt(i))
 1513                       {
 1514                           return true;
 1515                       }
 1516                   }
 1517               }
 1518               return false;
 1519           }
 1520       }
 1521   
 1522       /**
 1523        * Matches the current regular expression program against a String.
 1524        *
 1525        * @param search String to match against
 1526        * @return True if string matched
 1527        */
 1528       public boolean match(String search)
 1529       {
 1530           return match(search, 0);
 1531       }
 1532   
 1533       /**
 1534        * Splits a string into an array of strings on regular expression boundaries.
 1535        * This function works the same way as the Perl function of the same name.
 1536        * Given a regular expression of "[ab]+" and a string to split of
 1537        * "xyzzyababbayyzabbbab123", the result would be the array of Strings
 1538        * "[xyzzy, yyz, 123]".
 1539        *
 1540        * <p>Please note that the first string in the resulting array may be an empty
 1541        * string. This happens when the very first character of input string is
 1542        * matched by the pattern.
 1543        *
 1544        * @param s String to split on this regular exression
 1545        * @return Array of strings
 1546        */
 1547       public String[] split(String s)
 1548       {
 1549           // Create new vector
 1550           Vector v = new Vector();
 1551   
 1552           // Start at position 0 and search the whole string
 1553           int pos = 0;
 1554           int len = s.length();
 1555   
 1556           // Try a match at each position
 1557           while (pos < len && match(s, pos))
 1558           {
 1559               // Get start of match
 1560               int start = getParenStart(0);
 1561   
 1562               // Get end of match
 1563               int newpos = getParenEnd(0);
 1564   
 1565               // Check if no progress was made
 1566               if (newpos == pos)
 1567               {
 1568                   v.addElement(s.substring(pos, start + 1));
 1569                   newpos++;
 1570               }
 1571               else
 1572               {
 1573                   v.addElement(s.substring(pos, start));
 1574               }
 1575   
 1576               // Move to new position
 1577               pos = newpos;
 1578           }
 1579   
 1580           // Push remainder if it's not empty
 1581           String remainder = s.substring(pos);
 1582           if (remainder.length() != 0)
 1583           {
 1584               v.addElement(remainder);
 1585           }
 1586   
 1587           // Return vector as an array of strings
 1588           String[] ret = new String[v.size()];
 1589           v.copyInto(ret);
 1590           return ret;
 1591       }
 1592   
 1593       /**
 1594        * Flag bit that indicates that subst should replace all occurrences of this
 1595        * regular expression.
 1596        */
 1597       public static final int REPLACE_ALL            = 0x0000;
 1598   
 1599       /**
 1600        * Flag bit that indicates that subst should only replace the first occurrence
 1601        * of this regular expression.
 1602        */
 1603       public static final int REPLACE_FIRSTONLY      = 0x0001;
 1604   
 1605       /**
 1606        * Flag bit that indicates that subst should replace backreferences
 1607        */
 1608       public static final int REPLACE_BACKREFERENCES = 0x0002;
 1609   
 1610       /**
 1611        * Substitutes a string for this regular expression in another string.
 1612        * This method works like the Perl function of the same name.
 1613        * Given a regular expression of "a*b", a String to substituteIn of
 1614        * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
 1615        * resulting String returned by subst would be "-foo-garply-wacky-".
 1616        *
 1617        * @param substituteIn String to substitute within
 1618        * @param substitution String to substitute for all matches of this regular expression.
 1619        * @return The string substituteIn with zero or more occurrences of the current
 1620        * regular expression replaced with the substitution String (if this regular
 1621        * expression object doesn't match at any position, the original String is returned
 1622        * unchanged).
 1623        */
 1624       public String subst(String substituteIn, String substitution)
 1625       {
 1626           return subst(substituteIn, substitution, REPLACE_ALL);
 1627       }
 1628   
 1629       /**
 1630        * Substitutes a string for this regular expression in another string.
 1631        * This method works like the Perl function of the same name.
 1632        * Given a regular expression of "a*b", a String to substituteIn of
 1633        * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
 1634        * resulting String returned by subst would be "-foo-garply-wacky-".
 1635        * <p>
 1636        * It is also possible to reference the contents of a parenthesized expression
 1637        * with $0, $1, ... $9. A regular expression of "http://[\\.\\w\\-\\?/~_@&=%]+",
 1638        * a String to substituteIn of "visit us: http://www.apache.org!" and the
 1639        * substitution String "&lt;a href=\"$0\"&gt;$0&lt;/a&gt;", the resulting String
 1640        * returned by subst would be
 1641        * "visit us: &lt;a href=\"http://www.apache.org\"&gt;http://www.apache.org&lt;/a&gt;!".
 1642        * <p>
 1643        * <i>Note:</i> $0 represents the whole match.
 1644        *
 1645        * @param substituteIn String to substitute within
 1646        * @param substitution String to substitute for matches of this regular expression
 1647        * @param flags One or more bitwise flags from REPLACE_*.  If the REPLACE_FIRSTONLY
 1648        * flag bit is set, only the first occurrence of this regular expression is replaced.
 1649        * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
 1650        * replaced. If the flag REPLACE_BACKREFERENCES is set, all backreferences will
 1651        * be processed.
 1652        * @return The string substituteIn with zero or more occurrences of the current
 1653        * regular expression replaced with the substitution String (if this regular
 1654        * expression object doesn't match at any position, the original String is returned
 1655        * unchanged).
 1656        */
 1657       public String subst(String substituteIn, String substitution, int flags)
 1658       {
 1659           // String to return
 1660           StringBuffer ret = new StringBuffer();
 1661   
 1662           // Start at position 0 and search the whole string
 1663           int pos = 0;
 1664           int len = substituteIn.length();
 1665   
 1666           // Try a match at each position
 1667           while (pos < len && match(substituteIn, pos))
 1668           {
 1669               // Append string before match
 1670               ret.append(substituteIn.substring(pos, getParenStart(0)));
 1671   
 1672               if ((flags & REPLACE_BACKREFERENCES) != 0)
 1673               {
 1674                   // Process backreferences
 1675                   int lCurrentPosition = 0;
 1676                   int lLastPosition = -2;
 1677                   int lLength = substitution.length();
 1678   
 1679                   while ((lCurrentPosition = substitution.indexOf("$", lCurrentPosition)) >= 0)
 1680                   {
 1681                       if ((lCurrentPosition == 0 || substitution.charAt(lCurrentPosition - 1) != '\\')
 1682                           && lCurrentPosition + 1 < lLength)
 1683                       {
 1684                           char c = substitution.charAt(lCurrentPosition + 1);
 1685                           if (c >= '0' && c <= '9')
 1686                           {
 1687                               // Append everything between the last and the current $ sign
 1688                               ret.append(substitution.substring(lLastPosition + 2, lCurrentPosition));
 1689   
 1690                               // Append the parenthesized expression, if present
 1691                               String val = getParen(c - '0');
 1692                               if (val != null) {
 1693                                   ret.append(val);
 1694                               }
 1695                               lLastPosition = lCurrentPosition;
 1696                           }
 1697                       }
 1698   
 1699                       // Move forward, skipping past match
 1700                       lCurrentPosition++;
 1701                   }
 1702   
 1703                   // Append everything after the last $ sign
 1704                   ret.append(substitution.substring(lLastPosition + 2, lLength));
 1705               }
 1706               else
 1707               {
 1708                   // Append substitution without processing backreferences
 1709                   ret.append(substitution);
 1710               }
 1711   
 1712               // Move forward, skipping past match
 1713               int newpos = getParenEnd(0);
 1714   
 1715               // We always want to make progress!
 1716               if (newpos == pos)
 1717               {
 1718                   newpos++;
 1719               }
 1720   
 1721               // Try new position
 1722               pos = newpos;
 1723   
 1724               // Break out if we're only supposed to replace one occurrence
 1725               if ((flags & REPLACE_FIRSTONLY) != 0)
 1726               {
 1727                   break;
 1728               }
 1729           }
 1730   
 1731           // If there's remaining input, append it
 1732           if (pos < len)
 1733           {
 1734               ret.append(substituteIn.substring(pos));
 1735           }
 1736   
 1737           // Return string buffer as string
 1738           return ret.toString();
 1739       }
 1740   
 1741       /**
 1742        * Returns an array of Strings, whose toString representation matches a regular
 1743        * expression. This method works like the Perl function of the same name.  Given
 1744        * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
 1745        * aaaab], the array of Strings returned by grep would be [aab, aaaab].
 1746        *
 1747        * @param search Array of Objects to search
 1748        * @return Array of Strings whose toString() value matches this regular expression.
 1749        */
 1750       public String[] grep(Object[] search)
 1751       {
 1752           // Create new vector to hold return items
 1753           Vector v = new Vector();
 1754   
 1755           // Traverse array of objects
 1756           for (int i = 0; i < search.length; i++)
 1757           {
 1758               // Get next object as a string
 1759               String s = search[i].toString();
 1760   
 1761               // If it matches this regexp, add it to the list
 1762               if (match(s))
 1763               {
 1764                   v.addElement(s);
 1765               }
 1766           }
 1767   
 1768           // Return vector as an array of strings
 1769           String[] ret = new String[v.size()];
 1770           v.copyInto(ret);
 1771           return ret;
 1772       }
 1773   
 1774       /**
 1775        * @return true if character at i-th position in the <code>search</code> string is a newline
 1776        */
 1777       private boolean isNewline(int i)
 1778       {
 1779           char nextChar = search.charAt(i);
 1780   
 1781           return nextChar == '\n' || nextChar == '\r' || nextChar == '\u0085' ||
 1782                  nextChar == '\u2028' || nextChar == '\u2029';
 1783       }
 1784   
 1785       /**
 1786        * Compares two characters.
 1787        *
 1788        * @param c1 first character to compare.
 1789        * @param c2 second character to compare.
 1790        * @param caseIndependent whether comparision is case insensitive or not.
 1791        * @return negative, 0, or positive integer as the first character
 1792        *         less than, equal to, or greater then the second.
 1793        */
 1794       private int compareChars(char c1, char c2, boolean caseIndependent)
 1795       {
 1796           if (caseIndependent)
 1797           {
 1798               c1 = Character.toLowerCase(c1);
 1799               c2 = Character.toLowerCase(c2);
 1800           }
 1801           return ((int)c1 - (int)c2);
 1802       }
 1803   }

Save This Page
Home » lucene-2.4.1-src » org.apache » regexp » [javadoc | source]