Save This Page
Home » openjdk-7 » java » util » regex » [javadoc | source]
    1   /*
    2    * Copyright 1999-2006 Sun Microsystems, Inc.  All Rights Reserved.
    3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4    *
    5    * This code is free software; you can redistribute it and/or modify it
    6    * under the terms of the GNU General Public License version 2 only, as
    7    * published by the Free Software Foundation.  Sun designates this
    8    * particular file as subject to the "Classpath" exception as provided
    9    * by Sun in the LICENSE file that accompanied this code.
   10    *
   11    * This code is distributed in the hope that it will be useful, but WITHOUT
   12    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   13    * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   14    * version 2 for more details (a copy is included in the LICENSE file that
   15    * accompanied this code).
   16    *
   17    * You should have received a copy of the GNU General Public License version
   18    * 2 along with this work; if not, write to the Free Software Foundation,
   19    * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   20    *
   21    * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
   22    * CA 95054 USA or visit www.sun.com if you need additional information or
   23    * have any questions.
   24    */
   25   
   26   package java.util.regex;
   27   
   28   
   29   /**
   30    * An engine that performs match operations on a {@link java.lang.CharSequence
   31    * </code>character sequence<code>} by interpreting a {@link Pattern}.
   32    *
   33    * <p> A matcher is created from a pattern by invoking the pattern's {@link
   34    * Pattern#matcher matcher} method.  Once created, a matcher can be used to
   35    * perform three different kinds of match operations:
   36    *
   37    * <ul>
   38    *
   39    *   <li><p> The {@link #matches matches} method attempts to match the entire
   40    *   input sequence against the pattern.  </p></li>
   41    *
   42    *   <li><p> The {@link #lookingAt lookingAt} method attempts to match the
   43    *   input sequence, starting at the beginning, against the pattern.  </p></li>
   44    *
   45    *   <li><p> The {@link #find find} method scans the input sequence looking for
   46    *   the next subsequence that matches the pattern.  </p></li>
   47    *
   48    * </ul>
   49    *
   50    * <p> Each of these methods returns a boolean indicating success or failure.
   51    * More information about a successful match can be obtained by querying the
   52    * state of the matcher.
   53    *
   54    * <p> A matcher finds matches in a subset of its input called the
   55    * <i>region</i>. By default, the region contains all of the matcher's input.
   56    * The region can be modified via the{@link #region region} method and queried
   57    * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
   58    * methods. The way that the region boundaries interact with some pattern
   59    * constructs can be changed. See {@link #useAnchoringBounds
   60    * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
   61    * for more details.
   62    *
   63    * <p> This class also defines methods for replacing matched subsequences with
   64    * new strings whose contents can, if desired, be computed from the match
   65    * result.  The {@link #appendReplacement appendReplacement} and {@link
   66    * #appendTail appendTail} methods can be used in tandem in order to collect
   67    * the result into an existing string buffer, or the more convenient {@link
   68    * #replaceAll replaceAll} method can be used to create a string in which every
   69    * matching subsequence in the input sequence is replaced.
   70    *
   71    * <p> The explicit state of a matcher includes the start and end indices of
   72    * the most recent successful match.  It also includes the start and end
   73    * indices of the input subsequence captured by each <a
   74    * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
   75    * count of such subsequences.  As a convenience, methods are also provided for
   76    * returning these captured subsequences in string form.
   77    *
   78    * <p> The explicit state of a matcher is initially undefined; attempting to
   79    * query any part of it before a successful match will cause an {@link
   80    * IllegalStateException} to be thrown.  The explicit state of a matcher is
   81    * recomputed by every match operation.
   82    *
   83    * <p> The implicit state of a matcher includes the input character sequence as
   84    * well as the <i>append position</i>, which is initially zero and is updated
   85    * by the {@link #appendReplacement appendReplacement} method.
   86    *
   87    * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
   88    * method or, if a new input sequence is desired, its {@link
   89    * #reset(java.lang.CharSequence) reset(CharSequence)} method.  Resetting a
   90    * matcher discards its explicit state information and sets the append position
   91    * to zero.
   92    *
   93    * <p> Instances of this class are not safe for use by multiple concurrent
   94    * threads. </p>
   95    *
   96    *
   97    * @author      Mike McCloskey
   98    * @author      Mark Reinhold
   99    * @author      JSR-51 Expert Group
  100    * @since       1.4
  101    * @spec        JSR-51
  102    */
  103   
  104   public final class Matcher implements MatchResult {
  105   
  106       /**
  107        * The Pattern object that created this Matcher.
  108        */
  109       Pattern parentPattern;
  110   
  111       /**
  112        * The storage used by groups. They may contain invalid values if
  113        * a group was skipped during the matching.
  114        */
  115       int[] groups;
  116   
  117       /**
  118        * The range within the sequence that is to be matched. Anchors
  119        * will match at these "hard" boundaries. Changing the region
  120        * changes these values.
  121        */
  122       int from, to;
  123   
  124       /**
  125        * Lookbehind uses this value to ensure that the subexpression
  126        * match ends at the point where the lookbehind was encountered.
  127        */
  128       int lookbehindTo;
  129   
  130       /**
  131        * The original string being matched.
  132        */
  133       CharSequence text;
  134   
  135       /**
  136        * Matcher state used by the last node. NOANCHOR is used when a
  137        * match does not have to consume all of the input. ENDANCHOR is
  138        * the mode used for matching all the input.
  139        */
  140       static final int ENDANCHOR = 1;
  141       static final int NOANCHOR = 0;
  142       int acceptMode = NOANCHOR;
  143   
  144       /**
  145        * The range of string that last matched the pattern. If the last
  146        * match failed then first is -1; last initially holds 0 then it
  147        * holds the index of the end of the last match (which is where the
  148        * next search starts).
  149        */
  150       int first = -1, last = 0;
  151   
  152       /**
  153        * The end index of what matched in the last match operation.
  154        */
  155       int oldLast = -1;
  156   
  157       /**
  158        * The index of the last position appended in a substitution.
  159        */
  160       int lastAppendPosition = 0;
  161   
  162       /**
  163        * Storage used by nodes to tell what repetition they are on in
  164        * a pattern, and where groups begin. The nodes themselves are stateless,
  165        * so they rely on this field to hold state during a match.
  166        */
  167       int[] locals;
  168   
  169       /**
  170        * Boolean indicating whether or not more input could change
  171        * the results of the last match.
  172        *
  173        * If hitEnd is true, and a match was found, then more input
  174        * might cause a different match to be found.
  175        * If hitEnd is true and a match was not found, then more
  176        * input could cause a match to be found.
  177        * If hitEnd is false and a match was found, then more input
  178        * will not change the match.
  179        * If hitEnd is false and a match was not found, then more
  180        * input will not cause a match to be found.
  181        */
  182       boolean hitEnd;
  183   
  184       /**
  185        * Boolean indicating whether or not more input could change
  186        * a positive match into a negative one.
  187        *
  188        * If requireEnd is true, and a match was found, then more
  189        * input could cause the match to be lost.
  190        * If requireEnd is false and a match was found, then more
  191        * input might change the match but the match won't be lost.
  192        * If a match was not found, then requireEnd has no meaning.
  193        */
  194       boolean requireEnd;
  195   
  196       /**
  197        * If transparentBounds is true then the boundaries of this
  198        * matcher's region are transparent to lookahead, lookbehind,
  199        * and boundary matching constructs that try to see beyond them.
  200        */
  201       boolean transparentBounds = false;
  202   
  203       /**
  204        * If anchoringBounds is true then the boundaries of this
  205        * matcher's region match anchors such as ^ and $.
  206        */
  207       boolean anchoringBounds = true;
  208   
  209       /**
  210        * No default constructor.
  211        */
  212       Matcher() {
  213       }
  214   
  215       /**
  216        * All matchers have the state used by Pattern during a match.
  217        */
  218       Matcher(Pattern parent, CharSequence text) {
  219           this.parentPattern = parent;
  220           this.text = text;
  221   
  222           // Allocate state storage
  223           int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
  224           groups = new int[parentGroupCount * 2];
  225           locals = new int[parent.localCount];
  226   
  227           // Put fields into initial states
  228           reset();
  229       }
  230   
  231       /**
  232        * Returns the pattern that is interpreted by this matcher.
  233        *
  234        * @return  The pattern for which this matcher was created
  235        */
  236       public Pattern pattern() {
  237           return parentPattern;
  238       }
  239   
  240       /**
  241        * Returns the match state of this matcher as a {@link MatchResult}.
  242        * The result is unaffected by subsequent operations performed upon this
  243        * matcher.
  244        *
  245        * @return  a <code>MatchResult</code> with the state of this matcher
  246        * @since 1.5
  247        */
  248       public MatchResult toMatchResult() {
  249           Matcher result = new Matcher(this.parentPattern, text.toString());
  250           result.first = this.first;
  251           result.last = this.last;
  252           result.groups = this.groups.clone();
  253           return result;
  254       }
  255   
  256       /**
  257         * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
  258         * find matches with.
  259         *
  260         * <p> This method causes this matcher to lose information
  261         * about the groups of the last match that occurred. The
  262         * matcher's position in the input is maintained and its
  263         * last append position is unaffected.</p>
  264         *
  265         * @param  newPattern
  266         *         The new pattern used by this matcher
  267         * @return  This matcher
  268         * @throws  IllegalArgumentException
  269         *          If newPattern is <tt>null</tt>
  270         * @since 1.5
  271         */
  272       public Matcher usePattern(Pattern newPattern) {
  273           if (newPattern == null)
  274               throw new IllegalArgumentException("Pattern cannot be null");
  275           parentPattern = newPattern;
  276   
  277           // Reallocate state storage
  278           int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
  279           groups = new int[parentGroupCount * 2];
  280           locals = new int[newPattern.localCount];
  281           for (int i = 0; i < groups.length; i++)
  282               groups[i] = -1;
  283           for (int i = 0; i < locals.length; i++)
  284               locals[i] = -1;
  285           return this;
  286       }
  287   
  288       /**
  289        * Resets this matcher.
  290        *
  291        * <p> Resetting a matcher discards all of its explicit state information
  292        * and sets its append position to zero. The matcher's region is set to the
  293        * default region, which is its entire character sequence. The anchoring
  294        * and transparency of this matcher's region boundaries are unaffected.
  295        *
  296        * @return  This matcher
  297        */
  298       public Matcher reset() {
  299           first = -1;
  300           last = 0;
  301           oldLast = -1;
  302           for(int i=0; i<groups.length; i++)
  303               groups[i] = -1;
  304           for(int i=0; i<locals.length; i++)
  305               locals[i] = -1;
  306           lastAppendPosition = 0;
  307           from = 0;
  308           to = getTextLength();
  309           return this;
  310       }
  311   
  312       /**
  313        * Resets this matcher with a new input sequence.
  314        *
  315        * <p> Resetting a matcher discards all of its explicit state information
  316        * and sets its append position to zero.  The matcher's region is set to
  317        * the default region, which is its entire character sequence.  The
  318        * anchoring and transparency of this matcher's region boundaries are
  319        * unaffected.
  320        *
  321        * @param  input
  322        *         The new input character sequence
  323        *
  324        * @return  This matcher
  325        */
  326       public Matcher reset(CharSequence input) {
  327           text = input;
  328           return reset();
  329       }
  330   
  331       /**
  332        * Returns the start index of the previous match.  </p>
  333        *
  334        * @return  The index of the first character matched
  335        *
  336        * @throws  IllegalStateException
  337        *          If no match has yet been attempted,
  338        *          or if the previous match operation failed
  339        */
  340       public int start() {
  341           if (first < 0)
  342               throw new IllegalStateException("No match available");
  343           return first;
  344       }
  345   
  346       /**
  347        * Returns the start index of the subsequence captured by the given group
  348        * during the previous match operation.
  349        *
  350        * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  351        * to right, starting at one.  Group zero denotes the entire pattern, so
  352        * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
  353        * <i>m.</i><tt>start()</tt>.  </p>
  354        *
  355        * @param  group
  356        *         The index of a capturing group in this matcher's pattern
  357        *
  358        * @return  The index of the first character captured by the group,
  359        *          or <tt>-1</tt> if the match was successful but the group
  360        *          itself did not match anything
  361        *
  362        * @throws  IllegalStateException
  363        *          If no match has yet been attempted,
  364        *          or if the previous match operation failed
  365        *
  366        * @throws  IndexOutOfBoundsException
  367        *          If there is no capturing group in the pattern
  368        *          with the given index
  369        */
  370       public int start(int group) {
  371           if (first < 0)
  372               throw new IllegalStateException("No match available");
  373           if (group > groupCount())
  374               throw new IndexOutOfBoundsException("No group " + group);
  375           return groups[group * 2];
  376       }
  377   
  378       /**
  379        * Returns the offset after the last character matched.  </p>
  380        *
  381        * @return  The offset after the last character matched
  382        *
  383        * @throws  IllegalStateException
  384        *          If no match has yet been attempted,
  385        *          or if the previous match operation failed
  386        */
  387       public int end() {
  388           if (first < 0)
  389               throw new IllegalStateException("No match available");
  390           return last;
  391       }
  392   
  393       /**
  394        * Returns the offset after the last character of the subsequence
  395        * captured by the given group during the previous match operation.
  396        *
  397        * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  398        * to right, starting at one.  Group zero denotes the entire pattern, so
  399        * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
  400        * <i>m.</i><tt>end()</tt>.  </p>
  401        *
  402        * @param  group
  403        *         The index of a capturing group in this matcher's pattern
  404        *
  405        * @return  The offset after the last character captured by the group,
  406        *          or <tt>-1</tt> if the match was successful
  407        *          but the group itself did not match anything
  408        *
  409        * @throws  IllegalStateException
  410        *          If no match has yet been attempted,
  411        *          or if the previous match operation failed
  412        *
  413        * @throws  IndexOutOfBoundsException
  414        *          If there is no capturing group in the pattern
  415        *          with the given index
  416        */
  417       public int end(int group) {
  418           if (first < 0)
  419               throw new IllegalStateException("No match available");
  420           if (group > groupCount())
  421               throw new IndexOutOfBoundsException("No group " + group);
  422           return groups[group * 2 + 1];
  423       }
  424   
  425       /**
  426        * Returns the input subsequence matched by the previous match.
  427        *
  428        * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
  429        * the expressions <i>m.</i><tt>group()</tt> and
  430        * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt>&nbsp;<i>m.</i><tt>end())</tt>
  431        * are equivalent.  </p>
  432        *
  433        * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
  434        * string.  This method will return the empty string when the pattern
  435        * successfully matches the empty string in the input.  </p>
  436        *
  437        * @return The (possibly empty) subsequence matched by the previous match,
  438        *         in string form
  439        *
  440        * @throws  IllegalStateException
  441        *          If no match has yet been attempted,
  442        *          or if the previous match operation failed
  443        */
  444       public String group() {
  445           return group(0);
  446       }
  447   
  448       /**
  449        * Returns the input subsequence captured by the given group during the
  450        * previous match operation.
  451        *
  452        * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
  453        * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
  454        * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt>&nbsp;<i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
  455        * are equivalent.  </p>
  456        *
  457        * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  458        * to right, starting at one.  Group zero denotes the entire pattern, so
  459        * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
  460        * </p>
  461        *
  462        * <p> If the match was successful but the group specified failed to match
  463        * any part of the input sequence, then <tt>null</tt> is returned. Note
  464        * that some groups, for example <tt>(a*)</tt>, match the empty string.
  465        * This method will return the empty string when such a group successfully
  466        * matches the empty string in the input.  </p>
  467        *
  468        * @param  group
  469        *         The index of a capturing group in this matcher's pattern
  470        *
  471        * @return  The (possibly empty) subsequence captured by the group
  472        *          during the previous match, or <tt>null</tt> if the group
  473        *          failed to match part of the input
  474        *
  475        * @throws  IllegalStateException
  476        *          If no match has yet been attempted,
  477        *          or if the previous match operation failed
  478        *
  479        * @throws  IndexOutOfBoundsException
  480        *          If there is no capturing group in the pattern
  481        *          with the given index
  482        */
  483       public String group(int group) {
  484           if (first < 0)
  485               throw new IllegalStateException("No match found");
  486           if (group < 0 || group > groupCount())
  487               throw new IndexOutOfBoundsException("No group " + group);
  488           if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
  489               return null;
  490           return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
  491       }
  492   
  493       /**
  494        * Returns the number of capturing groups in this matcher's pattern.
  495        *
  496        * <p> Group zero denotes the entire pattern by convention. It is not
  497        * included in this count.
  498        *
  499        * <p> Any non-negative integer smaller than or equal to the value
  500        * returned by this method is guaranteed to be a valid group index for
  501        * this matcher.  </p>
  502        *
  503        * @return The number of capturing groups in this matcher's pattern
  504        */
  505       public int groupCount() {
  506           return parentPattern.capturingGroupCount - 1;
  507       }
  508   
  509       /**
  510        * Attempts to match the entire region against the pattern.
  511        *
  512        * <p> If the match succeeds then more information can be obtained via the
  513        * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
  514        *
  515        * @return  <tt>true</tt> if, and only if, the entire region sequence
  516        *          matches this matcher's pattern
  517        */
  518       public boolean matches() {
  519           return match(from, ENDANCHOR);
  520       }
  521   
  522       /**
  523        * Attempts to find the next subsequence of the input sequence that matches
  524        * the pattern.
  525        *
  526        * <p> This method starts at the beginning of this matcher's region, or, if
  527        * a previous invocation of the method was successful and the matcher has
  528        * not since been reset, at the first character not matched by the previous
  529        * match.
  530        *
  531        * <p> If the match succeeds then more information can be obtained via the
  532        * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
  533        *
  534        * @return  <tt>true</tt> if, and only if, a subsequence of the input
  535        *          sequence matches this matcher's pattern
  536        */
  537       public boolean find() {
  538           int nextSearchIndex = last;
  539           if (nextSearchIndex == first)
  540               nextSearchIndex++;
  541   
  542           // If next search starts before region, start it at region
  543           if (nextSearchIndex < from)
  544               nextSearchIndex = from;
  545   
  546           // If next search starts beyond region then it fails
  547           if (nextSearchIndex > to) {
  548               for (int i = 0; i < groups.length; i++)
  549                   groups[i] = -1;
  550               return false;
  551           }
  552           return search(nextSearchIndex);
  553       }
  554   
  555       /**
  556        * Resets this matcher and then attempts to find the next subsequence of
  557        * the input sequence that matches the pattern, starting at the specified
  558        * index.
  559        *
  560        * <p> If the match succeeds then more information can be obtained via the
  561        * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
  562        * invocations of the {@link #find()} method will start at the first
  563        * character not matched by this match.  </p>
  564        *
  565        * @throws  IndexOutOfBoundsException
  566        *          If start is less than zero or if start is greater than the
  567        *          length of the input sequence.
  568        *
  569        * @return  <tt>true</tt> if, and only if, a subsequence of the input
  570        *          sequence starting at the given index matches this matcher's
  571        *          pattern
  572        */
  573       public boolean find(int start) {
  574           int limit = getTextLength();
  575           if ((start < 0) || (start > limit))
  576               throw new IndexOutOfBoundsException("Illegal start index");
  577           reset();
  578           return search(start);
  579       }
  580   
  581       /**
  582        * Attempts to match the input sequence, starting at the beginning of the
  583        * region, against the pattern.
  584        *
  585        * <p> Like the {@link #matches matches} method, this method always starts
  586        * at the beginning of the region; unlike that method, it does not
  587        * require that the entire region be matched.
  588        *
  589        * <p> If the match succeeds then more information can be obtained via the
  590        * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods.  </p>
  591        *
  592        * @return  <tt>true</tt> if, and only if, a prefix of the input
  593        *          sequence matches this matcher's pattern
  594        */
  595       public boolean lookingAt() {
  596           return match(from, NOANCHOR);
  597       }
  598   
  599       /**
  600        * Returns a literal replacement <code>String</code> for the specified
  601        * <code>String</code>.
  602        *
  603        * This method produces a <code>String</code> that will work
  604        * as a literal replacement <code>s</code> in the
  605        * <code>appendReplacement</code> method of the {@link Matcher} class.
  606        * The <code>String</code> produced will match the sequence of characters
  607        * in <code>s</code> treated as a literal sequence. Slashes ('\') and
  608        * dollar signs ('$') will be given no special meaning.
  609        *
  610        * @param  s The string to be literalized
  611        * @return  A literal string replacement
  612        * @since 1.5
  613        */
  614       public static String quoteReplacement(String s) {
  615           if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
  616               return s;
  617           StringBuilder sb = new StringBuilder();
  618           for (int i=0; i<s.length(); i++) {
  619               char c = s.charAt(i);
  620               if (c == '\\' || c == '$') {
  621                   sb.append('\\');
  622               }
  623               sb.append(c);
  624           }
  625           return sb.toString();
  626       }
  627   
  628       /**
  629        * Implements a non-terminal append-and-replace step.
  630        *
  631        * <p> This method performs the following actions: </p>
  632        *
  633        * <ol>
  634        *
  635        *   <li><p> It reads characters from the input sequence, starting at the
  636        *   append position, and appends them to the given string buffer.  It
  637        *   stops after reading the last character preceding the previous match,
  638        *   that is, the character at index {@link
  639        *   #start()}&nbsp;<tt>-</tt>&nbsp;<tt>1</tt>.  </p></li>
  640        *
  641        *   <li><p> It appends the given replacement string to the string buffer.
  642        *   </p></li>
  643        *
  644        *   <li><p> It sets the append position of this matcher to the index of
  645        *   the last character matched, plus one, that is, to {@link #end()}.
  646        *   </p></li>
  647        *
  648        * </ol>
  649        *
  650        * <p> The replacement string may contain references to subsequences
  651        * captured during the previous match: Each occurrence of
  652        * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
  653        * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
  654        * The first number after the <tt>$</tt> is always treated as part of
  655        * the group reference. Subsequent numbers are incorporated into g if
  656        * they would form a legal group reference. Only the numerals '0'
  657        * through '9' are considered as potential components of the group
  658        * reference. If the second group matched the string <tt>"foo"</tt>, for
  659        * example, then passing the replacement string <tt>"$2bar"</tt> would
  660        * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
  661        * sign (<tt>$</tt>) may be included as a literal in the replacement
  662        * string by preceding it with a backslash (<tt>\$</tt>).
  663        *
  664        * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
  665        * the replacement string may cause the results to be different than if it
  666        * were being treated as a literal replacement string. Dollar signs may be
  667        * treated as references to captured subsequences as described above, and
  668        * backslashes are used to escape literal characters in the replacement
  669        * string.
  670        *
  671        * <p> This method is intended to be used in a loop together with the
  672        * {@link #appendTail appendTail} and {@link #find find} methods.  The
  673        * following code, for example, writes <tt>one dog two dogs in the
  674        * yard</tt> to the standard-output stream: </p>
  675        *
  676        * <blockquote><pre>
  677        * Pattern p = Pattern.compile("cat");
  678        * Matcher m = p.matcher("one cat two cats in the yard");
  679        * StringBuffer sb = new StringBuffer();
  680        * while (m.find()) {
  681        *     m.appendReplacement(sb, "dog");
  682        * }
  683        * m.appendTail(sb);
  684        * System.out.println(sb.toString());</pre></blockquote>
  685        *
  686        * @param  sb
  687        *         The target string buffer
  688        *
  689        * @param  replacement
  690        *         The replacement string
  691        *
  692        * @return  This matcher
  693        *
  694        * @throws  IllegalStateException
  695        *          If no match has yet been attempted,
  696        *          or if the previous match operation failed
  697        *
  698        * @throws  IndexOutOfBoundsException
  699        *          If the replacement string refers to a capturing group
  700        *          that does not exist in the pattern
  701        */
  702       public Matcher appendReplacement(StringBuffer sb, String replacement) {
  703   
  704           // If no match, return error
  705           if (first < 0)
  706               throw new IllegalStateException("No match available");
  707   
  708           // Process substitution string to replace group references with groups
  709           int cursor = 0;
  710           StringBuilder result = new StringBuilder();
  711   
  712           while (cursor < replacement.length()) {
  713               char nextChar = replacement.charAt(cursor);
  714               if (nextChar == '\\') {
  715                   cursor++;
  716                   nextChar = replacement.charAt(cursor);
  717                   result.append(nextChar);
  718                   cursor++;
  719               } else if (nextChar == '$') {
  720                   // Skip past $
  721                   cursor++;
  722                   // The first number is always a group
  723                   int refNum = (int)replacement.charAt(cursor) - '0';
  724                   if ((refNum < 0)||(refNum > 9))
  725                       throw new IllegalArgumentException(
  726                           "Illegal group reference");
  727                   cursor++;
  728   
  729                   // Capture the largest legal group string
  730                   boolean done = false;
  731                   while (!done) {
  732                       if (cursor >= replacement.length()) {
  733                           break;
  734                       }
  735                       int nextDigit = replacement.charAt(cursor) - '0';
  736                       if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
  737                           break;
  738                       }
  739                       int newRefNum = (refNum * 10) + nextDigit;
  740                       if (groupCount() < newRefNum) {
  741                           done = true;
  742                       } else {
  743                           refNum = newRefNum;
  744                           cursor++;
  745                       }
  746                   }
  747                   // Append group
  748                   if (start(refNum) != -1 && end(refNum) != -1)
  749                       result.append(text, start(refNum), end(refNum));
  750               } else {
  751                   result.append(nextChar);
  752                   cursor++;
  753               }
  754           }
  755           // Append the intervening text
  756           sb.append(text, lastAppendPosition, first);
  757           // Append the match substitution
  758           sb.append(result);
  759   
  760           lastAppendPosition = last;
  761           return this;
  762       }
  763   
  764       /**
  765        * Implements a terminal append-and-replace step.
  766        *
  767        * <p> This method reads characters from the input sequence, starting at
  768        * the append position, and appends them to the given string buffer.  It is
  769        * intended to be invoked after one or more invocations of the {@link
  770        * #appendReplacement appendReplacement} method in order to copy the
  771        * remainder of the input sequence.  </p>
  772        *
  773        * @param  sb
  774        *         The target string buffer
  775        *
  776        * @return  The target string buffer
  777        */
  778       public StringBuffer appendTail(StringBuffer sb) {
  779           sb.append(text, lastAppendPosition, getTextLength());
  780           return sb;
  781       }
  782   
  783       /**
  784        * Replaces every subsequence of the input sequence that matches the
  785        * pattern with the given replacement string.
  786        *
  787        * <p> This method first resets this matcher.  It then scans the input
  788        * sequence looking for matches of the pattern.  Characters that are not
  789        * part of any match are appended directly to the result string; each match
  790        * is replaced in the result by the replacement string.  The replacement
  791        * string may contain references to captured subsequences as in the {@link
  792        * #appendReplacement appendReplacement} method.
  793        *
  794        * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
  795        * the replacement string may cause the results to be different than if it
  796        * were being treated as a literal replacement string. Dollar signs may be
  797        * treated as references to captured subsequences as described above, and
  798        * backslashes are used to escape literal characters in the replacement
  799        * string.
  800        *
  801        * <p> Given the regular expression <tt>a*b</tt>, the input
  802        * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
  803        * <tt>"-"</tt>, an invocation of this method on a matcher for that
  804        * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
  805        *
  806        * <p> Invoking this method changes this matcher's state.  If the matcher
  807        * is to be used in further matching operations then it should first be
  808        * reset.  </p>
  809        *
  810        * @param  replacement
  811        *         The replacement string
  812        *
  813        * @return  The string constructed by replacing each matching subsequence
  814        *          by the replacement string, substituting captured subsequences
  815        *          as needed
  816        */
  817       public String replaceAll(String replacement) {
  818           reset();
  819           boolean result = find();
  820           if (result) {
  821               StringBuffer sb = new StringBuffer();
  822               do {
  823                   appendReplacement(sb, replacement);
  824                   result = find();
  825               } while (result);
  826               appendTail(sb);
  827               return sb.toString();
  828           }
  829           return text.toString();
  830       }
  831   
  832       /**
  833        * Replaces the first subsequence of the input sequence that matches the
  834        * pattern with the given replacement string.
  835        *
  836        * <p> This method first resets this matcher.  It then scans the input
  837        * sequence looking for a match of the pattern.  Characters that are not
  838        * part of the match are appended directly to the result string; the match
  839        * is replaced in the result by the replacement string.  The replacement
  840        * string may contain references to captured subsequences as in the {@link
  841        * #appendReplacement appendReplacement} method.
  842        *
  843        * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
  844        * the replacement string may cause the results to be different than if it
  845        * were being treated as a literal replacement string. Dollar signs may be
  846        * treated as references to captured subsequences as described above, and
  847        * backslashes are used to escape literal characters in the replacement
  848        * string.
  849        *
  850        * <p> Given the regular expression <tt>dog</tt>, the input
  851        * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
  852        * <tt>"cat"</tt>, an invocation of this method on a matcher for that
  853        * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>.  </p>
  854        *
  855        * <p> Invoking this method changes this matcher's state.  If the matcher
  856        * is to be used in further matching operations then it should first be
  857        * reset.  </p>
  858        *
  859        * @param  replacement
  860        *         The replacement string
  861        * @return  The string constructed by replacing the first matching
  862        *          subsequence by the replacement string, substituting captured
  863        *          subsequences as needed
  864        */
  865       public String replaceFirst(String replacement) {
  866           if (replacement == null)
  867               throw new NullPointerException("replacement");
  868           reset();
  869           if (!find())
  870               return text.toString();
  871           StringBuffer sb = new StringBuffer();
  872           appendReplacement(sb, replacement);
  873           appendTail(sb);
  874           return sb.toString();
  875       }
  876   
  877       /**
  878        * Sets the limits of this matcher's region. The region is the part of the
  879        * input sequence that will be searched to find a match. Invoking this
  880        * method resets the matcher, and then sets the region to start at the
  881        * index specified by the <code>start</code> parameter and end at the
  882        * index specified by the <code>end</code> parameter.
  883        *
  884        * <p>Depending on the transparency and anchoring being used (see
  885        * {@link #useTransparentBounds useTransparentBounds} and
  886        * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
  887        * as anchors may behave differently at or around the boundaries of the
  888        * region.
  889        *
  890        * @param  start
  891        *         The index to start searching at (inclusive)
  892        * @param  end
  893        *         The index to end searching at (exclusive)
  894        * @throws  IndexOutOfBoundsException
  895        *          If start or end is less than zero, if
  896        *          start is greater than the length of the input sequence, if
  897        *          end is greater than the length of the input sequence, or if
  898        *          start is greater than end.
  899        * @return  this matcher
  900        * @since 1.5
  901        */
  902       public Matcher region(int start, int end) {
  903           if ((start < 0) || (start > getTextLength()))
  904               throw new IndexOutOfBoundsException("start");
  905           if ((end < 0) || (end > getTextLength()))
  906               throw new IndexOutOfBoundsException("end");
  907           if (start > end)
  908               throw new IndexOutOfBoundsException("start > end");
  909           reset();
  910           from = start;
  911           to = end;
  912           return this;
  913       }
  914   
  915       /**
  916        * Reports the start index of this matcher's region. The
  917        * searches this matcher conducts are limited to finding matches
  918        * within {@link #regionStart regionStart} (inclusive) and
  919        * {@link #regionEnd regionEnd} (exclusive).
  920        *
  921        * @return  The starting point of this matcher's region
  922        * @since 1.5
  923        */
  924       public int regionStart() {
  925           return from;
  926       }
  927   
  928       /**
  929        * Reports the end index (exclusive) of this matcher's region.
  930        * The searches this matcher conducts are limited to finding matches
  931        * within {@link #regionStart regionStart} (inclusive) and
  932        * {@link #regionEnd regionEnd} (exclusive).
  933        *
  934        * @return  the ending point of this matcher's region
  935        * @since 1.5
  936        */
  937       public int regionEnd() {
  938           return to;
  939       }
  940   
  941       /**
  942        * Queries the transparency of region bounds for this matcher.
  943        *
  944        * <p> This method returns <tt>true</tt> if this matcher uses
  945        * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
  946        * bounds.
  947        *
  948        * <p> See {@link #useTransparentBounds useTransparentBounds} for a
  949        * description of transparent and opaque bounds.
  950        *
  951        * <p> By default, a matcher uses opaque region boundaries.
  952        *
  953        * @return <tt>true</tt> iff this matcher is using transparent bounds,
  954        *         <tt>false</tt> otherwise.
  955        * @see java.util.regex.Matcher#useTransparentBounds(boolean)
  956        * @since 1.5
  957        */
  958       public boolean hasTransparentBounds() {
  959           return transparentBounds;
  960       }
  961   
  962       /**
  963        * Sets the transparency of region bounds for this matcher.
  964        *
  965        * <p> Invoking this method with an argument of <tt>true</tt> will set this
  966        * matcher to use <i>transparent</i> bounds. If the boolean
  967        * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
  968        *
  969        * <p> Using transparent bounds, the boundaries of this
  970        * matcher's region are transparent to lookahead, lookbehind,
  971        * and boundary matching constructs. Those constructs can see beyond the
  972        * boundaries of the region to see if a match is appropriate.
  973        *
  974        * <p> Using opaque bounds, the boundaries of this matcher's
  975        * region are opaque to lookahead, lookbehind, and boundary matching
  976        * constructs that may try to see beyond them. Those constructs cannot
  977        * look past the boundaries so they will fail to match anything outside
  978        * of the region.
  979        *
  980        * <p> By default, a matcher uses opaque bounds.
  981        *
  982        * @param  b a boolean indicating whether to use opaque or transparent
  983        *         regions
  984        * @return this matcher
  985        * @see java.util.regex.Matcher#hasTransparentBounds
  986        * @since 1.5
  987        */
  988       public Matcher useTransparentBounds(boolean b) {
  989           transparentBounds = b;
  990           return this;
  991       }
  992   
  993       /**
  994        * Queries the anchoring of region bounds for this matcher.
  995        *
  996        * <p> This method returns <tt>true</tt> if this matcher uses
  997        * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
  998        *
  999        * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
 1000        * description of anchoring bounds.
 1001        *
 1002        * <p> By default, a matcher uses anchoring region boundaries.
 1003        *
 1004        * @return <tt>true</tt> iff this matcher is using anchoring bounds,
 1005        *         <tt>false</tt> otherwise.
 1006        * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
 1007        * @since 1.5
 1008        */
 1009       public boolean hasAnchoringBounds() {
 1010           return anchoringBounds;
 1011       }
 1012   
 1013       /**
 1014        * Sets the anchoring of region bounds for this matcher.
 1015        *
 1016        * <p> Invoking this method with an argument of <tt>true</tt> will set this
 1017        * matcher to use <i>anchoring</i> bounds. If the boolean
 1018        * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
 1019        * used.
 1020        *
 1021        * <p> Using anchoring bounds, the boundaries of this
 1022        * matcher's region match anchors such as ^ and $.
 1023        *
 1024        * <p> Without anchoring bounds, the boundaries of this
 1025        * matcher's region will not match anchors such as ^ and $.
 1026        *
 1027        * <p> By default, a matcher uses anchoring region boundaries.
 1028        *
 1029        * @param  b a boolean indicating whether or not to use anchoring bounds.
 1030        * @return this matcher
 1031        * @see java.util.regex.Matcher#hasAnchoringBounds
 1032        * @since 1.5
 1033        */
 1034       public Matcher useAnchoringBounds(boolean b) {
 1035           anchoringBounds = b;
 1036           return this;
 1037       }
 1038   
 1039       /**
 1040        * <p>Returns the string representation of this matcher. The
 1041        * string representation of a <code>Matcher</code> contains information
 1042        * that may be useful for debugging. The exact format is unspecified.
 1043        *
 1044        * @return  The string representation of this matcher
 1045        * @since 1.5
 1046        */
 1047       public String toString() {
 1048           StringBuilder sb = new StringBuilder();
 1049           sb.append("java.util.regex.Matcher");
 1050           sb.append("[pattern=" + pattern());
 1051           sb.append(" region=");
 1052           sb.append(regionStart() + "," + regionEnd());
 1053           sb.append(" lastmatch=");
 1054           if ((first >= 0) && (group() != null)) {
 1055               sb.append(group());
 1056           }
 1057           sb.append("]");
 1058           return sb.toString();
 1059       }
 1060   
 1061       /**
 1062        * <p>Returns true if the end of input was hit by the search engine in
 1063        * the last match operation performed by this matcher.
 1064        *
 1065        * <p>When this method returns true, then it is possible that more input
 1066        * would have changed the result of the last search.
 1067        *
 1068        * @return  true iff the end of input was hit in the last match; false
 1069        *          otherwise
 1070        * @since 1.5
 1071        */
 1072       public boolean hitEnd() {
 1073           return hitEnd;
 1074       }
 1075   
 1076       /**
 1077        * <p>Returns true if more input could change a positive match into a
 1078        * negative one.
 1079        *
 1080        * <p>If this method returns true, and a match was found, then more
 1081        * input could cause the match to be lost. If this method returns false
 1082        * and a match was found, then more input might change the match but the
 1083        * match won't be lost. If a match was not found, then requireEnd has no
 1084        * meaning.
 1085        *
 1086        * @return  true iff more input could change a positive match into a
 1087        *          negative one.
 1088        * @since 1.5
 1089        */
 1090       public boolean requireEnd() {
 1091           return requireEnd;
 1092       }
 1093   
 1094       /**
 1095        * Initiates a search to find a Pattern within the given bounds.
 1096        * The groups are filled with default values and the match of the root
 1097        * of the state machine is called. The state machine will hold the state
 1098        * of the match as it proceeds in this matcher.
 1099        *
 1100        * Matcher.from is not set here, because it is the "hard" boundary
 1101        * of the start of the search which anchors will set to. The from param
 1102        * is the "soft" boundary of the start of the search, meaning that the
 1103        * regex tries to match at that index but ^ won't match there. Subsequent
 1104        * calls to the search methods start at a new "soft" boundary which is
 1105        * the end of the previous match.
 1106        */
 1107       boolean search(int from) {
 1108           this.hitEnd = false;
 1109           this.requireEnd = false;
 1110           from        = from < 0 ? 0 : from;
 1111           this.first  = from;
 1112           this.oldLast = oldLast < 0 ? from : oldLast;
 1113           for (int i = 0; i < groups.length; i++)
 1114               groups[i] = -1;
 1115           acceptMode = NOANCHOR;
 1116           boolean result = parentPattern.root.match(this, from, text);
 1117           if (!result)
 1118               this.first = -1;
 1119           this.oldLast = this.last;
 1120           return result;
 1121       }
 1122   
 1123       /**
 1124        * Initiates a search for an anchored match to a Pattern within the given
 1125        * bounds. The groups are filled with default values and the match of the
 1126        * root of the state machine is called. The state machine will hold the
 1127        * state of the match as it proceeds in this matcher.
 1128        */
 1129       boolean match(int from, int anchor) {
 1130           this.hitEnd = false;
 1131           this.requireEnd = false;
 1132           from        = from < 0 ? 0 : from;
 1133           this.first  = from;
 1134           this.oldLast = oldLast < 0 ? from : oldLast;
 1135           for (int i = 0; i < groups.length; i++)
 1136               groups[i] = -1;
 1137           acceptMode = anchor;
 1138           boolean result = parentPattern.matchRoot.match(this, from, text);
 1139           if (!result)
 1140               this.first = -1;
 1141           this.oldLast = this.last;
 1142           return result;
 1143       }
 1144   
 1145       /**
 1146        * Returns the end index of the text.
 1147        *
 1148        * @return the index after the last character in the text
 1149        */
 1150       int getTextLength() {
 1151           return text.length();
 1152       }
 1153   
 1154       /**
 1155        * Generates a String from this Matcher's input in the specified range.
 1156        *
 1157        * @param  beginIndex   the beginning index, inclusive
 1158        * @param  endIndex     the ending index, exclusive
 1159        * @return A String generated from this Matcher's input
 1160        */
 1161       CharSequence getSubSequence(int beginIndex, int endIndex) {
 1162           return text.subSequence(beginIndex, endIndex);
 1163       }
 1164   
 1165       /**
 1166        * Returns this Matcher's input character at index i.
 1167        *
 1168        * @return A char from the specified index
 1169        */
 1170       char charAt(int i) {
 1171           return text.charAt(i);
 1172       }
 1173   
 1174   }

Save This Page
Home » openjdk-7 » java » util » regex » [javadoc | source]