Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/oro/text/perl/Perl5Util.java


1   /*
2    * $Id: Perl5Util.java,v 1.19 2003/11/07 20:16:25 dfs Exp $
3    *
4    * ====================================================================
5    * The Apache Software License, Version 1.1
6    *
7    * Copyright (c) 2000 The Apache Software Foundation.  All rights
8    * reserved.
9    *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   *
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer.
16   *
17   * 2. Redistributions in binary form must reproduce the above copyright
18   *    notice, this list of conditions and the following disclaimer in
19   *    the documentation and/or other materials provided with the
20   *    distribution.
21   *
22   * 3. The end-user documentation included with the redistribution,
23   *    if any, must include the following acknowledgment:
24   *       "This product includes software developed by the
25   *        Apache Software Foundation (http://www.apache.org/)."
26   *    Alternately, this acknowledgment may appear in the software itself,
27   *    if and wherever such third-party acknowledgments normally appear.
28   *
29   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
30   *    must not be used to endorse or promote products derived from this
31   *    software without prior written permission. For written
32   *    permission, please contact apache@apache.org.
33   *
34   * 5. Products derived from this software may not be called "Apache" 
35   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
36   *    name, without prior written permission of the Apache Software Foundation.
37   *
38   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49   * SUCH DAMAGE.
50   * ====================================================================
51   *
52   * This software consists of voluntary contributions made by many
53   * individuals on behalf of the Apache Software Foundation.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  
59  package org.apache.oro.text.perl;
60  
61  import java.util.*;
62  
63  import org.apache.oro.text.*;
64  import org.apache.oro.text.regex.*;
65  import org.apache.oro.util.*;
66  
67  /**
68   * This is a utility class implementing the 3 most common Perl5 operations
69   * involving regular expressions:
70   * <ul>
71   * <li> [m]/pattern/[i][m][s][x],
72   * <li> s/pattern/replacement/[g][i][m][o][s][x],
73   * <li> and split().
74   * </ul>
75   * As with Perl, any non-alphanumeric character can be used in lieu of
76   * the slashes.
77   *  <p>
78   * The objective of the class is to minimize the amount of code a Java
79   * programmer using Jakarta-ORO
80   * has to write to achieve the same results as Perl by 
81   * transparently handling regular expression compilation, caching, and
82   * matching.  A second objective is to use the same Perl pattern matching
83   * syntax to ease the task of Perl programmers transitioning to Java
84   * (this also reduces the number of parameters to a method).
85   * All the state affecting methods are synchronized to avoid
86   * the maintenance of explicit locks in multithreaded programs.  This
87   * philosophy differs from the
88   * {@link org.apache.oro.text.regex} package, where
89   * you are expected to either maintain explicit locks, or more preferably
90   * create separate compiler and matcher instances for each thread.
91   * <p>
92   * To use this class, first create an instance using the default constructor
93   * or initialize the instance with a PatternCache of your choosing using
94   * the alternate constructor.  The default cache used by Perl5Util is a
95   * PatternCacheLRU of capacity GenericPatternCache.DEFAULT_CAPACITY.  You may
96   * want to create a cache with a different capacity, a different
97   * cache replacement policy, or even devise your own PatternCache
98   * implementation.  The PatternCacheLRU is probably the best general purpose
99   * pattern cache, but your specific application may be better served by
100  * a different cache replacement policy.  You should remember that you can
101  * front-load a cache with all the patterns you will be using before
102  * initializing a Perl5Util instance, or you can just let Perl5Util
103  * fill the cache as you use it.
104  * <p>
105  * You might use the class as follows:
106  * <pre>
107  * Perl5Util util = new Perl5Util();
108  * String line;
109  * DataInputStream input;
110  * PrintStream output;
111  * 
112  * // Initialization of input and output omitted
113  * while((line = input.readLine()) != null) {
114  *     // First find the line with the string we want to substitute because
115  *     // it is cheaper than blindly substituting each line.
116  *     if(util.match("/HREF=\"description1.html\"/")) {
117  *        line = util.substitute("s/description1\\.html/about1.html/", line);
118  *     }
119  *    output.println(line);
120  * }
121  * </pre>
122  * <p>
123  * A couple of things to remember when using this class are that the
124  * {@link #match match()} methods have the same meaning as
125  * {@link org.apache.oro.text.regex.Perl5Matcher#contains
126  *  Perl5Matcher.contains()}
127  * and <code>=~ m/pattern/</code> in Perl.  The methods are named match
128  * to more closely associate them with Perl and to differentiate them
129  * from {@link org.apache.oro.text.regex.Perl5Matcher#matches
130  * Perl5Matcher.matches()}.
131  * A further thing to keep in mind is that the
132  * {@link MalformedPerl5PatternException} class is derived from
133  * RuntimeException which means you DON'T have to catch it.  The reasoning
134  * behind this is that you will detect your regular expression mistakes
135  * as you write and debug your program when a MalformedPerl5PatternException
136  * is thrown during a test run.  However, we STRONGLY recommend that you
137  * ALWAYS catch MalformedPerl5PatternException whenever you deal with a
138  * DYNAMICALLY created pattern.  Relying on a fatal
139  * MalformedPerl5PatternException being thrown to detect errors while
140  * debugging is only useful for dealing with static patterns, that is, actual
141  * pregenerated strings present in your program.  Patterns created from user
142  * input or some other dynamic method CANNOT be relied upon to be correct
143  * and MUST be handled by catching MalformedPerl5PatternException for your
144  * programs to be robust.
145  * <p>
146  * Finally, as a convenience Perl5Util implements 
147  * the {@link org.apache.oro.text.regex.MatchResult MatchResult} interface.
148  * The methods are merely wrappers which call the corresponding method of
149  * the last {@link org.apache.oro.text.regex.MatchResult MatchResult}
150  * found (which can be accessed with {@link #getMatch()}) by a match or
151  * substitution (or even a split, but this isn't particularly useful).
152  * At the moment, the
153  * {@link org.apache.oro.text.regex.MatchResult MatchResult} returned
154  * by {@link #getMatch()} is not stored in a thread-local variable.  Therefore
155  * concurrent calls to {@link #getMatch()} will produce unpredictable
156  * results.  So if your concurrent program requires the match results,
157  * you must protect the matching and the result retrieval in a critical
158  * section.  If you do not need match results, you don't need to do anything
159  * special.  If you feel the J2SE implementation of {@link #getMatch()}
160  * should use a thread-local variable and obviate the need for a critical
161  * section, please express your views on the oro-dev mailing list.
162  *
163  * @version @version@
164  * @since 1.0
165  * @see MalformedPerl5PatternException
166  * @see org.apache.oro.text.PatternCache
167  * @see org.apache.oro.text.PatternCacheLRU
168  * @see org.apache.oro.text.regex.MatchResult
169  */
170 public final class Perl5Util implements MatchResult {
171   /** The regular expression to use to parse match expression. */
172   private static final String __matchExpression = "m?(\\W)(.*)\\1([imsx]*)";
173 
174   /** The pattern cache to compile and store patterns */
175   private PatternCache __patternCache;
176   /** The hashtable to cache higher-level expressions */
177   private Cache __expressionCache;
178   /** The pattern matcher to perform matching operations. */
179   private Perl5Matcher __matcher;
180   /** The compiled match expression parsing regular expression. */
181   private Pattern __matchPattern;
182   /** The last match from a successful call to a matching method. */
183   private MatchResult __lastMatch;
184   /**
185    * A container for temporarily holding the results of a split before
186    * deleting trailing empty fields.
187    */
188   private ArrayList __splitList;
189 
190   /**
191    * Keeps track of the original input (for postMatch() and preMatch())
192    * methods.  This will be discarded if the preMatch() and postMatch()
193    * methods are moved into the MatchResult interface.
194    */
195   private Object __originalInput;
196 
197   /**
198    * Keeps track of the begin and end offsets of the original input for
199    * the postMatch() and preMatch() methods.
200    */
201   private int __inputBeginOffset, __inputEndOffset;
202 
203   /** Used for default return value of post and pre Match() */
204   private static final String __nullString = "";
205 
206   /**
207    * A constant passed to the {@link #split split()} methods indicating
208    * that all occurrences of a pattern should be used to split a string. 
209    */
210   public static final int SPLIT_ALL = Util.SPLIT_ALL;
211 
212   /**
213    * A secondary constructor for Perl5Util.  It initializes the Perl5Matcher
214    * used by the class to perform matching operations, but requires the
215    * programmer to provide a PatternCache instance for the class
216    * to use to compile and store regular expressions.  You would want to
217    * use this constructor if you want to change the capacity or policy 
218    * of the cache used.  Example uses might be:
219    * <pre>
220    * // We know we're going to use close to 50 expressions a whole lot, so
221    * // we create a cache of the proper size.
222    * util = new Perl5Util(new PatternCacheLRU(50));
223    * </pre>
224    * or
225    * <pre>
226    * // We're only going to use a few expressions and know that second-chance
227    * // fifo is best suited to the order in which we are using the patterns.
228    * util = new Perl5Util(new PatternCacheFIFO2(10));
229    * </pre>
230    */
231   public Perl5Util(PatternCache cache) {
232     __splitList    = new ArrayList();
233     __matcher      = new Perl5Matcher();
234     __patternCache = cache;
235     __expressionCache = new CacheLRU(cache.capacity());
236     __compilePatterns();
237   }
238 
239   /**
240    * Default constructor for Perl5Util.  This initializes the Perl5Matcher
241    * used by the class to perform matching operations and creates a
242    * default PatternCacheLRU instance to use to compile and cache regular
243    * expressions.  The size of this cache is 
244    * GenericPatternCache.DEFAULT_CAPACITY.
245    */
246   public Perl5Util() {
247     this(new PatternCacheLRU());
248   }
249 
250   /**
251    * Compiles the patterns (currently only the match expression) used to
252    * parse Perl5 expressions.  Right now it initializes __matchPattern.
253    */
254   private void __compilePatterns() {
255     Perl5Compiler compiler = new Perl5Compiler();
256 
257     try {
258       __matchPattern = 
259   compiler.compile(__matchExpression, Perl5Compiler.SINGLELINE_MASK);
260     } catch(MalformedPatternException e) {
261       // This should only happen during debugging.
262       //e.printStackTrace();
263       throw new RuntimeException(e.getMessage());
264     }
265   }
266 
267   /**
268    * Parses a match expression and returns a compiled pattern.
269    * First checks the expression cache and if the pattern is not found,
270    * then parses the expression and fetches a compiled pattern from the
271    * pattern cache.  Otherwise, just uses the pattern found in the
272    * expression cache.  __matchPattern is used to parse the expression.
273    * <p>
274    * @param pattern  The Perl5 match expression to parse.
275    * @exception MalformedPerl5PatternException If there is an error parsing
276    *            the expression.
277    */
278   private Pattern __parseMatchExpression(String pattern)
279        throws MalformedPerl5PatternException 
280   {
281     int index, compileOptions;
282     String options, regex;
283     MatchResult result;
284     Object obj;
285     Pattern ret;
286 
287     obj = __expressionCache.getElement(pattern);
288 
289     // Must catch ClassCastException because someone might incorrectly 
290     // pass an s/// expression.  try block is cheaper than checking
291     // instanceof
292     try {
293       if(obj != null)
294   return (Pattern)obj;
295     } catch(ClassCastException e) {
296       // Fall through and parse expression
297     }
298 
299     if(!__matcher.matches(pattern, __matchPattern))
300       throw new
301   MalformedPerl5PatternException("Invalid expression: " +
302                pattern);
303 
304     result = __matcher.getMatch();
305 
306     regex = result.group(2);
307     compileOptions = Perl5Compiler.DEFAULT_MASK;
308 
309     options = result.group(3);
310 
311     if(options != null) {
312       index = options.length();
313 
314       while(index-- > 0) {
315   switch(options.charAt(index)) {
316   case 'i' :
317     compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
318     break;
319   case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
320   case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
321   case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
322   default  :
323     throw new
324       MalformedPerl5PatternException("Invalid options: " + options);
325   }
326       }
327     }
328 
329     ret = __patternCache.getPattern(regex, compileOptions);
330     __expressionCache.addElement(pattern, ret);
331 
332     return ret;
333   }
334 
335   /**
336    * Searches for the first pattern match somewhere in a character array
337    * taking a pattern specified in Perl5 native format:
338    * <blockquote><pre>
339    * [m]/pattern/[i][m][s][x]
340    * </pre></blockquote>
341    * The <code>m</code> prefix is optional and the meaning of the optional
342    * trailing options are:
343    * <dl compact> 
344    * <dt> i <dd> case insensitive match
345    * <dt> m <dd> treat the input as consisting of multiple lines
346    * <dt> s <dd> treat the input as consisting of a single line
347    * <dt> x <dd> enable extended expression syntax incorporating whitespace
348    *             and comments
349    * </dl>
350    * As with Perl, any non-alphanumeric character can be used in lieu of
351    * the slashes.
352    * <p>
353    * If the input contains the pattern, the org.apache.oro.text.regex.MatchResult
354    * can be obtained by calling {@link #getMatch()}.
355    * However, Perl5Util implements the MatchResult interface as a wrapper
356    * around the last MatchResult found, so you can call its methods to
357    * access match information.
358    * <p>
359    * @param pattern  The pattern to search for.
360    * @param input    The char[] input to search.
361    * @return True if the input contains the pattern, false otherwise.
362    * @exception MalformedPerl5PatternException  If there is an error in
363    *            the pattern.  You are not forced to catch this exception
364    *            because it is derived from RuntimeException.
365    */
366   public synchronized boolean match(String pattern, char[] input) 
367        throws MalformedPerl5PatternException
368   {
369     boolean result;
370     __parseMatchExpression(pattern);
371 
372     result = __matcher.contains(input, __parseMatchExpression(pattern));
373        
374     if(result) {
375       __lastMatch        = __matcher.getMatch();
376       __originalInput    = input;
377       __inputBeginOffset = 0;
378       __inputEndOffset   = input.length;
379     }
380 
381     return result;
382   }
383 
384 
385   /**
386    * Searches for the first pattern match in a String taking
387    * a pattern specified in Perl5 native format:
388    * <blockquote><pre>
389    * [m]/pattern/[i][m][s][x]
390    * </pre></blockquote>
391    * The <code>m</code> prefix is optional and the meaning of the optional
392    * trailing options are:
393    * <dl compact> 
394    * <dt> i <dd> case insensitive match
395    * <dt> m <dd> treat the input as consisting of multiple lines
396    * <dt> s <dd> treat the input as consisting of a single line
397    * <dt> x <dd> enable extended expression syntax incorporating whitespace
398    *             and comments
399    * </dl>
400    * As with Perl, any non-alphanumeric character can be used in lieu of
401    * the slashes.
402    * <p>
403    * If the input contains the pattern, the
404    * {@link org.apache.oro.text.regex.MatchResult MatchResult}
405    * can be obtained by calling {@link #getMatch()}.
406    * However, Perl5Util implements the MatchResult interface as a wrapper
407    * around the last MatchResult found, so you can call its methods to
408    * access match information.
409    * <p>
410    * @param pattern  The pattern to search for.
411    * @param input    The String input to search.
412    * @return True if the input contains the pattern, false otherwise.
413    * @exception MalformedPerl5PatternException  If there is an error in
414    *            the pattern.  You are not forced to catch this exception
415    *            because it is derived from RuntimeException.
416    */
417   public synchronized boolean match(String pattern, String input)
418        throws MalformedPerl5PatternException
419   {
420     return match(pattern, input.toCharArray());
421   }
422 
423 
424   /**
425    * Searches for the next pattern match somewhere in a
426    * org.apache.oro.text.regex.PatternMatcherInput instance, taking
427    * a pattern specified in Perl5 native format:
428    * <blockquote><pre>
429    * [m]/pattern/[i][m][s][x]
430    * </pre></blockquote>
431    * The <code>m</code> prefix is optional and the meaning of the optional
432    * trailing options are:
433    * <dl compact> 
434    * <dt> i <dd> case insensitive match
435    * <dt> m <dd> treat the input as consisting of multiple lines
436    * <dt> s <dd> treat the input as consisting of a single line
437    * <dt> x <dd> enable extended expression syntax incorporating whitespace
438    *             and comments
439    * </dl>
440    * As with Perl, any non-alphanumeric character can be used in lieu of
441    * the slashes.
442    * <p>
443    * If the input contains the pattern, the
444    * {@link org.apache.oro.text.regex.MatchResult MatchResult}
445    * can be obtained by calling {@link #getMatch()}.
446    * However, Perl5Util implements the MatchResult interface as a wrapper
447    * around the last MatchResult found, so you can call its methods to
448    * access match information.
449    * After the call to this method, the PatternMatcherInput current offset
450    * is advanced to the end of the match, so you can use it to repeatedly
451    * search for expressions in the entire input using a while loop as
452    * explained in the {@link org.apache.oro.text.regex.PatternMatcherInput
453    * PatternMatcherInput} documentation.
454    * <p>
455    * @param pattern  The pattern to search for.
456    * @param input    The PatternMatcherInput to search.
457    * @return True if the input contains the pattern, false otherwise.
458    * @exception MalformedPerl5PatternException  If there is an error in
459    *            the pattern.  You are not forced to catch this exception
460    *            because it is derived from RuntimeException.
461    */
462   public synchronized boolean match(String pattern, PatternMatcherInput input)
463        throws MalformedPerl5PatternException
464   {
465     boolean result;
466 
467     result = __matcher.contains(input, __parseMatchExpression(pattern));
468 
469     if(result) {
470       __lastMatch     = __matcher.getMatch();
471       __originalInput = input.getInput();
472       __inputBeginOffset = input.getBeginOffset();
473       __inputEndOffset   = input.getEndOffset();
474     }
475 
476     return result;
477   }
478 
479 
480   /**
481    * Returns the last match found by a call to a match(), substitute(), or
482    * split() method.  This method is only intended for use to retrieve a match
483    * found by the last match found by a match() method.  This method should
484    * be used when you want to save MatchResult instances.  Otherwise, for
485    * simply accessing match information, it is more convenient to use the
486    * Perl5Util methods implementing the MatchResult interface.
487    * <p>
488    * @return The org.apache.oro.text.regex.MatchResult instance containing the
489    *         last match found.
490    */
491   public synchronized MatchResult getMatch() {
492     return __lastMatch;
493   }
494 
495 
496   /**
497    * Substitutes a pattern in a given input with a replacement string.
498    * The substitution expression is specified in Perl5 native format:
499    * <blockquote><pre>
500    * s/pattern/replacement/[g][i][m][o][s][x]
501    * </pre></blockquote>
502    * The <code>s</code> prefix is mandatory and the meaning of the optional
503    * trailing options are:
504    * <dl compact> 
505    * <dt> g <dd> Substitute all occurrences of pattern with replacement.
506    *             The default is to replace only the first occurrence.
507    * <dt> i <dd> perform a case insensitive match
508    * <dt> m <dd> treat the input as consisting of multiple lines
509    * <dt> o <dd> If variable interopolation is used, only evaluate the
510    *             interpolation once (the first time).  This is equivalent
511    *             to using a numInterpolations argument of 1 in
512    * {@link org.apache.oro.text.regex.Util#substitute Util.substitute()}.
513    *             The default is to compute each interpolation independently.
514    *             See
515    * {@link org.apache.oro.text.regex.Util#substitute Util.substitute()}
516    * and {@link org.apache.oro.text.regex.Perl5Substitution Perl5Substitution}
517    *             for more details on variable interpolation in
518    *             substitutions.
519    * <dt> s <dd> treat the input as consisting of a single line
520    * <dt> x <dd> enable extended expression syntax incorporating whitespace
521    *             and comments
522    * </dl>
523    * As with Perl, any non-alphanumeric character can be used in lieu of
524    * the slashes.  This is helpful to avoid backslashing.  For example,
525    * using slashes you would have to do:
526    * <blockquote><pre>
527    * numSubs = util.substitute(result, "s/foo\\/bar/goo\\/\\/baz/", input);
528    * </pre></blockquote>
529    * when you could more easily write:
530    * <blockquote><pre>
531    * numSubs = util.substitute(result, "s#foo/bar#goo//baz#", input);
532    * </pre></blockquote>
533    * where the hashmarks are used instead of slashes.
534    * <p>
535    * There is a special case of backslashing that you need to pay attention
536    * to.  As demonstrated above, to denote a delimiter in the substituted
537    * string it must be backslashed.  However, this can be a problem
538    * when you want to denote a backslash at the end of the substituted
539    * string.  As of PerlTools 1.3, a new means of handling this
540    * situation has been implemented.
541    * In previous versions, the behavior was that
542    * <blockquote>
543    * "... a double backslash (quadrupled in the Java String) always
544    * represents two backslashes unless the second backslash is followed
545    * by the delimiter, in which case it represents a single backslash."
546    * </blockquote>
547    * <p>
548    * The new behavior is that a backslash is always a backslash
549    * in the substitution portion of the expression unless it is used to
550    * escape a delimiter.  A backslash is considered to escape a delimiter
551    * if an even number of contiguous backslashes preceed the backslash
552    * and the delimiter following the backslash is not the FINAL delimiter
553    * in the expression.  Therefore, backslashes preceding final delimiters
554    * are never considered to escape the delimiter.  The following, which
555    * used to be an invalid expression and require a special-case extra
556    * backslash, will now replace all instances of / with \:
557    * <blockquote><pre>
558    * numSubs = util.substitute(result, "s#/#\\#g", input);
559    * </pre></blockquote>
560    * <p>
561    * @param result     The StringBuffer in which to store the result of the
562    *                   substitutions. The buffer is only appended to.
563    * @param expression The Perl5 substitution regular expression.
564    * @param input      The input on which to perform substitutions.
565    * @return The number of substitutions made.
566    * @exception MalformedPerl5PatternException  If there is an error in
567    *            the expression.  You are not forced to catch this exception
568    *            because it is derived from RuntimeException.
569    * @since 2.0.6
570    */
571   // Expression parsing will have to be moved into a separate method if
572   // there are going to be variations of this method.
573   public synchronized int substitute(StringBuffer result, String expression,
574              String input)
575        throws MalformedPerl5PatternException 
576   {
577     boolean backslash, finalDelimiter;
578     int index, compileOptions, numSubstitutions, numInterpolations;
579     int firstOffset, secondOffset, thirdOffset, subCount;
580     StringBuffer replacement;
581     Pattern compiledPattern;
582     char exp[], delimiter;
583     ParsedSubstitutionEntry entry;
584     Perl5Substitution substitution;
585     Object obj;
586 
587     obj = __expressionCache.getElement(expression);
588 
589   __nullTest:
590     if(obj != null) {
591       // Must catch ClassCastException because someone might incorrectly 
592       // pass an m// expression.  try block is cheaper than checking
593       // instanceof.  We want to go ahead with parsing just in case so
594       // we break.
595       try {
596   entry = (ParsedSubstitutionEntry)obj;
597       } catch(ClassCastException e) {
598   break __nullTest;
599       }
600 
601 
602       subCount =
603   Util.substitute(result, __matcher, entry._pattern, entry._substitution,
604       input, entry._numSubstitutions);
605 
606       __lastMatch = __matcher.getMatch();
607 
608       return subCount;
609     }
610 
611     exp = expression.toCharArray();
612 
613     // Make sure basic conditions for a valid substitution expression hold.
614     if(exp.length < 4 || exp[0] != 's' || Character.isLetterOrDigit(exp[1])
615        || exp[1] == '-')
616       throw new
617   MalformedPerl5PatternException("Invalid expression: " + expression);
618     delimiter    = exp[1];
619     firstOffset  = 2;
620     secondOffset = thirdOffset = -1;
621     backslash    = false;
622 
623     // Parse pattern
624     for(index = firstOffset; index < exp.length; index++) {
625       if(exp[index] == '\\')
626   backslash = !backslash;
627       else if(exp[index] == delimiter && !backslash) {
628   secondOffset = index;
629   break;
630       } else if(backslash) 
631   backslash = !backslash;
632     }
633 
634     if(secondOffset == -1 || secondOffset == exp.length - 1)
635       throw new
636   MalformedPerl5PatternException("Invalid expression: " + expression);
637 
638     // Parse replacement string
639 
640     backslash = false;
641     finalDelimiter = true;
642     replacement = new StringBuffer(exp.length - secondOffset);
643     for(index = secondOffset + 1; index < exp.length; index++) {
644       if(exp[index] == '\\') {
645   backslash = !backslash;
646 
647   // 05/05/99 dfs
648   // We unbackslash backslashed delimiters in the replacement string
649   // only if we're on an odd backslash and there is another occurrence
650   // of a delimiter later in the string.
651   if(backslash && index + 1 < exp.length && exp[index + 1] == delimiter
652     && expression.lastIndexOf(delimiter, exp.length - 1) != (index + 1))
653   {
654     finalDelimiter = false;
655     continue;
656   }
657       } else if(exp[index] == delimiter && finalDelimiter) {
658   thirdOffset = index;
659   break;
660       } else {
661   backslash      = false;
662   finalDelimiter = true;
663       }
664 
665       replacement.append(exp[index]);
666     }
667 
668     if(thirdOffset == -1)
669       throw new
670   MalformedPerl5PatternException("Invalid expression: " + expression);
671 
672     compileOptions    = Perl5Compiler.DEFAULT_MASK;
673     numSubstitutions  = 1;
674 
675     // Single quotes cause no interpolations to be performed in replacement
676     if(delimiter != '\'')
677       numInterpolations = Perl5Substitution.INTERPOLATE_ALL;
678     else
679       numInterpolations = Perl5Substitution.INTERPOLATE_NONE;
680 
681     // Parse options
682     for(index = thirdOffset + 1; index < exp.length; index++) {
683       switch(exp[index]) {
684       case 'i' :
685   compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
686   break;
687       case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
688       case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
689       case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
690       case 'g' : numSubstitutions = Util.SUBSTITUTE_ALL; break;
691       case 'o' : numInterpolations = 1; break;
692       default  :
693   throw new
694     MalformedPerl5PatternException("Invalid option: " + exp[index]);
695       }
696     }
697 
698     compiledPattern =
699       __patternCache.getPattern(new String(exp, firstOffset,
700              secondOffset - firstOffset),
701         compileOptions);
702     substitution =
703       new Perl5Substitution(replacement.toString(), numInterpolations);
704     entry = new ParsedSubstitutionEntry(compiledPattern, substitution,
705           numSubstitutions);
706     __expressionCache.addElement(expression, entry);
707 
708     subCount =
709       Util.substitute(result, __matcher, compiledPattern, substitution,
710           input, numSubstitutions);
711 
712     __lastMatch = __matcher.getMatch();
713 
714     return subCount;
715   }
716 
717   /**
718    * Substitutes a pattern in a given input with a replacement string.
719    * The substitution expression is specified in Perl5 native format.
720    * <dl compact>
721    *   <dt>Calling this method is the same as:</dt>
722    *   <dd>
723    *     <blockquote><pre>
724    *      String result;
725    *      StringBuffer buffer = new StringBuffer();
726    *      perl.substitute(buffer, expression, input);
727    *      result = buffer.toString();
728    *     </pre></blockquote>
729    *   </dd>
730    * </dl>
731    * @param expression The Perl5 substitution regular expression.
732    * @param input      The input on which to perform substitutions.
733    * @return  The input as a String after substitutions have been performed.
734    * @exception MalformedPerl5PatternException  If there is an error in
735    *            the expression.  You are not forced to catch this exception
736    *            because it is derived from RuntimeException.
737    * @since 1.0
738    * @see #substitute
739    */
740   public synchronized String substitute(String expression, String input)
741     throws MalformedPerl5PatternException
742   {
743     StringBuffer result = new StringBuffer();
744     substitute(result, expression, input);
745     return result.toString();
746   }
747  
748   /**
749    * Splits a String into strings that are appended to a List, but no more
750    * than a specified limit.  The String is split using a regular expression
751    * as the delimiter.  The regular expression is a pattern specified
752    * in Perl5 native format:
753    * <blockquote><pre>
754    * [m]/pattern/[i][m][s][x]
755    * </pre></blockquote>
756    * The <code>m</code> prefix is optional and the meaning of the optional
757    * trailing options are:
758    * <dl compact> 
759    * <dt> i <dd> case insensitive match
760    * <dt> m <dd> treat the input as consisting of multiple lines
761    * <dt> s <dd> treat the input as consisting of a single line
762    * <dt> x <dd> enable extended expression syntax incorporating whitespace
763    *             and comments
764    * </dl>
765    * As with Perl, any non-alphanumeric character can be used in lieu of
766    * the slashes.
767    * <p>
768    * The limit parameter causes the string to be split on at most the first
769    * <b>limit - 1</b> number of pattern occurences.
770    * <p>
771    * Of special note is that this split method performs EXACTLY the same
772    * as the Perl split() function.  In other words, if the split pattern
773    * contains parentheses, additional Vector elements are created from
774    * each of the matching subgroups in the pattern.  Using an example
775    * similar to the one from the Camel book:
776    * <blockquote><pre>
777    * split(list, "/([,-])/", "8-12,15,18")
778    * </pre></blockquote>
779    * produces the Vector containing:
780    * <blockquote><pre>
781    * { "8", "-", "12", ",", "15", ",", "18" }
782    * </pre></blockquote>
783    * Furthermore, the following Perl behavior is observed: "leading empty
784    * fields are preserved, and empty trailing one are deleted."  This
785    * has the effect that a split on a zero length string returns an empty
786    * list.
787    * The {@link org.apache.oro.text.regex.Util#split Util.split()} method
788    * does NOT implement these behaviors because it is intended to
789    * be a general self-consistent and predictable split function usable
790    * with Pattern instances other than Perl5Pattern.
791    * <p>
792    * @param results 
793    *    A <code> Collection </code> to which the substrings of the input
794    *    that occur between the regular expression delimiter occurences
795    *    are appended. The input will not be split into any more substrings
796    *    than the specified 
797    *    limit. A way of thinking of this is that only the first
798    *    <b>limit - 1</b>
799    *    matches of the delimiting regular expression will be used to split the
800    *    input.  The Collection must support the
801    *    <code>addAll(Collection)</code> operation.
802    * @param pattern The regular expression to use as a split delimiter.
803    * @param input The String to split.
804    * @param limit The limit on the size of the returned <code>Vector</code>.
805    *   Values <= 0 produce the same behavior as the SPLIT_ALL constant which
806    *   causes the limit to be ignored and splits to be performed on all
807    *   occurrences of the pattern.  You should use the SPLIT_ALL constant
808    *   to achieve this behavior instead of relying on the default behavior
809    *   associated with non-positive limit values.
810    * @exception MalformedPerl5PatternException  If there is an error in
811    *            the expression.  You are not forced to catch this exception
812    *            because it is derived from RuntimeException.
813    */
814   public synchronized void split(Collection results, String pattern,
815          String input, int limit)
816        throws MalformedPerl5PatternException 
817   {
818     int beginOffset, groups, index;
819     String group;
820     MatchResult currentResult = null;
821     PatternMatcherInput pinput;
822     Pattern compiledPattern;
823 
824     compiledPattern = __parseMatchExpression(pattern);
825 
826     pinput = new PatternMatcherInput(input);
827     beginOffset = 0;
828 
829     while(--limit != 0 && __matcher.contains(pinput, compiledPattern)) {
830       currentResult = __matcher.getMatch();
831 
832       __splitList.add(input.substring(beginOffset,
833               currentResult.beginOffset(0)));
834 
835       if((groups = currentResult.groups()) > 1) {
836   for(index = 1; index < groups; ++index) {
837     group = currentResult.group(index);
838     if(group != null && group.length() > 0)
839       __splitList.add(group);
840   }
841       }
842 
843       beginOffset = currentResult.endOffset(0);
844     }
845 
846     __splitList.add(input.substring(beginOffset, input.length()));
847 
848     // Remove all trailing empty fields.
849     for(int i = __splitList.size() - 1; i >= 0; --i) {
850       String str;
851 
852       str = (String)__splitList.get(i);
853       if(str.length() == 0)
854   __splitList.remove(i);
855       else
856   break;
857     }
858 
859     results.addAll(__splitList);
860     __splitList.clear();
861 
862     // Just for the sake of completeness
863     __lastMatch = currentResult;
864   }
865 
866   /**
867    * This method is identical to calling:
868    * <blockquote><pre>
869    * split(results, pattern, input, SPLIT_ALL);
870    * </pre></blockquote>
871    */
872   public synchronized void split(Collection results, String pattern,
873          String input)
874        throws MalformedPerl5PatternException 
875   {
876     split(results, pattern, input, SPLIT_ALL);
877   }
878 
879   /**
880    * Splits input in the default Perl manner, splitting on all whitespace.
881    * This method is identical to calling:
882    * <blockquote><pre>
883    * split(results, "/\\s+/", input);
884    * </pre></blockquote>
885    */
886   public synchronized void split(Collection results, String input)
887        throws MalformedPerl5PatternException
888   {
889     split(results, "/\\s+/", input);
890   }
891 
892   /**
893    * Splits a String into strings contained in a Vector of size no greater
894    * than a specified limit.  The String is split using a regular expression
895    * as the delimiter.  The regular expression is a pattern specified
896    * in Perl5 native format:
897    * <blockquote><pre>
898    * [m]/pattern/[i][m][s][x]
899    * </pre></blockquote>
900    * The <code>m</code> prefix is optional and the meaning of the optional
901    * trailing options are:
902    * <dl compact> 
903    * <dt> i <dd> case insensitive match
904    * <dt> m <dd> treat the input as consisting of multiple lines
905    * <dt> s <dd> treat the input as consisting of a single line
906    * <dt> x <dd> enable extended expression syntax incorporating whitespace
907    *             and comments
908    * </dl>
909    * As with Perl, any non-alphanumeric character can be used in lieu of
910    * the slashes.
911    * <p>
912    * The limit parameter causes the string to be split on at most the first
913    * <b>limit - 1</b> number of pattern occurences.
914    * <p>
915    * Of special note is that this split method performs EXACTLY the same
916    * as the Perl split() function.  In other words, if the split pattern
917    * contains parentheses, additional Vector elements are created from
918    * each of the matching subgroups in the pattern.  Using an example
919    * similar to the one from the Camel book:
920    * <blockquote><pre>
921    * split("/([,-])/", "8-12,15,18")
922    * </pre></blockquote>
923    * produces the Vector containing:
924    * <blockquote><pre>
925    * { "8", "-", "12", ",", "15", ",", "18" }
926    * </pre></blockquote>
927    * The {@link org.apache.oro.text.regex.Util#split Util.split()} method
928    * does NOT implement this particular behavior because it is intended to
929    * be usable with Pattern instances other than Perl5Pattern.
930    * <p>
931    * @deprecated Use
932    * {@link #split(Collection results, String pattern, String input, int limit)}
933    *  instead.
934    * @param pattern The regular expression to use as a split delimiter.
935    * @param input The String to split.
936    * @param limit The limit on the size of the returned <code>Vector</code>.
937    *   Values <= 0 produce the same behavior as the SPLIT_ALL constant which
938    *   causes the limit to be ignored and splits to be performed on all
939    *   occurrences of the pattern.  You should use the SPLIT_ALL constant
940    *   to achieve this behavior instead of relying on the default behavior
941    *   associated with non-positive limit values.
942    * @return A <code> Vector </code> containing the substrings of the input
943    *    that occur between the regular expression delimiter occurences. The
944    *    input will not be split into any more substrings than the specified 
945    *    limit. A way of thinking of this is that only the first
946    *    <b>limit - 1</b>
947    *    matches of the delimiting regular expression will be used to split the
948    *    input. 
949    * @exception MalformedPerl5PatternException  If there is an error in
950    *            the expression.  You are not forced to catch this exception
951    *            because it is derived from RuntimeException.
952    */
953   public synchronized Vector split(String pattern, String input, int limit)
954        throws MalformedPerl5PatternException 
955   {
956     Vector results = new Vector(20);
957     split(results, pattern, input, limit);
958     return results;
959   }
960 
961   /**
962    * This method is identical to calling:
963    * <blockquote><pre>
964    * split(pattern, input, SPLIT_ALL);
965    * </pre></blockquote>
966    * @deprecated Use
967    * {@link #split(Collection results, String pattern, String input)} instead.
968    */
969   public synchronized Vector split(String pattern, String input)
970        throws MalformedPerl5PatternException 
971   {
972     return split(pattern, input, SPLIT_ALL);
973   }
974 
975   /**
976    * Splits input in the default Perl manner, splitting on all whitespace.
977    * This method is identical to calling:
978    * <blockquote><pre>
979    * split("/\\s+/", input);
980    * </pre></blockquote>
981    * @deprecated Use
982    * {@link #split(Collection results, String input)} instead.
983    */
984   public synchronized Vector split(String input)
985        throws MalformedPerl5PatternException 
986   {
987     return split("/\\s+/", input);
988   }
989 
990   //
991   // MatchResult interface methods.
992   //
993 
994   /**
995    * Returns the length of the last match found.
996    * <p>
997    * @return The length of the last match found.
998    */
999   public synchronized int length() {
1000    return __lastMatch.length();
1001  }
1002
1003  /**
1004   * @return The number of groups contained in the last match found.
1005   *         This number includes the 0th group.  In other words, the
1006   *         result refers to the number of parenthesized subgroups plus
1007   *         the entire match itself.          
1008   */
1009  public synchronized int groups() {
1010    return __lastMatch.groups();
1011  }
1012
1013
1014  /**
1015   * Returns the contents of the parenthesized subgroups of the last match
1016   * found according to the behavior dictated by the MatchResult interface.
1017   * <p>
1018   * @param group The pattern subgroup to return.
1019   * @return A string containing the indicated pattern subgroup.  Group
1020   *         0 always refers to the entire match.  If a group was never
1021   *         matched, it returns null.  This is not to be confused with
1022   *         a group matching the null string, which will return a String
1023   *         of length 0.
1024   */                       
1025  public synchronized String group(int group) {
1026    return __lastMatch.group(group);
1027  }
1028
1029  /**
1030   * Returns the begin offset of the subgroup of the last match found 
1031   * relative the beginning of the match.
1032   * <p>
1033   * @param group The pattern subgroup.
1034   * @return The offset into group 0 of the first token in the indicated
1035   *         pattern subgroup.  If a group was never matched or does
1036   *         not exist, returns -1.  Be aware that a group that matches
1037   *         the null string at the end of a match will have an offset
1038   *         equal to the length of the string, so you shouldn't blindly
1039   *         use the offset to index an array or String.
1040   */                                                                 
1041  public synchronized int begin(int group) {
1042    return __lastMatch.begin(group);
1043  }
1044
1045
1046  /**
1047   * Returns the end offset of the subgroup of the last match found 
1048   * relative the beginning of the match.
1049   * <p>
1050   * @param group The pattern subgroup.
1051   * @return Returns one plus the offset into group 0 of the last token in
1052   *         the indicated pattern subgroup.  If a group was never matched
1053   *         or does not exist, returns -1.  A group matching the null
1054   *         string will return its start offset.
1055   */
1056  public synchronized int end(int group) {
1057    return __lastMatch.end(group);
1058  }
1059
1060
1061  /**
1062   * Returns an offset marking the beginning of the last pattern match
1063   * found relative to the beginning of the input from which the match
1064   * was extracted.
1065   * <p>
1066   * @param group The pattern subgroup.
1067   * @return The offset of the first token in the indicated
1068   *         pattern subgroup.  If a group was never matched or does
1069   *         not exist, returns -1.          
1070   */
1071  public synchronized int beginOffset(int group) {
1072    return __lastMatch.beginOffset(group);
1073  }
1074
1075  /**
1076   * Returns an offset marking the end of the last pattern match found
1077   * relative to the beginning of the input from which the match was
1078   * extracted.
1079   * <p>
1080   * @param group The pattern subgroup.
1081   * @return Returns one plus the offset of the last token in
1082   *         the indicated pattern subgroup.  If a group was never matched
1083   *         or does not exist, returns -1.  A group matching the null
1084   *         string will return its start offset.
1085   */                   
1086  public synchronized int endOffset(int group) {
1087    return __lastMatch.endOffset(group);
1088  }
1089
1090  /**
1091   * Returns the same as group(0).
1092   * <p>
1093   * @return A string containing the entire match.
1094   */  
1095  public synchronized String toString() {
1096    if(__lastMatch == null)
1097      return null;
1098    return __lastMatch.toString();
1099  }
1100
1101
1102  /**
1103   * Returns the part of the input preceding the last match found.
1104   * <p>
1105   * @return The part of the input following the last match found.
1106   */
1107  public synchronized String preMatch() {
1108    int begin;
1109
1110    if(__originalInput == null)
1111      return __nullString;
1112
1113    begin = __lastMatch.beginOffset(0);
1114
1115    if(begin <= 0)
1116      return __nullString;
1117
1118    if(__originalInput instanceof char[]) {
1119      char[] input;
1120
1121      input = (char[])__originalInput;
1122
1123      // Just in case we make sure begin offset is in bounds.  It should
1124      // be but we're paranoid.
1125      if(begin > input.length)
1126  begin = input.length;
1127
1128      return new String(input, __inputBeginOffset, begin);
1129    } else if(__originalInput instanceof String) {
1130      String input;
1131
1132      input = (String)__originalInput;
1133
1134      // Just in case we make sure begin offset is in bounds.  It should
1135      // be but we're paranoid.
1136      if(begin > input.length())
1137  begin = input.length();
1138
1139      return input.substring(__inputBeginOffset, begin);
1140    }
1141
1142    return __nullString;
1143  }
1144
1145
1146  /**
1147   * Returns the part of the input following the last match found.
1148   * <p>
1149   * @return The part of the input following the last match found.
1150   */
1151  public synchronized String postMatch() {
1152    int end;
1153
1154    if(__originalInput == null)
1155      return __nullString;
1156
1157    end = __lastMatch.endOffset(0);
1158
1159    if(end < 0)
1160      return __nullString;
1161
1162    if(__originalInput instanceof char[]) {
1163      char[] input;
1164
1165      input = (char[])__originalInput;
1166      // Just in case we make sure begin offset is in bounds.  It should
1167      // be but we're paranoid.
1168      if(end >= input.length)
1169  return __nullString;
1170
1171      return new String(input, end, __inputEndOffset - end);
1172    } else if(__originalInput instanceof String) {
1173      String input;
1174
1175      input = (String)__originalInput;
1176
1177      // Just in case we make sure begin offset is in bounds.  It should
1178      // be but we're paranoid.
1179      if(end >= input.length())
1180  return __nullString;
1181
1182      return input.substring(end, __inputEndOffset);
1183    }
1184
1185    return __nullString;
1186  }
1187
1188
1189  /**
1190   * Returns the part of the input preceding the last match found as a
1191   * char array.  This method eliminates the extra
1192   * buffer copying caused by preMatch().toCharArray().
1193   * <p>
1194   * @return The part of the input preceding the last match found as a char[].
1195   *         If the result is of zero length, returns null instead of a zero
1196   *         length array.
1197   */
1198  public synchronized char[] preMatchCharArray() {
1199    int begin;
1200    char[] result = null;
1201
1202    if(__originalInput == null)
1203      return null;
1204
1205    begin = __lastMatch.beginOffset(0);
1206
1207    if(begin <= 0)
1208      return null;
1209
1210    if(__originalInput instanceof char[]) {
1211      char[] input;
1212
1213      input = (char[])__originalInput;
1214
1215      // Just in case we make sure begin offset is in bounds.  It should
1216      // be but we're paranoid.
1217      if(begin >= input.length)
1218  begin = input.length;
1219
1220      result = new char[begin - __inputBeginOffset];
1221      System.arraycopy(input, __inputBeginOffset, result, 0, result.length);
1222    } else if(__originalInput instanceof String) {
1223      String input;
1224
1225      input = (String)__originalInput;
1226
1227      // Just in case we make sure begin offset is in bounds.  It should
1228      // be but we're paranoid.
1229      if(begin >= input.length())
1230  begin = input.length();
1231
1232      result = new char[begin - __inputBeginOffset];
1233      input.getChars(__inputBeginOffset, begin, result, 0);
1234    }
1235
1236    return result;
1237  }
1238
1239
1240  /**
1241   * Returns the part of the input following the last match found as a char
1242   * array.  This method eliminates the extra buffer copying caused by
1243   * preMatch().toCharArray().
1244   * <p>
1245   * @return The part of the input following the last match found as a char[].
1246   *         If the result is of zero length, returns null instead of a zero
1247   *         length array.
1248   */
1249  public synchronized char[] postMatchCharArray() {
1250    int end;
1251    char[] result = null;
1252
1253    if(__originalInput == null)
1254      return null;
1255
1256    end = __lastMatch.endOffset(0);
1257
1258    if(end < 0)
1259      return null;
1260
1261    if(__originalInput instanceof char[]) {
1262      int length;
1263      char[] input;
1264
1265      input = (char[])__originalInput;
1266      // Just in case we make sure begin offset is in bounds.  It should
1267      // be but we're paranoid.
1268      if(end >= input.length)
1269  return null;
1270
1271      length = __inputEndOffset - end;
1272      result = new char[length];
1273      System.arraycopy(input, end, result, 0, length);
1274    } else if(__originalInput instanceof String) {
1275      String input;
1276
1277      input = (String)__originalInput;
1278
1279      // Just in case we make sure begin offset is in bounds.  It should
1280      // be but we're paranoid.
1281      if(end >= __inputEndOffset)
1282  return null;
1283
1284      result = new char[__inputEndOffset - end];
1285      input.getChars(end, __inputEndOffset, result, 0);
1286    }
1287
1288    return result;
1289  }
1290
1291}
1292