Source code: org/apache/oro/text/perl/Perl5Util.java
1 /*
2 * $Id: Perl5Util.java,v 1.19 2003/11/07 20:16:25 dfs Exp $
3 *
4 * ====================================================================
5 * The Apache Software License, Version 1.1
6 *
7 * Copyright (c) 2000 The Apache Software Foundation. All rights
8 * reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * 3. The end-user documentation included with the redistribution,
23 * if any, must include the following acknowledgment:
24 * "This product includes software developed by the
25 * Apache Software Foundation (http://www.apache.org/)."
26 * Alternately, this acknowledgment may appear in the software itself,
27 * if and wherever such third-party acknowledgments normally appear.
28 *
29 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
30 * must not be used to endorse or promote products derived from this
31 * software without prior written permission. For written
32 * permission, please contact apache@apache.org.
33 *
34 * 5. Products derived from this software may not be called "Apache"
35 * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
36 * name, without prior written permission of the Apache Software Foundation.
37 *
38 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This software consists of voluntary contributions made by many
53 * individuals on behalf of the Apache Software Foundation. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58
59 package org.apache.oro.text.perl;
60
61 import java.util.*;
62
63 import org.apache.oro.text.*;
64 import org.apache.oro.text.regex.*;
65 import org.apache.oro.util.*;
66
67 /**
68 * This is a utility class implementing the 3 most common Perl5 operations
69 * involving regular expressions:
70 * <ul>
71 * <li> [m]/pattern/[i][m][s][x],
72 * <li> s/pattern/replacement/[g][i][m][o][s][x],
73 * <li> and split().
74 * </ul>
75 * As with Perl, any non-alphanumeric character can be used in lieu of
76 * the slashes.
77 * <p>
78 * The objective of the class is to minimize the amount of code a Java
79 * programmer using Jakarta-ORO
80 * has to write to achieve the same results as Perl by
81 * transparently handling regular expression compilation, caching, and
82 * matching. A second objective is to use the same Perl pattern matching
83 * syntax to ease the task of Perl programmers transitioning to Java
84 * (this also reduces the number of parameters to a method).
85 * All the state affecting methods are synchronized to avoid
86 * the maintenance of explicit locks in multithreaded programs. This
87 * philosophy differs from the
88 * {@link org.apache.oro.text.regex} package, where
89 * you are expected to either maintain explicit locks, or more preferably
90 * create separate compiler and matcher instances for each thread.
91 * <p>
92 * To use this class, first create an instance using the default constructor
93 * or initialize the instance with a PatternCache of your choosing using
94 * the alternate constructor. The default cache used by Perl5Util is a
95 * PatternCacheLRU of capacity GenericPatternCache.DEFAULT_CAPACITY. You may
96 * want to create a cache with a different capacity, a different
97 * cache replacement policy, or even devise your own PatternCache
98 * implementation. The PatternCacheLRU is probably the best general purpose
99 * pattern cache, but your specific application may be better served by
100 * a different cache replacement policy. You should remember that you can
101 * front-load a cache with all the patterns you will be using before
102 * initializing a Perl5Util instance, or you can just let Perl5Util
103 * fill the cache as you use it.
104 * <p>
105 * You might use the class as follows:
106 * <pre>
107 * Perl5Util util = new Perl5Util();
108 * String line;
109 * DataInputStream input;
110 * PrintStream output;
111 *
112 * // Initialization of input and output omitted
113 * while((line = input.readLine()) != null) {
114 * // First find the line with the string we want to substitute because
115 * // it is cheaper than blindly substituting each line.
116 * if(util.match("/HREF=\"description1.html\"/")) {
117 * line = util.substitute("s/description1\\.html/about1.html/", line);
118 * }
119 * output.println(line);
120 * }
121 * </pre>
122 * <p>
123 * A couple of things to remember when using this class are that the
124 * {@link #match match()} methods have the same meaning as
125 * {@link org.apache.oro.text.regex.Perl5Matcher#contains
126 * Perl5Matcher.contains()}
127 * and <code>=~ m/pattern/</code> in Perl. The methods are named match
128 * to more closely associate them with Perl and to differentiate them
129 * from {@link org.apache.oro.text.regex.Perl5Matcher#matches
130 * Perl5Matcher.matches()}.
131 * A further thing to keep in mind is that the
132 * {@link MalformedPerl5PatternException} class is derived from
133 * RuntimeException which means you DON'T have to catch it. The reasoning
134 * behind this is that you will detect your regular expression mistakes
135 * as you write and debug your program when a MalformedPerl5PatternException
136 * is thrown during a test run. However, we STRONGLY recommend that you
137 * ALWAYS catch MalformedPerl5PatternException whenever you deal with a
138 * DYNAMICALLY created pattern. Relying on a fatal
139 * MalformedPerl5PatternException being thrown to detect errors while
140 * debugging is only useful for dealing with static patterns, that is, actual
141 * pregenerated strings present in your program. Patterns created from user
142 * input or some other dynamic method CANNOT be relied upon to be correct
143 * and MUST be handled by catching MalformedPerl5PatternException for your
144 * programs to be robust.
145 * <p>
146 * Finally, as a convenience Perl5Util implements
147 * the {@link org.apache.oro.text.regex.MatchResult MatchResult} interface.
148 * The methods are merely wrappers which call the corresponding method of
149 * the last {@link org.apache.oro.text.regex.MatchResult MatchResult}
150 * found (which can be accessed with {@link #getMatch()}) by a match or
151 * substitution (or even a split, but this isn't particularly useful).
152 * At the moment, the
153 * {@link org.apache.oro.text.regex.MatchResult MatchResult} returned
154 * by {@link #getMatch()} is not stored in a thread-local variable. Therefore
155 * concurrent calls to {@link #getMatch()} will produce unpredictable
156 * results. So if your concurrent program requires the match results,
157 * you must protect the matching and the result retrieval in a critical
158 * section. If you do not need match results, you don't need to do anything
159 * special. If you feel the J2SE implementation of {@link #getMatch()}
160 * should use a thread-local variable and obviate the need for a critical
161 * section, please express your views on the oro-dev mailing list.
162 *
163 * @version @version@
164 * @since 1.0
165 * @see MalformedPerl5PatternException
166 * @see org.apache.oro.text.PatternCache
167 * @see org.apache.oro.text.PatternCacheLRU
168 * @see org.apache.oro.text.regex.MatchResult
169 */
170 public final class Perl5Util implements MatchResult {
171 /** The regular expression to use to parse match expression. */
172 private static final String __matchExpression = "m?(\\W)(.*)\\1([imsx]*)";
173
174 /** The pattern cache to compile and store patterns */
175 private PatternCache __patternCache;
176 /** The hashtable to cache higher-level expressions */
177 private Cache __expressionCache;
178 /** The pattern matcher to perform matching operations. */
179 private Perl5Matcher __matcher;
180 /** The compiled match expression parsing regular expression. */
181 private Pattern __matchPattern;
182 /** The last match from a successful call to a matching method. */
183 private MatchResult __lastMatch;
184 /**
185 * A container for temporarily holding the results of a split before
186 * deleting trailing empty fields.
187 */
188 private ArrayList __splitList;
189
190 /**
191 * Keeps track of the original input (for postMatch() and preMatch())
192 * methods. This will be discarded if the preMatch() and postMatch()
193 * methods are moved into the MatchResult interface.
194 */
195 private Object __originalInput;
196
197 /**
198 * Keeps track of the begin and end offsets of the original input for
199 * the postMatch() and preMatch() methods.
200 */
201 private int __inputBeginOffset, __inputEndOffset;
202
203 /** Used for default return value of post and pre Match() */
204 private static final String __nullString = "";
205
206 /**
207 * A constant passed to the {@link #split split()} methods indicating
208 * that all occurrences of a pattern should be used to split a string.
209 */
210 public static final int SPLIT_ALL = Util.SPLIT_ALL;
211
212 /**
213 * A secondary constructor for Perl5Util. It initializes the Perl5Matcher
214 * used by the class to perform matching operations, but requires the
215 * programmer to provide a PatternCache instance for the class
216 * to use to compile and store regular expressions. You would want to
217 * use this constructor if you want to change the capacity or policy
218 * of the cache used. Example uses might be:
219 * <pre>
220 * // We know we're going to use close to 50 expressions a whole lot, so
221 * // we create a cache of the proper size.
222 * util = new Perl5Util(new PatternCacheLRU(50));
223 * </pre>
224 * or
225 * <pre>
226 * // We're only going to use a few expressions and know that second-chance
227 * // fifo is best suited to the order in which we are using the patterns.
228 * util = new Perl5Util(new PatternCacheFIFO2(10));
229 * </pre>
230 */
231 public Perl5Util(PatternCache cache) {
232 __splitList = new ArrayList();
233 __matcher = new Perl5Matcher();
234 __patternCache = cache;
235 __expressionCache = new CacheLRU(cache.capacity());
236 __compilePatterns();
237 }
238
239 /**
240 * Default constructor for Perl5Util. This initializes the Perl5Matcher
241 * used by the class to perform matching operations and creates a
242 * default PatternCacheLRU instance to use to compile and cache regular
243 * expressions. The size of this cache is
244 * GenericPatternCache.DEFAULT_CAPACITY.
245 */
246 public Perl5Util() {
247 this(new PatternCacheLRU());
248 }
249
250 /**
251 * Compiles the patterns (currently only the match expression) used to
252 * parse Perl5 expressions. Right now it initializes __matchPattern.
253 */
254 private void __compilePatterns() {
255 Perl5Compiler compiler = new Perl5Compiler();
256
257 try {
258 __matchPattern =
259 compiler.compile(__matchExpression, Perl5Compiler.SINGLELINE_MASK);
260 } catch(MalformedPatternException e) {
261 // This should only happen during debugging.
262 //e.printStackTrace();
263 throw new RuntimeException(e.getMessage());
264 }
265 }
266
267 /**
268 * Parses a match expression and returns a compiled pattern.
269 * First checks the expression cache and if the pattern is not found,
270 * then parses the expression and fetches a compiled pattern from the
271 * pattern cache. Otherwise, just uses the pattern found in the
272 * expression cache. __matchPattern is used to parse the expression.
273 * <p>
274 * @param pattern The Perl5 match expression to parse.
275 * @exception MalformedPerl5PatternException If there is an error parsing
276 * the expression.
277 */
278 private Pattern __parseMatchExpression(String pattern)
279 throws MalformedPerl5PatternException
280 {
281 int index, compileOptions;
282 String options, regex;
283 MatchResult result;
284 Object obj;
285 Pattern ret;
286
287 obj = __expressionCache.getElement(pattern);
288
289 // Must catch ClassCastException because someone might incorrectly
290 // pass an s/// expression. try block is cheaper than checking
291 // instanceof
292 try {
293 if(obj != null)
294 return (Pattern)obj;
295 } catch(ClassCastException e) {
296 // Fall through and parse expression
297 }
298
299 if(!__matcher.matches(pattern, __matchPattern))
300 throw new
301 MalformedPerl5PatternException("Invalid expression: " +
302 pattern);
303
304 result = __matcher.getMatch();
305
306 regex = result.group(2);
307 compileOptions = Perl5Compiler.DEFAULT_MASK;
308
309 options = result.group(3);
310
311 if(options != null) {
312 index = options.length();
313
314 while(index-- > 0) {
315 switch(options.charAt(index)) {
316 case 'i' :
317 compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
318 break;
319 case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
320 case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
321 case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
322 default :
323 throw new
324 MalformedPerl5PatternException("Invalid options: " + options);
325 }
326 }
327 }
328
329 ret = __patternCache.getPattern(regex, compileOptions);
330 __expressionCache.addElement(pattern, ret);
331
332 return ret;
333 }
334
335 /**
336 * Searches for the first pattern match somewhere in a character array
337 * taking a pattern specified in Perl5 native format:
338 * <blockquote><pre>
339 * [m]/pattern/[i][m][s][x]
340 * </pre></blockquote>
341 * The <code>m</code> prefix is optional and the meaning of the optional
342 * trailing options are:
343 * <dl compact>
344 * <dt> i <dd> case insensitive match
345 * <dt> m <dd> treat the input as consisting of multiple lines
346 * <dt> s <dd> treat the input as consisting of a single line
347 * <dt> x <dd> enable extended expression syntax incorporating whitespace
348 * and comments
349 * </dl>
350 * As with Perl, any non-alphanumeric character can be used in lieu of
351 * the slashes.
352 * <p>
353 * If the input contains the pattern, the org.apache.oro.text.regex.MatchResult
354 * can be obtained by calling {@link #getMatch()}.
355 * However, Perl5Util implements the MatchResult interface as a wrapper
356 * around the last MatchResult found, so you can call its methods to
357 * access match information.
358 * <p>
359 * @param pattern The pattern to search for.
360 * @param input The char[] input to search.
361 * @return True if the input contains the pattern, false otherwise.
362 * @exception MalformedPerl5PatternException If there is an error in
363 * the pattern. You are not forced to catch this exception
364 * because it is derived from RuntimeException.
365 */
366 public synchronized boolean match(String pattern, char[] input)
367 throws MalformedPerl5PatternException
368 {
369 boolean result;
370 __parseMatchExpression(pattern);
371
372 result = __matcher.contains(input, __parseMatchExpression(pattern));
373
374 if(result) {
375 __lastMatch = __matcher.getMatch();
376 __originalInput = input;
377 __inputBeginOffset = 0;
378 __inputEndOffset = input.length;
379 }
380
381 return result;
382 }
383
384
385 /**
386 * Searches for the first pattern match in a String taking
387 * a pattern specified in Perl5 native format:
388 * <blockquote><pre>
389 * [m]/pattern/[i][m][s][x]
390 * </pre></blockquote>
391 * The <code>m</code> prefix is optional and the meaning of the optional
392 * trailing options are:
393 * <dl compact>
394 * <dt> i <dd> case insensitive match
395 * <dt> m <dd> treat the input as consisting of multiple lines
396 * <dt> s <dd> treat the input as consisting of a single line
397 * <dt> x <dd> enable extended expression syntax incorporating whitespace
398 * and comments
399 * </dl>
400 * As with Perl, any non-alphanumeric character can be used in lieu of
401 * the slashes.
402 * <p>
403 * If the input contains the pattern, the
404 * {@link org.apache.oro.text.regex.MatchResult MatchResult}
405 * can be obtained by calling {@link #getMatch()}.
406 * However, Perl5Util implements the MatchResult interface as a wrapper
407 * around the last MatchResult found, so you can call its methods to
408 * access match information.
409 * <p>
410 * @param pattern The pattern to search for.
411 * @param input The String input to search.
412 * @return True if the input contains the pattern, false otherwise.
413 * @exception MalformedPerl5PatternException If there is an error in
414 * the pattern. You are not forced to catch this exception
415 * because it is derived from RuntimeException.
416 */
417 public synchronized boolean match(String pattern, String input)
418 throws MalformedPerl5PatternException
419 {
420 return match(pattern, input.toCharArray());
421 }
422
423
424 /**
425 * Searches for the next pattern match somewhere in a
426 * org.apache.oro.text.regex.PatternMatcherInput instance, taking
427 * a pattern specified in Perl5 native format:
428 * <blockquote><pre>
429 * [m]/pattern/[i][m][s][x]
430 * </pre></blockquote>
431 * The <code>m</code> prefix is optional and the meaning of the optional
432 * trailing options are:
433 * <dl compact>
434 * <dt> i <dd> case insensitive match
435 * <dt> m <dd> treat the input as consisting of multiple lines
436 * <dt> s <dd> treat the input as consisting of a single line
437 * <dt> x <dd> enable extended expression syntax incorporating whitespace
438 * and comments
439 * </dl>
440 * As with Perl, any non-alphanumeric character can be used in lieu of
441 * the slashes.
442 * <p>
443 * If the input contains the pattern, the
444 * {@link org.apache.oro.text.regex.MatchResult MatchResult}
445 * can be obtained by calling {@link #getMatch()}.
446 * However, Perl5Util implements the MatchResult interface as a wrapper
447 * around the last MatchResult found, so you can call its methods to
448 * access match information.
449 * After the call to this method, the PatternMatcherInput current offset
450 * is advanced to the end of the match, so you can use it to repeatedly
451 * search for expressions in the entire input using a while loop as
452 * explained in the {@link org.apache.oro.text.regex.PatternMatcherInput
453 * PatternMatcherInput} documentation.
454 * <p>
455 * @param pattern The pattern to search for.
456 * @param input The PatternMatcherInput to search.
457 * @return True if the input contains the pattern, false otherwise.
458 * @exception MalformedPerl5PatternException If there is an error in
459 * the pattern. You are not forced to catch this exception
460 * because it is derived from RuntimeException.
461 */
462 public synchronized boolean match(String pattern, PatternMatcherInput input)
463 throws MalformedPerl5PatternException
464 {
465 boolean result;
466
467 result = __matcher.contains(input, __parseMatchExpression(pattern));
468
469 if(result) {
470 __lastMatch = __matcher.getMatch();
471 __originalInput = input.getInput();
472 __inputBeginOffset = input.getBeginOffset();
473 __inputEndOffset = input.getEndOffset();
474 }
475
476 return result;
477 }
478
479
480 /**
481 * Returns the last match found by a call to a match(), substitute(), or
482 * split() method. This method is only intended for use to retrieve a match
483 * found by the last match found by a match() method. This method should
484 * be used when you want to save MatchResult instances. Otherwise, for
485 * simply accessing match information, it is more convenient to use the
486 * Perl5Util methods implementing the MatchResult interface.
487 * <p>
488 * @return The org.apache.oro.text.regex.MatchResult instance containing the
489 * last match found.
490 */
491 public synchronized MatchResult getMatch() {
492 return __lastMatch;
493 }
494
495
496 /**
497 * Substitutes a pattern in a given input with a replacement string.
498 * The substitution expression is specified in Perl5 native format:
499 * <blockquote><pre>
500 * s/pattern/replacement/[g][i][m][o][s][x]
501 * </pre></blockquote>
502 * The <code>s</code> prefix is mandatory and the meaning of the optional
503 * trailing options are:
504 * <dl compact>
505 * <dt> g <dd> Substitute all occurrences of pattern with replacement.
506 * The default is to replace only the first occurrence.
507 * <dt> i <dd> perform a case insensitive match
508 * <dt> m <dd> treat the input as consisting of multiple lines
509 * <dt> o <dd> If variable interopolation is used, only evaluate the
510 * interpolation once (the first time). This is equivalent
511 * to using a numInterpolations argument of 1 in
512 * {@link org.apache.oro.text.regex.Util#substitute Util.substitute()}.
513 * The default is to compute each interpolation independently.
514 * See
515 * {@link org.apache.oro.text.regex.Util#substitute Util.substitute()}
516 * and {@link org.apache.oro.text.regex.Perl5Substitution Perl5Substitution}
517 * for more details on variable interpolation in
518 * substitutions.
519 * <dt> s <dd> treat the input as consisting of a single line
520 * <dt> x <dd> enable extended expression syntax incorporating whitespace
521 * and comments
522 * </dl>
523 * As with Perl, any non-alphanumeric character can be used in lieu of
524 * the slashes. This is helpful to avoid backslashing. For example,
525 * using slashes you would have to do:
526 * <blockquote><pre>
527 * numSubs = util.substitute(result, "s/foo\\/bar/goo\\/\\/baz/", input);
528 * </pre></blockquote>
529 * when you could more easily write:
530 * <blockquote><pre>
531 * numSubs = util.substitute(result, "s#foo/bar#goo//baz#", input);
532 * </pre></blockquote>
533 * where the hashmarks are used instead of slashes.
534 * <p>
535 * There is a special case of backslashing that you need to pay attention
536 * to. As demonstrated above, to denote a delimiter in the substituted
537 * string it must be backslashed. However, this can be a problem
538 * when you want to denote a backslash at the end of the substituted
539 * string. As of PerlTools 1.3, a new means of handling this
540 * situation has been implemented.
541 * In previous versions, the behavior was that
542 * <blockquote>
543 * "... a double backslash (quadrupled in the Java String) always
544 * represents two backslashes unless the second backslash is followed
545 * by the delimiter, in which case it represents a single backslash."
546 * </blockquote>
547 * <p>
548 * The new behavior is that a backslash is always a backslash
549 * in the substitution portion of the expression unless it is used to
550 * escape a delimiter. A backslash is considered to escape a delimiter
551 * if an even number of contiguous backslashes preceed the backslash
552 * and the delimiter following the backslash is not the FINAL delimiter
553 * in the expression. Therefore, backslashes preceding final delimiters
554 * are never considered to escape the delimiter. The following, which
555 * used to be an invalid expression and require a special-case extra
556 * backslash, will now replace all instances of / with \:
557 * <blockquote><pre>
558 * numSubs = util.substitute(result, "s#/#\\#g", input);
559 * </pre></blockquote>
560 * <p>
561 * @param result The StringBuffer in which to store the result of the
562 * substitutions. The buffer is only appended to.
563 * @param expression The Perl5 substitution regular expression.
564 * @param input The input on which to perform substitutions.
565 * @return The number of substitutions made.
566 * @exception MalformedPerl5PatternException If there is an error in
567 * the expression. You are not forced to catch this exception
568 * because it is derived from RuntimeException.
569 * @since 2.0.6
570 */
571 // Expression parsing will have to be moved into a separate method if
572 // there are going to be variations of this method.
573 public synchronized int substitute(StringBuffer result, String expression,
574 String input)
575 throws MalformedPerl5PatternException
576 {
577 boolean backslash, finalDelimiter;
578 int index, compileOptions, numSubstitutions, numInterpolations;
579 int firstOffset, secondOffset, thirdOffset, subCount;
580 StringBuffer replacement;
581 Pattern compiledPattern;
582 char exp[], delimiter;
583 ParsedSubstitutionEntry entry;
584 Perl5Substitution substitution;
585 Object obj;
586
587 obj = __expressionCache.getElement(expression);
588
589 __nullTest:
590 if(obj != null) {
591 // Must catch ClassCastException because someone might incorrectly
592 // pass an m// expression. try block is cheaper than checking
593 // instanceof. We want to go ahead with parsing just in case so
594 // we break.
595 try {
596 entry = (ParsedSubstitutionEntry)obj;
597 } catch(ClassCastException e) {
598 break __nullTest;
599 }
600
601
602 subCount =
603 Util.substitute(result, __matcher, entry._pattern, entry._substitution,
604 input, entry._numSubstitutions);
605
606 __lastMatch = __matcher.getMatch();
607
608 return subCount;
609 }
610
611 exp = expression.toCharArray();
612
613 // Make sure basic conditions for a valid substitution expression hold.
614 if(exp.length < 4 || exp[0] != 's' || Character.isLetterOrDigit(exp[1])
615 || exp[1] == '-')
616 throw new
617 MalformedPerl5PatternException("Invalid expression: " + expression);
618 delimiter = exp[1];
619 firstOffset = 2;
620 secondOffset = thirdOffset = -1;
621 backslash = false;
622
623 // Parse pattern
624 for(index = firstOffset; index < exp.length; index++) {
625 if(exp[index] == '\\')
626 backslash = !backslash;
627 else if(exp[index] == delimiter && !backslash) {
628 secondOffset = index;
629 break;
630 } else if(backslash)
631 backslash = !backslash;
632 }
633
634 if(secondOffset == -1 || secondOffset == exp.length - 1)
635 throw new
636 MalformedPerl5PatternException("Invalid expression: " + expression);
637
638 // Parse replacement string
639
640 backslash = false;
641 finalDelimiter = true;
642 replacement = new StringBuffer(exp.length - secondOffset);
643 for(index = secondOffset + 1; index < exp.length; index++) {
644 if(exp[index] == '\\') {
645 backslash = !backslash;
646
647 // 05/05/99 dfs
648 // We unbackslash backslashed delimiters in the replacement string
649 // only if we're on an odd backslash and there is another occurrence
650 // of a delimiter later in the string.
651 if(backslash && index + 1 < exp.length && exp[index + 1] == delimiter
652 && expression.lastIndexOf(delimiter, exp.length - 1) != (index + 1))
653 {
654 finalDelimiter = false;
655 continue;
656 }
657 } else if(exp[index] == delimiter && finalDelimiter) {
658 thirdOffset = index;
659 break;
660 } else {
661 backslash = false;
662 finalDelimiter = true;
663 }
664
665 replacement.append(exp[index]);
666 }
667
668 if(thirdOffset == -1)
669 throw new
670 MalformedPerl5PatternException("Invalid expression: " + expression);
671
672 compileOptions = Perl5Compiler.DEFAULT_MASK;
673 numSubstitutions = 1;
674
675 // Single quotes cause no interpolations to be performed in replacement
676 if(delimiter != '\'')
677 numInterpolations = Perl5Substitution.INTERPOLATE_ALL;
678 else
679 numInterpolations = Perl5Substitution.INTERPOLATE_NONE;
680
681 // Parse options
682 for(index = thirdOffset + 1; index < exp.length; index++) {
683 switch(exp[index]) {
684 case 'i' :
685 compileOptions |= Perl5Compiler.CASE_INSENSITIVE_MASK;
686 break;
687 case 'm' : compileOptions |= Perl5Compiler.MULTILINE_MASK; break;
688 case 's' : compileOptions |= Perl5Compiler.SINGLELINE_MASK; break;
689 case 'x' : compileOptions |= Perl5Compiler.EXTENDED_MASK; break;
690 case 'g' : numSubstitutions = Util.SUBSTITUTE_ALL; break;
691 case 'o' : numInterpolations = 1; break;
692 default :
693 throw new
694 MalformedPerl5PatternException("Invalid option: " + exp[index]);
695 }
696 }
697
698 compiledPattern =
699 __patternCache.getPattern(new String(exp, firstOffset,
700 secondOffset - firstOffset),
701 compileOptions);
702 substitution =
703 new Perl5Substitution(replacement.toString(), numInterpolations);
704 entry = new ParsedSubstitutionEntry(compiledPattern, substitution,
705 numSubstitutions);
706 __expressionCache.addElement(expression, entry);
707
708 subCount =
709 Util.substitute(result, __matcher, compiledPattern, substitution,
710 input, numSubstitutions);
711
712 __lastMatch = __matcher.getMatch();
713
714 return subCount;
715 }
716
717 /**
718 * Substitutes a pattern in a given input with a replacement string.
719 * The substitution expression is specified in Perl5 native format.
720 * <dl compact>
721 * <dt>Calling this method is the same as:</dt>
722 * <dd>
723 * <blockquote><pre>
724 * String result;
725 * StringBuffer buffer = new StringBuffer();
726 * perl.substitute(buffer, expression, input);
727 * result = buffer.toString();
728 * </pre></blockquote>
729 * </dd>
730 * </dl>
731 * @param expression The Perl5 substitution regular expression.
732 * @param input The input on which to perform substitutions.
733 * @return The input as a String after substitutions have been performed.
734 * @exception MalformedPerl5PatternException If there is an error in
735 * the expression. You are not forced to catch this exception
736 * because it is derived from RuntimeException.
737 * @since 1.0
738 * @see #substitute
739 */
740 public synchronized String substitute(String expression, String input)
741 throws MalformedPerl5PatternException
742 {
743 StringBuffer result = new StringBuffer();
744 substitute(result, expression, input);
745 return result.toString();
746 }
747
748 /**
749 * Splits a String into strings that are appended to a List, but no more
750 * than a specified limit. The String is split using a regular expression
751 * as the delimiter. The regular expression is a pattern specified
752 * in Perl5 native format:
753 * <blockquote><pre>
754 * [m]/pattern/[i][m][s][x]
755 * </pre></blockquote>
756 * The <code>m</code> prefix is optional and the meaning of the optional
757 * trailing options are:
758 * <dl compact>
759 * <dt> i <dd> case insensitive match
760 * <dt> m <dd> treat the input as consisting of multiple lines
761 * <dt> s <dd> treat the input as consisting of a single line
762 * <dt> x <dd> enable extended expression syntax incorporating whitespace
763 * and comments
764 * </dl>
765 * As with Perl, any non-alphanumeric character can be used in lieu of
766 * the slashes.
767 * <p>
768 * The limit parameter causes the string to be split on at most the first
769 * <b>limit - 1</b> number of pattern occurences.
770 * <p>
771 * Of special note is that this split method performs EXACTLY the same
772 * as the Perl split() function. In other words, if the split pattern
773 * contains parentheses, additional Vector elements are created from
774 * each of the matching subgroups in the pattern. Using an example
775 * similar to the one from the Camel book:
776 * <blockquote><pre>
777 * split(list, "/([,-])/", "8-12,15,18")
778 * </pre></blockquote>
779 * produces the Vector containing:
780 * <blockquote><pre>
781 * { "8", "-", "12", ",", "15", ",", "18" }
782 * </pre></blockquote>
783 * Furthermore, the following Perl behavior is observed: "leading empty
784 * fields are preserved, and empty trailing one are deleted." This
785 * has the effect that a split on a zero length string returns an empty
786 * list.
787 * The {@link org.apache.oro.text.regex.Util#split Util.split()} method
788 * does NOT implement these behaviors because it is intended to
789 * be a general self-consistent and predictable split function usable
790 * with Pattern instances other than Perl5Pattern.
791 * <p>
792 * @param results
793 * A <code> Collection </code> to which the substrings of the input
794 * that occur between the regular expression delimiter occurences
795 * are appended. The input will not be split into any more substrings
796 * than the specified
797 * limit. A way of thinking of this is that only the first
798 * <b>limit - 1</b>
799 * matches of the delimiting regular expression will be used to split the
800 * input. The Collection must support the
801 * <code>addAll(Collection)</code> operation.
802 * @param pattern The regular expression to use as a split delimiter.
803 * @param input The String to split.
804 * @param limit The limit on the size of the returned <code>Vector</code>.
805 * Values <= 0 produce the same behavior as the SPLIT_ALL constant which
806 * causes the limit to be ignored and splits to be performed on all
807 * occurrences of the pattern. You should use the SPLIT_ALL constant
808 * to achieve this behavior instead of relying on the default behavior
809 * associated with non-positive limit values.
810 * @exception MalformedPerl5PatternException If there is an error in
811 * the expression. You are not forced to catch this exception
812 * because it is derived from RuntimeException.
813 */
814 public synchronized void split(Collection results, String pattern,
815 String input, int limit)
816 throws MalformedPerl5PatternException
817 {
818 int beginOffset, groups, index;
819 String group;
820 MatchResult currentResult = null;
821 PatternMatcherInput pinput;
822 Pattern compiledPattern;
823
824 compiledPattern = __parseMatchExpression(pattern);
825
826 pinput = new PatternMatcherInput(input);
827 beginOffset = 0;
828
829 while(--limit != 0 && __matcher.contains(pinput, compiledPattern)) {
830 currentResult = __matcher.getMatch();
831
832 __splitList.add(input.substring(beginOffset,
833 currentResult.beginOffset(0)));
834
835 if((groups = currentResult.groups()) > 1) {
836 for(index = 1; index < groups; ++index) {
837 group = currentResult.group(index);
838 if(group != null && group.length() > 0)
839 __splitList.add(group);
840 }
841 }
842
843 beginOffset = currentResult.endOffset(0);
844 }
845
846 __splitList.add(input.substring(beginOffset, input.length()));
847
848 // Remove all trailing empty fields.
849 for(int i = __splitList.size() - 1; i >= 0; --i) {
850 String str;
851
852 str = (String)__splitList.get(i);
853 if(str.length() == 0)
854 __splitList.remove(i);
855 else
856 break;
857 }
858
859 results.addAll(__splitList);
860 __splitList.clear();
861
862 // Just for the sake of completeness
863 __lastMatch = currentResult;
864 }
865
866 /**
867 * This method is identical to calling:
868 * <blockquote><pre>
869 * split(results, pattern, input, SPLIT_ALL);
870 * </pre></blockquote>
871 */
872 public synchronized void split(Collection results, String pattern,
873 String input)
874 throws MalformedPerl5PatternException
875 {
876 split(results, pattern, input, SPLIT_ALL);
877 }
878
879 /**
880 * Splits input in the default Perl manner, splitting on all whitespace.
881 * This method is identical to calling:
882 * <blockquote><pre>
883 * split(results, "/\\s+/", input);
884 * </pre></blockquote>
885 */
886 public synchronized void split(Collection results, String input)
887 throws MalformedPerl5PatternException
888 {
889 split(results, "/\\s+/", input);
890 }
891
892 /**
893 * Splits a String into strings contained in a Vector of size no greater
894 * than a specified limit. The String is split using a regular expression
895 * as the delimiter. The regular expression is a pattern specified
896 * in Perl5 native format:
897 * <blockquote><pre>
898 * [m]/pattern/[i][m][s][x]
899 * </pre></blockquote>
900 * The <code>m</code> prefix is optional and the meaning of the optional
901 * trailing options are:
902 * <dl compact>
903 * <dt> i <dd> case insensitive match
904 * <dt> m <dd> treat the input as consisting of multiple lines
905 * <dt> s <dd> treat the input as consisting of a single line
906 * <dt> x <dd> enable extended expression syntax incorporating whitespace
907 * and comments
908 * </dl>
909 * As with Perl, any non-alphanumeric character can be used in lieu of
910 * the slashes.
911 * <p>
912 * The limit parameter causes the string to be split on at most the first
913 * <b>limit - 1</b> number of pattern occurences.
914 * <p>
915 * Of special note is that this split method performs EXACTLY the same
916 * as the Perl split() function. In other words, if the split pattern
917 * contains parentheses, additional Vector elements are created from
918 * each of the matching subgroups in the pattern. Using an example
919 * similar to the one from the Camel book:
920 * <blockquote><pre>
921 * split("/([,-])/", "8-12,15,18")
922 * </pre></blockquote>
923 * produces the Vector containing:
924 * <blockquote><pre>
925 * { "8", "-", "12", ",", "15", ",", "18" }
926 * </pre></blockquote>
927 * The {@link org.apache.oro.text.regex.Util#split Util.split()} method
928 * does NOT implement this particular behavior because it is intended to
929 * be usable with Pattern instances other than Perl5Pattern.
930 * <p>
931 * @deprecated Use
932 * {@link #split(Collection results, String pattern, String input, int limit)}
933 * instead.
934 * @param pattern The regular expression to use as a split delimiter.
935 * @param input The String to split.
936 * @param limit The limit on the size of the returned <code>Vector</code>.
937 * Values <= 0 produce the same behavior as the SPLIT_ALL constant which
938 * causes the limit to be ignored and splits to be performed on all
939 * occurrences of the pattern. You should use the SPLIT_ALL constant
940 * to achieve this behavior instead of relying on the default behavior
941 * associated with non-positive limit values.
942 * @return A <code> Vector </code> containing the substrings of the input
943 * that occur between the regular expression delimiter occurences. The
944 * input will not be split into any more substrings than the specified
945 * limit. A way of thinking of this is that only the first
946 * <b>limit - 1</b>
947 * matches of the delimiting regular expression will be used to split the
948 * input.
949 * @exception MalformedPerl5PatternException If there is an error in
950 * the expression. You are not forced to catch this exception
951 * because it is derived from RuntimeException.
952 */
953 public synchronized Vector split(String pattern, String input, int limit)
954 throws MalformedPerl5PatternException
955 {
956 Vector results = new Vector(20);
957 split(results, pattern, input, limit);
958 return results;
959 }
960
961 /**
962 * This method is identical to calling:
963 * <blockquote><pre>
964 * split(pattern, input, SPLIT_ALL);
965 * </pre></blockquote>
966 * @deprecated Use
967 * {@link #split(Collection results, String pattern, String input)} instead.
968 */
969 public synchronized Vector split(String pattern, String input)
970 throws MalformedPerl5PatternException
971 {
972 return split(pattern, input, SPLIT_ALL);
973 }
974
975 /**
976 * Splits input in the default Perl manner, splitting on all whitespace.
977 * This method is identical to calling:
978 * <blockquote><pre>
979 * split("/\\s+/", input);
980 * </pre></blockquote>
981 * @deprecated Use
982 * {@link #split(Collection results, String input)} instead.
983 */
984 public synchronized Vector split(String input)
985 throws MalformedPerl5PatternException
986 {
987 return split("/\\s+/", input);
988 }
989
990 //
991 // MatchResult interface methods.
992 //
993
994 /**
995 * Returns the length of the last match found.
996 * <p>
997 * @return The length of the last match found.
998 */
999 public synchronized int length() {
1000 return __lastMatch.length();
1001 }
1002
1003 /**
1004 * @return The number of groups contained in the last match found.
1005 * This number includes the 0th group. In other words, the
1006 * result refers to the number of parenthesized subgroups plus
1007 * the entire match itself.
1008 */
1009 public synchronized int groups() {
1010 return __lastMatch.groups();
1011 }
1012
1013
1014 /**
1015 * Returns the contents of the parenthesized subgroups of the last match
1016 * found according to the behavior dictated by the MatchResult interface.
1017 * <p>
1018 * @param group The pattern subgroup to return.
1019 * @return A string containing the indicated pattern subgroup. Group
1020 * 0 always refers to the entire match. If a group was never
1021 * matched, it returns null. This is not to be confused with
1022 * a group matching the null string, which will return a String
1023 * of length 0.
1024 */
1025 public synchronized String group(int group) {
1026 return __lastMatch.group(group);
1027 }
1028
1029 /**
1030 * Returns the begin offset of the subgroup of the last match found
1031 * relative the beginning of the match.
1032 * <p>
1033 * @param group The pattern subgroup.
1034 * @return The offset into group 0 of the first token in the indicated
1035 * pattern subgroup. If a group was never matched or does
1036 * not exist, returns -1. Be aware that a group that matches
1037 * the null string at the end of a match will have an offset
1038 * equal to the length of the string, so you shouldn't blindly
1039 * use the offset to index an array or String.
1040 */
1041 public synchronized int begin(int group) {
1042 return __lastMatch.begin(group);
1043 }
1044
1045
1046 /**
1047 * Returns the end offset of the subgroup of the last match found
1048 * relative the beginning of the match.
1049 * <p>
1050 * @param group The pattern subgroup.
1051 * @return Returns one plus the offset into group 0 of the last token in
1052 * the indicated pattern subgroup. If a group was never matched
1053 * or does not exist, returns -1. A group matching the null
1054 * string will return its start offset.
1055 */
1056 public synchronized int end(int group) {
1057 return __lastMatch.end(group);
1058 }
1059
1060
1061 /**
1062 * Returns an offset marking the beginning of the last pattern match
1063 * found relative to the beginning of the input from which the match
1064 * was extracted.
1065 * <p>
1066 * @param group The pattern subgroup.
1067 * @return The offset of the first token in the indicated
1068 * pattern subgroup. If a group was never matched or does
1069 * not exist, returns -1.
1070 */
1071 public synchronized int beginOffset(int group) {
1072 return __lastMatch.beginOffset(group);
1073 }
1074
1075 /**
1076 * Returns an offset marking the end of the last pattern match found
1077 * relative to the beginning of the input from which the match was
1078 * extracted.
1079 * <p>
1080 * @param group The pattern subgroup.
1081 * @return Returns one plus the offset of the last token in
1082 * the indicated pattern subgroup. If a group was never matched
1083 * or does not exist, returns -1. A group matching the null
1084 * string will return its start offset.
1085 */
1086 public synchronized int endOffset(int group) {
1087 return __lastMatch.endOffset(group);
1088 }
1089
1090 /**
1091 * Returns the same as group(0).
1092 * <p>
1093 * @return A string containing the entire match.
1094 */
1095 public synchronized String toString() {
1096 if(__lastMatch == null)
1097 return null;
1098 return __lastMatch.toString();
1099 }
1100
1101
1102 /**
1103 * Returns the part of the input preceding the last match found.
1104 * <p>
1105 * @return The part of the input following the last match found.
1106 */
1107 public synchronized String preMatch() {
1108 int begin;
1109
1110 if(__originalInput == null)
1111 return __nullString;
1112
1113 begin = __lastMatch.beginOffset(0);
1114
1115 if(begin <= 0)
1116 return __nullString;
1117
1118 if(__originalInput instanceof char[]) {
1119 char[] input;
1120
1121 input = (char[])__originalInput;
1122
1123 // Just in case we make sure begin offset is in bounds. It should
1124 // be but we're paranoid.
1125 if(begin > input.length)
1126 begin = input.length;
1127
1128 return new String(input, __inputBeginOffset, begin);
1129 } else if(__originalInput instanceof String) {
1130 String input;
1131
1132 input = (String)__originalInput;
1133
1134 // Just in case we make sure begin offset is in bounds. It should
1135 // be but we're paranoid.
1136 if(begin > input.length())
1137 begin = input.length();
1138
1139 return input.substring(__inputBeginOffset, begin);
1140 }
1141
1142 return __nullString;
1143 }
1144
1145
1146 /**
1147 * Returns the part of the input following the last match found.
1148 * <p>
1149 * @return The part of the input following the last match found.
1150 */
1151 public synchronized String postMatch() {
1152 int end;
1153
1154 if(__originalInput == null)
1155 return __nullString;
1156
1157 end = __lastMatch.endOffset(0);
1158
1159 if(end < 0)
1160 return __nullString;
1161
1162 if(__originalInput instanceof char[]) {
1163 char[] input;
1164
1165 input = (char[])__originalInput;
1166 // Just in case we make sure begin offset is in bounds. It should
1167 // be but we're paranoid.
1168 if(end >= input.length)
1169 return __nullString;
1170
1171 return new String(input, end, __inputEndOffset - end);
1172 } else if(__originalInput instanceof String) {
1173 String input;
1174
1175 input = (String)__originalInput;
1176
1177 // Just in case we make sure begin offset is in bounds. It should
1178 // be but we're paranoid.
1179 if(end >= input.length())
1180 return __nullString;
1181
1182 return input.substring(end, __inputEndOffset);
1183 }
1184
1185 return __nullString;
1186 }
1187
1188
1189 /**
1190 * Returns the part of the input preceding the last match found as a
1191 * char array. This method eliminates the extra
1192 * buffer copying caused by preMatch().toCharArray().
1193 * <p>
1194 * @return The part of the input preceding the last match found as a char[].
1195 * If the result is of zero length, returns null instead of a zero
1196 * length array.
1197 */
1198 public synchronized char[] preMatchCharArray() {
1199 int begin;
1200 char[] result = null;
1201
1202 if(__originalInput == null)
1203 return null;
1204
1205 begin = __lastMatch.beginOffset(0);
1206
1207 if(begin <= 0)
1208 return null;
1209
1210 if(__originalInput instanceof char[]) {
1211 char[] input;
1212
1213 input = (char[])__originalInput;
1214
1215 // Just in case we make sure begin offset is in bounds. It should
1216 // be but we're paranoid.
1217 if(begin >= input.length)
1218 begin = input.length;
1219
1220 result = new char[begin - __inputBeginOffset];
1221 System.arraycopy(input, __inputBeginOffset, result, 0, result.length);
1222 } else if(__originalInput instanceof String) {
1223 String input;
1224
1225 input = (String)__originalInput;
1226
1227 // Just in case we make sure begin offset is in bounds. It should
1228 // be but we're paranoid.
1229 if(begin >= input.length())
1230 begin = input.length();
1231
1232 result = new char[begin - __inputBeginOffset];
1233 input.getChars(__inputBeginOffset, begin, result, 0);
1234 }
1235
1236 return result;
1237 }
1238
1239
1240 /**
1241 * Returns the part of the input following the last match found as a char
1242 * array. This method eliminates the extra buffer copying caused by
1243 * preMatch().toCharArray().
1244 * <p>
1245 * @return The part of the input following the last match found as a char[].
1246 * If the result is of zero length, returns null instead of a zero
1247 * length array.
1248 */
1249 public synchronized char[] postMatchCharArray() {
1250 int end;
1251 char[] result = null;
1252
1253 if(__originalInput == null)
1254 return null;
1255
1256 end = __lastMatch.endOffset(0);
1257
1258 if(end < 0)
1259 return null;
1260
1261 if(__originalInput instanceof char[]) {
1262 int length;
1263 char[] input;
1264
1265 input = (char[])__originalInput;
1266 // Just in case we make sure begin offset is in bounds. It should
1267 // be but we're paranoid.
1268 if(end >= input.length)
1269 return null;
1270
1271 length = __inputEndOffset - end;
1272 result = new char[length];
1273 System.arraycopy(input, end, result, 0, length);
1274 } else if(__originalInput instanceof String) {
1275 String input;
1276
1277 input = (String)__originalInput;
1278
1279 // Just in case we make sure begin offset is in bounds. It should
1280 // be but we're paranoid.
1281 if(end >= __inputEndOffset)
1282 return null;
1283
1284 result = new char[__inputEndOffset - end];
1285 input.getChars(end, __inputEndOffset, result, 0);
1286 }
1287
1288 return result;
1289 }
1290
1291}
1292