Source code: org/apache/oro/text/regex/Perl5Matcher.java
1 /*
2 * $Id: Perl5Matcher.java,v 1.27 2003/11/07 20:16:25 dfs Exp $
3 *
4 * ====================================================================
5 * The Apache Software License, Version 1.1
6 *
7 * Copyright (c) 2000 The Apache Software Foundation. All rights
8 * reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 *
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 *
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in
19 * the documentation and/or other materials provided with the
20 * distribution.
21 *
22 * 3. The end-user documentation included with the redistribution,
23 * if any, must include the following acknowledgment:
24 * "This product includes software developed by the
25 * Apache Software Foundation (http://www.apache.org/)."
26 * Alternately, this acknowledgment may appear in the software itself,
27 * if and wherever such third-party acknowledgments normally appear.
28 *
29 * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro"
30 * must not be used to endorse or promote products derived from this
31 * software without prior written permission. For written
32 * permission, please contact apache@apache.org.
33 *
34 * 5. Products derived from this software may not be called "Apache"
35 * or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their
36 * name, without prior written permission of the Apache Software Foundation.
37 *
38 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49 * SUCH DAMAGE.
50 * ====================================================================
51 *
52 * This software consists of voluntary contributions made by many
53 * individuals on behalf of the Apache Software Foundation. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58
59 package org.apache.oro.text.regex;
60
61 import java.util.*;
62
63 /**
64 * The Perl5Matcher class is used to match regular expressions
65 * (conforming to the Perl5 regular expression syntax) generated by
66 * Perl5Compiler.
67 * <p>
68 * Perl5Compiler and Perl5Matcher are designed with the intent that
69 * you use a separate instance of each per thread to avoid the overhead
70 * of both synchronization and concurrent access (e.g., a match that takes
71 * a long time in one thread will block the progress of another thread with
72 * a shorter match). If you want to use a single instance of each
73 * in a concurrent program, you must appropriately protect access to
74 * the instances with critical sections. If you want to share Perl5Pattern
75 * instances between concurrently executing instances of Perl5Matcher, you
76 * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}.
77 *
78 * @version @version@
79 * @since 1.0
80 * @see PatternMatcher
81 * @see Perl5Compiler
82 */
83 public final class Perl5Matcher implements PatternMatcher {
84 private static final char __EOS = Character.MAX_VALUE;
85 private static final int __INITIAL_NUM_OFFSETS = 20;
86
87 private boolean __multiline = false, __lastSuccess = false;
88 private boolean __caseInsensitive = false;
89 private char __previousChar, __input[], __originalInput[];
90 private Perl5Repetition __currentRep;
91 private int __numParentheses, __bol, __eol, __currentOffset, __endOffset;
92
93 private char[] __program;
94 private int __expSize, __inputOffset, __lastParen;
95 private int[] __beginMatchOffsets, __endMatchOffsets;
96 private Stack __stack = new Stack();
97 private Perl5MatchResult __lastMatchResult = null;
98
99 private static boolean
100 __compare(char[] s1, int s1Offs, char[] s2, int s2Offs, int n)
101 {
102 int cnt;
103
104 for(cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) {
105 if(s1Offs >= s1.length)
106 return false;
107 if(s2Offs >= s2.length)
108 return false;
109 if(s1[s1Offs] != s2[s2Offs])
110 return false;
111 }
112
113 return true;
114 }
115
116 private static int __findFirst(char[] input, int current, int endOffset,
117 char[] mustString)
118 {
119 int count, saveCurrent;
120 char ch;
121
122
123 if(input.length == 0)
124 return endOffset;
125
126 ch = mustString[0];
127 // Find the offset of the first character of the must string
128 while(current < endOffset) {
129 if(ch == input[current]){
130 saveCurrent = current;
131 count = 0;
132
133 while(current < endOffset && count < mustString.length) {
134 if(mustString[count] != input[current])
135 break;
136 ++count;
137 ++current;
138 }
139
140 current = saveCurrent;
141
142 if(count >= mustString.length)
143 break;
144 }
145 ++current;
146 }
147
148 return current;
149 }
150
151
152 private void __pushState(int parenFloor) {
153 int[] state;
154 int stateEntries, paren;
155
156 stateEntries = 3*(__expSize - parenFloor);
157 if(stateEntries <= 0)
158 state = new int[3];
159 else
160 state = new int[stateEntries + 3];
161
162 state[0] = __expSize;
163 state[1] = __lastParen;
164 state[2] = __inputOffset;
165
166 for(paren = __expSize; paren > parenFloor; --paren, stateEntries-=3) {
167 state[stateEntries] = __endMatchOffsets[paren];
168 state[stateEntries + 1] = __beginMatchOffsets[paren];
169 state[stateEntries + 2] = paren;
170 }
171
172 __stack.push(state);
173 }
174
175
176 private void __popState() {
177 int[] state;
178 int entry, paren;
179
180 state = (int[])__stack.pop();
181
182 __expSize = state[0];
183 __lastParen = state[1];
184 __inputOffset = state[2];
185
186 for(entry = 3; entry < state.length; entry+=3) {
187 paren = state[entry + 2];
188 __beginMatchOffsets[paren] = state[entry + 1];
189
190 if(paren <= __lastParen)
191 __endMatchOffsets[paren] = state[entry];
192 }
193
194 for(paren = __lastParen + 1; paren <= __numParentheses; paren++) {
195 if(paren > __expSize)
196 __beginMatchOffsets[paren] = OpCode._NULL_OFFSET;
197 __endMatchOffsets[paren] = OpCode._NULL_OFFSET;
198 }
199 }
200
201
202 // Initialize globals needed before calling __tryExpression for first time
203 private void __initInterpreterGlobals(Perl5Pattern expression, char[] input,
204 int beginOffset, int endOffset,
205 int currentOffset)
206 {
207 // Remove this hack after more efficient case-folding and unicode
208 // character classes are implemented
209 __caseInsensitive = expression._isCaseInsensitive;
210 __input = input;
211 __endOffset = endOffset;
212 __currentRep = new Perl5Repetition();
213 __currentRep._numInstances = 0;
214 __currentRep._lastRepetition = null;
215 __program = expression._program;
216 __stack.setSize(0);
217
218 // currentOffset should always be >= beginOffset and should
219 // always be equal to zero when beginOffset equals 0, but we
220 // make a weak attempt to protect against a violation of this
221 // precondition
222 if(currentOffset == beginOffset || currentOffset <= 0)
223 __previousChar = '\n';
224 else {
225 __previousChar = input[currentOffset - 1];
226 if(!__multiline && __previousChar == '\n')
227 __previousChar = '\0';
228 }
229
230 __numParentheses = expression._numParentheses;
231 __currentOffset = currentOffset;
232
233 __bol = beginOffset;
234 __eol = endOffset;
235
236 // Ok, here we're using endOffset as a temporary variable.
237 endOffset = __numParentheses + 1;
238 if(__beginMatchOffsets == null || endOffset > __beginMatchOffsets.length) {
239 if(endOffset < __INITIAL_NUM_OFFSETS)
240 endOffset = __INITIAL_NUM_OFFSETS;
241 __beginMatchOffsets = new int[endOffset];
242 __endMatchOffsets = new int[endOffset];
243 }
244 }
245
246 // Set the match result information. Only call this if we successfully
247 // matched.
248 private void __setLastMatchResult() {
249 int offs, maxEndOffs = 0;
250
251 //endOffset+=dontTry;
252
253 __lastMatchResult = new Perl5MatchResult(__numParentheses + 1);
254
255 // This can happen when using Perl5StreamInput
256 if(__endMatchOffsets[0] > __originalInput.length)
257 throw new ArrayIndexOutOfBoundsException();
258
259 __lastMatchResult._matchBeginOffset = __beginMatchOffsets[0];
260
261 while(__numParentheses >= 0) {
262 offs = __beginMatchOffsets[__numParentheses];
263
264 if(offs >= 0)
265 __lastMatchResult._beginGroupOffset[__numParentheses] =
266 offs - __lastMatchResult._matchBeginOffset;
267 else
268 __lastMatchResult._beginGroupOffset[__numParentheses] =
269 OpCode._NULL_OFFSET;
270
271 offs = __endMatchOffsets[__numParentheses];
272
273 if(offs >= 0) {
274 __lastMatchResult._endGroupOffset[__numParentheses] =
275 offs - __lastMatchResult._matchBeginOffset;
276 if(offs > maxEndOffs && offs <= __originalInput.length)
277 maxEndOffs = offs;
278 } else
279 __lastMatchResult._endGroupOffset[__numParentheses] =
280 OpCode._NULL_OFFSET;
281
282 --__numParentheses;
283 }
284
285 __lastMatchResult._match =
286 new String(__originalInput, __beginMatchOffsets[0],
287 maxEndOffs - __beginMatchOffsets[0]);
288
289 // Free up for garbage collection
290 __originalInput = null;
291 }
292
293
294
295 // Expects to receive a valid regular expression program. No checking
296 // is done to ensure validity.
297 // __originalInput must be set before calling this method for
298 // __lastMatchResult to be set correctly.
299 // beginOffset marks the beginning of the string
300 // currentOffset marks where to start the pattern search
301 private boolean __interpret(Perl5Pattern expression, char[] input,
302 int beginOffset, int endOffset,
303 int currentOffset)
304 {
305 boolean success;
306 int minLength = 0, dontTry = 0, offset;
307 char ch, mustString[];
308
309 __initInterpreterGlobals(expression, input, beginOffset, endOffset,
310 currentOffset);
311
312 success = false;
313 mustString = expression._mustString;
314
315 _mainLoop:
316 while(true) {
317
318 if(mustString != null &&
319 ((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 ||
320 ((__multiline ||
321 (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0)
322 && expression._back >= 0)))
323 {
324
325 __currentOffset =
326 __findFirst(__input, __currentOffset, endOffset, mustString);
327
328 if(__currentOffset >= endOffset) {
329 if((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0)
330 expression._mustUtility++;
331 success = false;
332 break _mainLoop;
333 } else if(expression._back >= 0) {
334 __currentOffset-=expression._back;
335 if(__currentOffset < currentOffset)
336 __currentOffset = currentOffset;
337 minLength = expression._back + mustString.length;
338 } else if(!expression._isExpensive &&
339 (expression._options & Perl5Compiler.READ_ONLY_MASK) == 0 &&
340 (--expression._mustUtility < 0)) {
341 // Be careful! The preceding logical expression is constructed
342 // so that mustUtility is only decremented if the expression is
343 // compiled without READ_ONLY_MASK.
344 mustString = expression._mustString = null;
345 __currentOffset = currentOffset;
346 } else {
347 __currentOffset = currentOffset;
348 minLength = mustString.length;
349 }
350 }
351
352 if((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) {
353 if(__currentOffset == beginOffset && __tryExpression(beginOffset)) {
354 success = true;
355 break _mainLoop;
356 } else if(__multiline
357 || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0
358 || (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0)
359 {
360 if(minLength > 0)
361 dontTry = minLength - 1;
362 endOffset-=dontTry;
363
364 if(__currentOffset > currentOffset)
365 --__currentOffset;
366
367 while(__currentOffset < endOffset) {
368 if(__input[__currentOffset++] == '\n') {
369 if(__currentOffset < endOffset &&
370 __tryExpression(__currentOffset)) {
371 success = true;
372 break _mainLoop;
373 }
374 }
375 }
376 }
377
378 break _mainLoop;
379 }
380
381
382 if(expression._startString != null) {
383 mustString = expression._startString;
384 if((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) {
385 ch = mustString[0];
386
387 while(__currentOffset < endOffset) {
388 if(ch == __input[__currentOffset]) {
389 if(__tryExpression(__currentOffset)){
390 success = true;
391 break _mainLoop;
392 }
393 ++__currentOffset;
394 while(__currentOffset < endOffset &&
395 __input[__currentOffset] == ch)
396 ++__currentOffset;
397 }
398 ++__currentOffset;
399 }
400 } else {
401
402 while((__currentOffset =
403 __findFirst(__input, __currentOffset, endOffset, mustString))
404 < endOffset){
405 if(__tryExpression(__currentOffset)) {
406 success = true;
407 break _mainLoop;
408 }
409 ++__currentOffset;
410 }
411 }
412
413 break _mainLoop;
414 }
415
416 if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
417 boolean doEvery, tmp;
418 char op;
419
420 doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
421
422 if(minLength > 0)
423 dontTry = minLength - 1;
424 endOffset -= dontTry;
425 tmp = true;
426
427 switch(op = __program[offset]) {
428 case OpCode._ANYOF:
429 offset = OpCode._getOperand(offset);
430 while(__currentOffset < endOffset) {
431 ch = __input[__currentOffset];
432
433 if(ch < 256 &&
434 (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
435 if(tmp && __tryExpression(__currentOffset)) {
436 success = true;
437 break _mainLoop;
438 } else
439 tmp = doEvery;
440 } else
441 tmp = true;
442 ++__currentOffset;
443 }
444
445 break;
446
447 case OpCode._ANYOFUN:
448 case OpCode._NANYOFUN:
449 offset = OpCode._getOperand(offset);
450 while(__currentOffset < endOffset) {
451 ch = __input[__currentOffset];
452
453 if(__matchUnicodeClass(ch, __program, offset, op)) {
454 if(tmp && __tryExpression(__currentOffset)) {
455 success = true;
456 break _mainLoop;
457 } else
458 tmp = doEvery;
459 } else
460 tmp = true;
461 ++__currentOffset;
462 }
463
464 break;
465
466 case OpCode._BOUND:
467 if(minLength > 0) {
468 ++dontTry;
469 --endOffset;
470 }
471
472 if(__currentOffset != beginOffset) {
473 ch = __input[__currentOffset - 1];
474 tmp = OpCode._isWordCharacter(ch);
475 } else
476 tmp = OpCode._isWordCharacter(__previousChar);
477
478 while(__currentOffset < endOffset) {
479 ch = __input[__currentOffset];
480 if(tmp != OpCode._isWordCharacter(ch)){
481 tmp = !tmp;
482 if(__tryExpression(__currentOffset)) {
483 success = true;
484 break _mainLoop;
485 }
486 }
487 ++__currentOffset;
488 }
489
490 if((minLength > 0 || tmp) &&
491 __tryExpression(__currentOffset)) {
492 success = true;
493 break _mainLoop;
494 }
495 break;
496
497 case OpCode._NBOUND:
498 if(minLength > 0) {
499 ++dontTry;
500 --endOffset;
501 }
502
503 if(__currentOffset != beginOffset) {
504 ch = __input[__currentOffset - 1];
505 tmp = OpCode._isWordCharacter(ch);
506 } else
507 tmp = OpCode._isWordCharacter(__previousChar);
508
509 while(__currentOffset < endOffset) {
510 ch = __input[__currentOffset];
511 if(tmp != OpCode._isWordCharacter(ch))
512 tmp = !tmp;
513 else if(__tryExpression(__currentOffset)) {
514 success = true;
515 break _mainLoop;
516 }
517
518 ++__currentOffset;
519 }
520
521 if((minLength > 0 || !tmp) &&
522 __tryExpression(__currentOffset)) {
523 success = true;
524 break _mainLoop;
525 }
526 break;
527
528 case OpCode._ALNUM:
529 while(__currentOffset < endOffset) {
530 ch = __input[__currentOffset];
531 if(OpCode._isWordCharacter(ch)) {
532 if(tmp && __tryExpression(__currentOffset)) {
533 success = true;
534 break _mainLoop;
535 } else
536 tmp = doEvery;
537 } else
538 tmp = true;
539 ++__currentOffset;
540 }
541 break;
542
543 case OpCode._NALNUM:
544 while(__currentOffset < endOffset) {
545 ch = __input[__currentOffset];
546 if(!OpCode._isWordCharacter(ch)) {
547 if(tmp && __tryExpression(__currentOffset)) {
548 success = true;
549 break _mainLoop;
550 } else
551 tmp = doEvery;
552 } else
553 tmp = true;
554 ++__currentOffset;
555 }
556 break;
557
558 case OpCode._SPACE:
559 while(__currentOffset < endOffset) {
560 if(Character.isWhitespace(__input[__currentOffset])) {
561 if(tmp && __tryExpression(__currentOffset)) {
562 success = true;
563 break _mainLoop;
564 } else
565 tmp = doEvery;
566 } else
567 tmp = true;
568 ++__currentOffset;
569 }
570 break;
571
572 case OpCode._NSPACE:
573 while(__currentOffset < endOffset) {
574 if(!Character.isWhitespace(__input[__currentOffset])) {
575 if(tmp && __tryExpression(__currentOffset)) {
576 success = true;
577 break _mainLoop;
578 } else
579 tmp = doEvery;
580 } else
581 tmp = true;
582 ++__currentOffset;
583 }
584 break;
585
586 case OpCode._DIGIT:
587 while(__currentOffset < endOffset) {
588 if(Character.isDigit(__input[__currentOffset])) {
589 if(tmp && __tryExpression(__currentOffset)) {
590 success = true;
591 break _mainLoop;
592 } else
593 tmp = doEvery;
594 } else
595 tmp = true;
596 ++__currentOffset;
597 }
598 break;
599
600
601 case OpCode._NDIGIT:
602 while(__currentOffset < endOffset) {
603 if(!Character.isDigit(__input[__currentOffset])) {
604 if(tmp && __tryExpression(__currentOffset)) {
605 success = true;
606 break _mainLoop;
607 } else
608 tmp = doEvery;
609 } else
610 tmp = true;
611 ++__currentOffset;
612 }
613 break;
614 } // end switch
615
616 } else {
617 if(minLength > 0)
618 dontTry = minLength - 1;
619 endOffset-=dontTry;
620
621 do {
622 if(__tryExpression(__currentOffset)) {
623 success = true;
624 break _mainLoop;
625 }
626 } while(__currentOffset++ < endOffset);
627
628 }
629
630
631 break _mainLoop;
632 } // end while
633
634 __lastSuccess = success;
635 __lastMatchResult = null;
636
637 return success;
638 }
639
640 private boolean __matchUnicodeClass(char code, char __program[],
641 int offset ,char opcode)
642 {
643 boolean isANYOF = ( opcode == OpCode._ANYOFUN );
644
645 while( __program[offset] != OpCode._END ){
646 if( __program[offset] == OpCode._RANGE ){
647 offset++;
648 if((code >= __program[offset]) && (code <= __program[offset+1])){
649 return isANYOF;
650 } else {
651 offset+=2;
652 }
653
654 } else if(__program[offset] == OpCode._ONECHAR) {
655 offset++;
656 if(__program[offset++] == code) return isANYOF;
657
658 } else {
659 isANYOF = (__program[offset] == OpCode._OPCODE)
660 ? isANYOF : !isANYOF;
661
662 offset++;
663 switch ( __program[offset++] ) {
664 case OpCode._ALNUM:
665 if(OpCode._isWordCharacter(code)) return isANYOF;
666 break;
667 case OpCode._NALNUM:
668 if(!OpCode._isWordCharacter(code)) return isANYOF;
669 break;
670 case OpCode._SPACE:
671 if(Character.isWhitespace(code)) return isANYOF;
672 break;
673 case OpCode._NSPACE:
674 if(!Character.isWhitespace(code)) return isANYOF;
675 break;
676 case OpCode._DIGIT:
677 if(Character.isDigit(code)) return isANYOF;
678 break;
679 case OpCode._NDIGIT:
680 if(!Character.isDigit(code)) return isANYOF;
681 break;
682 case OpCode._ALNUMC:
683 if(Character.isLetterOrDigit(code)) return isANYOF;
684 break;
685 case OpCode._ALPHA:
686 if(Character.isLetter(code)) return isANYOF;
687 break;
688 case OpCode._BLANK:
689 if(Character.isSpaceChar(code)) return isANYOF;
690 break;
691 case OpCode._CNTRL:
692 if(Character.isISOControl(code)) return isANYOF;
693 break;
694 case OpCode._LOWER:
695 if(Character.isLowerCase(code)) return isANYOF;
696 // Remove this hack after more efficient case-folding and unicode
697 // character classes are implemented
698 if(__caseInsensitive && Character.isUpperCase(code))
699 return isANYOF;
700 break;
701 case OpCode._UPPER:
702 if(Character.isUpperCase(code)) return isANYOF;
703 // Remove this hack after more efficient case-folding and unicode
704 // character classes are implemented
705 if(__caseInsensitive && Character.isLowerCase(code))
706 return isANYOF;
707 break;
708 case OpCode._PRINT:
709 if(Character.isSpaceChar(code)) return isANYOF;
710 // Fall through to check if the character is alphanumeric,
711 // or a punctuation mark. Printable characters are either
712 // alphanumeric, punctuation marks, or spaces.
713 case OpCode._GRAPH:
714 if(Character.isLetterOrDigit(code)) return isANYOF;
715 // Fall through to check if the character is a punctuation mark.
716 // Graph characters are either alphanumeric or punctuation.
717 case OpCode._PUNCT:
718 switch ( Character.getType(code) ) {
719 case Character.DASH_PUNCTUATION:
720 case Character.START_PUNCTUATION:
721 case Character.END_PUNCTUATION:
722 case Character.CONNECTOR_PUNCTUATION:
723 case Character.OTHER_PUNCTUATION:
724 case Character.MATH_SYMBOL:
725 case Character.CURRENCY_SYMBOL:
726 case Character.MODIFIER_SYMBOL:
727 return isANYOF;
728 default:
729 break;
730 }
731 break;
732 case OpCode._XDIGIT:
733 if( (code >= '0' && code <= '9') ||
734 (code >= 'a' && code <= 'f') ||
735 (code >= 'A' && code <= 'F')) return isANYOF;
736 break;
737 case OpCode._ASCII:
738 if(code < 0x80)return isANYOF;
739 }
740 }
741 }
742 return !isANYOF;
743 }
744
745 private boolean __tryExpression(int offset) {
746 int count;
747
748 __inputOffset = offset;
749 __lastParen = 0;
750 __expSize = 0;
751
752 if(__numParentheses > 0) {
753 for(count=0; count <= __numParentheses; count++) {
754 __beginMatchOffsets[count] = OpCode._NULL_OFFSET;
755 __endMatchOffsets[count] = OpCode._NULL_OFFSET;
756 }
757 }
758
759 if(__match(1)){
760 __beginMatchOffsets[0] = offset;
761 __endMatchOffsets[0] = __inputOffset;
762 return true;
763 }
764
765 return false;
766 }
767
768
769 private int __repeat(int offset, int max) {
770 int scan, eol, operand, ret;
771 char ch;
772 char op;
773
774 scan = __inputOffset;
775 eol = __eol;
776
777 if(max != Character.MAX_VALUE && max < eol - scan)
778 eol = scan + max;
779
780 operand = OpCode._getOperand(offset);
781
782 switch(op = __program[offset]) {
783
784 case OpCode._ANY:
785 while(scan < eol && __input[scan] != '\n')
786 ++scan;
787 break;
788
789 case OpCode._SANY:
790 scan = eol;
791 break;
792
793 case OpCode._EXACTLY:
794 ++operand;
795 while(scan < eol && __program[operand] == __input[scan])
796 ++scan;
797 break;
798
799 case OpCode._ANYOF:
800 if(scan < eol && (ch = __input[scan]) < 256) {
801 while((ch < 256 ) && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
802 if(++scan < eol)
803 ch = __input[scan];
804 else
805 break;
806 }
807 }
808 break;
809
810 case OpCode._ANYOFUN:
811 case OpCode._NANYOFUN:
812 if(scan < eol) {
813 ch = __input[scan];
814 while(__matchUnicodeClass(ch, __program, operand, op)){
815 if(++scan < eol)
816 ch = __input[scan];
817 else
818 break;
819 }
820 }
821 break;
822
823 case OpCode._ALNUM:
824 while(scan < eol && OpCode._isWordCharacter(__input[scan]))
825 ++scan;
826 break;
827
828 case OpCode._NALNUM:
829 while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
830 ++scan;
831 break;
832
833 case OpCode._SPACE:
834 while(scan < eol && Character.isWhitespace(__input[scan]))
835 ++scan;
836 break;
837
838 case OpCode._NSPACE:
839 while(scan < eol && !Character.isWhitespace(__input[scan]))
840 ++scan;
841 break;
842
843 case OpCode._DIGIT:
844 while(scan < eol && Character.isDigit(__input[scan]))
845 ++scan;
846 break;
847
848 case OpCode._NDIGIT:
849 while(scan < eol && !Character.isDigit(__input[scan]))
850 ++scan;
851 break;
852
853 default:
854 break;
855
856 }
857
858 ret = scan - __inputOffset;
859 __inputOffset = scan;
860
861 return ret;
862 }
863
864
865 private boolean __match(int offset) {
866 char nextChar, op;
867 int scan, next, input, maxScan, current, line, arg;
868 boolean inputRemains = true, minMod = false;
869 Perl5Repetition rep;
870
871
872 input = __inputOffset;
873 inputRemains = (input < __endOffset);
874 nextChar = (inputRemains ? __input[input] : __EOS);
875
876 scan = offset;
877 maxScan = __program.length;
878
879 while(scan < maxScan /*&& scan > 0*/){
880 next = OpCode._getNext(__program, scan);
881
882 switch(op = __program[scan]) {
883
884 case OpCode._BOL:
885 if(input == __bol ? __previousChar == '\n' :
886 (__multiline && (inputRemains || input < __eol) &&
887 __input[input - 1] == '\n'))
888 break;
889 return false;
890
891 case OpCode._MBOL:
892 if(input == __bol ? __previousChar == '\n' :
893 ((inputRemains || input < __eol) && __input[input - 1] == '\n'))
894 break;
895 return false;
896
897 case OpCode._SBOL:
898 if(input == __bol && __previousChar == '\n')
899 break;
900 return false;
901
902 case OpCode._GBOL:
903 if(input == __bol)
904 break;
905 return true;
906
907 case OpCode._EOL :
908 if((inputRemains || input < __eol) && nextChar != '\n')
909 return false;
910 if(!__multiline && __eol - input > 1)
911 return false;
912 break;
913
914 case OpCode._MEOL:
915 if((inputRemains || input < __eol) && nextChar != '\n')
916 return false;
917 break;
918
919 case OpCode._SEOL:
920 if((inputRemains || input < __eol) && nextChar != '\n')
921 return false;
922 if(__eol - input > 1)
923 return false;
924 break;
925
926 case OpCode._SANY:
927 if(!inputRemains && input >= __eol)
928 return false;
929 inputRemains = (++input < __endOffset);
930 nextChar = (inputRemains ? __input[input] : __EOS);
931 break;
932
933 case OpCode._ANY:
934 if((!inputRemains && input >= __eol) || nextChar == '\n')
935 return false;
936 inputRemains = (++input < __endOffset);
937 nextChar = (inputRemains ? __input[input] : __EOS);
938 break;
939
940 case OpCode._EXACTLY:
941 current = OpCode._getOperand(scan);
942 line = __program[current++];
943
944 if(__program[current] != nextChar)
945 return false;
946 if(__eol - input < line)
947 return false;
948
949 if(line > 1 && !__compare(__program, current, __input, input, line))
950 return false;
951
952 input+=line;
953 inputRemains = (input < __endOffset);
954 nextChar = (inputRemains ? __input[input] : __EOS);
955 break;
956
957 case OpCode._ANYOF:
958 current = OpCode._getOperand(scan);
959
960 if(nextChar == __EOS && inputRemains)
961 nextChar = __input[input];
962
963 if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
964 (1 << (nextChar & 0xf))) != 0)
965 return false;
966
967 if(!inputRemains && input >= __eol)
968 return false;
969
970 inputRemains = (++input < __endOffset);
971 nextChar = (inputRemains ? __input[input] : __EOS);
972 break;
973
974 case OpCode._ANYOFUN:
975 case OpCode._NANYOFUN:
976 current = OpCode._getOperand(scan);
977
978 if(nextChar == __EOS && inputRemains)
979 nextChar = __input[input];
980
981 if(!__matchUnicodeClass(nextChar, __program, current, op))
982 return false;
983
984 if(!inputRemains && input >= __eol)
985 return false;
986
987 inputRemains = (++input < __endOffset);
988 nextChar = (inputRemains ? __input[input] : __EOS);
989 break;
990
991 case OpCode._ALNUM:
992 if(!inputRemains)
993 return false;
994 if(!OpCode._isWordCharacter(nextChar))
995 return false;
996 inputRemains = (++input < __endOffset);
997 nextChar = (inputRemains ? __input[input] : __EOS);
998 break;
999
1000 case OpCode._NALNUM:
1001 if(!inputRemains && input >= __eol)
1002 return false;
1003 if(OpCode._isWordCharacter(nextChar))
1004 return false;
1005 inputRemains = (++input < __endOffset);
1006 nextChar = (inputRemains ? __input[input] : __EOS);
1007 break;
1008
1009
1010 case OpCode._NBOUND:
1011 case OpCode._BOUND:
1012 boolean a, b;
1013
1014 if(input == __bol)
1015 a = OpCode._isWordCharacter(__previousChar);
1016 else
1017 a = OpCode._isWordCharacter(__input[input - 1]);
1018
1019 b = OpCode._isWordCharacter(nextChar);
1020
1021 if((a == b) == (__program[scan] == OpCode._BOUND))
1022 return false;
1023 break;
1024
1025 case OpCode._SPACE:
1026 if(!inputRemains && input >= __eol)
1027 return false;
1028 if(!Character.isWhitespace(nextChar))
1029 return false;
1030 inputRemains = (++input < __endOffset);
1031 nextChar = (inputRemains ? __input[input] : __EOS);
1032 break;
1033
1034
1035 case OpCode._NSPACE:
1036 if(!inputRemains)
1037 return false;
1038 if(Character.isWhitespace(nextChar))
1039 return false;
1040 inputRemains = (++input < __endOffset);
1041 nextChar = (inputRemains ? __input[input] : __EOS);
1042 break;
1043
1044 case OpCode._DIGIT:
1045 if(!Character.isDigit(nextChar))
1046 return false;
1047 inputRemains = (++input < __endOffset);
1048 nextChar = (inputRemains ? __input[input] : __EOS);
1049 break;
1050
1051 case OpCode._NDIGIT:
1052 if(!inputRemains && input >= __eol)
1053 return false;
1054 if(Character.isDigit(nextChar))
1055 return false;
1056 inputRemains = (++input < __endOffset);
1057 nextChar = (inputRemains ? __input[input] : __EOS);
1058 break;
1059
1060 case OpCode._REF:
1061 arg = OpCode._getArg1(__program, scan);
1062 current = __beginMatchOffsets[arg];
1063
1064 if(current == OpCode._NULL_OFFSET)
1065 return false;
1066
1067 if(__endMatchOffsets[arg] == OpCode._NULL_OFFSET)
1068 return false;
1069
1070 if(current == __endMatchOffsets[arg])
1071 break;
1072
1073 if(__input[current] != nextChar)
1074 return false;
1075
1076 line = __endMatchOffsets[arg] - current;
1077
1078 if(input + line > __eol)
1079 return false;
1080
1081 if(line > 1 && !__compare(__input, current, __input, input, line))
1082 return false;
1083
1084 input+=line;
1085 inputRemains = (input < __endOffset);
1086 nextChar = (inputRemains ? __input[input] : __EOS);
1087 break;
1088
1089 case OpCode._NOTHING:
1090 break;
1091
1092 case OpCode._BACK:
1093 break;
1094
1095 case OpCode._OPEN:
1096 arg = OpCode._getArg1(__program, scan);
1097 __beginMatchOffsets[arg] = input;
1098
1099 if(arg > __expSize)
1100 __expSize = arg;
1101 break;
1102
1103 case OpCode._CLOSE:
1104 arg = OpCode._getArg1(__program, scan);
1105 __endMatchOffsets[arg] = input;
1106
1107 if(arg > __lastParen)
1108 __lastParen = arg;
1109 break;
1110
1111 case OpCode._CURLYX:
1112 rep = new Perl5Repetition();
1113 rep._lastRepetition = __currentRep;
1114 __currentRep = rep;
1115
1116 rep._parenFloor = __lastParen;
1117 rep._numInstances = -1;
1118 rep._min = OpCode._getArg1(__program, scan);
1119 rep._max = OpCode._getArg2(__program, scan);
1120 rep._scan = OpCode._getNextOperator(scan) + 2;
1121 rep._next = next;
1122 rep._minMod = minMod;
1123 // Must initialize to -1 because if we initialize to 0 and are
1124 // at the beginning of the input the OpCode._WHILEM case will
1125 // not work right.
1126 rep._lastLocation = -1;
1127 __inputOffset = input;
1128
1129 // use minMod as temporary
1130 minMod = __match(OpCode._getPrevOperator(next));
1131
1132 // leave scope call not pertinent?
1133 __currentRep = rep._lastRepetition;
1134 return minMod;
1135
1136 case OpCode._WHILEM:
1137 rep = __currentRep;
1138
1139 arg = rep._numInstances + 1;
1140 __inputOffset = input;
1141
1142 if(input == rep._lastLocation) {
1143 __currentRep = rep._lastRepetition;
1144 line = __currentRep._numInstances;
1145 if(__match(rep._next))
1146 return true;
1147 __currentRep._numInstances = line;
1148 __currentRep = rep;
1149 return false;
1150 }
1151
1152 if(arg < rep._min) {
1153 rep._numInstances = arg;
1154 rep._lastLocation = input;
1155 if(__match(rep._scan))
1156 return true;
1157 rep._numInstances = arg - 1;
1158 return false;
1159 }
1160
1161 if(rep._minMod) {
1162 __currentRep = rep._lastRepetition;
1163 line = __currentRep._numInstances;
1164 if(__match(rep._next))
1165 return true;
1166 __currentRep._numInstances = line;
1167 __currentRep = rep;
1168
1169 if(arg >= rep._max)
1170 return false;
1171
1172 __inputOffset = input;
1173 rep._numInstances = arg;
1174 rep._lastLocation = input;
1175
1176 if(__match(rep._scan))
1177 return true;
1178
1179 rep._numInstances = arg - 1;
1180 return false;
1181 }
1182
1183 if(arg < rep._max) {
1184 __pushState(rep._parenFloor);
1185 rep._numInstances = arg;
1186 rep._lastLocation = input;
1187 if(__match(rep._scan))
1188 return true;
1189 __popState();
1190 __inputOffset = input;
1191 }
1192
1193 __currentRep = rep._lastRepetition;
1194 line = __currentRep._numInstances;
1195 if(__match(rep._next))
1196 return true;
1197
1198 rep._numInstances = line;
1199 __currentRep = rep;
1200 rep._numInstances = arg - 1;
1201 return false;
1202
1203 case OpCode._BRANCH:
1204 if(__program[next] != OpCode._BRANCH)
1205 next = OpCode._getNextOperator(scan);
1206 else {
1207 int lastParen;
1208
1209 lastParen = __lastParen;
1210
1211 do {
1212
1213 __inputOffset = input;
1214
1215 if(__match(OpCode._getNextOperator(scan)))
1216 return true;
1217
1218 for(arg = __lastParen; arg > lastParen; --arg)
1219 //__endMatchOffsets[arg] = 0;
1220 __endMatchOffsets[arg] = OpCode._NULL_OFFSET;
1221 __lastParen = arg;
1222
1223 scan = OpCode._getNext(__program, scan);
1224 } while(scan != OpCode._NULL_OFFSET &&
1225 __program[scan] == OpCode._BRANCH);
1226 return false;
1227 }
1228
1229 break;
1230
1231 case OpCode._MINMOD:
1232 minMod = true;
1233 break;
1234
1235
1236 case OpCode._CURLY:
1237 case OpCode._STAR:
1238 case OpCode._PLUS:
1239 if(op == OpCode._CURLY) {
1240 line = OpCode._getArg1(__program, scan);
1241 arg = OpCode._getArg2(__program, scan);
1242 scan = OpCode._getNextOperator(scan) + 2;
1243 } else if(op == OpCode._STAR) {
1244 line = 0;
1245 arg = Character.MAX_VALUE;
1246 scan = OpCode._getNextOperator(scan);
1247 } else {
1248 line = 1;
1249 arg = Character.MAX_VALUE;
1250 scan = OpCode._getNextOperator(scan);
1251 }
1252
1253 if(__program[next] == OpCode._EXACTLY) {
1254 nextChar = __program[OpCode._getOperand(next) + 1];
1255 current = 0;
1256 } else {
1257 nextChar = __EOS;
1258 current = -1000;
1259 }
1260 __inputOffset = input;
1261
1262 if(minMod) {
1263 minMod = false;
1264
1265 if(line > 0 && __repeat(scan, line) < line)
1266 return false;
1267
1268
1269 while(arg >= line || (arg == Character.MAX_VALUE && line > 0)) {
1270 // there may be a bug here with respect to
1271 // __inputOffset >= __endOffset, but it seems to be right for
1272 // now. the issue is with __inputOffset being reset later.
1273 // is this test really supposed to happen here?
1274 if(current == -1000 || __inputOffset >= __endOffset ||
1275 __input[__inputOffset] == nextChar) {
1276 if(__match(next))
1277 return true;
1278 }
1279
1280 __inputOffset = input + line;
1281
1282 if(__repeat(scan, 1) != 0) {
1283 ++line;
1284 __inputOffset = input + line;
1285 } else
1286 return false;
1287 }
1288
1289 } else {
1290 arg = __repeat(scan, arg);
1291
1292 if(line < arg && OpCode._opType[__program[next]] == OpCode._EOL &&
1293 ((!__multiline && __program[next] != OpCode._MEOL) ||
1294 __program[next] == OpCode._SEOL))
1295 line = arg;
1296
1297 while(arg >= line) {
1298 // there may be a bug here with respect to
1299 // __inputOffset >= __endOffset, but it seems to be right for
1300 // now. the issue is with __inputOffset being reset later.
1301 // is this test really supposed to happen here?
1302 if(current == -1000 || __inputOffset >= __endOffset ||
1303 __input[__inputOffset] == nextChar) {
1304 if(__match(next))
1305 return true;
1306 }
1307
1308 --arg;
1309 __inputOffset = input + arg;
1310 }
1311 }
1312
1313 return false;
1314
1315 case OpCode._SUCCEED:
1316 case OpCode._END:
1317 __inputOffset = input;
1318 // This enforces the rule that two consecutive matches cannot have
1319 // the same end offset.
1320 if(__inputOffset == __lastMatchInputEndOffset)
1321 return false;
1322 return true;
1323
1324 case OpCode._IFMATCH:
1325 __inputOffset = input;
1326 scan = OpCode._getNextOperator(scan);
1327 if(!__match(scan))
1328 return false;
1329 break;
1330
1331 case OpCode._UNLESSM:
1332 __inputOffset = input;
1333 scan = OpCode._getNextOperator(scan);
1334 if(__match(scan))
1335 return false;
1336 break;
1337
1338
1339 default:
1340 // todo: Need to throw an exception here.
1341
1342 } // end switch
1343
1344 //scan = (next > 0 ? next : 0);
1345 scan = next;
1346 } // end while scan
1347
1348
1349
1350 return false;
1351 }
1352
1353
1354 /**
1355 * Set whether or not subsequent calls to {@link #matches matches()}
1356 * or {@link #contains contains()} should treat the input as
1357 * consisting of multiple lines. The default behavior is for
1358 * input to be treated as consisting of multiple lines. This method
1359 * should only be called if the Perl5Pattern used for a match was
1360 * compiled without either of the Perl5Compiler.MULTILINE_MASK or
1361 * Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the
1362 * behavior of how the <b>^</b>, <b>$</b>, and <b>.</b> metacharacters are
1363 * interpreted on the fly. The compilation options used when compiling
1364 * a pattern ALWAYS override the behavior specified by setMultiline(). See
1365 * {@link Perl5Compiler} for more details.
1366 * <p>
1367 * @param multiline If set to true treats the input as consisting of
1368 * multiple lines with respect to the <b>^</b> and <b>$</b>
1369 * metacharacters. If set to false treats the input as consisting
1370 * of a single line with respect to the <b>^</b> and <b>$</b>
1371 * metacharacters.
1372 */
1373 public void setMultiline(boolean multiline) { __multiline = multiline; }
1374
1375
1376 /**
1377 * @return True if the matcher is treating input as consisting of multiple
1378 * lines with respect to the <b>^</b> and <b>$</b> metacharacters,
1379 * false otherwise.
1380 */
1381 public boolean isMultiline() { return __multiline; }
1382
1383 char[] _toLower(char[] input) {
1384 int current;
1385 char[] inp;
1386 // todo:
1387 // Certainly not the best way to do case insensitive matching.
1388 // Must definitely change this in some way, but for now we
1389 // do what Perl does and make a copy of the input, converting
1390 // it all to lowercase. This is truly better handled in the
1391 // compilation phase.
1392 inp = new char[input.length];
1393 System.arraycopy(input, 0, inp, 0, input.length);
1394 input = inp;
1395
1396 // todo: Need to inline toLowerCase()
1397 for(current = 0; current < input.length; current++)
1398 if(Character.isUpperCase(input[current]))
1399 input[current] = Character.toLowerCase(input[current]);
1400
1401 return input;
1402 }
1403
1404
1405 /**
1406 * Determines if a prefix of a string (represented as a char[])
1407 * matches a given pattern, starting from a given offset into the string.
1408 * If a prefix of the string matches the pattern, a MatchResult instance
1409 * representing the match is made accesible via
1410 * {@link #getMatch()}.
1411 * <p>
1412 * This method is useful for certain common token identification tasks
1413 * that are made more difficult without this functionality.
1414 * <p>
1415 * @param input The char[] to test for a prefix match.
1416 * @param pattern The Pattern to be matched.
1417 * @param offset The offset at which to start searching for the prefix.
1418 * @return True if input matches pattern, false otherwise.
1419 */
1420 public boolean matchesPrefix(char[] input, Pattern pattern, int offset) {
1421 Perl5Pattern expression;
1422
1423 expression = (Perl5Pattern)pattern;
1424 __originalInput = input;
1425 if(expression._isCaseInsensitive)
1426 input = _toLower(input);
1427
1428 __initInterpreterGlobals(expression, input, 0, input.length, offset);
1429
1430 __lastSuccess = __tryExpression(offset);
1431 __lastMatchResult = null;
1432
1433 return __lastSuccess;
1434 }
1435
1436
1437 /**
1438 * Determines if a prefix of a string (represented as a char[])
1439 * matches a given pattern.
1440 * If a prefix of the string matches the pattern, a MatchResult instance
1441 * representing the match is made accesible via
1442 * {@link #getMatch()}.
1443 * <p>
1444 * This method is useful for certain common token identification tasks
1445 * that are made more difficult without this functionality.
1446 * <p>
1447 * @param input The char[] to test for a prefix match.
1448 * @param pattern The Pattern to be matched.
1449 * @return True if input matches pattern, false otherwise.
1450 */
1451 public boolean matchesPrefix(char[] input, Pattern pattern) {
1452 return matchesPrefix(input, pattern, 0);
1453 }
1454
1455
1456 /**
1457 * Determines if a prefix of a string matches a given pattern.
1458 * If a prefix of the string matches the pattern, a MatchResult instance
1459 * representing the match is made accesible via
1460 * {@link #getMatch()}.
1461 * <p>
1462 * This method is useful for certain common token identification tasks
1463 * that are made more difficult without this functionality.
1464 * <p>
1465 * @param input The String to test for a prefix match.
1466 * @param pattern The Pattern to be matched.
1467 * @return True if input matches pattern, false otherwise.
1468 */
1469 public boolean matchesPrefix(String input, Pattern pattern) {
1470 return matchesPrefix(input.toCharArray(), pattern, 0);
1471 }
1472
1473 /**
1474 * Determines if a prefix of a PatternMatcherInput instance
1475 * matches a given pattern. If there is a match, a MatchResult instance
1476 * representing the match is made accesible via
1477 * {@link #getMatch()}. Unlike the
1478 * {@link #contains(PatternMatcherInput, Pattern)}
1479 * method, the current offset of the PatternMatcherInput argument
1480 * is not updated. However, unlike the
1481 * {@link #matches matches(PatternMatcherInput, Pattern)} method,
1482 * matchesPrefix() will start its search from the current offset
1483 * rather than the begin offset of the PatternMatcherInput.
1484 * <p>
1485 * This method is useful for certain common token identification tasks
1486 * that are made more difficult without this functionality.
1487 * <p>
1488 * @param input The PatternMatcherInput to test for a prefix match.
1489 * @param pattern The Pattern to be matched.
1490 * @return True if input matches pattern, false otherwise.
1491 */
1492 public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern) {
1493 char[] inp;
1494 Perl5Pattern expression;
1495
1496 expression = (Perl5Pattern)pattern;
1497
1498 __originalInput = input._originalBuffer;
1499 if(expression._isCaseInsensitive) {
1500 if(input._toLowerBuffer == null)
1501 input._toLowerBuffer = _toLower(__originalInput);
1502 inp = input._toLowerBuffer;
1503 } else
1504 inp = __originalInput;
1505
1506 __initInterpreterGlobals(expression, inp, input._beginOffset,
1507 input._endOffset, input._currentOffset);
1508 __lastSuccess = __tryExpression(input._currentOffset);
1509 __lastMatchResult = null;
1510
1511 return __lastSuccess;
1512 }
1513
1514
1515
1516 /**
1517 * Determines if a string (repres