Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/oro/text/regex/Perl5Matcher.java


1   /*
2    * $Id: Perl5Matcher.java,v 1.27 2003/11/07 20:16:25 dfs Exp $
3    *
4    * ====================================================================
5    * The Apache Software License, Version 1.1
6    *
7    * Copyright (c) 2000 The Apache Software Foundation.  All rights
8    * reserved.
9    *
10   * Redistribution and use in source and binary forms, with or without
11   * modification, are permitted provided that the following conditions
12   * are met:
13   *
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer.
16   *
17   * 2. Redistributions in binary form must reproduce the above copyright
18   *    notice, this list of conditions and the following disclaimer in
19   *    the documentation and/or other materials provided with the
20   *    distribution.
21   *
22   * 3. The end-user documentation included with the redistribution,
23   *    if any, must include the following acknowledgment:
24   *       "This product includes software developed by the
25   *        Apache Software Foundation (http://www.apache.org/)."
26   *    Alternately, this acknowledgment may appear in the software itself,
27   *    if and wherever such third-party acknowledgments normally appear.
28   *
29   * 4. The names "Apache" and "Apache Software Foundation", "Jakarta-Oro" 
30   *    must not be used to endorse or promote products derived from this
31   *    software without prior written permission. For written
32   *    permission, please contact apache@apache.org.
33   *
34   * 5. Products derived from this software may not be called "Apache" 
35   *    or "Jakarta-Oro", nor may "Apache" or "Jakarta-Oro" appear in their 
36   *    name, without prior written permission of the Apache Software Foundation.
37   *
38   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
39   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
40   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
42   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
43   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
44   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
45   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
46   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
47   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
48   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
49   * SUCH DAMAGE.
50   * ====================================================================
51   *
52   * This software consists of voluntary contributions made by many
53   * individuals on behalf of the Apache Software Foundation.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  
59  package org.apache.oro.text.regex;
60  
61  import java.util.*;
62  
63  /**
64   * The Perl5Matcher class is used to match regular expressions
65   * (conforming to the Perl5 regular expression syntax) generated by
66   * Perl5Compiler.
67   * <p>
68   * Perl5Compiler and Perl5Matcher are designed with the intent that
69   * you use a separate instance of each per thread to avoid the overhead
70   * of both synchronization and concurrent access (e.g., a match that takes
71   * a long time in one thread will block the progress of another thread with
72   * a shorter match).  If you want to use a single instance of each
73   * in a concurrent program, you must appropriately protect access to
74   * the instances with critical sections.  If you want to share Perl5Pattern
75   * instances between concurrently executing instances of Perl5Matcher, you
76   * must compile the patterns with {@link Perl5Compiler#READ_ONLY_MASK}.
77   *
78   * @version @version@
79   * @since 1.0
80   * @see PatternMatcher
81   * @see Perl5Compiler
82   */
83  public final class Perl5Matcher implements PatternMatcher {
84    private static final char __EOS = Character.MAX_VALUE;
85    private static final int __INITIAL_NUM_OFFSETS = 20;
86  
87    private boolean __multiline = false, __lastSuccess = false;
88    private boolean __caseInsensitive = false;
89    private char __previousChar, __input[], __originalInput[];
90    private Perl5Repetition __currentRep;
91    private int __numParentheses, __bol, __eol, __currentOffset, __endOffset;
92  
93    private char[] __program;
94    private int __expSize, __inputOffset, __lastParen;
95    private int[] __beginMatchOffsets, __endMatchOffsets;
96    private Stack __stack = new Stack();
97    private Perl5MatchResult __lastMatchResult = null;
98  
99    private static boolean
100     __compare(char[] s1, int s1Offs, char[] s2, int s2Offs, int n)
101   {
102     int cnt;
103 
104     for(cnt = 0; cnt < n; cnt++, s1Offs++, s2Offs++) {
105       if(s1Offs >= s1.length)
106   return false;
107       if(s2Offs >= s2.length)
108   return false;
109       if(s1[s1Offs] != s2[s2Offs])
110   return false;
111     }
112 
113     return true;
114   }
115 
116   private static int __findFirst(char[] input, int current, int endOffset,
117          char[] mustString)
118   {
119     int count, saveCurrent;
120     char ch;
121 
122 
123     if(input.length == 0)
124       return endOffset;
125 
126     ch = mustString[0];
127     // Find the offset of the first character of the must string
128     while(current < endOffset) {
129       if(ch == input[current]){
130   saveCurrent = current;
131   count = 0;
132 
133   while(current < endOffset && count < mustString.length) {
134     if(mustString[count] != input[current])
135       break;
136     ++count;
137     ++current;
138   }
139 
140   current = saveCurrent;
141 
142   if(count >= mustString.length)
143     break;
144       }
145       ++current;
146     }
147 
148     return current;
149   }
150 
151 
152   private void __pushState(int parenFloor) {
153     int[] state;
154     int stateEntries, paren;
155 
156     stateEntries = 3*(__expSize - parenFloor);
157     if(stateEntries <= 0)
158       state = new int[3];
159     else
160       state = new int[stateEntries + 3];
161 
162     state[0] = __expSize;
163     state[1] = __lastParen;
164     state[2] = __inputOffset;
165 
166     for(paren = __expSize; paren > parenFloor; --paren, stateEntries-=3) {
167       state[stateEntries]     = __endMatchOffsets[paren];
168       state[stateEntries + 1] = __beginMatchOffsets[paren];
169       state[stateEntries + 2] = paren;
170     }
171 
172     __stack.push(state);
173   }
174 
175 
176   private void __popState() {
177     int[] state;
178     int entry, paren;
179 
180     state = (int[])__stack.pop();
181 
182     __expSize     = state[0];
183     __lastParen   = state[1];
184     __inputOffset = state[2];
185 
186     for(entry = 3; entry < state.length; entry+=3) {
187       paren = state[entry + 2];
188       __beginMatchOffsets[paren] = state[entry + 1];
189 
190       if(paren <= __lastParen)
191   __endMatchOffsets[paren] = state[entry];
192     }
193 
194     for(paren = __lastParen + 1; paren <= __numParentheses; paren++) {
195       if(paren > __expSize)
196   __beginMatchOffsets[paren] = OpCode._NULL_OFFSET;
197       __endMatchOffsets[paren] = OpCode._NULL_OFFSET;
198     }
199   }
200 
201 
202   // Initialize globals needed before calling __tryExpression for first time
203   private void __initInterpreterGlobals(Perl5Pattern expression, char[] input,
204           int beginOffset, int endOffset,
205           int currentOffset)
206   {
207     // Remove this hack after more efficient case-folding and unicode
208     // character classes are implemented
209     __caseInsensitive            = expression._isCaseInsensitive;
210     __input                      = input;
211     __endOffset                  = endOffset;
212     __currentRep                 = new Perl5Repetition();
213     __currentRep._numInstances   = 0;
214     __currentRep._lastRepetition = null;
215     __program                    = expression._program;
216     __stack.setSize(0);
217 
218     // currentOffset should always be >= beginOffset and should
219     // always be equal to zero when beginOffset equals 0, but we
220     // make a weak attempt to protect against a violation of this
221     // precondition
222     if(currentOffset == beginOffset || currentOffset <= 0)
223       __previousChar = '\n';
224     else {
225       __previousChar = input[currentOffset - 1];
226       if(!__multiline && __previousChar == '\n')
227   __previousChar = '\0';
228     }
229 
230     __numParentheses    = expression._numParentheses;
231     __currentOffset     = currentOffset;
232 
233     __bol = beginOffset;
234     __eol = endOffset;
235 
236     // Ok, here we're using endOffset as a temporary variable.
237     endOffset = __numParentheses + 1;
238     if(__beginMatchOffsets == null || endOffset > __beginMatchOffsets.length) {
239       if(endOffset < __INITIAL_NUM_OFFSETS)
240   endOffset = __INITIAL_NUM_OFFSETS;
241       __beginMatchOffsets = new int[endOffset];
242       __endMatchOffsets   = new int[endOffset];      
243     }
244   }
245 
246   // Set the match result information.  Only call this if we successfully
247   // matched.
248   private void __setLastMatchResult() {
249     int offs, maxEndOffs = 0;
250 
251     //endOffset+=dontTry;
252 
253     __lastMatchResult = new Perl5MatchResult(__numParentheses + 1);
254 
255     // This can happen when using Perl5StreamInput
256     if(__endMatchOffsets[0] > __originalInput.length)
257       throw new ArrayIndexOutOfBoundsException();
258 
259     __lastMatchResult._matchBeginOffset = __beginMatchOffsets[0];
260 
261     while(__numParentheses >= 0) {
262       offs = __beginMatchOffsets[__numParentheses];
263 
264       if(offs >= 0)
265   __lastMatchResult._beginGroupOffset[__numParentheses] =
266     offs - __lastMatchResult._matchBeginOffset;
267       else
268   __lastMatchResult._beginGroupOffset[__numParentheses] =
269     OpCode._NULL_OFFSET;
270 
271       offs = __endMatchOffsets[__numParentheses];
272 
273       if(offs >= 0) {
274   __lastMatchResult._endGroupOffset[__numParentheses] =
275     offs - __lastMatchResult._matchBeginOffset;
276   if(offs > maxEndOffs && offs <= __originalInput.length)
277     maxEndOffs = offs;
278       } else
279   __lastMatchResult._endGroupOffset[__numParentheses] =
280     OpCode._NULL_OFFSET;
281 
282       --__numParentheses;
283     }
284 
285     __lastMatchResult._match =
286       new String(__originalInput, __beginMatchOffsets[0],
287      maxEndOffs - __beginMatchOffsets[0]);
288 
289     // Free up for garbage collection
290     __originalInput = null;
291   }
292 
293 
294 
295   // Expects to receive a valid regular expression program.  No checking
296   // is done to ensure validity.
297   // __originalInput must be set before calling this method for
298   // __lastMatchResult to be set correctly.
299   // beginOffset marks the beginning of the string
300   // currentOffset marks where to start the pattern search
301   private boolean __interpret(Perl5Pattern expression, char[] input,
302             int beginOffset, int endOffset,
303             int currentOffset)
304   {
305     boolean success;
306     int minLength = 0, dontTry = 0, offset;
307     char ch, mustString[];
308 
309     __initInterpreterGlobals(expression, input, beginOffset, endOffset,
310            currentOffset);
311 
312     success = false;
313     mustString = expression._mustString;
314 
315   _mainLoop:
316     while(true) {
317 
318       if(mustString != null &&
319    ((expression._anchor & Perl5Pattern._OPT_ANCH) == 0 ||
320     ((__multiline || 
321      (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0)
322      && expression._back >= 0)))
323   {
324 
325   __currentOffset =
326     __findFirst(__input, __currentOffset, endOffset, mustString);
327   
328   if(__currentOffset >= endOffset) {
329     if((expression._options & Perl5Compiler.READ_ONLY_MASK) == 0)
330       expression._mustUtility++;
331     success = false;
332     break _mainLoop;
333   } else if(expression._back >= 0) {
334     __currentOffset-=expression._back;
335     if(__currentOffset < currentOffset)
336       __currentOffset = currentOffset;
337     minLength = expression._back + mustString.length;
338   } else if(!expression._isExpensive &&
339       (expression._options & Perl5Compiler.READ_ONLY_MASK) == 0 &&
340       (--expression._mustUtility < 0)) {
341     // Be careful!  The preceding logical expression is constructed
342     // so that mustUtility is only decremented if the expression is
343     // compiled without READ_ONLY_MASK.
344     mustString = expression._mustString = null;
345     __currentOffset = currentOffset;
346   } else {
347     __currentOffset = currentOffset;
348     minLength = mustString.length;
349   }
350       }
351 
352       if((expression._anchor & Perl5Pattern._OPT_ANCH) != 0) {
353   if(__currentOffset == beginOffset && __tryExpression(beginOffset)) {
354     success = true;
355     break _mainLoop;
356   } else if(__multiline
357       || (expression._anchor & Perl5Pattern._OPT_ANCH_MBOL) != 0
358       || (expression._anchor & Perl5Pattern._OPT_IMPLICIT) != 0)
359     {
360       if(minLength > 0)
361         dontTry = minLength - 1;
362       endOffset-=dontTry;
363 
364       if(__currentOffset > currentOffset)
365         --__currentOffset;
366 
367       while(__currentOffset < endOffset) {
368         if(__input[__currentOffset++] == '\n') {
369     if(__currentOffset < endOffset &&
370        __tryExpression(__currentOffset)) {
371       success = true;
372       break _mainLoop;
373     }
374         }
375       }
376     }
377 
378   break _mainLoop;
379       }
380 
381 
382       if(expression._startString != null) {
383   mustString = expression._startString;
384   if((expression._anchor & Perl5Pattern._OPT_SKIP) != 0) {
385     ch = mustString[0];
386 
387     while(__currentOffset < endOffset) {
388       if(ch == __input[__currentOffset]) {
389         if(__tryExpression(__currentOffset)){
390     success = true;
391     break _mainLoop;
392         }
393         ++__currentOffset;
394         while(__currentOffset < endOffset &&
395         __input[__currentOffset] == ch)
396         ++__currentOffset;
397       }
398       ++__currentOffset;
399     }
400   } else {
401 
402     while((__currentOffset =
403      __findFirst(__input, __currentOffset, endOffset, mustString))
404     < endOffset){
405       if(__tryExpression(__currentOffset)) {
406         success = true;
407         break _mainLoop;
408       }
409       ++__currentOffset;
410     }
411   }
412 
413   break _mainLoop;
414       }
415 
416       if((offset = expression._startClassOffset) != OpCode._NULL_OFFSET) {
417   boolean doEvery, tmp;
418   char op;
419 
420   doEvery = ((expression._anchor & Perl5Pattern._OPT_SKIP) == 0);
421 
422   if(minLength > 0)
423     dontTry = minLength - 1;
424   endOffset -= dontTry;
425   tmp = true;
426 
427   switch(op = __program[offset]) {
428   case OpCode._ANYOF:
429     offset = OpCode._getOperand(offset);
430     while(__currentOffset < endOffset) {
431       ch = __input[__currentOffset];
432 
433       if(ch < 256 &&
434          (__program[offset + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
435         if(tmp && __tryExpression(__currentOffset)) {
436     success = true;
437     break _mainLoop;
438         } else
439     tmp = doEvery;
440       } else
441         tmp = true;
442       ++__currentOffset;
443     }
444 
445     break;
446 
447   case OpCode._ANYOFUN:
448   case OpCode._NANYOFUN:
449     offset = OpCode._getOperand(offset);
450     while(__currentOffset < endOffset) {
451       ch = __input[__currentOffset];
452 
453       if(__matchUnicodeClass(ch, __program, offset, op)) {
454         if(tmp && __tryExpression(__currentOffset)) {
455     success = true;
456     break _mainLoop;
457         } else
458     tmp = doEvery;
459       } else
460         tmp = true;
461       ++__currentOffset;
462     }
463 
464     break;
465 
466   case OpCode._BOUND:
467     if(minLength > 0) {
468       ++dontTry;
469       --endOffset;
470     }
471 
472     if(__currentOffset != beginOffset) {
473       ch = __input[__currentOffset - 1];
474       tmp = OpCode._isWordCharacter(ch);
475     } else
476       tmp = OpCode._isWordCharacter(__previousChar);
477 
478     while(__currentOffset < endOffset) {
479       ch = __input[__currentOffset];
480       if(tmp != OpCode._isWordCharacter(ch)){
481         tmp = !tmp;
482         if(__tryExpression(__currentOffset)) {
483     success = true;
484     break _mainLoop;
485         }
486       }
487       ++__currentOffset;
488     }
489 
490     if((minLength > 0 || tmp) && 
491        __tryExpression(__currentOffset)) {
492       success = true;
493       break _mainLoop;
494     }
495     break;
496 
497   case OpCode._NBOUND:
498     if(minLength > 0) {
499       ++dontTry;
500       --endOffset;
501     }
502 
503     if(__currentOffset != beginOffset) {
504       ch = __input[__currentOffset - 1];
505       tmp = OpCode._isWordCharacter(ch);
506     } else
507       tmp = OpCode._isWordCharacter(__previousChar);
508 
509     while(__currentOffset < endOffset) {
510       ch = __input[__currentOffset];
511       if(tmp != OpCode._isWordCharacter(ch))
512         tmp = !tmp;
513       else if(__tryExpression(__currentOffset)) {
514         success = true;
515         break _mainLoop;
516       }
517 
518       ++__currentOffset;
519     }
520 
521     if((minLength > 0 || !tmp) &&
522        __tryExpression(__currentOffset)) {
523       success = true;
524       break _mainLoop;
525     }
526     break;
527 
528   case OpCode._ALNUM:
529     while(__currentOffset < endOffset) {
530       ch = __input[__currentOffset];
531       if(OpCode._isWordCharacter(ch)) {
532         if(tmp && __tryExpression(__currentOffset)) {
533     success = true;
534     break _mainLoop;
535         } else
536     tmp = doEvery;
537       } else
538         tmp = true;
539       ++__currentOffset;
540     }
541     break;
542 
543   case OpCode._NALNUM:
544     while(__currentOffset < endOffset) {
545       ch = __input[__currentOffset];
546       if(!OpCode._isWordCharacter(ch)) {
547         if(tmp && __tryExpression(__currentOffset)) {
548     success = true;
549     break _mainLoop;
550         } else
551     tmp = doEvery;
552       } else
553         tmp = true;
554       ++__currentOffset;
555     }
556     break;
557 
558   case OpCode._SPACE:
559     while(__currentOffset < endOffset) {
560       if(Character.isWhitespace(__input[__currentOffset])) {
561         if(tmp && __tryExpression(__currentOffset)) {
562     success = true;
563     break _mainLoop;
564         } else
565     tmp = doEvery;
566       } else
567         tmp = true;
568       ++__currentOffset;
569     }
570     break;
571 
572   case OpCode._NSPACE:
573     while(__currentOffset < endOffset) {
574       if(!Character.isWhitespace(__input[__currentOffset])) {
575         if(tmp && __tryExpression(__currentOffset)) {
576     success = true;
577     break _mainLoop;
578         } else
579     tmp = doEvery;
580       } else
581         tmp = true;
582       ++__currentOffset;
583     }
584     break;
585 
586   case OpCode._DIGIT:
587     while(__currentOffset < endOffset) {
588       if(Character.isDigit(__input[__currentOffset])) {
589         if(tmp && __tryExpression(__currentOffset)) {
590     success = true;
591     break _mainLoop;
592         } else
593     tmp = doEvery;
594       } else
595         tmp = true;
596       ++__currentOffset;
597     }
598     break;
599 
600 
601   case OpCode._NDIGIT:
602     while(__currentOffset < endOffset) {
603       if(!Character.isDigit(__input[__currentOffset])) {
604         if(tmp && __tryExpression(__currentOffset)) {
605     success = true;
606     break _mainLoop;
607         } else
608     tmp = doEvery;
609       } else
610         tmp = true;
611       ++__currentOffset;
612     }
613     break;
614   } // end switch
615 
616       } else {
617   if(minLength > 0)
618     dontTry = minLength - 1;
619   endOffset-=dontTry;
620 
621   do {
622     if(__tryExpression(__currentOffset)) {
623       success = true;
624       break _mainLoop;
625     }
626   } while(__currentOffset++ < endOffset);
627 
628       }
629 
630 
631       break _mainLoop;
632     } // end while
633 
634     __lastSuccess = success;
635     __lastMatchResult = null;
636 
637     return success;
638   }
639   
640   private boolean __matchUnicodeClass(char code, char __program[], 
641            int offset ,char opcode)
642   {
643     boolean isANYOF = ( opcode == OpCode._ANYOFUN );
644 
645     while( __program[offset] != OpCode._END ){
646       if( __program[offset] == OpCode._RANGE ){
647   offset++;
648   if((code >= __program[offset]) && (code <= __program[offset+1])){
649     return isANYOF;
650   } else {
651     offset+=2;
652   }
653 
654       } else if(__program[offset] == OpCode._ONECHAR) {
655          offset++;
656   if(__program[offset++] == code) return isANYOF;
657 
658       } else {
659   isANYOF = (__program[offset] == OpCode._OPCODE) 
660     ? isANYOF : !isANYOF;
661 
662   offset++;
663   switch ( __program[offset++] ) {
664   case OpCode._ALNUM:
665     if(OpCode._isWordCharacter(code)) return isANYOF;
666     break;
667   case OpCode._NALNUM:
668     if(!OpCode._isWordCharacter(code)) return isANYOF;
669     break;
670   case OpCode._SPACE:
671     if(Character.isWhitespace(code)) return isANYOF;
672     break;
673   case OpCode._NSPACE:
674     if(!Character.isWhitespace(code)) return isANYOF;
675     break;
676   case OpCode._DIGIT:
677     if(Character.isDigit(code)) return isANYOF;
678     break;
679   case OpCode._NDIGIT:
680     if(!Character.isDigit(code)) return isANYOF;
681     break;
682   case OpCode._ALNUMC:
683     if(Character.isLetterOrDigit(code)) return isANYOF;
684     break;
685   case OpCode._ALPHA:
686     if(Character.isLetter(code)) return isANYOF;
687     break;
688   case OpCode._BLANK:
689     if(Character.isSpaceChar(code)) return isANYOF;
690     break;
691   case OpCode._CNTRL:
692     if(Character.isISOControl(code)) return isANYOF;
693     break;
694   case OpCode._LOWER:
695     if(Character.isLowerCase(code)) return isANYOF;
696     // Remove this hack after more efficient case-folding and unicode
697     // character classes are implemented
698     if(__caseInsensitive && Character.isUpperCase(code))
699       return isANYOF;
700     break;
701   case OpCode._UPPER:
702     if(Character.isUpperCase(code)) return isANYOF;
703     // Remove this hack after more efficient case-folding and unicode
704     // character classes are implemented
705     if(__caseInsensitive && Character.isLowerCase(code))
706       return isANYOF;
707     break;
708   case OpCode._PRINT:
709     if(Character.isSpaceChar(code)) return isANYOF;
710           // Fall through to check if the character is alphanumeric,
711     // or a punctuation mark.  Printable characters are either
712     // alphanumeric, punctuation marks, or spaces.
713   case OpCode._GRAPH:
714     if(Character.isLetterOrDigit(code)) return isANYOF;
715           // Fall through to check if the character is a punctuation mark.
716           // Graph characters are either alphanumeric or punctuation.
717   case OpCode._PUNCT:
718     switch ( Character.getType(code) ) {
719       case Character.DASH_PUNCTUATION:
720       case Character.START_PUNCTUATION:
721       case Character.END_PUNCTUATION:
722       case Character.CONNECTOR_PUNCTUATION:
723       case Character.OTHER_PUNCTUATION:
724             case Character.MATH_SYMBOL:
725             case Character.CURRENCY_SYMBOL:
726             case Character.MODIFIER_SYMBOL:
727         return isANYOF;
728       default:
729         break;
730       }
731     break;
732   case OpCode._XDIGIT:
733     if( (code >= '0' && code <= '9') ||
734         (code >= 'a' && code <= 'f') ||
735         (code >= 'A' && code <= 'F')) return isANYOF;
736     break;
737   case OpCode._ASCII:
738     if(code < 0x80)return isANYOF;
739   }
740       } 
741     }
742     return !isANYOF;
743   }
744   
745   private boolean __tryExpression(int offset) {
746     int count;
747     
748     __inputOffset = offset;
749     __lastParen   = 0;
750     __expSize     = 0;
751 
752     if(__numParentheses > 0) {
753       for(count=0; count <= __numParentheses; count++) {
754   __beginMatchOffsets[count] = OpCode._NULL_OFFSET;
755   __endMatchOffsets[count]   = OpCode._NULL_OFFSET;
756       }
757     }
758 
759     if(__match(1)){
760       __beginMatchOffsets[0] = offset;
761       __endMatchOffsets[0]   = __inputOffset;
762       return true;
763     }
764 
765     return false;
766   }
767     
768 
769   private int __repeat(int offset, int max) {
770     int scan, eol, operand, ret;
771     char ch;
772     char op;
773 
774     scan = __inputOffset;
775     eol  = __eol;
776 
777     if(max != Character.MAX_VALUE && max < eol - scan)
778       eol = scan + max;
779 
780     operand = OpCode._getOperand(offset);
781 
782     switch(op = __program[offset]) {
783 
784     case OpCode._ANY:
785       while(scan < eol && __input[scan] != '\n')
786   ++scan;
787       break;
788 
789     case OpCode._SANY:
790       scan = eol;
791       break;
792 
793     case OpCode._EXACTLY:
794       ++operand;
795       while(scan < eol && __program[operand] == __input[scan])
796   ++scan;
797       break;
798 
799     case OpCode._ANYOF:
800       if(scan < eol && (ch = __input[scan]) < 256) {
801   while((ch < 256  ) && (__program[operand + (ch >> 4)] & (1 << (ch & 0xf))) == 0) {
802     if(++scan < eol)
803       ch = __input[scan];
804     else
805       break;
806   }
807       }
808       break;
809 
810     case OpCode._ANYOFUN:
811     case OpCode._NANYOFUN:
812       if(scan < eol) {
813   ch = __input[scan];
814   while(__matchUnicodeClass(ch, __program, operand, op)){
815     if(++scan < eol)
816       ch = __input[scan];
817     else
818       break;
819   }
820       }
821       break;
822 
823     case OpCode._ALNUM:
824       while(scan < eol && OpCode._isWordCharacter(__input[scan]))
825   ++scan;
826       break;
827 
828     case OpCode._NALNUM:
829       while(scan < eol && !OpCode._isWordCharacter(__input[scan]))
830   ++scan;
831       break;
832 
833     case OpCode._SPACE:
834       while(scan < eol && Character.isWhitespace(__input[scan]))
835   ++scan;
836       break;
837 
838     case OpCode._NSPACE:
839       while(scan < eol && !Character.isWhitespace(__input[scan]))
840   ++scan;
841       break;
842 
843     case OpCode._DIGIT:
844       while(scan < eol && Character.isDigit(__input[scan]))
845   ++scan;
846       break;
847 
848     case OpCode._NDIGIT:
849       while(scan < eol && !Character.isDigit(__input[scan]))
850   ++scan;
851       break;
852 
853     default:
854       break;
855 
856     }
857 
858     ret = scan - __inputOffset;
859     __inputOffset = scan;
860 
861     return ret;
862   }
863 
864 
865   private boolean __match(int offset) {
866     char nextChar, op;
867     int scan, next, input, maxScan, current, line, arg;
868     boolean inputRemains = true, minMod = false;
869     Perl5Repetition rep;
870 
871 
872     input    = __inputOffset;
873     inputRemains = (input < __endOffset);
874     nextChar = (inputRemains ? __input[input] : __EOS);
875 
876     scan     = offset;
877     maxScan  = __program.length;
878 
879     while(scan < maxScan /*&& scan > 0*/){
880       next = OpCode._getNext(__program, scan);
881 
882       switch(op = __program[scan]) {
883 
884       case OpCode._BOL:
885   if(input == __bol ? __previousChar == '\n' :
886      (__multiline && (inputRemains || input < __eol) && 
887       __input[input - 1] == '\n'))
888     break;
889   return false;
890 
891       case OpCode._MBOL:
892   if(input == __bol ? __previousChar == '\n' :
893      ((inputRemains || input < __eol) && __input[input - 1] == '\n'))
894     break;
895   return false;
896 
897       case OpCode._SBOL:
898   if(input == __bol && __previousChar == '\n')
899     break;
900   return false;
901 
902       case OpCode._GBOL:
903   if(input == __bol)
904     break;
905   return true;
906 
907       case OpCode._EOL :
908   if((inputRemains || input < __eol) && nextChar != '\n')
909     return false;
910   if(!__multiline && __eol - input > 1)
911     return false;
912   break;
913 
914       case OpCode._MEOL:
915   if((inputRemains || input < __eol) && nextChar != '\n')
916     return false;
917   break;
918 
919       case OpCode._SEOL:
920   if((inputRemains || input < __eol) && nextChar != '\n')
921     return false;
922   if(__eol - input > 1)
923     return false;
924   break;
925 
926       case OpCode._SANY:
927   if(!inputRemains && input >= __eol)
928     return false;
929   inputRemains = (++input < __endOffset);
930   nextChar = (inputRemains ? __input[input] : __EOS);
931   break;
932 
933       case OpCode._ANY:
934   if((!inputRemains && input >= __eol) || nextChar == '\n')
935     return false;
936   inputRemains = (++input < __endOffset);
937   nextChar = (inputRemains ? __input[input] : __EOS);
938   break;
939 
940       case OpCode._EXACTLY:
941   current = OpCode._getOperand(scan);
942   line = __program[current++];
943 
944   if(__program[current] != nextChar)
945     return false;
946   if(__eol - input < line)
947     return false;
948 
949   if(line > 1 && !__compare(__program, current, __input, input, line))
950     return false;
951 
952   input+=line;
953   inputRemains = (input < __endOffset);
954   nextChar = (inputRemains ? __input[input] : __EOS);
955   break;
956 
957       case OpCode._ANYOF:
958   current = OpCode._getOperand(scan);
959 
960   if(nextChar == __EOS && inputRemains)
961     nextChar = __input[input];
962 
963   if(nextChar >= 256 || (__program[current + (nextChar >> 4)] &
964       (1 << (nextChar & 0xf))) != 0)
965     return false;
966 
967   if(!inputRemains && input >= __eol)
968     return false;
969 
970   inputRemains = (++input < __endOffset);
971   nextChar = (inputRemains ? __input[input] : __EOS);
972   break;
973 
974       case OpCode._ANYOFUN:
975       case OpCode._NANYOFUN:
976   current = OpCode._getOperand(scan);
977 
978   if(nextChar == __EOS && inputRemains)
979     nextChar = __input[input];
980 
981   if(!__matchUnicodeClass(nextChar, __program, current, op))
982     return false;
983 
984   if(!inputRemains && input >= __eol)
985     return false;
986 
987   inputRemains = (++input < __endOffset);
988   nextChar = (inputRemains ? __input[input] : __EOS);
989   break;
990 
991       case OpCode._ALNUM:
992   if(!inputRemains)
993     return false;
994   if(!OpCode._isWordCharacter(nextChar))
995     return false;
996   inputRemains = (++input < __endOffset);
997   nextChar = (inputRemains ? __input[input] : __EOS);
998   break;
999 
1000      case OpCode._NALNUM:
1001  if(!inputRemains && input >= __eol)
1002    return false;
1003  if(OpCode._isWordCharacter(nextChar))
1004    return false;
1005  inputRemains = (++input < __endOffset);
1006  nextChar = (inputRemains ? __input[input] : __EOS);
1007  break;
1008
1009
1010      case OpCode._NBOUND:
1011      case OpCode._BOUND:
1012  boolean a, b;
1013
1014  if(input == __bol)
1015    a = OpCode._isWordCharacter(__previousChar);
1016  else
1017    a = OpCode._isWordCharacter(__input[input - 1]);
1018
1019  b = OpCode._isWordCharacter(nextChar);
1020
1021  if((a == b) == (__program[scan] == OpCode._BOUND))
1022    return false;
1023  break;
1024
1025      case OpCode._SPACE:
1026  if(!inputRemains && input >= __eol)
1027    return false;
1028  if(!Character.isWhitespace(nextChar))
1029    return false;
1030  inputRemains = (++input < __endOffset);
1031  nextChar = (inputRemains ? __input[input] : __EOS);
1032  break;
1033
1034
1035      case OpCode._NSPACE:
1036  if(!inputRemains)
1037    return false;
1038  if(Character.isWhitespace(nextChar))
1039    return false;
1040  inputRemains = (++input < __endOffset);
1041  nextChar = (inputRemains ? __input[input] : __EOS);
1042  break;
1043
1044      case OpCode._DIGIT:
1045  if(!Character.isDigit(nextChar))
1046    return false;
1047  inputRemains = (++input < __endOffset);
1048  nextChar = (inputRemains ? __input[input] : __EOS);
1049  break;
1050
1051      case OpCode._NDIGIT:
1052  if(!inputRemains && input >= __eol)
1053    return false;
1054  if(Character.isDigit(nextChar))
1055    return false;
1056  inputRemains = (++input < __endOffset);
1057  nextChar = (inputRemains ? __input[input] : __EOS);
1058  break;
1059
1060      case OpCode._REF:
1061  arg = OpCode._getArg1(__program, scan);
1062  current = __beginMatchOffsets[arg];
1063
1064  if(current == OpCode._NULL_OFFSET)
1065    return false;
1066
1067  if(__endMatchOffsets[arg] == OpCode._NULL_OFFSET)
1068    return false;
1069
1070  if(current == __endMatchOffsets[arg])
1071    break;
1072
1073  if(__input[current] != nextChar)
1074    return false;
1075
1076  line = __endMatchOffsets[arg] - current;
1077
1078  if(input + line > __eol)
1079    return false;
1080
1081  if(line > 1 && !__compare(__input, current, __input, input, line))
1082    return false;
1083
1084  input+=line;
1085  inputRemains = (input < __endOffset);
1086  nextChar     = (inputRemains ? __input[input] : __EOS);
1087  break;
1088
1089      case OpCode._NOTHING:
1090  break;
1091
1092      case OpCode._BACK:
1093  break;
1094
1095      case OpCode._OPEN:
1096  arg = OpCode._getArg1(__program, scan);
1097  __beginMatchOffsets[arg] = input;
1098
1099  if(arg > __expSize)
1100    __expSize = arg;
1101  break;
1102
1103      case OpCode._CLOSE:
1104  arg = OpCode._getArg1(__program, scan);
1105  __endMatchOffsets[arg] = input;
1106
1107  if(arg > __lastParen)
1108    __lastParen = arg;
1109  break;
1110
1111      case OpCode._CURLYX:
1112  rep = new Perl5Repetition();
1113  rep._lastRepetition = __currentRep;
1114  __currentRep = rep;
1115
1116  rep._parenFloor = __lastParen;
1117  rep._numInstances = -1;
1118  rep._min    = OpCode._getArg1(__program, scan);
1119  rep._max    = OpCode._getArg2(__program, scan);
1120  rep._scan   = OpCode._getNextOperator(scan) + 2;
1121  rep._next   = next;
1122  rep._minMod = minMod;
1123  // Must initialize to -1 because if we initialize to 0 and are
1124  // at the beginning of the input the OpCode._WHILEM case will
1125  // not work right.
1126  rep._lastLocation = -1;
1127  __inputOffset = input;
1128
1129  // use minMod as temporary
1130  minMod = __match(OpCode._getPrevOperator(next));
1131
1132  // leave scope call not pertinent?
1133  __currentRep = rep._lastRepetition;
1134  return minMod;
1135
1136      case OpCode._WHILEM:
1137  rep = __currentRep;
1138
1139  arg = rep._numInstances + 1;
1140  __inputOffset = input;
1141
1142  if(input == rep._lastLocation) {
1143    __currentRep = rep._lastRepetition;
1144    line = __currentRep._numInstances;
1145    if(__match(rep._next))
1146      return true;
1147    __currentRep._numInstances = line;
1148    __currentRep = rep;
1149    return false;
1150  }
1151
1152  if(arg < rep._min) {
1153    rep._numInstances = arg;
1154    rep._lastLocation = input;
1155    if(__match(rep._scan))
1156      return true;
1157    rep._numInstances = arg - 1;
1158    return false;
1159  }
1160
1161  if(rep._minMod) {
1162    __currentRep = rep._lastRepetition;
1163    line = __currentRep._numInstances;
1164    if(__match(rep._next))
1165      return true;
1166    __currentRep._numInstances = line;
1167    __currentRep = rep;
1168
1169    if(arg >= rep._max)
1170      return false;
1171
1172    __inputOffset = input;
1173    rep._numInstances = arg;
1174    rep._lastLocation = input;
1175
1176    if(__match(rep._scan))
1177      return true;
1178
1179    rep._numInstances = arg - 1;
1180    return false;
1181  }
1182
1183  if(arg < rep._max) {
1184    __pushState(rep._parenFloor);
1185    rep._numInstances = arg;
1186    rep._lastLocation = input;
1187    if(__match(rep._scan))
1188      return true;
1189    __popState();
1190    __inputOffset = input;
1191  }
1192
1193  __currentRep = rep._lastRepetition;
1194  line = __currentRep._numInstances;
1195  if(__match(rep._next))
1196    return true;
1197
1198  rep._numInstances = line;
1199  __currentRep = rep;
1200  rep._numInstances = arg - 1;
1201  return false;
1202
1203      case OpCode._BRANCH:
1204  if(__program[next] != OpCode._BRANCH)
1205    next = OpCode._getNextOperator(scan);
1206  else {
1207    int lastParen;
1208
1209    lastParen = __lastParen;
1210
1211    do {
1212
1213      __inputOffset = input;
1214
1215      if(__match(OpCode._getNextOperator(scan)))
1216        return true;
1217
1218      for(arg = __lastParen; arg > lastParen; --arg)
1219        //__endMatchOffsets[arg] = 0;
1220        __endMatchOffsets[arg] = OpCode._NULL_OFFSET;
1221      __lastParen = arg;
1222
1223      scan = OpCode._getNext(__program, scan);
1224    } while(scan != OpCode._NULL_OFFSET &&
1225      __program[scan] == OpCode._BRANCH);
1226    return false;
1227  }
1228
1229  break;
1230
1231      case OpCode._MINMOD:
1232  minMod = true;
1233  break;
1234
1235
1236      case OpCode._CURLY:
1237      case OpCode._STAR:
1238      case OpCode._PLUS:
1239  if(op == OpCode._CURLY) {
1240    line = OpCode._getArg1(__program, scan);
1241    arg  = OpCode._getArg2(__program, scan);
1242    scan = OpCode._getNextOperator(scan) + 2;
1243  } else if(op == OpCode._STAR) {
1244    line = 0;
1245    arg  = Character.MAX_VALUE;
1246    scan = OpCode._getNextOperator(scan);
1247  } else {
1248    line = 1;
1249    arg  = Character.MAX_VALUE;
1250    scan = OpCode._getNextOperator(scan);
1251  }
1252
1253  if(__program[next] == OpCode._EXACTLY) {
1254    nextChar = __program[OpCode._getOperand(next) + 1];
1255    current  = 0;
1256  } else {
1257    nextChar = __EOS;
1258    current  = -1000;
1259  }
1260  __inputOffset = input;
1261
1262  if(minMod) {
1263    minMod = false;
1264
1265    if(line > 0 && __repeat(scan, line) < line)
1266      return false;
1267
1268
1269    while(arg >= line || (arg == Character.MAX_VALUE && line > 0)) {
1270      // there may be a bug here with respect to
1271      // __inputOffset >= __endOffset, but it seems to be right for
1272      // now.  the issue is with __inputOffset being reset later.
1273      // is this test really supposed to happen here?
1274      if(current == -1000 || __inputOffset >= __endOffset ||
1275         __input[__inputOffset] == nextChar) {
1276        if(__match(next))
1277    return true;
1278      }
1279
1280      __inputOffset = input + line;
1281
1282      if(__repeat(scan, 1) != 0) {
1283        ++line;
1284        __inputOffset = input + line;
1285      } else
1286        return false;
1287    }
1288
1289  } else {
1290    arg = __repeat(scan, arg);
1291
1292    if(line < arg && OpCode._opType[__program[next]] == OpCode._EOL &&
1293       ((!__multiline && __program[next] != OpCode._MEOL) ||
1294              __program[next] == OpCode._SEOL))
1295      line = arg;
1296
1297    while(arg >= line) {
1298      // there may be a bug here with respect to
1299      // __inputOffset >= __endOffset, but it seems to be right for
1300      // now.  the issue is with __inputOffset being reset later.
1301      // is this test really supposed to happen here?
1302      if(current == -1000 || __inputOffset >= __endOffset ||
1303         __input[__inputOffset] == nextChar) {
1304        if(__match(next))
1305    return true;
1306      }
1307
1308      --arg;
1309      __inputOffset = input + arg;
1310    }
1311  }
1312
1313  return false;
1314
1315      case OpCode._SUCCEED:
1316      case OpCode._END:
1317  __inputOffset = input;
1318  // This enforces the rule that two consecutive matches cannot have
1319  // the same end offset.
1320  if(__inputOffset == __lastMatchInputEndOffset)
1321    return false;
1322  return true;
1323
1324      case OpCode._IFMATCH:
1325  __inputOffset = input;
1326  scan = OpCode._getNextOperator(scan);
1327  if(!__match(scan))
1328    return false;
1329  break;
1330
1331      case OpCode._UNLESSM:
1332  __inputOffset = input;
1333  scan = OpCode._getNextOperator(scan);
1334  if(__match(scan))
1335    return false;
1336  break;
1337
1338
1339      default:
1340  // todo: Need to throw an exception here.
1341
1342      } // end switch
1343
1344      //scan = (next > 0 ? next : 0);
1345      scan = next;
1346    } // end while scan
1347
1348
1349
1350    return false;
1351  }
1352
1353
1354  /**
1355   * Set whether or not subsequent calls to {@link #matches matches()}
1356   * or {@link #contains contains()} should treat the input as
1357   * consisting of multiple lines.  The default behavior is for 
1358   * input to be treated as consisting of multiple lines.  This method
1359   * should only be called if the Perl5Pattern used for a match was
1360   * compiled without either of the Perl5Compiler.MULTILINE_MASK or
1361   * Perl5Compiler.SINGLELINE_MASK flags, and you want to alter the
1362   * behavior of how the <b>^</b>, <b>$</b>, and <b>.</b> metacharacters are
1363   * interpreted on the fly.  The compilation options used when compiling
1364   * a pattern ALWAYS override the behavior specified by setMultiline().  See
1365   * {@link Perl5Compiler} for more details.
1366   * <p>
1367   * @param multiline  If set to true treats the input as consisting of
1368   *        multiple lines with respect to the <b>^</b> and <b>$</b> 
1369   *        metacharacters.  If set to false treats the input as consisting
1370   *        of a single line with respect to the <b>^</b> and <b>$</b> 
1371   *        metacharacters.
1372   */
1373  public void setMultiline(boolean multiline) { __multiline = multiline; }
1374
1375
1376  /**
1377   * @return True if the matcher is treating input as consisting of multiple
1378   *         lines with respect to the <b>^</b> and <b>$</b> metacharacters,
1379   *         false otherwise.
1380   */
1381  public boolean isMultiline() { return __multiline; }
1382
1383  char[] _toLower(char[] input) {
1384    int current;
1385    char[] inp;
1386    // todo:
1387    // Certainly not the best way to do case insensitive matching.
1388    // Must definitely change this in some way, but for now we
1389    // do what Perl does and make a copy of the input, converting
1390    // it all to lowercase.  This is truly better handled in the
1391    // compilation phase.
1392    inp = new char[input.length];
1393    System.arraycopy(input, 0, inp, 0, input.length);
1394    input = inp;
1395
1396    // todo: Need to inline toLowerCase()
1397    for(current = 0; current < input.length; current++)
1398      if(Character.isUpperCase(input[current]))
1399  input[current] = Character.toLowerCase(input[current]);
1400
1401    return input;
1402  }
1403
1404
1405  /**
1406   * Determines if a prefix of a string (represented as a char[])
1407   * matches a given pattern, starting from a given offset into the string.
1408   * If a prefix of the string matches the pattern, a MatchResult instance
1409   * representing the match is made accesible via
1410   * {@link #getMatch()}.
1411   * <p>
1412   * This method is useful for certain common token identification tasks
1413   * that are made more difficult without this functionality.
1414   * <p>
1415   * @param input  The char[] to test for a prefix match.
1416   * @param pattern  The Pattern to be matched.
1417   * @param offset The offset at which to start searching for the prefix.
1418   * @return True if input matches pattern, false otherwise.
1419   */
1420  public boolean matchesPrefix(char[] input, Pattern pattern, int offset) {
1421    Perl5Pattern expression;
1422
1423    expression = (Perl5Pattern)pattern;
1424    __originalInput = input;
1425    if(expression._isCaseInsensitive)
1426      input = _toLower(input);
1427
1428    __initInterpreterGlobals(expression, input, 0, input.length, offset);
1429
1430    __lastSuccess = __tryExpression(offset);
1431    __lastMatchResult = null;
1432
1433    return __lastSuccess;
1434  }
1435
1436
1437  /**
1438   * Determines if a prefix of a string (represented as a char[])
1439   * matches a given pattern.
1440   * If a prefix of the string matches the pattern, a MatchResult instance
1441   * representing the match is made accesible via
1442   * {@link #getMatch()}.
1443   * <p>
1444   * This method is useful for certain common token identification tasks
1445   * that are made more difficult without this functionality.
1446   * <p>
1447   * @param input  The char[] to test for a prefix match.
1448   * @param pattern  The Pattern to be matched.
1449   * @return True if input matches pattern, false otherwise.
1450   */
1451  public boolean matchesPrefix(char[] input, Pattern pattern) {
1452    return matchesPrefix(input, pattern, 0);
1453  }
1454
1455
1456  /**
1457   * Determines if a prefix of a string matches a given pattern.
1458   * If a prefix of the string matches the pattern, a MatchResult instance
1459   * representing the match is made accesible via
1460   * {@link #getMatch()}.
1461   * <p>
1462   * This method is useful for certain common token identification tasks
1463   * that are made more difficult without this functionality.
1464   * <p>
1465   * @param input  The String to test for a prefix match.
1466   * @param pattern  The Pattern to be matched.
1467   * @return True if input matches pattern, false otherwise.
1468   */
1469  public boolean matchesPrefix(String input, Pattern pattern) {
1470    return matchesPrefix(input.toCharArray(), pattern, 0);
1471  }
1472
1473  /**
1474   * Determines if a prefix of a PatternMatcherInput instance
1475   * matches a given pattern.  If there is a match, a MatchResult instance
1476   * representing the match is made accesible via
1477   * {@link #getMatch()}.  Unlike the
1478   * {@link #contains(PatternMatcherInput, Pattern)}
1479   * method, the current offset of the PatternMatcherInput argument
1480   * is not updated.  However, unlike the
1481   * {@link #matches matches(PatternMatcherInput, Pattern)} method,
1482   * matchesPrefix() will start its search from the current offset
1483   * rather than the begin offset of the PatternMatcherInput.
1484   * <p>
1485   * This method is useful for certain common token identification tasks
1486   * that are made more difficult without this functionality.
1487   * <p>
1488   * @param input  The PatternMatcherInput to test for a prefix match.
1489   * @param pattern  The Pattern to be matched.
1490   * @return True if input matches pattern, false otherwise.
1491   */
1492  public boolean matchesPrefix(PatternMatcherInput input, Pattern pattern) {
1493    char[] inp;
1494    Perl5Pattern expression;
1495
1496    expression = (Perl5Pattern)pattern;
1497
1498    __originalInput = input._originalBuffer;
1499    if(expression._isCaseInsensitive) {
1500      if(input._toLowerBuffer == null)
1501  input._toLowerBuffer = _toLower(__originalInput);
1502      inp = input._toLowerBuffer;
1503    } else
1504      inp = __originalInput;
1505
1506    __initInterpreterGlobals(expression, inp, input._beginOffset,
1507           input._endOffset, input._currentOffset);
1508    __lastSuccess = __tryExpression(input._currentOffset);
1509    __lastMatchResult = null;
1510
1511    return __lastSuccess;
1512  }
1513
1514
1515
1516  /**
1517   * Determines if a string (repres