Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/xerces/utils/regex/RegexParser.java


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999,2000 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package org.apache.xerces.utils.regex;
59  
60  
61  import java.util.Locale;
62  import java.util.MissingResourceException;
63  import java.util.ResourceBundle;
64  import java.util.Vector;
65  
66  /**
67   * A Regular Expression Parser.
68   */
69  class RegexParser {
70      static final int T_CHAR = 0;
71      static final int T_EOF = 1;
72      static final int T_OR = 2;                  // '|'
73      static final int T_STAR = 3;                // '*'
74      static final int T_PLUS = 4;                // '+'
75      static final int T_QUESTION = 5;            // '?'
76      static final int T_LPAREN = 6;              // '('
77      static final int T_RPAREN = 7;              // ')'
78      static final int T_DOT = 8;                 // '.'
79      static final int T_LBRACKET = 9;            // '['
80      static final int T_BACKSOLIDUS = 10;        // '\'
81      static final int T_CARET = 11;              // '^'
82      static final int T_DOLLAR = 12;             // '$'
83      static final int T_LPAREN2 = 13;            // '(?:'
84      static final int T_LOOKAHEAD = 14;          // '(?='
85      static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
86      static final int T_LOOKBEHIND = 16;         // '(?<='
87      static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
88      static final int T_INDEPENDENT = 18;        // '(?>'
89      static final int T_SET_OPERATIONS = 19;     // '(?['
90      static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
91      static final int T_COMMENT = 21;            // '(?#'
92      static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
93      static final int T_CONDITION = 23;          // '(?('
94      static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
95  
96      static class ReferencePosition {
97          int refNumber;
98          int position;
99          ReferencePosition(int n, int pos) {
100             this.refNumber = n;
101             this.position = pos;
102         }
103     }
104 
105     int offset;
106     String regex;
107     int regexlen;
108     int options;
109     ResourceBundle resources;
110     int chardata;
111     int nexttoken;
112     static protected final int S_NORMAL = 0;
113     static protected final int S_INBRACKETS = 1;
114     static protected final int S_INXBRACKETS = 2;
115     int context = S_NORMAL;
116     int parennumber = 1;
117     boolean hasBackReferences;
118     Vector references = null;
119 
120     public RegexParser() {
121         //TODO IBM-JR this.setLocale(Locale.getDefault());
122     }
123     public RegexParser(Locale locale) {
124         //TODO IBM-JR this.setLocale(locale);
125     }
126 
127     public void setLocale(Locale locale) {
128         try {
129             this.resources = ResourceBundle.getBundle("org.apache.xerces.utils.regex.message", locale);
130         } catch (MissingResourceException mre) {
131             throw new RuntimeException("Installation Problem???  Couldn't load messages: "
132                                        +mre.getMessage());
133         }
134     }
135 
136     final ParseException ex(String key, int loc) {
137         return new ParseException(this.resources.getString(key), loc);
138     }
139 
140     private final boolean isSet(int flag) {
141         return (this.options & flag) == flag;
142     }
143 
144     synchronized Token parse(String regex, int options) throws ParseException {
145         this.options = options;
146         this.offset = 0;
147         this.setContext(S_NORMAL);
148         this.parennumber = 1;
149         this.hasBackReferences = false;
150         this.regex = regex;
151         if (this.isSet(RegularExpression.EXTENDED_COMMENT))
152             this.regex = REUtil.stripExtendedComment(this.regex);
153         this.regexlen = this.regex.length();
154 
155 
156         this.next();
157         Token ret = this.parseRegex();
158         if (this.offset != this.regexlen)
159             throw ex("parser.parse.1", this.offset);
160         if (this.references != null) {
161             for (int i = 0;  i < this.references.size();  i ++) {
162                 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
163                 if (this.parennumber <= position.refNumber)
164                     throw ex("parser.parse.2", position.position);
165             }
166             this.references.removeAllElements();
167         }
168         return ret;
169     }
170 
171     /*
172     public RegularExpression createRegex(String regex, int options) throws ParseException {
173         Token tok = this.parse(regex, options);
174         return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
175     }
176     */
177 
178     protected final void setContext(int con) {
179         this.context = con;
180     }
181 
182     final int read() {
183         return this.nexttoken;
184     }
185 
186     final void next() {
187         if (this.offset >= this.regexlen) {
188             this.chardata = -1;
189             this.nexttoken = T_EOF;
190             return;
191         }
192 
193         int ret;
194         int ch = this.regex.charAt(this.offset++);
195         this.chardata = ch;
196 
197         if (this.context == S_INBRACKETS) {
198             // In a character class, this.chardata has one character, that is to say,
199             // a pair of surrogates is composed and stored to this.chardata.
200             switch (ch) {
201               case '\\':
202                 ret = T_BACKSOLIDUS;
203                 if (this.offset >= this.regexlen)
204                     throw ex("parser.next.1", this.offset-1);
205                 this.chardata = this.regex.charAt(this.offset++);
206                 break;
207 
208               case '-':
209                 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
210                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
211                     this.offset++;
212                     ret = T_XMLSCHEMA_CC_SUBTRACTION;
213                 } else
214                     ret = T_CHAR;
215                 break;
216 
217               case '[':
218                 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
219                     && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
220                     this.offset++;
221                     ret = T_POSIX_CHARCLASS_START;
222                     break;
223                 } // Through down
224               default:
225                 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
226                     int low = this.regex.charAt(this.offset);
227                     if (REUtil.isLowSurrogate(low)) {
228                         this.chardata = REUtil.composeFromSurrogates(ch, low);
229                         this.offset ++;
230                     }
231                 }
232                 ret = T_CHAR;
233             }
234             this.nexttoken = ret;
235             return;
236         }
237 
238         switch (ch) {
239           case '|': ret = T_OR;             break;
240           case '*': ret = T_STAR;           break;
241           case '+': ret = T_PLUS;           break;
242           case '?': ret = T_QUESTION;       break;
243           case ')': ret = T_RPAREN;         break;
244           case '.': ret = T_DOT;            break;
245           case '[': ret = T_LBRACKET;       break;
246           case '^': ret = T_CARET;          break;
247           case '$': ret = T_DOLLAR;         break;
248           case '(':
249             ret = T_LPAREN;
250             if (this.offset >= this.regexlen)
251                 break;
252             if (this.regex.charAt(this.offset) != '?')
253                 break;
254             if (++this.offset >= this.regexlen)
255                 throw ex("parser.next.2", this.offset-1);
256             ch = this.regex.charAt(this.offset++);
257             switch (ch) {
258               case ':':  ret = T_LPAREN2;            break;
259               case '=':  ret = T_LOOKAHEAD;          break;
260               case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
261               case '[':  ret = T_SET_OPERATIONS;     break;
262               case '>':  ret = T_INDEPENDENT;        break;
263               case '<':
264                 if (this.offset >= this.regexlen)
265                     throw ex("parser.next.2", this.offset-3);
266                 ch = this.regex.charAt(this.offset++);
267                 if (ch == '=') {
268                     ret = T_LOOKBEHIND;
269                 } else if (ch == '!') {
270                     ret = T_NEGATIVELOOKBEHIND;
271                 } else
272                     throw ex("parser.next.3", this.offset-3);
273                 break;
274               case '#':
275                 while (this.offset < this.regexlen) {
276                     ch = this.regex.charAt(this.offset++);
277                     if (ch == ')')  break;
278                 }
279                 if (ch != ')')
280                     throw ex("parser.next.4", this.offset-1);
281                 ret = T_COMMENT;
282                 break;
283               default:
284                 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
285                     this.offset --;
286                     ret = T_MODIFIERS;
287                     break;
288                 } else if (ch == '(') {         // conditional
289                     ret = T_CONDITION;          // this.offsets points the next of '('.
290                     break;
291                 }
292                 throw ex("parser.next.2", this.offset-2);
293             }
294             break;
295             
296           case '\\':
297             ret = T_BACKSOLIDUS;
298             if (this.offset >= this.regexlen)
299                 throw ex("parser.next.1", this.offset-1);
300             this.chardata = this.regex.charAt(this.offset++);
301             break;
302 
303           default:
304             ret = T_CHAR;
305             if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen)
306                 this.chardata = REUtil.composeFromSurrogates(this.chardata,
307                                                              this.regex.charAt(this.offset++));
308         }
309         this.nexttoken = ret;
310     }
311 
312     /**
313      * regex ::= term (`|` term)*
314      * term ::= factor+
315      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
316      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
317      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
318      * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
319      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 
320      */
321     Token parseRegex() throws ParseException {
322         Token tok = this.parseTerm();
323         Token parent = null;
324         while (this.read() == T_OR) {
325             this.next();                    // '|'
326             if (parent == null) {
327                 parent = Token.createUnion();
328                 parent.addChild(tok);
329                 tok = parent;
330             }
331             tok.addChild(this.parseTerm());
332         }
333         return tok;
334     }
335 
336     /**
337      * term ::= factor+
338      */
339     Token parseTerm() throws ParseException {
340         int ch = this.read();
341         if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
342             return Token.createEmpty();
343         } else {
344             Token tok = this.parseFactor();
345             Token concat = null;
346             while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
347                 if (concat == null) {
348                     concat = Token.createConcat();
349                     concat.addChild(tok);
350                     tok = concat;
351                 }
352                 concat.addChild(this.parseFactor());
353                 //tok = Token.createConcat(tok, this.parseFactor());
354             }
355             return tok;
356         }
357     }
358 
359     // ----------------------------------------------------------------
360 
361     Token processCaret() throws ParseException {
362         this.next();
363         return Token.token_linebeginning;
364     }
365     Token processDollar() throws ParseException {
366         this.next();
367         return Token.token_lineend;
368     }
369     Token processLookahead() throws ParseException {
370         this.next();
371         Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
372         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
373         this.next();                            // ')'
374         return tok;
375     }
376     Token processNegativelookahead() throws ParseException {
377         this.next();
378         Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
379         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
380         this.next();                            // ')'
381         return tok;
382     }
383     Token processLookbehind() throws ParseException {
384         this.next();
385         Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
386         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
387         this.next();                            // ')'
388         return tok;
389     }
390     Token processNegativelookbehind() throws ParseException {
391         this.next();
392         Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
393         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
394         this.next();                    // ')'
395         return tok;
396     }
397     Token processBacksolidus_A() throws ParseException {
398         this.next();
399         return Token.token_stringbeginning;
400     }
401     Token processBacksolidus_Z() throws ParseException {
402         this.next();
403         return Token.token_stringend2;
404     }
405     Token processBacksolidus_z() throws ParseException {
406         this.next();
407         return Token.token_stringend;
408     }
409     Token processBacksolidus_b() throws ParseException {
410         this.next();
411         return Token.token_wordedge;
412     }
413     Token processBacksolidus_B() throws ParseException {
414         this.next();
415         return Token.token_not_wordedge;
416     }
417     Token processBacksolidus_lt() throws ParseException {
418         this.next();
419         return Token.token_wordbeginning;
420     }
421     Token processBacksolidus_gt() throws ParseException {
422         this.next();
423         return Token.token_wordend;
424     }
425     Token processStar(Token tok) throws ParseException {
426         this.next();
427         if (this.read() == T_QUESTION) {
428             this.next();
429             return Token.createNGClosure(tok);
430         } else
431             return Token.createClosure(tok);
432     }
433     Token processPlus(Token tok) throws ParseException {
434         // X+ -> XX*
435         this.next();
436         if (this.read() == T_QUESTION) {
437             this.next();
438             return Token.createConcat(tok, Token.createNGClosure(tok));
439         } else
440             return Token.createConcat(tok, Token.createClosure(tok));
441     }
442     Token processQuestion(Token tok) throws ParseException {
443         // X? -> X|
444         this.next();
445         Token par = Token.createUnion();
446         if (this.read() == T_QUESTION) {
447             this.next();
448             par.addChild(Token.createEmpty());
449             par.addChild(tok);
450         } else {
451             par.addChild(tok);
452             par.addChild(Token.createEmpty());
453         }
454         return par;
455     }
456     boolean checkQuestion(int off) {
457         return off < this.regexlen && this.regex.charAt(off) == '?';
458     }
459     Token processParen() throws ParseException {
460         this.next();
461         int p = this.parennumber++;
462         Token tok = Token.createParen(this.parseRegex(), p);
463         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
464         this.next();                            // Skips ')'
465         return tok;
466     }
467     Token processParen2() throws ParseException {
468         this.next();
469         Token tok = Token.createParen(this.parseRegex(), 0);
470         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
471         this.next();                            // Skips ')'
472         return tok;
473     }
474     Token processCondition() throws ParseException {
475                                                 // this.offset points the next of '('
476         if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
477                                                 // Parses a condition.
478         int refno = -1;
479         Token condition = null;
480         int ch = this.regex.charAt(this.offset);
481         if ('1' <= ch && ch <= '9') {
482             refno = ch-'0';
483             this.hasBackReferences = true;
484             if (this.references == null)  this.references = new Vector();
485             this.references.addElement(new ReferencePosition(refno, this.offset));
486             this.offset ++;
487             if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
488             this.offset ++;
489         } else {
490             if (ch == '?')  this.offset --; // Points '('.
491             this.next();
492             condition = this.parseFactor();
493             switch (condition.type) {
494               case Token.LOOKAHEAD:
495               case Token.NEGATIVELOOKAHEAD:
496               case Token.LOOKBEHIND:
497               case Token.NEGATIVELOOKBEHIND:
498                 break;
499               case Token.ANCHOR:
500                 if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
501                 break;
502               default:
503                 throw ex("parser.factor.5", this.offset);
504             }
505         }
506                                                 // Parses yes/no-patterns.
507         this.next();
508         Token yesPattern = this.parseRegex();
509         Token noPattern = null;
510         if (yesPattern.type == Token.UNION) {
511             if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
512             noPattern = yesPattern.getChild(1);
513             yesPattern = yesPattern.getChild(0);
514         }
515         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
516         this.next();
517         return Token.createCondition(refno, condition, yesPattern, noPattern);
518     }
519     Token processModifiers() throws ParseException {
520                                                 // this.offset points the next of '?'.
521                                                 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
522         int add = 0, mask = 0, ch = -1;
523         while (this.offset < this.regexlen) {
524             ch = this.regex.charAt(this.offset);
525             int v = REUtil.getOptionValue(ch);
526             if (v == 0)  break;                 // '-' or ':'?
527             add |= v;
528             this.offset ++;
529         }
530         if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
531         if (ch == '-') {
532             this.offset ++;
533             while (this.offset < this.regexlen) {
534                 ch = this.regex.charAt(this.offset);
535                 int v = REUtil.getOptionValue(ch);
536                 if (v == 0)  break;             // ':'?
537                 mask |= v;
538                 this.offset ++;
539             }
540             if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
541         }
542         Token tok;
543         if (ch == ':') {
544             this.offset ++;
545             this.next();
546             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
547             if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
548             this.next();
549         } else if (ch == ')') {                 // such as (?-i)
550             this.offset ++;
551             this.next();
552             tok = Token.createModifierGroup(this.parseRegex(), add, mask);
553         } else
554             throw ex("parser.factor.3", this.offset);
555 
556         return tok;
557     }
558     Token processIndependent() throws ParseException {
559         this.next();
560         Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
561         if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
562         this.next();                            // Skips ')'
563         return tok;
564     }
565     Token processBacksolidus_c() throws ParseException {
566         int ch2;                                // Must be in 0x0040-0x005f
567         if (this.offset >= this.regexlen
568             || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
569             throw ex("parser.atom.1", this.offset-1);
570         this.next();
571         return Token.createChar(ch2-0x40);
572     }
573     Token processBacksolidus_C() throws ParseException {
574         throw ex("parser.process.1", this.offset);
575     }
576     Token processBacksolidus_i() throws ParseException {
577         Token tok = Token.createChar('i');
578         this.next();
579         return tok;
580     }
581     Token processBacksolidus_I() throws ParseException {
582         throw ex("parser.process.1", this.offset);
583     }
584     Token processBacksolidus_g() throws ParseException {
585         this.next();
586         return Token.getGraphemePattern();
587     }
588     Token processBacksolidus_X() throws ParseException {
589         this.next();
590         return Token.getCombiningCharacterSequence();
591     }
592     Token processBackreference() throws ParseException {
593         int refnum = this.chardata-'0';
594         Token tok = Token.createBackReference(refnum);
595         this.hasBackReferences = true;
596         if (this.references == null)  this.references = new Vector();
597         this.references.addElement(new ReferencePosition(refnum, this.offset-2));
598         this.next();
599         return tok;
600     }
601 
602     // ----------------------------------------------------------------
603 
604     /**
605      * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
606      *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
607      *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
608      *            | '(?#' [^)]* ')'
609      * minmax ::= '{' min (',' max?)? '}'
610      * min ::= [0-9]+
611      * max ::= [0-9]+
612      */
613     Token parseFactor() throws ParseException {        
614         int ch = this.read();
615         Token tok;
616         switch (ch) {
617           case T_CARET:         return this.processCaret();
618           case T_DOLLAR:        return this.processDollar();
619           case T_LOOKAHEAD:     return this.processLookahead();
620           case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
621           case T_LOOKBEHIND:    return this.processLookbehind();
622           case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
623 
624           case T_COMMENT:
625             this.next();
626             return Token.createEmpty();
627 
628           case T_BACKSOLIDUS:
629             switch (this.chardata) {
630               case 'A': return this.processBacksolidus_A();
631               case 'Z': return this.processBacksolidus_Z();
632               case 'z': return this.processBacksolidus_z();
633               case 'b': return this.processBacksolidus_b();
634               case 'B': return this.processBacksolidus_B();
635               case '<': return this.processBacksolidus_lt();
636               case '>': return this.processBacksolidus_gt();
637             }
638                                                 // through down
639         }
640         tok = this.parseAtom();
641         ch = this.read();
642         switch (ch) {
643           case T_STAR:  return this.processStar(tok);
644           case T_PLUS:  return this.processPlus(tok);
645           case T_QUESTION: return this.processQuestion(tok);
646           case T_CHAR:
647             if (this.chardata == '{') {
648                                                 // this.offset -> next of '{'
649                 int off = this.offset;
650                 int min = 0, max = -1;
651                 if (off >= this.regexlen)  break;
652                 ch = this.regex.charAt(off++);
653                 if (ch != ',' && (ch < '0' || ch > '9'))  break;
654                 if (ch != ',') {                // 0-9
655                     min = ch-'0';
656                     while (off < this.regexlen
657                            && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
658                         min = min*10 +ch-'0';
659                         ch = -1;
660                     }
661                     if (ch < 0)  break;
662                 }
663                 //if (off >= this.regexlen)  break;
664                 max = min;
665                 if (ch == ',') {
666                     if (off >= this.regexlen
667                         || ((ch = this.regex.charAt(off++)) < '0' || ch > '9')
668                         && ch != '}')
669                         break;
670                     if (ch == '}') {
671                         max = -1;           // {min,}
672                     } else {
673                         max = ch-'0';       // {min,max}
674                         while (off < this.regexlen
675                                && (ch = this.regex.charAt(off++)) >= '0'
676                                && ch <= '9') {
677                             max = max*10 +ch-'0';
678                             ch = -1;
679                         }
680                         if (ch < 0)  break;
681                         //if (min > max)
682                         //    throw new ParseException("parseFactor(): min > max: "+min+", "+max);
683                     }
684                 }
685                 if (ch != '}')  break;
686                                                 // off -> next of '}'
687                 if (this.checkQuestion(off)) {
688                     tok = Token.createNGClosure(tok);
689                     this.offset = off+1;
690                 } else {
691                     tok = Token.createClosure(tok);
692                     this.offset = off;
693                 }
694                 tok.setMin(min);
695                 tok.setMax(max);
696                 //System.err.println("CLOSURE: "+min+", "+max);
697                 this.next();
698             }
699         }
700         return tok;
701     }
702 
703     /**
704      * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
705      *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
706      *          | '(?>' regex ')'
707      * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
708      */
709     Token parseAtom() throws ParseException {
710         int ch = this.read();
711         Token tok = null;
712         switch (ch) {
713           case T_LPAREN:        return this.processParen();
714           case T_LPAREN2:       return this.processParen2(); // '(?:'
715           case T_CONDITION:     return this.processCondition(); // '(?('
716           case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
717           case T_INDEPENDENT:   return this.processIndependent();
718           case T_DOT:
719             this.next();                    // Skips '.'
720             tok = Token.token_dot;
721             break;
722 
723             /**
724              * char-class ::= '[' ( '^'? range ','?)+ ']'
725              * range ::= '\d' | '\w' | '\s' | category-block | range-char
726              *           | range-char '-' range-char
727              * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
728              * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
729              */
730           case T_LBRACKET:      return this.parseCharacterClass(true);
731           case T_SET_OPERATIONS: return this.parseSetOperations();
732 
733           case T_BACKSOLIDUS:
734             switch (this.chardata) {
735               case 'd':  case 'D':
736               case 'w':  case 'W':
737               case 's':  case 'S':
738                 tok = this.getTokenForShorthand(this.chardata);
739                 this.next();
740                 return tok;
741 
742               case 'e':  case 'f':  case 'n':  case 'r':
743               case 't':  case 'u':  case 'v':  case 'x':
744                 {
745                     int ch2 = this.decodeEscaped();
746                     if (ch2 < 0x10000) {
747                         tok = Token.createChar(ch2);
748                     } else {
749                         tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
750                     }
751                 }
752                 break;
753 
754               case 'c': return this.processBacksolidus_c();
755               case 'C': return this.processBacksolidus_C();
756               case 'i': return this.processBacksolidus_i();
757               case 'I': return this.processBacksolidus_I();
758               case 'g': return this.processBacksolidus_g();
759               case 'X': return this.processBacksolidus_X();
760               case '1':  case '2':  case '3':  case '4':
761               case '5':  case '6':  case '7':  case '8':  case '9':
762                 return this.processBackreference();
763 
764               case 'P':
765               case 'p':
766                 int pstart = this.offset;
767                 tok = processBacksolidus_pP(this.chardata);
768                 if (tok == null)  throw this.ex("parser.atom.5", pstart);
769                 break;
770 
771               default:
772                 tok = Token.createChar(this.chardata);
773             }
774             this.next();
775             break;
776 
777           case T_CHAR:
778             tok = Token.createChar(this.chardata);
779             this.next();
780             break;
781 
782           default:
783             throw this.ex("parser.atom.4", this.offset-1);
784         }
785         return tok;
786     }
787 
788     protected RangeToken processBacksolidus_pP(int c) throws ParseException {
789         boolean positive = c == 'p';
790         this.next();
791         if (this.read() != T_CHAR)  throw this.ex("parser.atom.2", this.offset-1);
792         RangeToken tok;
793         switch (this.chardata) {
794           case 'L':                             // Letter
795             tok = Token.getRange("L", positive);  break;
796           case 'M':                             // Mark
797             tok = Token.getRange("M", positive);  break;
798           case 'N':                             // Number
799             tok = Token.getRange("N", positive);  break;
800           case 'Z':                             // Separator
801             tok = Token.getRange("Z", positive);  break;
802           case 'C':                             // Other
803             tok = Token.getRange("C", positive);  break;
804           case 'P':                             // Punctuation
805             tok = Token.getRange("P", positive);  break;
806           case 'S':                             // Symbol
807             tok = Token.getRange("S", positive);  break;
808           case '{':
809             // this.offset points the next of '{'.
810             //pstart = this.offset;
811             int namestart = this.offset;
812             int nameend = this.regex.indexOf('}', namestart);
813             if (nameend < 0)  throw this.ex("parser.atom.3", this.offset);
814             this.offset = nameend+1;
815             tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
816             /*
817               if (this.isSet(RegularExpression.IGNORE_CASE))
818               tok = RangeToken.createCaseInsensitiveToken(tok);
819             */
820             break;
821 
822           default:
823             throw this.ex("parser.atom.2", this.offset-1);
824         }
825         return tok;
826     }
827 
828     int processCIinCharacterClass(RangeToken tok, int c) {
829         return this.decodeEscaped();
830     }
831 
832     /**
833      * char-class ::= '[' ( '^'? range ','?)+ ']'
834      * range ::= '\d' | '\w' | '\s' | category-block | range-char
835      *           | range-char '-' range-char
836      * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
837      * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
838      */
839     protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
840         this.setContext(S_INBRACKETS);
841         this.next();                            // '['
842         boolean nrange = false;
843         RangeToken base = null;
844         RangeToken tok;
845         if (this.read() == T_CHAR && this.chardata == '^') {
846             nrange = true;
847             this.next();                        // '^'
848             if (useNrange) {
849                 tok = Token.createNRange();
850             } else {
851                 base = Token.createRange();
852                 base.addRange(0, Token.UTF16_MAX);
853                 tok = Token.createRange();
854             }
855         } else {
856             tok = Token.createRange();
857         }
858         int type;
859         boolean firstloop = true;
860         while ((type = this.read()) != T_EOF) {
861             if (type == T_CHAR && this.chardata == ']' && !firstloop)
862                 break;
863             firstloop = false;
864             int c = this.chardata;
865             boolean end = false;
866             if (type == T_BACKSOLIDUS) {
867                 switch (c) {
868                   case 'd':  case 'D':
869                   case 'w':  case 'W':
870                   case 's':  case 'S':
871                     tok.mergeRanges(this.getTokenForShorthand(c));
872                     end = true;
873                     break;
874 
875                   case 'i':  case 'I':
876                   case 'c':  case 'C':
877                     c = this.processCIinCharacterClass(tok, c);
878                     if (c < 0)  end = true;
879                     break;
880                     
881                   case 'p':
882                   case 'P':
883                     int pstart = this.offset;
884                     RangeToken tok2 = this.processBacksolidus_pP(c);
885                     if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
886                     tok.mergeRanges(tok2);
887                     end = true;
888                     break;
889 
890                   default:
891                     c = this.decodeEscaped();
892                 } // \ + c
893             } // backsolidus
894                                                 // POSIX Character class such as [:alnum:]
895             else if (type == T_POSIX_CHARCLASS_START) {
896                 int nameend = this.regex.indexOf(':', this.offset);
897                 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
898                 boolean positive = true;
899                 if (this.regex.charAt(this.offset) == '^') {
900                     this.offset ++;
901                     positive = false;
902                 }
903                 String name = this.regex.substring(this.offset, nameend);
904                 RangeToken range = Token.getRange(name, positive);
905                 if (range == null)  throw this.ex("parser.cc.3", this.offset);
906                 tok.mergeRanges(range);
907                 end = true;
908                 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
909                     throw this.ex("parser.cc.1", nameend);
910                 this.offset = nameend+2;
911             }
912             this.next();
913             if (!end) {                         // if not shorthands...
914                 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
915                     tok.addRange(c, c);
916                 } else {
917                     this.next(); // Skips '-'
918                     if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
919                     if (type == T_CHAR && this.chardata == ']') {
920                         tok.addRange(c, c);
921                         tok.addRange('-', '-');
922                     } else {
923                         int rangeend = this.chardata;
924                         if (type == T_BACKSOLIDUS)
925                             rangeend = this.decodeEscaped();
926                         this.next();
927                         tok.addRange(c, rangeend);
928                     }
929                 }
930             }
931             if (this.isSet(RegularExpression.SPECIAL_COMMA)
932                 && this.read() == T_CHAR && this.chardata == ',')
933                 this.next();
934         }
935         if (this.read() == T_EOF)
936             throw this.ex("parser.cc.2", this.offset);
937         if (!useNrange && nrange) {
938             base.subtractRanges(tok);
939             tok = base;
940         }
941         tok.sortRanges();
942         tok.compactRanges();
943         //tok.dumpRanges();
944         /*
945         if (this.isSet(RegularExpression.IGNORE_CASE))
946             tok = RangeToken.createCaseInsensitiveToken(tok);
947         */
948         this.setContext(S_NORMAL);
949         this.next();                    // Skips ']'
950 
951         return tok;
952     }
953     private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
954         this.setContext(S_INBRACKETS);
955         this.next();                            // '['
956         boolean nrange = false;
957         RangeToken base = null;
958         RangeToken tok;
959         if (this.read() == T_CHAR && this.chardata == '^') {
960             nrange = true;
961             this.next();                        // '^'
962             if (useNrange) {
963                 tok = Token.createNRange();
964             } else {
965                 base = Token.createRange();
966                 base.addRange(0, Token.UTF16_MAX);
967                 tok = Token.createRange();
968             }
969         } else {
970             tok = Token.createRange();
971         }
972         int type;
973         while ((type = this.read()) != T_EOF
974                && !(type == T_CHAR && this.chardata == ']')) {
975             int c = this.chardata;
976             /*
977             if (type == T_CHAR && c == '^') {
978                 this.next();
979                 type = this.read();
980                 c = this.chardata;
981                 if (type == T_EOF)  break;
982 
983                 nrange = !nrange;
984                 if (nrange)
985                     tok = Token.createRange();
986                 else {
987                     base.subtractRanges(tok);
988                     tok = base;
989                 }
990             }
991             */
992             boolean end = false;
993             if (type == T_BACKSOLIDUS) {
994                 switch (c) {
995                   case 'd':  case 'D':
996                   case 'w':  case 'W':
997                   case 's':  case 'S':
998                     tok.mergeRanges(this.getTokenForShorthand(c));
999                     end = true;
1000                    break;
1001
1002                  case 'i':  case 'I':
1003                  case 'c':  case 'C':
1004                    c = this.processCIinCharacterClass(tok, c);
1005                    if (c < 0)  end = true;
1006                    break;
1007                    
1008                  case 'p':
1009                  case 'P':
1010                    boolean positive = c ==  'p';
1011                    int pstart = this.offset;
1012                    this.next();
1013                    if (this.read() != T_CHAR)  throw ex("parser.atom.2", this.offset-1);
1014                    RangeToken tok2 = null;
1015                    switch (this.chardata) {
1016                      case 'L':                 // Letter
1017                        tok2 = Token.getRange("L", positive);  break;
1018                      case 'M':                 // Mark
1019                        tok2 = Token.getRange("M", positive);  break;
1020                      case 'N':                 // Number
1021                        tok2 = Token.getRange("N", positive);  break;
1022                      case 'Z':                 // Separator
1023                        tok2 = Token.getRange("Z", positive);  break;
1024                      case 'C':                 // Other
1025                        tok2 = Token.getRange("C", positive);  break;
1026                      case 'P':                 // Punctuation
1027                        tok2 = Token.getRange("P", positive);  break;
1028                      case 'S':                 // Symbol
1029                        tok2 = Token.getRange("S", positive);  break;
1030                      case '{':
1031                        // this.offset points the next of '{'.
1032                        pstart = this.offset;
1033                        int namestart = this.offset;
1034                        int nameend = this.regex.indexOf('}', namestart);
1035                        if (nameend < 0)  throw ex("parser.atom.3", this.offset);
1036                        this.offset = nameend+1;
1037                        tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive);
1038                        break;
1039
1040                      default:
1041                        throw ex("parser.atom.2", this.offset-1);
1042                    }
1043                    if (tok2 == null)  throw ex("parser.atom.5", pstart);
1044                    tok.mergeRanges(tok2);
1045                    end = true;
1046                    break;
1047
1048                  default:
1049                    c = this.decodeEscaped();
1050                } // \ + c
1051            }