Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/xerces/utils/regex/Token.java


1   /*
2    * The Apache Software License, Version 1.1
3    *
4    *
5    * Copyright (c) 1999,2000 The Apache Software Foundation.  All rights 
6    * reserved.
7    *
8    * Redistribution and use in source and binary forms, with or without
9    * modification, are permitted provided that the following conditions
10   * are met:
11   *
12   * 1. Redistributions of source code must retain the above copyright
13   *    notice, this list of conditions and the following disclaimer. 
14   *
15   * 2. Redistributions in binary form must reproduce the above copyright
16   *    notice, this list of conditions and the following disclaimer in
17   *    the documentation and/or other materials provided with the
18   *    distribution.
19   *
20   * 3. The end-user documentation included with the redistribution,
21   *    if any, must include the following acknowledgment:  
22   *       "This product includes software developed by the
23   *        Apache Software Foundation (http://www.apache.org/)."
24   *    Alternately, this acknowledgment may appear in the software itself,
25   *    if and wherever such third-party acknowledgments normally appear.
26   *
27   * 4. The names "Xerces" and "Apache Software Foundation" must
28   *    not be used to endorse or promote products derived from this
29   *    software without prior written permission. For written 
30   *    permission, please contact apache@apache.org.
31   *
32   * 5. Products derived from this software may not be called "Apache",
33   *    nor may "Apache" appear in their name, without prior written
34   *    permission of the Apache Software Foundation.
35   *
36   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47   * SUCH DAMAGE.
48   * ====================================================================
49   *
50   * This software consists of voluntary contributions made by many
51   * individuals on behalf of the Apache Software Foundation and was
52   * originally based on software copyright (c) 1999, International
53   * Business Machines, Inc., http://www.apache.org.  For more
54   * information on the Apache Software Foundation, please see
55   * <http://www.apache.org/>.
56   */
57  
58  package org.apache.xerces.utils.regex;
59  
60  import java.util.Vector;
61  import java.util.Hashtable;
62  
63  /**
64   * This class represents a node in parse tree.
65   */
66  class Token implements java.io.Serializable {
67      static final boolean COUNTTOKENS = true;
68      static int tokens = 0;
69  
70      static final int CHAR = 0;                  // Literal char
71      static final int DOT = 11;                  // .
72      static final int CONCAT = 1;                // XY
73      static final int UNION = 2;                 // X|Y|Z
74      static final int CLOSURE = 3;               // X*
75      static final int RANGE = 4;                 // [a-zA-Z] etc.
76      static final int NRANGE = 5;                // [^a-zA-Z] etc.
77      static final int PAREN = 6;                 // (X) or (?:X)
78      static final int EMPTY = 7;                 //
79      static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
80      static final int NONGREEDYCLOSURE = 9;      // *? +?
81      static final int STRING = 10;               // strings
82      static final int BACKREFERENCE = 12;        // back references
83      static final int LOOKAHEAD = 20;            // (?=...)
84      static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
85      static final int LOOKBEHIND = 22;           // (?<=...)
86      static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
87      static final int INDEPENDENT = 24;          // (?>...)
88      static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
89      static final int CONDITION = 26;            // (?(...)yes|no)
90  
91      static final int UTF16_MAX = 0x10ffff;
92  
93      int type;
94  
95      static protected Token token_dot;
96      static protected Token token_0to9;
97      static protected Token token_wordchars;
98      static protected Token token_not_0to9;
99      static protected Token token_not_wordchars;
100     static protected Token token_spaces;
101     static protected Token token_not_spaces;
102     static protected Token token_empty;
103     static protected Token token_linebeginning;
104     static protected Token token_linebeginning2;
105     static protected Token token_lineend;
106     static protected Token token_stringbeginning;
107     static protected Token token_stringend;
108     static protected Token token_stringend2;
109     static protected Token token_wordedge;
110     static protected Token token_not_wordedge;
111     static protected Token token_wordbeginning;
112     static protected Token token_wordend;
113     static {
114         Token.token_empty = new Token(Token.EMPTY);
115 
116         Token.token_linebeginning = Token.createAnchor('^');
117         Token.token_linebeginning2 = Token.createAnchor('@');
118         Token.token_lineend = Token.createAnchor('$');
119         Token.token_stringbeginning = Token.createAnchor('A');
120         Token.token_stringend = Token.createAnchor('z');
121         Token.token_stringend2 = Token.createAnchor('Z');
122         Token.token_wordedge = Token.createAnchor('b');
123         Token.token_not_wordedge = Token.createAnchor('B');
124         Token.token_wordbeginning = Token.createAnchor('<');
125         Token.token_wordend = Token.createAnchor('>');
126 
127         Token.token_dot = new Token(Token.DOT);
128 
129         Token.token_0to9 = Token.createRange();
130         Token.token_0to9.addRange('0', '9');
131         Token.token_wordchars = Token.createRange();
132         Token.token_wordchars.addRange('0', '9');
133         Token.token_wordchars.addRange('A', 'Z');
134         Token.token_wordchars.addRange('_', '_');
135         Token.token_wordchars.addRange('a', 'z');
136         Token.token_spaces = Token.createRange();
137         Token.token_spaces.addRange('\t', '\t');
138         Token.token_spaces.addRange('\n', '\n');
139         Token.token_spaces.addRange('\f', '\f');
140         Token.token_spaces.addRange('\r', '\r');
141         Token.token_spaces.addRange(' ', ' ');
142 
143         Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
144         Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
145         Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
146     }
147 
148     static Token.ParenToken createLook(int type, Token child) {
149         if (COUNTTOKENS)  Token.tokens ++;
150         return new Token.ParenToken(type, child, 0);
151     }
152     static Token.ParenToken createParen(Token child, int pnumber) {
153         if (COUNTTOKENS)  Token.tokens ++;
154         return new Token.ParenToken(Token.PAREN, child, pnumber);
155     }
156     static Token.ClosureToken createClosure(Token tok) {
157         if (COUNTTOKENS)  Token.tokens ++;
158         return new Token.ClosureToken(Token.CLOSURE, tok);
159     }
160     static Token.ClosureToken createNGClosure(Token tok) {
161         if (COUNTTOKENS)  Token.tokens ++;
162         return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
163     }
164     static Token.ConcatToken createConcat(Token tok1, Token tok2) {
165         if (COUNTTOKENS)  Token.tokens ++;
166         return new Token.ConcatToken(tok1, tok2);
167     }
168     static Token.UnionToken createConcat() {
169         if (COUNTTOKENS)  Token.tokens ++;
170         return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
171     }
172     static Token.UnionToken createUnion() {
173         if (COUNTTOKENS)  Token.tokens ++;
174         return new Token.UnionToken(Token.UNION);
175     }
176     static Token createEmpty() {
177         return Token.token_empty;
178     }
179     static RangeToken createRange() {
180         if (COUNTTOKENS)  Token.tokens ++;
181         return new RangeToken(Token.RANGE);
182     }
183     static RangeToken createNRange() {
184         if (COUNTTOKENS)  Token.tokens ++;
185         return new RangeToken(Token.NRANGE);
186     }
187     static Token.CharToken createChar(int ch) {
188         if (COUNTTOKENS)  Token.tokens ++;
189         return new Token.CharToken(Token.CHAR, ch);
190     }
191     static private Token.CharToken createAnchor(int ch) {
192         if (COUNTTOKENS)  Token.tokens ++;
193         return new Token.CharToken(Token.ANCHOR, ch);
194     }
195     static Token.StringToken createBackReference(int refno) {
196         if (COUNTTOKENS)  Token.tokens ++;
197         return new Token.StringToken(Token.BACKREFERENCE, null, refno);
198     }
199     static Token.StringToken createString(String str) {
200         if (COUNTTOKENS)  Token.tokens ++;
201         return new Token.StringToken(Token.STRING, str, 0);
202     }
203     static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
204         if (COUNTTOKENS)  Token.tokens ++;
205         return new Token.ModifierToken(child, add, mask);
206     }
207     static Token.ConditionToken createCondition(int refno, Token condition,
208                                                 Token yespat, Token nopat) {
209         if (COUNTTOKENS)  Token.tokens ++;
210         return new Token.ConditionToken(refno, condition, yespat, nopat);
211     }
212 
213     protected Token(int type) {
214         this.type = type;
215     }
216 
217     /**
218      * A number of children.
219      */
220     int size() {
221         return 0;
222     }
223     Token getChild(int index) {
224         return null;
225     }
226     void addChild(Token tok) {
227         throw new RuntimeException("Not supported.");
228     }
229 
230                                                 // for RANGE or NRANGE
231     protected void addRange(int start, int end) {
232         throw new RuntimeException("Not supported.");
233     }
234     protected void sortRanges() {
235         throw new RuntimeException("Not supported.");
236     }
237     protected void compactRanges() {
238         throw new RuntimeException("Not supported.");
239     }
240     protected void mergeRanges(Token tok) {
241         throw new RuntimeException("Not supported.");
242     }
243     protected void subtractRanges(Token tok) {
244         throw new RuntimeException("Not supported.");
245     }
246     protected void intersectRanges(Token tok) {
247         throw new RuntimeException("Not supported.");
248     }
249     static Token complementRanges(Token tok) {
250         return RangeToken.complementRanges(tok);
251     }
252 
253 
254     void setMin(int min) {                      // for CLOSURE
255     }
256     void setMax(int max) {                      // for CLOSURE
257     }
258     int getMin() {                              // for CLOSURE
259         return -1;
260     }
261     int getMax() {                              // for CLOSURE
262         return -1;
263     }
264     int getReferenceNumber() {                  // for STRING
265         return 0;
266     }
267     String getString() {                        // for STRING
268         return null;
269     }
270 
271     int getParenNumber() {
272         return 0;
273     }
274     int getChar() {
275         return -1;
276     }
277 
278     public String toString() {
279         return this.toString(0);
280     }
281     public String toString(int options) {
282         return this.type == Token.DOT ? "." : "";
283     }
284 
285     /**
286      * How many characters are needed?
287      */
288     final int getMinLength() {
289         switch (this.type) {
290           case CONCAT:
291             int sum = 0;
292             for (int i = 0;  i < this.size();  i ++)
293                 sum += this.getChild(i).getMinLength();
294             return sum;
295 
296           case CONDITION:
297           case UNION:
298             if (this.size() == 0)
299                 return 0;
300             int ret = this.getChild(0).getMinLength();
301             for (int i = 1;  i < this.size();  i ++) {
302                 int min = this.getChild(i).getMinLength();
303                 if (min < ret)  ret = min;
304             }
305             return ret;
306 
307           case CLOSURE:
308           case NONGREEDYCLOSURE:
309             if (this.getMin() >= 0)
310                 return this.getMin() * this.getChild(0).getMinLength();
311             return 0;
312 
313           case EMPTY:
314           case ANCHOR:
315             return 0;
316 
317           case DOT:
318           case CHAR:
319           case RANGE:
320           case NRANGE:
321             return 1;
322 
323           case INDEPENDENT:
324           case PAREN:
325           case MODIFIERGROUP:
326             return this.getChild(0).getMinLength();
327 
328           case BACKREFERENCE:
329             return 0;                           // *******
330 
331           case STRING:
332             return this.getString().length();
333 
334           case LOOKAHEAD:
335           case NEGATIVELOOKAHEAD:
336           case LOOKBEHIND:
337           case NEGATIVELOOKBEHIND:
338             return 0;                           // ***** Really?
339 
340           default:
341             throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
342         }
343     }
344 
345     final int getMaxLength() {
346         switch (this.type) {
347           case CONCAT:
348             int sum = 0;
349             for (int i = 0;  i < this.size();  i ++) {
350                 int d = this.getChild(i).getMaxLength();
351                 if (d < 0)  return -1;
352                 sum += d;
353             }
354             return sum;
355 
356           case CONDITION:
357           case UNION:
358             if (this.size() == 0)
359                 return 0;
360             int ret = this.getChild(0).getMaxLength();
361             for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
362                 int max = this.getChild(i).getMaxLength();
363                 if (max < 0) {                  // infinity
364                     ret = -1;
365                     break;
366                 }
367                 if (max > ret)  ret = max;
368             }
369             return ret;
370 
371           case CLOSURE:
372           case NONGREEDYCLOSURE:
373             if (this.getMax() >= 0)
374                                                 // When this.child.getMaxLength() < 0,
375                                                 // this returns minus value
376                 return this.getMax() * this.getChild(0).getMaxLength();
377             return -1;
378 
379           case EMPTY:
380           case ANCHOR:
381             return 0;
382 
383           case CHAR:
384             return 1;
385           case DOT:
386           case RANGE:
387           case NRANGE:
388             return 2;
389 
390           case INDEPENDENT:
391           case PAREN:
392           case MODIFIERGROUP:
393             return this.getChild(0).getMaxLength();
394 
395           case BACKREFERENCE:
396             return -1;                          // ******
397 
398           case STRING:
399             return this.getString().length();
400 
401           case LOOKAHEAD:
402           case NEGATIVELOOKAHEAD:
403           case LOOKBEHIND:
404           case NEGATIVELOOKBEHIND:
405             return 0;                           // ***** Really?
406 
407           default:
408             throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
409         }
410     }
411 
412     static final int FC_CONTINUE = 0;
413     static final int FC_TERMINAL = 1;
414     static final int FC_ANY = 2;
415     private static final boolean isSet(int options, int flag) {
416         return (options & flag) == flag;
417     }
418     final int analyzeFirstCharacter(RangeToken result, int options) {
419         switch (this.type) {
420           case CONCAT:
421             int ret = FC_CONTINUE;
422             for (int i = 0;  i < this.size();  i ++)
423                 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
424                     break;
425             return ret;
426 
427           case UNION:
428             if (this.size() == 0)
429                 return FC_CONTINUE;
430             /*
431              *  a|b|c -> FC_TERMINAL
432              *  a|.|c -> FC_ANY
433              *  a|b|  -> FC_CONTINUE
434              */
435             int ret2 = FC_CONTINUE;
436             boolean hasEmpty = false;
437             for (int i = 0;  i < this.size();  i ++) {
438                 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
439                 if (ret2 == FC_ANY)
440                     break;
441                 else if (ret2 == FC_CONTINUE)
442                     hasEmpty = true;
443             }
444             return hasEmpty ? FC_CONTINUE : ret2;
445 
446           case CONDITION:
447             int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
448             if (this.size() == 1)  return FC_CONTINUE;
449             if (ret3 == FC_ANY)  return ret3;
450             int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
451             if (ret4 == FC_ANY)  return ret4;
452             return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
453 
454           case CLOSURE:
455           case NONGREEDYCLOSURE:
456             this.getChild(0).analyzeFirstCharacter(result, options);
457             return FC_CONTINUE;
458 
459           case EMPTY:
460           case ANCHOR:
461             return FC_CONTINUE;
462 
463           case CHAR:
464             int ch = this.getChar();
465             result.addRange(ch, ch);
466             if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
467                 ch = Character.toUpperCase((char)ch);
468                 result.addRange(ch, ch);
469                 ch = Character.toLowerCase((char)ch);
470                 result.addRange(ch, ch);
471             }
472             return FC_TERMINAL;
473 
474           case DOT:                             // ****
475             if (isSet(options, RegularExpression.SINGLE_LINE)) {
476                 return FC_CONTINUE;             // **** We can not optimize.
477             } else {
478                 return FC_CONTINUE;
479                 /*
480                 result.addRange(0, RegularExpression.LINE_FEED-1);
481                 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
482                 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
483                                 RegularExpression.LINE_SEPARATOR-1);
484                 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
485                 return 1;
486                 */
487             }
488 
489           case RANGE:
490             if (isSet(options, RegularExpression.IGNORE_CASE)) {
491                 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
492             } else {
493                 result.mergeRanges(this);
494             }
495             return FC_TERMINAL;
496 
497           case NRANGE:                          // ****
498             if (isSet(options, RegularExpression.IGNORE_CASE)) {
499                 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
500             } else {
501                 result.mergeRanges(Token.complementRanges(this));
502             }
503             return FC_TERMINAL;
504 
505           case INDEPENDENT:
506           case PAREN:
507             return this.getChild(0).analyzeFirstCharacter(result, options);
508 
509           case MODIFIERGROUP:
510             options |= ((ModifierToken)this).getOptions();
511             options &= ~((ModifierToken)this).getOptionsMask();
512             return this.getChild(0).analyzeFirstCharacter(result, options);
513 
514           case BACKREFERENCE:
515             result.addRange(0, UTF16_MAX);  // **** We can not optimize.
516             return FC_ANY;
517 
518           case STRING:
519             int cha = this.getString().charAt(0);
520             int ch2;
521             if (REUtil.isHighSurrogate(cha)
522                 && this.getString().length() >= 2
523                 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
524                 cha = REUtil.composeFromSurrogates(cha, ch2);
525             result.addRange(cha, cha);
526             if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527                 cha = Character.toUpperCase((char)cha);
528                 result.addRange(cha, cha);
529                 cha = Character.toLowerCase((char)cha);
530                 result.addRange(cha, cha);
531             }
532             return FC_TERMINAL;
533 
534           case LOOKAHEAD:
535           case NEGATIVELOOKAHEAD:
536           case LOOKBEHIND:
537           case NEGATIVELOOKBEHIND:
538             return FC_CONTINUE;
539 
540           default:
541             throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
542         }
543     }
544 
545     private final boolean isShorterThan(Token tok) {
546         if (tok == null)  return false;
547         /*
548         int mylength;
549         if (this.type == STRING)  mylength = this.getString().length();
550         else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
551         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
552         int otherlength;
553         if (tok.type == STRING)  otherlength = tok.getString().length();
554         else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
555         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
556         */
557         int mylength;
558         if (this.type == STRING)  mylength = this.getString().length();
559         else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
560         int otherlength;
561         if (tok.type == STRING)  otherlength = tok.getString().length();
562         else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
563         return mylength < otherlength;
564     }
565 
566     static class FixedStringContainer {
567         Token token = null;
568         int options = 0;
569         FixedStringContainer() {
570         }
571     }
572 
573     final void findFixedString(FixedStringContainer container, int options) {
574         switch (this.type) {
575           case CONCAT:
576             Token prevToken = null;
577             int prevOptions = 0;
578             for (int i = 0;  i < this.size();  i ++) {
579                 this.getChild(i).findFixedString(container, options);
580                 if (prevToken == null || prevToken.isShorterThan(container.token)) {
581                     prevToken = container.token;
582                     prevOptions = container.options;
583                 }
584             }
585             container.token = prevToken;
586             container.options = prevOptions;
587             return;
588 
589           case UNION:
590           case CLOSURE:
591           case NONGREEDYCLOSURE:
592           case EMPTY:
593           case ANCHOR:
594           case RANGE:
595           case DOT:
596           case NRANGE:
597           case BACKREFERENCE:
598           case LOOKAHEAD:
599           case NEGATIVELOOKAHEAD:
600           case LOOKBEHIND:
601           case NEGATIVELOOKBEHIND:
602           case CONDITION:
603             container.token = null;
604             return;
605 
606           case CHAR:                            // Ignore CHAR tokens.
607             container.token = null;             // **
608             return;                             // **
609 
610           case STRING:
611             container.token = this;
612             container.options = options;
613             return;
614 
615           case INDEPENDENT:
616           case PAREN:
617             this.getChild(0).findFixedString(container, options);
618             return;
619 
620           case MODIFIERGROUP:
621             options |= ((ModifierToken)this).getOptions();
622             options &= ~((ModifierToken)this).getOptionsMask();
623             this.getChild(0).findFixedString(container, options);
624             return;
625 
626           default:
627             throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
628         }
629     }
630 
631     boolean match(int ch) {
632         throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
633     }
634 
635     // ------------------------------------------------------
636     static protected Hashtable categories = new Hashtable();
637     static protected Hashtable categories2 = null;
638     static final String[] categoryNames = {
639         "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
640         "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
641         "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
642         "L", "M", "N", "Z", "C", "P", "S",      // 29-35
643     };
644     static final int CHAR_LETTER = 29;
645     static final int CHAR_MARK = 30;
646     static final int CHAR_NUMBER = 31;
647     static final int CHAR_SEPARATOR = 32;
648     static final int CHAR_OTHER = 33;
649     static final int CHAR_PUNCTUATION = 34;
650     static final int CHAR_SYMBOL = 35;
651     static final String[] blockNames = {
652         "Basic Latin",                          // 0
653         "Latin-1 Supplement",
654         "Latin Extended-A",
655         "Latin Extended-B",
656         "IPA Extensions",
657         "Spacing Modifier Letters",
658         "Combining Diacritical Marks",
659         "Greek",
660         "Cyrillic",                             // 8
661         "Armenian",
662         "Hebrew",
663         "Arabic",
664         "Devanagari",
665         "Bengali",
666         "Gurmukhi",
667         "Gujarati",
668         "Oriya",                                // 16
669         "Tamil",
670         "Telugu",
671         "Kannada",
672         "Malayalam",
673         "Thai",
674         "Lao",
675         "Tibetan",
676         "Georgian",                             // 24
677         "Hangul Jamo",
678         "Latin Extended Additional",
679         "Greek Extended",
680         "General Punctuation",
681         "Superscripts and Subscripts",
682         "Currency Symbols",
683         "Combining Marks for Symbols",
684         "Letterlike Symbols",                   // 32
685         "Number Forms",
686         "Arrows",
687         "Mathematical Operators",
688         "Miscellaneous Technical",
689         "Control Pictures",
690         "Optical Character Recognition",
691         "Enclosed Alphanumerics",
692         "Box Drawing",                          // 40
693         "Block Elements",
694         "Geometric Shapes",
695         "Miscellaneous Symbols",
696         "Dingbats",
697         "CJK Symbols and Punctuation",
698         "Hiragana",
699         "Katakana",
700         "Bopomofo",                             // 48
701         "Hangul Compatibility Jamo",
702         "Kanbun",
703         "Enclosed CJK Letters and Months",
704         "CJK Compatibility",
705         "CJK Unified Ideographs",
706         "Hangul Syllables",
707         "High Surrogates",
708         "High Private Use Surrogates",          // 56
709         "Low Surrogates",
710         "Private Use",
711         "CJK Compatibility Ideographs",
712         "Alphabetic Presentation Forms",
713         "Arabic Presentation Forms-A",
714         "Combining Half Marks",
715         "CJK Compatibility Forms",
716         "Small Form Variants",                  // 64
717         "Arabic Presentation Forms-B",
718         "Specials",
719         "Halfwidth and Fullwidth Forms",        // 67
720     };
721     static final String blockRanges =
722     "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF"
723     +"\u0300\u036F\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF"
724     +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF"
725     +"\u0C00\u0C7F\u0C80\u0CFF\u0D00\u0D7F\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FBF"
726     +"\u10A0\u10FF\u1100\u11FF\u1E00\u1EFF\u1F00\u1FFF\u2000\u206F\u2070\u209F"
727     +"\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
728     +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F"
729     +"\u25A0\u25FF\u2600\u26FF\u2700\u27BF\u3000\u303F\u3040\u309F\u30A0\u30FF"
730     +"\u3100\u312F\u3130\u318F\u3190\u319F\u3200\u32FF\u3300\u33FF\u4E00\u9FFF"
731     +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF"
732     +"\uFB00\uFB4F\uFB50\uFDFF\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE"
733     +"\uFEFF\uFEFF\uFF00\uFFEF";
734 
735     static protected RangeToken getRange(String name, boolean positive) {
736         if (Token.categories.size() == 0) {
737             synchronized (Token.categories) {
738                 Token[] ranges = new Token[Token.categoryNames.length];
739                 for (int i = 0;  i < ranges.length;  i ++) {
740                     ranges[i] = Token.createRange();
741                 }
742                 for (int i = 0;  i < 0x10000;  i ++) {
743                     int type = Character.getType((char)i);
744                     ranges[type].addRange(i, i);
745                     switch (type) {
746                       case Character.UPPERCASE_LETTER:
747                       case Character.LOWERCASE_LETTER:
748                       case Character.TITLECASE_LETTER:
749                       case Character.MODIFIER_LETTER:
750                       case Character.OTHER_LETTER:
751                         type = CHAR_LETTER;
752                         break;
753                       case Character.NON_SPACING_MARK:
754                       case Character.COMBINING_SPACING_MARK:
755                       case Character.ENCLOSING_MARK:
756                         type = CHAR_MARK;
757                         break;
758                       case Character.DECIMAL_DIGIT_NUMBER:
759                       case Character.LETTER_NUMBER:
760                       case Character.OTHER_NUMBER:
761                         type = CHAR_NUMBER;
762                         break;
763                       case Character.SPACE_SEPARATOR:
764                       case Character.LINE_SEPARATOR:
765                       case Character.PARAGRAPH_SEPARATOR:
766                         type = CHAR_SEPARATOR;
767                         break;
768                       case Character.CONTROL:
769                       case Character.FORMAT:
770                       case Character.SURROGATE:
771                       case Character.PRIVATE_USE:
772                       case Character.UNASSIGNED:
773                         type = CHAR_OTHER;
774                         break;
775                       case Character.CONNECTOR_PUNCTUATION:
776                       case Character.DASH_PUNCTUATION:
777                       case Character.START_PUNCTUATION:
778                       case Character.END_PUNCTUATION:
779                       case Character.OTHER_PUNCTUATION:
780                         type = CHAR_PUNCTUATION;
781                         break;
782                       case Character.MATH_SYMBOL:
783                       case Character.CURRENCY_SYMBOL:
784                       case Character.MODIFIER_SYMBOL:
785                       case Character.OTHER_SYMBOL:
786                         type = CHAR_SYMBOL;
787                         break;
788                       default:
789                         throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
790                     }
791                     ranges[type].addRange(i, i);
792                 } // for all characters
793                 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
794 
795                 Token.categories2 = new Hashtable();
796                 for (int i = 0;  i < ranges.length;  i ++) {
797                     if (Token.categoryNames[i] != null) {
798                         if (i == Character.UNASSIGNED) { // Unassigned
799                             ranges[i].addRange(0x10000, Token.UTF16_MAX);
800                         }
801                         Token.categories.put(Token.categoryNames[i], ranges[i]);
802                         Token.categories2.put(Token.categoryNames[i],
803                                               Token.complementRanges(ranges[i]));
804                     }
805                 }
806                 for (int i = 0;  i < Token.blockNames.length;  i ++) {
807                     Token r1 = Token.createRange();
808                     int rstart = Token.blockRanges.charAt(i*2);
809                     int rend = Token.blockRanges.charAt(i*2+1);
810                     String n = Token.blockNames[i];
811                     r1.addRange(rstart, rend);
812                     if (n.equals("Specials"))
813                         r1.addRange(0xfff0, 0xfffd);
814                     Token.categories.put(n, r1);
815                     Token.categories2.put(n, Token.complementRanges(r1));
816                     if (n.indexOf(' ') >= 0) {
817                         StringBuffer buffer = new StringBuffer(n.length()+2);
818                         buffer.append("Is");
819                         for (int ci = 0;  ci < n.length();  ci ++)
820                             if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
821                         Token.setAlias(new String(buffer), n, true);
822                     }
823                 }
824 
825                                                 // TR#18 1.2
826                 Token.setAlias("ASSIGNED", "Cn", false);
827                 Token.setAlias("UNASSIGNED", "Cn", true);
828                 Token all = Token.createRange();
829                 all.addRange(0, Token.UTF16_MAX);
830                 Token.categories.put("ALL", all);
831                 Token.categories2.put("ALL", Token.complementRanges(all));
832 
833                 Token isalpha = Token.createRange();
834                 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
835                 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
836                 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
837                 Token.categories.put("IsAlpha", isalpha);
838                 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
839 
840                 Token isalnum = Token.createRange();
841                 isalnum.mergeRanges(isalpha);   // Lu Ll Lo
842                 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
843                 Token.categories.put("IsAlnum", isalnum);
844                 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
845 
846                 Token isspace = Token.createRange();
847                 isspace.mergeRanges(Token.token_spaces);
848                 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
849                 Token.categories.put("IsSpace", isspace);
850                 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
851 
852                 Token isword = Token.createRange();
853                 isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
854                 isword.addRange('_', '_');
855                 Token.categories.put("IsWord", isword);
856                 Token.categories2.put("IsWord", Token.complementRanges(isword));
857 
858                 Token isascii = Token.createRange();
859                 isascii.addRange(0, 127);
860                 Token.categories.put("IsASCII", isascii);
861                 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
862 
863                 Token isnotgraph = Token.createRange();
864                 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
865                 isnotgraph.addRange(' ', ' ');
866                 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
867                 Token.categories2.put("IsGraph", isnotgraph);
868 
869                 Token isxdigit = Token.createRange();
870                 isxdigit.addRange('0', '9');
871                 isxdigit.addRange('A', 'F');
872                 isxdigit.addRange('a', 'f');
873                 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
874                 Token.categories2.put("IsXDigit", isxdigit);
875                 
876                 Token.setAlias("IsDigit", "Nd", true);
877                 Token.setAlias("IsUpper", "Lu", true);
878                 Token.setAlias("IsLower", "Ll", true);
879                 Token.setAlias("IsCntrl", "C", true);
880                 Token.setAlias("IsPrint", "C", false);
881                 Token.setAlias("IsPunct", "P", true);
882 
883                 Token.setAlias("alpha", "IsAlpha", true);
884                 Token.setAlias("alnum", "IsAlnum", true);
885                 Token.setAlias("ascii", "IsASCII", true);
886                 Token.setAlias("cntrl", "IsCntrl", true);
887                 Token.setAlias("digit", "IsDigit", true);
888                 Token.setAlias("graph", "IsGraph", true);
889                 Token.setAlias("lower", "IsLower", true);
890                 Token.setAlias("print", "IsPrint", true);
891                 Token.setAlias("punct", "IsPunct", true);
892                 Token.setAlias("space", "IsSpace", true);
893                 Token.setAlias("upper", "IsUpper", true);
894                 Token.setAlias("word", "IsWord", true); // Perl extension
895                 Token.setAlias("xdigit", "IsXDigit", true);
896 
897             } // synchronized
898         } // if null
899         RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
900             : (RangeToken)Token.categories2.get(name);
901         return tok;
902     }
903 
904     private static void setAlias(String newName, String name, boolean positive) {
905         Token t1 = (Token)Token.categories.get(name);
906         Token t2 = (Token)Token.categories2.get(name);
907         if (positive) {
908             Token.categories.put(newName, t1);
909             Token.categories2.put(newName, t2);
910         } else {
911             Token.categories2.put(newName, t1);
912             Token.categories.put(newName, t2);
913         }
914     }
915 
916     // ------------------------------------------------------
917 
918     static final String viramaString =
919     "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
920     +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
921     +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
922     +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
923     +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
924     +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
925     +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
926     +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
927     +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
928     +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
929     +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
930 
931     static private Token token_grapheme = null;
932     static synchronized protected Token getGraphemePattern() {
933         if (Token.token_grapheme != null)
934             return Token.token_grapheme;
935 
936         Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
937         base_char.mergeRanges(Token.getRange("ASSIGNED", true));
938         base_char.subtractRanges(Token.getRange("M", true));
939         base_char.subtractRanges(Token.getRange("C", true));
940 
941         Token virama = Token.createRange();
942         for (int i = 0;  i < Token.viramaString.length();  i ++) {
943             int ch = viramaString.charAt(i);
944             virama.addRange(i, i);
945         }
946 
947         Token combiner_wo_virama = Token.createRange();
948         combiner_wo_virama.mergeRanges(Token.getRange("M", true));
949         combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
950         combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
951 
952         Token left = Token.createUnion();       // base_char?
953         left.addChild(base_char);
954         left.addChild(Token.token_empty);
955 
956         Token foo = Token.createUnion();
957         foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
958         foo.addChild(combiner_wo_virama);
959 
960         foo = Token.createClosure(foo);
961 
962         foo = Token.createConcat(left, foo);
963 
964         Token.token_grapheme = foo;
965         return Token.token_grapheme;
966     }
967 
968     /**
969      * Combing Character Sequence in Perl 5.6.
970      */
971     static private Token token_ccs = null;
972     static synchronized protected Token getCombiningCharacterSequence() {
973         if (Token.token_ccs != null)
974             return Token.token_ccs;
975 
976         Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
977         foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
978         Token.token_ccs = foo;
979         return Token.token_ccs;
980     }
981 
982     // ------------------------------------------------------
983 
984     // ------------------------------------------------------
985     /**
986      * This class represents a node in parse tree.
987      */
988     static class StringToken extends Token implements java.io.Serializable {
989         String string;
990         int refNumber;
991 
992         StringToken(int type, String str, int n) {
993             super(type);
994             this.string = str;
995             this.refNumber = n;
996         }
997 
998         int getReferenceNumber() {              // for STRING
999             return this.refNumber;
1000        }
1001        String getString() {                    // for STRING
1002            return this.string;
1003        }
1004        
1005        public String toString(int options) {
1006            if (this.type == BACKREFERENCE)
1007                return "\\"+this.refNumber;
1008            else
1009                return REUtil.quoteMeta(this.string);
1010        }
1011    }
1012
1013    /**
1014     * This class represents a node in parse tree.
1015     */
1016    static class ConcatToken extends Token implements java.io.Serializable {
1017        Token child;
1018        Token child2;
1019        
1020        ConcatToken(Token t1, Token t2) {
1021            super(Token.CONCAT);
1022            this.child = t1;
1023            this.child2 = t2;
1024        }
1025
1026        int size() {
1027            return 2;
1028