Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]
    1   /*   Copyright 2004 The Apache Software Foundation
    2    *
    3    *   Licensed under the Apache License, Version 2.0 (the "License");
    4    *   you may not use this file except in compliance with the License.
    5    *   You may obtain a copy of the License at
    6    *
    7    *       http://www.apache.org/licenses/LICENSE-2.0
    8    *
    9    *   Unless required by applicable law or agreed to in writing, software
   10    *   distributed under the License is distributed on an "AS IS" BASIS,
   11    *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   12    *   See the License for the specific language governing permissions and
   13    *  limitations under the License.
   14    */
   15   
   16   package org.apache.xmlbeans.impl.regex;
   17   
   18   import java.util.Vector;
   19   import java.util.Hashtable;
   20   
   21   /**
   22    * This class represents a node in parse tree.
   23    */
   24   class Token implements java.io.Serializable {
   25       static final boolean COUNTTOKENS = true;
   26       static int tokens = 0;
   27   
   28       static final int CHAR = 0;                  // Literal char
   29       static final int DOT = 11;                  // .
   30       static final int CONCAT = 1;                // XY
   31       static final int UNION = 2;                 // X|Y|Z
   32       static final int CLOSURE = 3;               // X*
   33       static final int RANGE = 4;                 // [a-zA-Z] etc.
   34       static final int NRANGE = 5;                // [^a-zA-Z] etc.
   35       static final int PAREN = 6;                 // (X) or (?:X)
   36       static final int EMPTY = 7;                 //
   37       static final int ANCHOR = 8;                // ^ $ \b \B \< \> \A \Z \z
   38       static final int NONGREEDYCLOSURE = 9;      // *? +?
   39       static final int STRING = 10;               // strings
   40       static final int BACKREFERENCE = 12;        // back references
   41       static final int LOOKAHEAD = 20;            // (?=...)
   42       static final int NEGATIVELOOKAHEAD = 21;    // (?!...)
   43       static final int LOOKBEHIND = 22;           // (?<=...)
   44       static final int NEGATIVELOOKBEHIND = 23;   // (?<!...)
   45       static final int INDEPENDENT = 24;          // (?>...)
   46       static final int MODIFIERGROUP = 25;        // (?ims-ims:...)
   47       static final int CONDITION = 26;            // (?(...)yes|no)
   48   
   49       static final int UTF16_MAX = 0x10ffff;
   50   
   51       int type;
   52   
   53       static Token token_dot;
   54       static Token token_0to9;
   55       static Token token_wordchars;
   56       static Token token_not_0to9;
   57       static Token token_not_wordchars;
   58       static Token token_spaces;
   59       static Token token_not_spaces;
   60       static Token token_empty;
   61       static Token token_linebeginning;
   62       static Token token_linebeginning2;
   63       static Token token_lineend;
   64       static Token token_stringbeginning;
   65       static Token token_stringend;
   66       static Token token_stringend2;
   67       static Token token_wordedge;
   68       static Token token_not_wordedge;
   69       static Token token_wordbeginning;
   70       static Token token_wordend;
   71       static {
   72           Token.token_empty = new Token(Token.EMPTY);
   73   
   74           Token.token_linebeginning = Token.createAnchor('^');
   75           Token.token_linebeginning2 = Token.createAnchor('@');
   76           Token.token_lineend = Token.createAnchor('$');
   77           Token.token_stringbeginning = Token.createAnchor('A');
   78           Token.token_stringend = Token.createAnchor('z');
   79           Token.token_stringend2 = Token.createAnchor('Z');
   80           Token.token_wordedge = Token.createAnchor('b');
   81           Token.token_not_wordedge = Token.createAnchor('B');
   82           Token.token_wordbeginning = Token.createAnchor('<');
   83           Token.token_wordend = Token.createAnchor('>');
   84   
   85           Token.token_dot = new Token(Token.DOT);
   86   
   87           Token.token_0to9 = Token.createRange();
   88           Token.token_0to9.addRange('0', '9');
   89           Token.token_wordchars = Token.createRange();
   90           Token.token_wordchars.addRange('0', '9');
   91           Token.token_wordchars.addRange('A', 'Z');
   92           Token.token_wordchars.addRange('_', '_');
   93           Token.token_wordchars.addRange('a', 'z');
   94           Token.token_spaces = Token.createRange();
   95           Token.token_spaces.addRange('\t', '\t');
   96           Token.token_spaces.addRange('\n', '\n');
   97           Token.token_spaces.addRange('\f', '\f');
   98           Token.token_spaces.addRange('\r', '\r');
   99           Token.token_spaces.addRange(' ', ' ');
  100   
  101           Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
  102           Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
  103           Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
  104       }
  105   
  106       static Token.ParenToken createLook(int type, Token child) {
  107           if (COUNTTOKENS)  Token.tokens ++;
  108           return new Token.ParenToken(type, child, 0);
  109       }
  110       static Token.ParenToken createParen(Token child, int pnumber) {
  111           if (COUNTTOKENS)  Token.tokens ++;
  112           return new Token.ParenToken(Token.PAREN, child, pnumber);
  113       }
  114       static Token.ClosureToken createClosure(Token tok) {
  115           if (COUNTTOKENS)  Token.tokens ++;
  116           return new Token.ClosureToken(Token.CLOSURE, tok);
  117       }
  118       static Token.ClosureToken createNGClosure(Token tok) {
  119           if (COUNTTOKENS)  Token.tokens ++;
  120           return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
  121       }
  122       static Token.ConcatToken createConcat(Token tok1, Token tok2) {
  123           if (COUNTTOKENS)  Token.tokens ++;
  124           return new Token.ConcatToken(tok1, tok2);
  125       }
  126       static Token.UnionToken createConcat() {
  127           if (COUNTTOKENS)  Token.tokens ++;
  128           return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
  129       }
  130       static Token.UnionToken createUnion() {
  131           if (COUNTTOKENS)  Token.tokens ++;
  132           return new Token.UnionToken(Token.UNION);
  133       }
  134       static Token createEmpty() {
  135           return Token.token_empty;
  136       }
  137       static RangeToken createRange() {
  138           if (COUNTTOKENS)  Token.tokens ++;
  139           return new RangeToken(Token.RANGE);
  140       }
  141       static RangeToken createNRange() {
  142           if (COUNTTOKENS)  Token.tokens ++;
  143           return new RangeToken(Token.NRANGE);
  144       }
  145       static Token.CharToken createChar(int ch) {
  146           if (COUNTTOKENS)  Token.tokens ++;
  147           return new Token.CharToken(Token.CHAR, ch);
  148       }
  149       static private Token.CharToken createAnchor(int ch) {
  150           if (COUNTTOKENS)  Token.tokens ++;
  151           return new Token.CharToken(Token.ANCHOR, ch);
  152       }
  153       static Token.StringToken createBackReference(int refno) {
  154           if (COUNTTOKENS)  Token.tokens ++;
  155           return new Token.StringToken(Token.BACKREFERENCE, null, refno);
  156       }
  157       static Token.StringToken createString(String str) {
  158           if (COUNTTOKENS)  Token.tokens ++;
  159           return new Token.StringToken(Token.STRING, str, 0);
  160       }
  161       static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
  162           if (COUNTTOKENS)  Token.tokens ++;
  163           return new Token.ModifierToken(child, add, mask);
  164       }
  165       static Token.ConditionToken createCondition(int refno, Token condition,
  166                                                   Token yespat, Token nopat) {
  167           if (COUNTTOKENS)  Token.tokens ++;
  168           return new Token.ConditionToken(refno, condition, yespat, nopat);
  169       }
  170   
  171       protected Token(int type) {
  172           this.type = type;
  173       }
  174   
  175       /**
  176        * A number of children.
  177        */
  178       int size() {
  179           return 0;
  180       }
  181       Token getChild(int index) {
  182           return null;
  183       }
  184       void addChild(Token tok) {
  185           throw new RuntimeException("Not supported.");
  186       }
  187   
  188                                                   // for RANGE or NRANGE
  189       protected void addRange(int start, int end) {
  190           throw new RuntimeException("Not supported.");
  191       }
  192       protected void sortRanges() {
  193           throw new RuntimeException("Not supported.");
  194       }
  195       protected void compactRanges() {
  196           throw new RuntimeException("Not supported.");
  197       }
  198       protected void mergeRanges(Token tok) {
  199           throw new RuntimeException("Not supported.");
  200       }
  201       protected void subtractRanges(Token tok) {
  202           throw new RuntimeException("Not supported.");
  203       }
  204       protected void intersectRanges(Token tok) {
  205           throw new RuntimeException("Not supported.");
  206       }
  207       static Token complementRanges(Token tok) {
  208           return RangeToken.complementRanges(tok);
  209       }
  210   
  211   
  212       void setMin(int min) {                      // for CLOSURE
  213       }
  214       void setMax(int max) {                      // for CLOSURE
  215       }
  216       int getMin() {                              // for CLOSURE
  217           return -1;
  218       }
  219       int getMax() {                              // for CLOSURE
  220           return -1;
  221       }
  222       int getReferenceNumber() {                  // for STRING
  223           return 0;
  224       }
  225       String getString() {                        // for STRING
  226           return null;
  227       }
  228   
  229       int getParenNumber() {
  230           return 0;
  231       }
  232       int getChar() {
  233           return -1;
  234       }
  235   
  236       public String toString() {
  237           return this.toString(0);
  238       }
  239       public String toString(int options) {
  240           return this.type == Token.DOT ? "." : "";
  241       }
  242   
  243       /**
  244        * How many characters are needed?
  245        */
  246       final int getMinLength() {
  247           switch (this.type) {
  248             case CONCAT:
  249               int sum = 0;
  250               for (int i = 0;  i < this.size();  i ++)
  251                   sum += this.getChild(i).getMinLength();
  252               return sum;
  253   
  254             case CONDITION:
  255             case UNION:
  256               if (this.size() == 0)
  257                   return 0;
  258               int ret = this.getChild(0).getMinLength();
  259               for (int i = 1;  i < this.size();  i ++) {
  260                   int min = this.getChild(i).getMinLength();
  261                   if (min < ret)  ret = min;
  262               }
  263               return ret;
  264   
  265             case CLOSURE:
  266             case NONGREEDYCLOSURE:
  267               if (this.getMin() >= 0)
  268                   return this.getMin() * this.getChild(0).getMinLength();
  269               return 0;
  270   
  271             case EMPTY:
  272             case ANCHOR:
  273               return 0;
  274   
  275             case DOT:
  276             case CHAR:
  277             case RANGE:
  278             case NRANGE:
  279               return 1;
  280   
  281             case INDEPENDENT:
  282             case PAREN:
  283             case MODIFIERGROUP:
  284               return this.getChild(0).getMinLength();
  285   
  286             case BACKREFERENCE:
  287               return 0;                           // *******
  288   
  289             case STRING:
  290               return this.getString().length();
  291   
  292             case LOOKAHEAD:
  293             case NEGATIVELOOKAHEAD:
  294             case LOOKBEHIND:
  295             case NEGATIVELOOKBEHIND:
  296               return 0;                           // ***** Really?
  297   
  298             default:
  299               throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
  300           }
  301       }
  302   
  303       final int getMaxLength() {
  304           switch (this.type) {
  305             case CONCAT:
  306               int sum = 0;
  307               for (int i = 0;  i < this.size();  i ++) {
  308                   int d = this.getChild(i).getMaxLength();
  309                   if (d < 0)  return -1;
  310                   sum += d;
  311               }
  312               return sum;
  313   
  314             case CONDITION:
  315             case UNION:
  316               if (this.size() == 0)
  317                   return 0;
  318               int ret = this.getChild(0).getMaxLength();
  319               for (int i = 1;  ret >= 0 && i < this.size();  i ++) {
  320                   int max = this.getChild(i).getMaxLength();
  321                   if (max < 0) {                  // infinity
  322                       ret = -1;
  323                       break;
  324                   }
  325                   if (max > ret)  ret = max;
  326               }
  327               return ret;
  328   
  329             case CLOSURE:
  330             case NONGREEDYCLOSURE:
  331               if (this.getMax() >= 0)
  332                                                   // When this.child.getMaxLength() < 0,
  333                                                   // this returns minus value
  334                   return this.getMax() * this.getChild(0).getMaxLength();
  335               return -1;
  336   
  337             case EMPTY:
  338             case ANCHOR:
  339               return 0;
  340   
  341             case CHAR:
  342               return 1;
  343             case DOT:
  344             case RANGE:
  345             case NRANGE:
  346               return 2;
  347   
  348             case INDEPENDENT:
  349             case PAREN:
  350             case MODIFIERGROUP:
  351               return this.getChild(0).getMaxLength();
  352   
  353             case BACKREFERENCE:
  354               return -1;                          // ******
  355   
  356             case STRING:
  357               return this.getString().length();
  358   
  359             case LOOKAHEAD:
  360             case NEGATIVELOOKAHEAD:
  361             case LOOKBEHIND:
  362             case NEGATIVELOOKBEHIND:
  363               return 0;                           // ***** Really?
  364   
  365             default:
  366               throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
  367           }
  368       }
  369   
  370       static final int FC_CONTINUE = 0;
  371       static final int FC_TERMINAL = 1;
  372       static final int FC_ANY = 2;
  373       private static final boolean isSet(int options, int flag) {
  374           return (options & flag) == flag;
  375       }
  376       final int analyzeFirstCharacter(RangeToken result, int options) {
  377           switch (this.type) {
  378             case CONCAT:
  379               int ret = FC_CONTINUE;
  380               for (int i = 0;  i < this.size();  i ++)
  381                   if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
  382                       break;
  383               return ret;
  384   
  385             case UNION:
  386               if (this.size() == 0)
  387                   return FC_CONTINUE;
  388               /*
  389                *  a|b|c -> FC_TERMINAL
  390                *  a|.|c -> FC_ANY
  391                *  a|b|  -> FC_CONTINUE
  392                */
  393               int ret2 = FC_CONTINUE;
  394               boolean hasEmpty = false;
  395               for (int i = 0;  i < this.size();  i ++) {
  396                   ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
  397                   if (ret2 == FC_ANY)
  398                       break;
  399                   else if (ret2 == FC_CONTINUE)
  400                       hasEmpty = true;
  401               }
  402               return hasEmpty ? FC_CONTINUE : ret2;
  403   
  404             case CONDITION:
  405               int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
  406               if (this.size() == 1)  return FC_CONTINUE;
  407               if (ret3 == FC_ANY)  return ret3;
  408               int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
  409               if (ret4 == FC_ANY)  return ret4;
  410               return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
  411   
  412             case CLOSURE:
  413             case NONGREEDYCLOSURE:
  414               this.getChild(0).analyzeFirstCharacter(result, options);
  415               return FC_CONTINUE;
  416   
  417             case EMPTY:
  418             case ANCHOR:
  419               return FC_CONTINUE;
  420   
  421             case CHAR:
  422               int ch = this.getChar();
  423               result.addRange(ch, ch);
  424               if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  425                   ch = Character.toUpperCase((char)ch);
  426                   result.addRange(ch, ch);
  427                   ch = Character.toLowerCase((char)ch);
  428                   result.addRange(ch, ch);
  429               }
  430               return FC_TERMINAL;
  431   
  432             case DOT:                             // ****
  433               if (isSet(options, RegularExpression.SINGLE_LINE)) {
  434                   return FC_CONTINUE;             // **** We can not optimize.
  435               } else {
  436                   return FC_CONTINUE;
  437                   /*
  438                   result.addRange(0, RegularExpression.LINE_FEED-1);
  439                   result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
  440                   result.addRange(RegularExpression.CARRIAGE_RETURN+1,
  441                                   RegularExpression.LINE_SEPARATOR-1);
  442                   result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
  443                   return 1;
  444                   */
  445               }
  446   
  447             case RANGE:
  448               if (isSet(options, RegularExpression.IGNORE_CASE)) {
  449                   result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
  450               } else {
  451                   result.mergeRanges(this);
  452               }
  453               return FC_TERMINAL;
  454   
  455             case NRANGE:                          // ****
  456               if (isSet(options, RegularExpression.IGNORE_CASE)) {
  457                   result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
  458               } else {
  459                   result.mergeRanges(Token.complementRanges(this));
  460               }
  461               return FC_TERMINAL;
  462   
  463             case INDEPENDENT:
  464             case PAREN:
  465               return this.getChild(0).analyzeFirstCharacter(result, options);
  466   
  467             case MODIFIERGROUP:
  468               options |= ((ModifierToken)this).getOptions();
  469               options &= ~((ModifierToken)this).getOptionsMask();
  470               return this.getChild(0).analyzeFirstCharacter(result, options);
  471   
  472             case BACKREFERENCE:
  473               result.addRange(0, UTF16_MAX);  // **** We can not optimize.
  474               return FC_ANY;
  475   
  476             case STRING:
  477               int cha = this.getString().charAt(0);
  478               int ch2;
  479               if (REUtil.isHighSurrogate(cha)
  480                   && this.getString().length() >= 2
  481                   && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
  482                   cha = REUtil.composeFromSurrogates(cha, ch2);
  483               result.addRange(cha, cha);
  484               if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  485                   cha = Character.toUpperCase((char)cha);
  486                   result.addRange(cha, cha);
  487                   cha = Character.toLowerCase((char)cha);
  488                   result.addRange(cha, cha);
  489               }
  490               return FC_TERMINAL;
  491   
  492             case LOOKAHEAD:
  493             case NEGATIVELOOKAHEAD:
  494             case LOOKBEHIND:
  495             case NEGATIVELOOKBEHIND:
  496               return FC_CONTINUE;
  497   
  498             default:
  499               throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
  500           }
  501       }
  502   
  503       private final boolean isShorterThan(Token tok) {
  504           if (tok == null)  return false;
  505           /*
  506           int mylength;
  507           if (this.type == STRING)  mylength = this.getString().length();
  508           else if (this.type == CHAR)  mylength = this.getChar() >= 0x10000 ? 2 : 1;
  509           else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  510           int otherlength;
  511           if (tok.type == STRING)  otherlength = tok.getString().length();
  512           else if (tok.type == CHAR)  otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
  513           else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  514           */
  515           int mylength;
  516           if (this.type == STRING)  mylength = this.getString().length();
  517           else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  518           int otherlength;
  519           if (tok.type == STRING)  otherlength = tok.getString().length();
  520           else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  521           return mylength < otherlength;
  522       }
  523   
  524       static class FixedStringContainer {
  525           Token token = null;
  526           int options = 0;
  527           FixedStringContainer() {
  528           }
  529       }
  530   
  531       final void findFixedString(FixedStringContainer container, int options) {
  532           switch (this.type) {
  533             case CONCAT:
  534               Token prevToken = null;
  535               int prevOptions = 0;
  536               for (int i = 0;  i < this.size();  i ++) {
  537                   this.getChild(i).findFixedString(container, options);
  538                   if (prevToken == null || prevToken.isShorterThan(container.token)) {
  539                       prevToken = container.token;
  540                       prevOptions = container.options;
  541                   }
  542               }
  543               container.token = prevToken;
  544               container.options = prevOptions;
  545               return;
  546   
  547             case UNION:
  548             case CLOSURE:
  549             case NONGREEDYCLOSURE:
  550             case EMPTY:
  551             case ANCHOR:
  552             case RANGE:
  553             case DOT:
  554             case NRANGE:
  555             case BACKREFERENCE:
  556             case LOOKAHEAD:
  557             case NEGATIVELOOKAHEAD:
  558             case LOOKBEHIND:
  559             case NEGATIVELOOKBEHIND:
  560             case CONDITION:
  561               container.token = null;
  562               return;
  563   
  564             case CHAR:                            // Ignore CHAR tokens.
  565               container.token = null;             // **
  566               return;                             // **
  567   
  568             case STRING:
  569               container.token = this;
  570               container.options = options;
  571               return;
  572   
  573             case INDEPENDENT:
  574             case PAREN:
  575               this.getChild(0).findFixedString(container, options);
  576               return;
  577   
  578             case MODIFIERGROUP:
  579               options |= ((ModifierToken)this).getOptions();
  580               options &= ~((ModifierToken)this).getOptionsMask();
  581               this.getChild(0).findFixedString(container, options);
  582               return;
  583   
  584             default:
  585               throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
  586           }
  587       }
  588   
  589       boolean match(int ch) {
  590           throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
  591       }
  592   
  593       // ------------------------------------------------------
  594       private final static Hashtable categories = new Hashtable();
  595       private final static Hashtable categories2 = new Hashtable();
  596       private static final String[] categoryNames = {
  597           "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
  598           "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
  599           "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
  600           "Pi", "Pf",  // 29, 30
  601           "L", "M", "N", "Z", "C", "P", "S",      // 31-37
  602       };
  603   
  604       // Schema Rec. {Datatypes} - Punctuation 
  605       static final int CHAR_INIT_QUOTE  = 29;     // Pi - initial quote
  606       static final int CHAR_FINAL_QUOTE = 30;     // Pf - final quote
  607       static final int CHAR_LETTER = 31;
  608       static final int CHAR_MARK = 32;
  609       static final int CHAR_NUMBER = 33;
  610       static final int CHAR_SEPARATOR = 34;
  611       static final int CHAR_OTHER = 35;
  612       static final int CHAR_PUNCTUATION = 36;
  613       static final int CHAR_SYMBOL = 37;
  614       
  615       //blockNames in UNICODE 3.1 that supported by XML Schema REC             
  616       private static final String[] blockNames = {
  617           /*0000..007F;*/ "Basic Latin",
  618           /*0080..00FF;*/ "Latin-1 Supplement",
  619           /*0100..017F;*/ "Latin Extended-A",
  620           /*0180..024F;*/ "Latin Extended-B",
  621           /*0250..02AF;*/ "IPA Extensions",
  622           /*02B0..02FF;*/ "Spacing Modifier Letters",
  623           /*0300..036F;*/ "Combining Diacritical Marks",
  624           /*0370..03FF;*/ "Greek",
  625           /*0400..04FF;*/ "Cyrillic",
  626           /*0530..058F;*/ "Armenian",
  627           /*0590..05FF;*/ "Hebrew",
  628           /*0600..06FF;*/ "Arabic",
  629           /*0700..074F;*/ "Syriac",  
  630           /*0780..07BF;*/ "Thaana",
  631           /*0900..097F;*/ "Devanagari",
  632           /*0980..09FF;*/ "Bengali",
  633           /*0A00..0A7F;*/ "Gurmukhi",
  634           /*0A80..0AFF;*/ "Gujarati",
  635           /*0B00..0B7F;*/ "Oriya",
  636           /*0B80..0BFF;*/ "Tamil",
  637           /*0C00..0C7F;*/ "Telugu",
  638           /*0C80..0CFF;*/ "Kannada",
  639           /*0D00..0D7F;*/ "Malayalam",
  640           /*0D80..0DFF;*/ "Sinhala",
  641           /*0E00..0E7F;*/ "Thai",
  642           /*0E80..0EFF;*/ "Lao",
  643           /*0F00..0FFF;*/ "Tibetan",
  644           /*1000..109F;*/ "Myanmar", 
  645           /*10A0..10FF;*/ "Georgian",
  646           /*1100..11FF;*/ "Hangul Jamo",
  647           /*1200..137F;*/ "Ethiopic",
  648           /*13A0..13FF;*/ "Cherokee",
  649           /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
  650           /*1680..169F;*/ "Ogham",
  651           /*16A0..16FF;*/ "Runic",
  652           /*1780..17FF;*/ "Khmer",
  653           /*1800..18AF;*/ "Mongolian",
  654           /*1E00..1EFF;*/ "Latin Extended Additional",
  655           /*1F00..1FFF;*/ "Greek Extended",
  656           /*2000..206F;*/ "General Punctuation",
  657           /*2070..209F;*/ "Superscripts and Subscripts",
  658           /*20A0..20CF;*/ "Currency Symbols",
  659           /*20D0..20FF;*/ "Combining Marks for Symbols",
  660           /*2100..214F;*/ "Letterlike Symbols",
  661           /*2150..218F;*/ "Number Forms",
  662           /*2190..21FF;*/ "Arrows",
  663           /*2200..22FF;*/ "Mathematical Operators",
  664           /*2300..23FF;*/ "Miscellaneous Technical",
  665           /*2400..243F;*/ "Control Pictures",
  666           /*2440..245F;*/ "Optical Character Recognition",
  667           /*2460..24FF;*/ "Enclosed Alphanumerics",
  668           /*2500..257F;*/ "Box Drawing",
  669           /*2580..259F;*/ "Block Elements",
  670           /*25A0..25FF;*/ "Geometric Shapes",
  671           /*2600..26FF;*/ "Miscellaneous Symbols",
  672           /*2700..27BF;*/ "Dingbats",
  673           /*2800..28FF;*/ "Braille Patterns",
  674           /*2E80..2EFF;*/ "CJK Radicals Supplement",
  675           /*2F00..2FDF;*/ "Kangxi Radicals",
  676           /*2FF0..2FFF;*/ "Ideographic Description Characters",
  677           /*3000..303F;*/ "CJK Symbols and Punctuation",
  678           /*3040..309F;*/ "Hiragana",
  679           /*30A0..30FF;*/ "Katakana",
  680           /*3100..312F;*/ "Bopomofo",
  681           /*3130..318F;*/ "Hangul Compatibility Jamo",
  682           /*3190..319F;*/ "Kanbun",
  683           /*31A0..31BF;*/ "Bopomofo Extended",
  684           /*3200..32FF;*/ "Enclosed CJK Letters and Months",
  685           /*3300..33FF;*/ "CJK Compatibility",
  686           /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
  687           /*4E00..9FFF;*/ "CJK Unified Ideographs",
  688           /*A000..A48F;*/ "Yi Syllables",
  689           /*A490..A4CF;*/ "Yi Radicals",
  690           /*AC00..D7A3;*/ "Hangul Syllables",
  691           /*E000..F8FF;*/ "Private Use",
  692           /*F900..FAFF;*/ "CJK Compatibility Ideographs",
  693           /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
  694           /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
  695           /*FE20..FE2F;*/ "Combining Half Marks",
  696           /*FE30..FE4F;*/ "CJK Compatibility Forms",
  697           /*FE50..FE6F;*/ "Small Form Variants",
  698           /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
  699           /*FEFF..FEFF;*/ "Specials",
  700           /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
  701            //missing Specials add manually
  702           /*10300..1032F;*/ "Old Italic",		// 84
  703           /*10330..1034F;*/ "Gothic",
  704           /*10400..1044F;*/ "Deseret",
  705           /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
  706           /*1D100..1D1FF;*/ "Musical Symbols",
  707           /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
  708           /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
  709           /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
  710           /*E0000..E007F;*/ "Tags",
  711           //missing 2 private use add manually
  712   
  713       };
  714       //ADD THOSE MANUALLY
  715       //F0000..FFFFD; "Private Use",
  716       //100000..10FFFD; "Private Use"
  717       //FFF0..FFFD; "Specials", 
  718       static final String blockRanges = 
  719          "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
  720           +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
  721           +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
  722           +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
  723           +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
  724           +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
  725           +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
  726           +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
  727           +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
  728           +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
  729           +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
  730       static final int[] nonBMPBlockRanges = {
  731           0x10300, 0x1032F,       // 84
  732           0x10330, 0x1034F,
  733           0x10400, 0x1044F,
  734           0x1D000, 0x1D0FF,
  735           0x1D100, 0x1D1FF,
  736           0x1D400, 0x1D7FF,
  737           0x20000, 0x2A6D6,
  738           0x2F800, 0x2FA1F,
  739           0xE0000, 0xE007F
  740       };
  741       private static final int NONBMP_BLOCK_START = 84;
  742   
  743       static protected RangeToken getRange(String name, boolean positive) {
  744           if (Token.categories.size() == 0) {
  745               synchronized (Token.categories) {
  746                   Token[] ranges = new Token[Token.categoryNames.length];
  747                   for (int i = 0;  i < ranges.length;  i ++) {
  748                       ranges[i] = Token.createRange();
  749                   }
  750                   int type;
  751                   for (int i = 0;  i < 0x10000;  i ++) {
  752                       type = Character.getType((char)i);
  753                       if (type == Character.START_PUNCTUATION || 
  754                           type == Character.END_PUNCTUATION) {
  755                           //build table of Pi values
  756                           if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
  757                               i == 0x201F || i == 0x2039) {
  758                               type = CHAR_INIT_QUOTE;
  759                           }
  760                           //build table of Pf values
  761                           if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
  762                               type = CHAR_FINAL_QUOTE;
  763                           }
  764                       }
  765                       ranges[type].addRange(i, i);
  766                       switch (type) {
  767                         case Character.UPPERCASE_LETTER:
  768                         case Character.LOWERCASE_LETTER:
  769                         case Character.TITLECASE_LETTER:
  770                         case Character.MODIFIER_LETTER:
  771                         case Character.OTHER_LETTER:
  772                           type = CHAR_LETTER;
  773                           break;
  774                         case Character.NON_SPACING_MARK:
  775                         case Character.COMBINING_SPACING_MARK:
  776                         case Character.ENCLOSING_MARK:
  777                           type = CHAR_MARK;
  778                           break;
  779                         case Character.DECIMAL_DIGIT_NUMBER:
  780                         case Character.LETTER_NUMBER:
  781                         case Character.OTHER_NUMBER:
  782                           type = CHAR_NUMBER;
  783                           break;
  784                         case Character.SPACE_SEPARATOR:
  785                         case Character.LINE_SEPARATOR:
  786                         case Character.PARAGRAPH_SEPARATOR:
  787                           type = CHAR_SEPARATOR;
  788                           break;
  789                         case Character.CONTROL:
  790                         case Character.FORMAT:
  791                         case Character.SURROGATE:
  792                         case Character.PRIVATE_USE:
  793                         case Character.UNASSIGNED:
  794                           type = CHAR_OTHER;
  795                           break;
  796                         case Character.CONNECTOR_PUNCTUATION:
  797                         case Character.DASH_PUNCTUATION:
  798                         case Character.START_PUNCTUATION:
  799                         case Character.END_PUNCTUATION:
  800                         case CHAR_INIT_QUOTE:
  801                         case CHAR_FINAL_QUOTE:
  802                         case Character.OTHER_PUNCTUATION:
  803                           type = CHAR_PUNCTUATION;
  804                           break;
  805                         case Character.MATH_SYMBOL:
  806                         case Character.CURRENCY_SYMBOL:
  807                         case Character.MODIFIER_SYMBOL:
  808                         case Character.OTHER_SYMBOL:
  809                           type = CHAR_SYMBOL;
  810                           break;
  811                         default:
  812                           throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
  813                       }
  814                       ranges[type].addRange(i, i);
  815                   } // for all characters
  816                   ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
  817   
  818                   for (int i = 0;  i < ranges.length;  i ++) {
  819                       if (Token.categoryNames[i] != null) {
  820                           if (i == Character.UNASSIGNED) { // Unassigned
  821                               ranges[i].addRange(0x10000, Token.UTF16_MAX);
  822                           }
  823                           Token.categories.put(Token.categoryNames[i], ranges[i]);
  824                           Token.categories2.put(Token.categoryNames[i],
  825                                                 Token.complementRanges(ranges[i]));
  826                       }
  827                   }
  828                   //REVISIT: do we really need to support block names as in Unicode 3.1
  829                   //         or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
  830                   //
  831                   StringBuffer buffer = new StringBuffer(50);
  832                   for (int i = 0;  i < Token.blockNames.length;  i ++) {
  833                       Token r1 = Token.createRange();
  834                       int location;
  835                       if (i < NONBMP_BLOCK_START) {
  836                           location = i*2;
  837                           int rstart = Token.blockRanges.charAt(location);
  838                           int rend = Token.blockRanges.charAt(location+1);
  839                           //DEBUGING
  840                           //System.out.println(n+" " +Integer.toHexString(rstart)
  841                           //                     +"-"+ Integer.toHexString(rend));
  842                           r1.addRange(rstart, rend);
  843                       } else {
  844                           location = (i - NONBMP_BLOCK_START) * 2;
  845                           r1.addRange(Token.nonBMPBlockRanges[location],
  846                                       Token.nonBMPBlockRanges[location + 1]);
  847                       }
  848                       String n = Token.blockNames[i];
  849                       if (n.equals("Specials"))
  850                           r1.addRange(0xfff0, 0xfffd);
  851                       if (n.equals("Private Use")) {
  852                           r1.addRange(0xF0000,0xFFFFD);
  853                           r1.addRange(0x100000,0x10FFFD);
  854                       }
  855                       Token.categories.put(n, r1);
  856                       Token.categories2.put(n, Token.complementRanges(r1));
  857                       buffer.setLength(0);
  858                       buffer.append("Is");
  859                       if (n.indexOf(' ') >= 0) {
  860                           for (int ci = 0;  ci < n.length();  ci ++)
  861                               if (n.charAt(ci) != ' ')  buffer.append((char)n.charAt(ci));
  862                       }
  863                       else {
  864                           buffer.append(n);
  865                       }
  866                       Token.setAlias(buffer.toString(), n, true);
  867                   }
  868   
  869                   // TR#18 1.2
  870                   Token.setAlias("ASSIGNED", "Cn", false);
  871                   Token.setAlias("UNASSIGNED", "Cn", true);
  872                   Token all = Token.createRange();
  873                   all.addRange(0, Token.UTF16_MAX);
  874                   Token.categories.put("ALL", all);
  875                   Token.categories2.put("ALL", Token.complementRanges(all));
  876                   Token.registerNonXS("ASSIGNED");
  877                   Token.registerNonXS("UNASSIGNED");
  878                   Token.registerNonXS("ALL");
  879   
  880                   Token isalpha = Token.createRange();
  881                   isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
  882                   isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
  883                   isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
  884                   Token.categories.put("IsAlpha", isalpha);
  885                   Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
  886                   Token.registerNonXS("IsAlpha");
  887   
  888                   Token isalnum = Token.createRange();
  889                   isalnum.mergeRanges(isalpha);   // Lu Ll Lo
  890                   isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
  891                   Token.categories.put("IsAlnum", isalnum);
  892                   Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
  893                   Token.registerNonXS("IsAlnum");
  894   
  895                   Token isspace = Token.createRange();
  896                   isspace.mergeRanges(Token.token_spaces);
  897                   isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
  898                   Token.categories.put("IsSpace", isspace);
  899                   Token.categories2.put("IsSpace", Token.complementRanges(isspace));
  900                   Token.registerNonXS("IsSpace");
  901   
  902                   Token isword = Token.createRange();
  903                   isword.mergeRanges(isalnum);     // Lu Ll Lo Nd
  904                   isword.addRange('_', '_');
  905                   Token.categories.put("IsWord", isword);
  906                   Token.categories2.put("IsWord", Token.complementRanges(isword));
  907                   Token.registerNonXS("IsWord");
  908   
  909                   Token isascii = Token.createRange();
  910                   isascii.addRange(0, 127);
  911                   Token.categories.put("IsASCII", isascii);
  912                   Token.categories2.put("IsASCII", Token.complementRanges(isascii));
  913                   Token.registerNonXS("IsASCII");
  914   
  915                   Token isnotgraph = Token.createRange();
  916                   isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
  917                   isnotgraph.addRange(' ', ' ');
  918                   Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
  919                   Token.categories2.put("IsGraph", isnotgraph);
  920                   Token.registerNonXS("IsGraph");
  921   
  922                   Token isxdigit = Token.createRange();
  923                   isxdigit.addRange('0', '9');
  924                   isxdigit.addRange('A', 'F');
  925                   isxdigit.addRange('a', 'f');
  926                   Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
  927                   Token.categories2.put("IsXDigit", isxdigit);
  928                   Token.registerNonXS("IsXDigit");
  929   
  930                   Token.setAlias("IsDigit", "Nd", true);
  931                   Token.setAlias("IsUpper", "Lu", true);
  932                   Token.setAlias("IsLower", "Ll", true);
  933                   Token.setAlias("IsCntrl", "C", true);
  934                   Token.setAlias("IsPrint", "C", false);
  935                   Token.setAlias("IsPunct", "P", true);
  936                   Token.registerNonXS("IsDigit");
  937                   Token.registerNonXS("IsUpper");
  938                   Token.registerNonXS("IsLower");
  939                   Token.registerNonXS("IsCntrl");
  940                   Token.registerNonXS("IsPrint");
  941                   Token.registerNonXS("IsPunct");
  942   
  943                   Token.setAlias("alpha", "IsAlpha", true);
  944                   Token.setAlias("alnum", "IsAlnum", true);
  945                   Token.setAlias("ascii", "IsASCII", true);
  946                   Token.setAlias("cntrl", "IsCntrl", true);
  947                   Token.setAlias("digit", "IsDigit", true);
  948                   Token.setAlias("graph", "IsGraph", true);
  949                   Token.setAlias("lower", "IsLower", true);
  950                   Token.setAlias("print", "IsPrint", true);
  951                   Token.setAlias("punct", "IsPunct", true);
  952                   Token.setAlias("space", "IsSpace", true);
  953                   Token.setAlias("upper", "IsUpper", true);
  954                   Token.setAlias("word", "IsWord", true); // Perl extension
  955                   Token.setAlias("xdigit", "IsXDigit", true);
  956                   Token.registerNonXS("alpha");
  957                   Token.registerNonXS("alnum");
  958                   Token.registerNonXS("ascii");
  959                   Token.registerNonXS("cntrl");
  960                   Token.registerNonXS("digit");
  961                   Token.registerNonXS("graph");
  962                   Token.registerNonXS("lower");
  963                   Token.registerNonXS("print");
  964                   Token.registerNonXS("punct");
  965                   Token.registerNonXS("space");
  966                   Token.registerNonXS("upper");
  967                   Token.registerNonXS("word");
  968                   Token.registerNonXS("xdigit");
  969               } // synchronized
  970           } // if null
  971           RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
  972               : (RangeToken)Token.categories2.get(name);
  973           //if (tok == null) System.out.println(name);
  974           return tok;
  975       }
  976       static protected RangeToken getRange(String name, boolean positive, boolean xs) {
  977           RangeToken range = Token.getRange(name, positive);
  978           if (xs && range != null && Token.isRegisterNonXS(name))
  979               range = null;
  980           return range;
  981       }
  982   
  983       static Hashtable nonxs = null;
  984       /**
  985        * This method is called by only getRange().
  986        * So this method need not MT-safe.
  987        */
  988       static protected void registerNonXS(String name) {
  989           if (Token.nonxs == null)
  990               Token.nonxs = new Hashtable();
  991           Token.nonxs.put(name, name);
  992       }
  993       static protected boolean isRegisterNonXS(String name) {
  994           if (Token.nonxs == null)
  995               return false;
  996           //DEBUG
  997           //System.err.println("isRegisterNonXS: "+name);
  998           return Token.nonxs.containsKey(name);
  999       }
 1000   
 1001       private static void setAlias(String newName, String name, boolean positive) {
 1002           Token t1 = (Token)Token.categories.get(name);
 1003           Token t2 = (Token)Token.categories2.get(name);
 1004           if (positive) {
 1005               Token.categories.put(newName, t1);
 1006               Token.categories2.put(newName, t2);
 1007           } else {
 1008               Token.categories2.put(newName, t1);
 1009               Token.categories.put(newName, t2);
 1010           }
 1011       }
 1012   
 1013       // ------------------------------------------------------
 1014   
 1015       static final String viramaString =
 1016       "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1017       +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1018       +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1019       +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1020       +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1021       +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1022       +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1023       +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1024       +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
 1025       +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
 1026       +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
 1027   
 1028       static private Token token_grapheme = null;
 1029       static synchronized Token getGraphemePattern() {
 1030           if (Token.token_grapheme != null)
 1031               return Token.token_grapheme;
 1032   
 1033           Token base_char = Token.createRange();  // [{ASSIGNED}]-[{M},{C}]
 1034           base_char.mergeRanges(Token.getRange("ASSIGNED", true));
 1035           base_char.subtractRanges(Token.getRange("M", true));
 1036           base_char.subtractRanges(Token.getRange("C", true));
 1037   
 1038           Token virama = Token.createRange();
 1039           for (int i = 0;  i < Token.viramaString.length();  i ++) {
 1040               int ch = viramaString.charAt(i);
 1041               virama.addRange(i, i);
 1042           }
 1043   
 1044           Token combiner_wo_virama = Token.createRange();
 1045           combiner_wo_virama.mergeRanges(Token.getRange("M", true));
 1046           combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
 1047           combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
 1048   
 1049           Token left = Token.createUnion();       // base_char?
 1050           left.addChild(base_char);
 1051           left.addChild(Token.token_empty);
 1052   
 1053           Token foo = Token.createUnion();
 1054           foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
 1055           foo.addChild(combiner_wo_virama);
 1056   
 1057           foo = Token.createClosure(foo);
 1058   
 1059           foo = Token.createConcat(left, foo);
 1060   
 1061           Token.token_grapheme = foo;
 1062           return Token.token_grapheme;
 1063       }
 1064   
 1065       /**
 1066        * Combing Character Sequence in Perl 5.6.
 1067        */
 1068       static private Token token_ccs = null;
 1069       static synchronized Token getCombiningCharacterSequence() {
 1070           if (Token.token_ccs != null)
 1071               return Token.token_ccs;
 1072   
 1073           Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
 1074           foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
 1075           Token.token_ccs = foo;
 1076           return Token.token_ccs;
 1077       }
 1078   
 1079       // ------------------------------------------------------
 1080   
 1081       // ------------------------------------------------------
 1082       /**
 1083        * This class represents a node in parse tree.
 1084        */
 1085       static class StringToken extends Token implements java.io.Serializable {
 1086           String string;
 1087           int refNumber;
 1088   
 1089           StringToken(int type, String str, int n) {
 1090               super(type);
 1091               this.string = str;
 1092               this.refNumber = n;
 1093           }
 1094   
 1095           int getReferenceNumber() {              // for STRING
 1096               return this.refNumber;
 1097           }
 1098           String getString() {                    // for STRING
 1099               return this.string;
 1100           }
 1101           
 1102           public String toString(int options) {
 1103               if (this.type == BACKREFERENCE)
 1104                   return "\\"+this.refNumber;
 1105               else
 1106                   return REUtil.quoteMeta(this.string);
 1107           }
 1108       }
 1109   
 1110       /**
 1111        * This class represents a node in parse tree.
 1112        */
 1113       static class ConcatToken extends Token implements java.io.Serializable {
 1114           Token child;
 1115           Token child2;
 1116           
 1117           ConcatToken(Token t1, Token t2) {
 1118               super(Token.CONCAT);
 1119               this.child = t1;
 1120               this.child2 = t2;
 1121           }
 1122   
 1123           int size() {
 1124               return 2;
 1125           }
 1126           Token getChild(int index) {
 1127               return index == 0 ? this.child : this.child2;
 1128           }
 1129   
 1130           public String toString(int options) {
 1131               String ret;
 1132               if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
 1133                   ret = this.child.toString(options)+"+";
 1134               } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
 1135                   ret = this.child.toString(options)+"+?";
 1136               } else
 1137                   ret = this.child.toString(options)+this.child2.toString(options);
 1138               return ret;
 1139           }
 1140       }
 1141   
 1142       /**
 1143        * This class represents a node in parse tree.
 1144        */
 1145       static class CharToken extends Token implements java.io.Serializable {
 1146           int chardata;
 1147   
 1148           CharToken(int type, int ch) {
 1149               super(type);
 1150               this.chardata = ch;
 1151           }
 1152   
 1153           int getChar() {
 1154               return this.chardata;
 1155           }
 1156   
 1157           public String toString(int options) {
 1158               String ret;
 1159               switch (this.type) {
 1160                 case CHAR:
 1161                   switch (this.chardata) {
 1162                     case '|':  case '*':  case '+':  case '?':
 1163                     case '(':  case ')':  case '.':  case '[':
 1164                     case '{':  case '\\':
 1165                       ret = "\\"+(char)this.chardata;
 1166                       break;
 1167                     case '\f':  ret = "\\f";  break;
 1168                     case '\n':  ret = "\\n";  break;
 1169                     case '\r':  ret = "\\r";  break;
 1170                     case '\t':  ret = "\\t";  break;
 1171                     case 0x1b:  ret = "\\e";  break;
 1172                       //case 0x0b:  ret = "\\v";  break;
 1173                     default:
 1174                       if (this.chardata >= 0x10000) {
 1175                           String pre = "0"+Integer.toHexString(this.chardata);
 1176                           ret = "\\v"+pre.substring(pre.length()-6, pre.length());
 1177                       } else
 1178                           ret = ""+(char)this.chardata;
 1179                   }
 1180                   break;
 1181   
 1182                 case ANCHOR:
 1183                   if (this == Token.token_linebeginning || this == Token.token_lineend)
 1184                       ret = ""+(char)this.chardata;
 1185                   else 
 1186                       ret = "\\"+(char)this.chardata;
 1187                   break;
 1188   
 1189                 default:
 1190                   ret = null;
 1191               }
 1192               return ret;
 1193           }
 1194   
 1195           boolean match(int ch) {
 1196               if (this.type == CHAR) {
 1197                   return ch == this.chardata;
 1198               } else
 1199                   throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
 1200           }
 1201       }
 1202   
 1203       /**
 1204        * This class represents a node in parse tree.
 1205        */
 1206       static class ClosureToken extends Token implements java.io.Serializable {
 1207           int min;
 1208           int max;
 1209           Token child;
 1210   
 1211           ClosureToken(int type, Token tok) {
 1212               super(type);
 1213               this.child = tok;
 1214               this.setMin(-1);
 1215               this.setMax(-1);
 1216           }
 1217   
 1218           int size() {
 1219               return 1;
 1220           }
 1221           Token getChild(int index) {
 1222               return this.child;
 1223           }
 1224   
 1225           final void setMin(int min) {
 1226               this.min = min;
 1227           }
 1228           final void setMax(int max) {
 1229               this.max = max;
 1230           }
 1231           final int getMin() {
 1232               return this.min;
 1233           }
 1234           final int getMax() {
 1235               return this.max;
 1236           }
 1237   
 1238           public String toString(int options) {
 1239               String ret;
 1240               if (this.type == CLOSURE) {
 1241                   if (this.getMin() < 0 && this.getMax() < 0) {
 1242                       ret = this.child.toString(options)+"*";
 1243                   } else if (this.getMin() == this.getMax()) {
 1244                       ret = this.child.toString(options)+"{"+this.getMin()+"}";
 1245                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
 1246                       ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
 1247                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
 1248                       ret = this.child.toString(options)+"{"+this.getMin()+",}";
 1249                   } else
 1250                       throw new RuntimeException("Token#toString(): CLOSURE "
 1251                                                  +this.getMin()+", "+this.getMax());
 1252               } else {
 1253                   if (this.getMin() < 0 && this.getMax() < 0) {
 1254                       ret = this.child.toString(options)+"*?";
 1255                   } else if (this.getMin() == this.getMax()) {
 1256                       ret = this.child.toString(options)+"{"+this.getMin()+"}?";
 1257                   } else if (this.getMin() >= 0 && this.getMax() >= 0) {
 1258                       ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
 1259                   } else if (this.getMin() >= 0 && this.getMax() < 0) {
 1260                       ret = this.child.toString(options)+"{"+this.getMin()+",}?";
 1261                   } else
 1262                       throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
 1263                                                  +this.getMin()+", "+this.getMax());
 1264               }
 1265               return ret;
 1266           }
 1267       }
 1268   
 1269       /**
 1270        * This class represents a node in parse tree.
 1271        */
 1272       static class ParenToken extends Token implements java.io.Serializable {
 1273           Token child;
 1274           int parennumber;
 1275   
 1276           ParenToken(int type, Token tok, int paren) {
 1277               super(type);
 1278               this.child = tok;
 1279               this.parennumber = paren;
 1280           }
 1281   
 1282           int size() {
 1283               return 1;
 1284           }
 1285           Token getChild(int index) {
 1286               return this.child;
 1287           }
 1288   
 1289           int getParenNumber() {
 1290               return this.parennumber;
 1291           }
 1292   
 1293           public String toString(int options) {
 1294               String ret = null;
 1295               switch (this.type) {
 1296                 case PAREN:
 1297                   if (this.parennumber == 0) {
 1298                       ret = "(?:"+this.child.toString(options)+")";
 1299                   } else {
 1300                       ret = "("+this.child.toString(options)+")";
 1301                   }
 1302                   break;
 1303   
 1304                 case LOOKAHEAD:
 1305                   ret = "(?="+this.child.toString(options)+")";
 1306                   break;
 1307                 case NEGATIVELOOKAHEAD:
 1308                   ret = "(?!"+this.child.toString(options)+")";
 1309                   break;
 1310                 case LOOKBEHIND:
 1311                   ret = "(?<="+this.child.toString(options)+")";
 1312                   break;
 1313                 case NEGATIVELOOKBEHIND:
 1314                   ret = "(?<!"+this.child.toString(options)+")";
 1315                   break;
 1316                 case INDEPENDENT:
 1317                   ret = "(?>"+this.child.toString(options)+")";
 1318                   break;
 1319               }
 1320               return ret;
 1321           }
 1322       }
 1323   
 1324       /**
 1325        * (?(condition)yes-pattern|no-pattern)
 1326        */
 1327       static class ConditionToken extends Token implements java.io.Serializable {
 1328           int refNumber;
 1329           Token condition;
 1330           Token yes;
 1331           Token no;
 1332           ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
 1333               super(Token.CONDITION);
 1334               this.refNumber = refno;
 1335               this.condition = cond;
 1336               this.yes = yespat;
 1337               this.no = nopat;
 1338           }
 1339           int size() {
 1340               return this.no == null ? 1 : 2;
 1341           }
 1342           Token getChild(int index) {
 1343               if (index == 0)  return this.yes;
 1344               if (index == 1)  return this.no;
 1345               throw new RuntimeException("Internal Error: "+index);
 1346           }
 1347   
 1348           public String toString(int options) {
 1349               String ret;
 1350               if (refNumber > 0) {
 1351                   ret = "(?("+refNumber+")";
 1352               } else if (this.condition.type == Token.ANCHOR) {
 1353                   ret = "(?("+this.condition+")";
 1354               } else {
 1355                   ret = "(?"+this.condition;
 1356               }
 1357   
 1358               if (this.no == null) {
 1359                   ret += this.yes+")";
 1360               } else {
 1361                   ret += this.yes+"|"+this.no+")";
 1362               }
 1363               return ret;
 1364           }
 1365       }
 1366   
 1367       /**
 1368        * (ims-ims: .... )
 1369        */
 1370       static class ModifierToken extends Token implements java.io.Serializable {
 1371           Token child;
 1372           int add;
 1373           int mask;
 1374   
 1375           ModifierToken(Token tok, int add, int mask) {
 1376               super(Token.MODIFIERGROUP);
 1377               this.child = tok;
 1378               this.add = add;
 1379               this.mask = mask;
 1380           }
 1381   
 1382           int size() {
 1383               return 1;
 1384           }
 1385           Token getChild(int index) {
 1386               return this.child;
 1387           }
 1388   
 1389           int getOptions() {
 1390               return this.add;
 1391           }
 1392           int getOptionsMask() {
 1393               return this.mask;
 1394           }
 1395   
 1396           public String toString(int options) {
 1397               return "(?"
 1398                   +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
 1399                   +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
 1400                   +":"
 1401                   +this.child.toString(options)
 1402                   +")";
 1403           }
 1404       }
 1405   
 1406       /**
 1407        * This class represents a node in parse tree.
 1408        * for UNION or CONCAT.
 1409        */
 1410       static class UnionToken extends Token implements java.io.Serializable {
 1411           Vector children;
 1412   
 1413           UnionToken(int type) {
 1414               super(type);
 1415           }
 1416   
 1417           void addChild(Token tok) {
 1418               if (tok == null)  return;
 1419               if (this.children == null)  this.children = new Vector();
 1420               if (this.type == UNION) {
 1421                   this.children.addElement(tok);
 1422                   return;
 1423               }
 1424                                                   // This is CONCAT, and new child is CONCAT.
 1425               if (tok.type == CONCAT) {
 1426                   for (int i = 0;  i < tok.size();  i ++)
 1427                       this.addChild(tok.getChild(i)); // Recursion
 1428                   return;
 1429               }
 1430               int size = this.children.size();
 1431               if (size == 0) {
 1432                   this.children.addElement(tok);
 1433                   return;
 1434               }
 1435               Token previous = (Token)this.children.elementAt(size-1);
 1436               if (!((previous.type == CHAR || previous.type == STRING)
 1437                     && (tok.type == CHAR || tok.type == STRING))) {
 1438                   this.children.addElement(tok);
 1439                   return;
 1440               }
 1441               
 1442               //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
 1443   
 1444               StringBuffer buffer;
 1445               int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
 1446               if (previous.type == CHAR) {        // Replace previous token by STRING
 1447                   buffer = new StringBuffer(2 + nextMaxLength);
 1448                   int ch = previous.getChar();
 1449                   if (ch >= 0x10000)
 1450                       buffer.append(REUtil.decomposeToSurrogates(ch));
 1451                   else
 1452                       buffer.append((char)ch);
 1453                   previous = Token.createString(null);
 1454                   this.children.setElementAt(previous, size-1);
 1455               } else {                            // STRING
 1456                   buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
 1457                   buffer.append(previous.getString());
 1458               }
 1459   
 1460               if (tok.type == CHAR) {
 1461                   int ch = tok.getChar();
 1462                   if (ch >= 0x10000)
 1463                       buffer.append(REUtil.decomposeToSurrogates(ch));
 1464                   else
 1465                       buffer.append((char)ch);
 1466               } else {
 1467                   buffer.append(tok.getString());
 1468               }
 1469   
 1470               ((StringToken)previous).string = new String(buffer);
 1471           }
 1472   
 1473           int size() {
 1474               return this.children == null ? 0 : this.children.size();
 1475           }
 1476           Token getChild(int index) {
 1477               return (Token)this.children.elementAt(index);
 1478           }
 1479   
 1480           public String toString(int options) {
 1481               String ret;
 1482               if (this.type == CONCAT) {
 1483                   if (this.children.size() == 2) {
 1484                       Token ch = this.getChild(0);
 1485                       Token ch2 = this.getChild(1);
 1486                       if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
 1487                           ret = ch.toString(options)+"+";
 1488                       } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
 1489                           ret = ch.toString(options)+"+?";
 1490                       } else
 1491                           ret = ch.toString(options)+ch2.toString(options);
 1492                   } else {
 1493                       StringBuffer sb = new StringBuffer();
 1494                       for (int i = 0;  i < this.children.size();  i ++) {
 1495                           sb.append(((Token)this.children.elementAt(i)).toString(options));
 1496                       }
 1497                       ret = new String(sb);
 1498                   }
 1499                   return ret;
 1500               }
 1501               if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
 1502                   ret = this.getChild(0).toString(options)+"?";
 1503               } else if (this.children.size() == 2
 1504                          && this.getChild(0).type == EMPTY) {
 1505                   ret = this.getChild(1).toString(options)+"??";
 1506               } else {
 1507                   StringBuffer sb = new StringBuffer();
 1508                   sb.append(((Token)this.children.elementAt(0)).toString(options));
 1509                   for (int i = 1;  i < this.children.size();  i ++) {
 1510                       sb.append((char)'|');
 1511                       sb.append(((Token)this.children.elementAt(i)).toString(options));
 1512                   }
 1513                   ret = new String(sb);
 1514               }
 1515               return ret;
 1516           }
 1517       }
 1518   }

Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]