Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]
    1   /*   Copyright 2004 The Apache Software Foundation
    2    *
    3    *   Licensed under the Apache License, Version 2.0 (the "License");
    4    *   you may not use this file except in compliance with the License.
    5    *   You may obtain a copy of the License at
    6    *
    7    *       http://www.apache.org/licenses/LICENSE-2.0
    8    *
    9    *   Unless required by applicable law or agreed to in writing, software
   10    *   distributed under the License is distributed on an "AS IS" BASIS,
   11    *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   12    *   See the License for the specific language governing permissions and
   13    *  limitations under the License.
   14    */
   15   
   16   package org.apache.xmlbeans.impl.regex;
   17   
   18   /**
   19    * This class represents a character class such as [a-z] or a period.
   20    */
   21   final class RangeToken extends Token implements java.io.Serializable {
   22   
   23       int[] ranges;
   24       boolean sorted;
   25       boolean compacted;
   26       RangeToken icaseCache = null;
   27       int[] map = null;
   28       int nonMapIndex;
   29   
   30       RangeToken(int type) {
   31           super(type);
   32           this.setSorted(false);
   33       }
   34   
   35                                                   // for RANGE or NRANGE
   36       protected void addRange(int start, int end) {
   37           this.icaseCache = null;
   38           //System.err.println("Token#addRange(): "+start+" "+end);
   39           int r1, r2;
   40           if (start <= end) {
   41               r1 = start;
   42               r2 = end;
   43           } else {
   44               r1 = end;
   45               r2 = start;
   46           }
   47   
   48           int pos = 0;
   49           if (this.ranges == null) {
   50               this.ranges = new int[2];
   51               this.ranges[0] = r1;
   52               this.ranges[1] = r2;
   53               this.setSorted(true);
   54           } else {
   55               pos = this.ranges.length;
   56               if (this.ranges[pos-1]+1 == r1) {
   57                   this.ranges[pos-1] = r2;
   58                   return;
   59               }
   60               int[] temp = new int[pos+2];
   61               System.arraycopy(this.ranges, 0, temp, 0, pos);
   62               this.ranges = temp;
   63               if (this.ranges[pos-1] >= r1)
   64                   this.setSorted(false);
   65               this.ranges[pos++] = r1;
   66               this.ranges[pos] = r2;
   67               if (!this.sorted)
   68                   this.sortRanges();
   69           }
   70       }
   71   
   72       private final boolean isSorted() {
   73           return this.sorted;
   74       }
   75       private final void setSorted(boolean sort) {
   76           this.sorted = sort;
   77           if (!sort)  this.compacted = false;
   78       }
   79       private final boolean isCompacted() {
   80           return this.compacted;
   81       }
   82       private final void setCompacted() {
   83           this.compacted = true;
   84       }
   85   
   86       protected void sortRanges() {
   87           if (this.isSorted())
   88               return;
   89           if (this.ranges == null)
   90               return;
   91           //System.err.println("Do sorting: "+this.ranges.length);
   92   
   93                                                   // Bubble sort
   94                                                   // Why? -- In many cases,
   95                                                   //         this.ranges has few elements.
   96           for (int i = this.ranges.length-4;  i >= 0;  i -= 2) {
   97               for (int j = 0;  j <= i;  j += 2) {
   98                   if (this.ranges[j] > this.ranges[j+2]
   99                       || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
  100                       int tmp;
  101                       tmp = this.ranges[j+2];
  102                       this.ranges[j+2] = this.ranges[j];
  103                       this.ranges[j] = tmp;
  104                       tmp = this.ranges[j+3];
  105                       this.ranges[j+3] = this.ranges[j+1];
  106                       this.ranges[j+1] = tmp;
  107                   }
  108               }
  109           }
  110           this.setSorted(true);
  111       }
  112   
  113       /**
  114        * this.ranges is sorted.
  115        */
  116       protected void compactRanges() {
  117           boolean DEBUG = false;
  118           if (this.ranges == null || this.ranges.length <= 2)
  119               return;
  120           if (this.isCompacted())
  121               return;
  122           int base = 0;                           // Index of writing point
  123           int target = 0;                         // Index of processing point
  124   
  125           while (target < this.ranges.length) {
  126               if (base != target) {
  127                   this.ranges[base] = this.ranges[target++];
  128                   this.ranges[base+1] = this.ranges[target++];
  129               } else
  130                   target += 2;
  131               int baseend = this.ranges[base+1];
  132               while (target < this.ranges.length) {
  133                   if (baseend+1 < this.ranges[target])
  134                       break;
  135                   if (baseend+1 == this.ranges[target]) {
  136                       if (DEBUG)
  137                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  138                                              +", "+this.ranges[base+1]
  139                                              +"], ["+this.ranges[target]
  140                                              +", "+this.ranges[target+1]
  141                                              +"] -> ["+this.ranges[base]
  142                                              +", "+this.ranges[target+1]
  143                                              +"]");
  144                       this.ranges[base+1] = this.ranges[target+1];
  145                       baseend = this.ranges[base+1];
  146                       target += 2;
  147                   } else if (baseend >= this.ranges[target+1]) {
  148                       if (DEBUG)
  149                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  150                                              +", "+this.ranges[base+1]
  151                                              +"], ["+this.ranges[target]
  152                                              +", "+this.ranges[target+1]
  153                                              +"] -> ["+this.ranges[base]
  154                                              +", "+this.ranges[base+1]
  155                                              +"]");
  156                       target += 2;
  157                   } else if (baseend < this.ranges[target+1]) {
  158                       if (DEBUG)
  159                           System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  160                                              +", "+this.ranges[base+1]
  161                                              +"], ["+this.ranges[target]
  162                                              +", "+this.ranges[target+1]
  163                                              +"] -> ["+this.ranges[base]
  164                                              +", "+this.ranges[target+1]
  165                                              +"]");
  166                       this.ranges[base+1] = this.ranges[target+1];
  167                       baseend = this.ranges[base+1];
  168                       target += 2;
  169                   } else {
  170                       throw new RuntimeException("Token#compactRanges(): Internel Error: ["
  171                                                  +this.ranges[base]
  172                                                  +","+this.ranges[base+1]
  173                                                  +"] ["+this.ranges[target]
  174                                                  +","+this.ranges[target+1]+"]");
  175                   }
  176               } // while
  177               base += 2;
  178           }
  179   
  180           if (base != this.ranges.length) {
  181               int[] result = new int[base];
  182               System.arraycopy(this.ranges, 0, result, 0, base);
  183               this.ranges = result;
  184           }
  185           this.setCompacted();
  186       }
  187   
  188       protected void mergeRanges(Token token) {
  189           RangeToken tok = (RangeToken)token;
  190           this.sortRanges();
  191           tok.sortRanges();
  192           if (tok.ranges == null)
  193               return;
  194           this.icaseCache = null;
  195           this.setSorted(true);
  196           if (this.ranges == null) {
  197               this.ranges = new int[tok.ranges.length];
  198               System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
  199               return;
  200           }
  201           int[] result = new int[this.ranges.length+tok.ranges.length];
  202           for (int i = 0, j = 0, k = 0;  i < this.ranges.length || j < tok.ranges.length;) {
  203               if (i >= this.ranges.length) {
  204                   result[k++] = tok.ranges[j++];
  205                   result[k++] = tok.ranges[j++];
  206               } else if (j >= tok.ranges.length) {
  207                   result[k++] = this.ranges[i++];
  208                   result[k++] = this.ranges[i++];
  209               } else if (tok.ranges[j] < this.ranges[i]
  210                          || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
  211                   result[k++] = tok.ranges[j++];
  212                   result[k++] = tok.ranges[j++];
  213               } else {
  214                   result[k++] = this.ranges[i++];
  215                   result[k++] = this.ranges[i++];
  216               }
  217           }
  218           this.ranges = result;
  219       }
  220   
  221       protected void subtractRanges(Token token) {
  222           if (token.type == NRANGE) {
  223               this.intersectRanges(token);
  224               return;
  225           }
  226           RangeToken tok = (RangeToken)token;
  227           if (tok.ranges == null || this.ranges == null)
  228               return;
  229           this.icaseCache = null;
  230           this.sortRanges();
  231           this.compactRanges();
  232           tok.sortRanges();
  233           tok.compactRanges();
  234   
  235           //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
  236   
  237           int[] result = new int[this.ranges.length+tok.ranges.length];
  238           int wp = 0, src = 0, sub = 0;
  239           while (src < this.ranges.length && sub < tok.ranges.length) {
  240               int srcbegin = this.ranges[src];
  241               int srcend = this.ranges[src+1];
  242               int subbegin = tok.ranges[sub];
  243               int subend = tok.ranges[sub+1];
  244               if (srcend < subbegin) {            // Not overlapped
  245                                                   // src: o-----o
  246                                                   // sub:         o-----o
  247                                                   // res: o-----o
  248                                                   // Reuse sub
  249                   result[wp++] = this.ranges[src++];
  250                   result[wp++] = this.ranges[src++];
  251               } else if (srcend >= subbegin
  252                          && srcbegin <= subend) { // Overlapped
  253                                                   // src:    o--------o
  254                                                   // sub:  o----o
  255                                                   // sub:      o----o
  256                                                   // sub:          o----o
  257                                                   // sub:  o------------o
  258                   if (subbegin <= srcbegin && srcend <= subend) {
  259                                                   // src:    o--------o
  260                                                   // sub:  o------------o
  261                                                   // res: empty
  262                                                   // Reuse sub
  263                       src += 2;
  264                   } else if (subbegin <= srcbegin) {
  265                                                   // src:    o--------o
  266                                                   // sub:  o----o
  267                                                   // res:       o-----o
  268                                                   // Reuse src(=res)
  269                       this.ranges[src] = subend+1;
  270                       sub += 2;
  271                   } else if (srcend <= subend) {
  272                                                   // src:    o--------o
  273                                                   // sub:          o----o
  274                                                   // res:    o-----o
  275                                                   // Reuse sub
  276                       result[wp++] = srcbegin;
  277                       result[wp++] = subbegin-1;
  278                       src += 2;
  279                   } else {
  280                                                   // src:    o--------o
  281                                                   // sub:      o----o
  282                                                   // res:    o-o    o-o
  283                                                   // Reuse src(=right res)
  284                       result[wp++] = srcbegin;
  285                       result[wp++] = subbegin-1;
  286                       this.ranges[src] = subend+1;
  287                       sub += 2;
  288                   }
  289               } else if (subend < srcbegin) {
  290                                                   // Not overlapped
  291                                                   // src:          o-----o
  292                                                   // sub: o----o
  293                   sub += 2;
  294               } else {
  295                   throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
  296                                              +","+this.ranges[src+1]
  297                                              +"] - ["+tok.ranges[sub]
  298                                              +","+tok.ranges[sub+1]
  299                                              +"]");
  300               }
  301           }
  302           while (src < this.ranges.length) {
  303               result[wp++] = this.ranges[src++];
  304               result[wp++] = this.ranges[src++];
  305           }
  306           this.ranges = new int[wp];
  307           System.arraycopy(result, 0, this.ranges, 0, wp);
  308                                                   // this.ranges is sorted and compacted.
  309       }
  310   
  311       /**
  312        * @param tok Ignore whether it is NRANGE or not.
  313        */
  314       protected void intersectRanges(Token token) {
  315           RangeToken tok = (RangeToken)token;
  316           if (tok.ranges == null || this.ranges == null)
  317               return;
  318           this.icaseCache = null;
  319           this.sortRanges();
  320           this.compactRanges();
  321           tok.sortRanges();
  322           tok.compactRanges();
  323   
  324           int[] result = new int[this.ranges.length+tok.ranges.length];
  325           int wp = 0, src1 = 0, src2 = 0;
  326           while (src1 < this.ranges.length && src2 < tok.ranges.length) {
  327               int src1begin = this.ranges[src1];
  328               int src1end = this.ranges[src1+1];
  329               int src2begin = tok.ranges[src2];
  330               int src2end = tok.ranges[src2+1];
  331               if (src1end < src2begin) {          // Not overlapped
  332                                                   // src1: o-----o
  333                                                   // src2:         o-----o
  334                                                   // res:  empty
  335                                                   // Reuse src2
  336                   src1 += 2;
  337               } else if (src1end >= src2begin
  338                          && src1begin <= src2end) { // Overlapped
  339                                                   // src1:    o--------o
  340                                                   // src2:  o----o
  341                                                   // src2:      o----o
  342                                                   // src2:          o----o
  343                                                   // src2:  o------------o
  344                   if (src2begin <= src2begin && src1end <= src2end) {
  345                                                   // src1:    o--------o
  346                                                   // src2:  o------------o
  347                                                   // res:     o--------o
  348                                                   // Reuse src2
  349                       result[wp++] = src1begin;
  350                       result[wp++] = src1end;
  351                       src1 += 2;
  352                   } else if (src2begin <= src1begin) {
  353                                                   // src1:    o--------o
  354                                                   // src2:  o----o
  355                                                   // res:     o--o
  356                                                   // Reuse the rest of src1
  357                       result[wp++] = src1begin;
  358                       result[wp++] = src2end;
  359                       this.ranges[src1] = src2end+1;
  360                       src2 += 2;
  361                   } else if (src1end <= src2end) {
  362                                                   // src1:    o--------o
  363                                                   // src2:          o----o
  364                                                   // res:           o--o
  365                                                   // Reuse src2
  366                       result[wp++] = src2begin;
  367                       result[wp++] = src1end;
  368                       src1 += 2;
  369                   } else {
  370                                                   // src1:    o--------o
  371                                                   // src2:      o----o
  372                                                   // res:       o----o
  373                                                   // Reuse the rest of src1
  374                       result[wp++] = src2begin;
  375                       result[wp++] = src2end;
  376                       this.ranges[src1] = src2end+1;
  377                   }
  378               } else if (src2end < src1begin) {
  379                                                   // Not overlapped
  380                                                   // src1:          o-----o
  381                                                   // src2: o----o
  382                   src2 += 2;
  383               } else {
  384                   throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
  385                                              +this.ranges[src1]
  386                                              +","+this.ranges[src1+1]
  387                                              +"] & ["+tok.ranges[src2]
  388                                              +","+tok.ranges[src2+1]
  389                                              +"]");
  390               }
  391           }
  392           while (src1 < this.ranges.length) {
  393               result[wp++] = this.ranges[src1++];
  394               result[wp++] = this.ranges[src1++];
  395           }
  396           this.ranges = new int[wp];
  397           System.arraycopy(result, 0, this.ranges, 0, wp);
  398                                                   // this.ranges is sorted and compacted.
  399       }
  400   
  401       /**
  402        * for RANGE: Creates complement.
  403        * for NRANGE: Creates the same meaning RANGE.
  404        */
  405       static Token complementRanges(Token token) {
  406           if (token.type != RANGE && token.type != NRANGE)
  407               throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
  408           RangeToken tok = (RangeToken)token;
  409           tok.sortRanges();
  410           tok.compactRanges();
  411           int len = tok.ranges.length+2;
  412           if (tok.ranges[0] == 0)
  413               len -= 2;
  414           int last = tok.ranges[tok.ranges.length-1];
  415           if (last == UTF16_MAX)
  416               len -= 2;
  417           RangeToken ret = Token.createRange();
  418           ret.ranges = new int[len];
  419           int wp = 0;
  420           if (tok.ranges[0] > 0) {
  421               ret.ranges[wp++] = 0;
  422               ret.ranges[wp++] = tok.ranges[0]-1;
  423           }
  424           for (int i = 1;  i < tok.ranges.length-2;  i += 2) {
  425               ret.ranges[wp++] = tok.ranges[i]+1;
  426               ret.ranges[wp++] = tok.ranges[i+1]-1;
  427           }
  428           if (last != UTF16_MAX) {
  429               ret.ranges[wp++] = last+1;
  430               ret.ranges[wp] = UTF16_MAX;
  431           }
  432           ret.setCompacted();
  433           return ret;
  434       }
  435   
  436       synchronized RangeToken getCaseInsensitiveToken() {
  437           if (this.icaseCache != null)
  438               return this.icaseCache;
  439               
  440           RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  441           for (int i = 0;  i < this.ranges.length;  i += 2) {
  442               for (int ch = this.ranges[i];  ch <= this.ranges[i+1];  ch ++) {
  443                   if (ch > 0xffff)
  444                       uppers.addRange(ch, ch);
  445                   else {
  446                       char uch = Character.toUpperCase((char)ch);
  447                       uppers.addRange(uch, uch);
  448                   }
  449               }
  450           }
  451           RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  452           for (int i = 0;  i < uppers.ranges.length;  i += 2) {
  453               for (int ch = uppers.ranges[i];  ch <= uppers.ranges[i+1];  ch ++) {
  454                   if (ch > 0xffff)
  455                       lowers.addRange(ch, ch);
  456                   else {
  457                       char uch = Character.toUpperCase((char)ch);
  458                       lowers.addRange(uch, uch);
  459                   }
  460               }
  461           }
  462           lowers.mergeRanges(uppers);
  463           lowers.mergeRanges(this);
  464           lowers.compactRanges();
  465   
  466           this.icaseCache = lowers;
  467           return lowers;
  468       }
  469   
  470       void dumpRanges() {
  471           System.err.print("RANGE: ");
  472           if (this.ranges == null)
  473               System.err.println(" NULL");
  474           for (int i = 0;  i < this.ranges.length;  i += 2) {
  475               System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
  476           }
  477           System.err.println("");
  478       }
  479   
  480       boolean match(int ch) {
  481           if (this.map == null)  this.createMap();
  482           boolean ret;
  483           if (this.type == RANGE) {
  484               if (ch < MAPSIZE)
  485                   return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
  486               ret = false;
  487               for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
  488                   if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  489                       return true;
  490               }
  491           } else {
  492               if (ch < MAPSIZE)
  493                   return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
  494               ret = true;
  495               for (int i = this.nonMapIndex;  i < this.ranges.length;  i += 2) {
  496                   if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  497                       return false;
  498               }
  499           }
  500           return ret;
  501       }
  502   
  503       private static final int MAPSIZE = 256;
  504       private void createMap() {
  505           int asize = MAPSIZE/32;                 // 32 is the number of bits in `int'.
  506           // CHANGE(radup) we need a new map, since this is not synchronized
  507           // and if we init the instance map with 0's it's going to be trouble
  508           // -this.map = new int[asize];
  509           // -this.nonMapIndex = this.ranges.length;
  510           // -for (int i = 0; i < asize; i++) this.map[i] = 0;
  511           int[] localmap = new int[asize]; // +
  512           int localnonMapIndex = this.ranges.length; // +
  513           for (int i = 0;  i < asize;  i ++)  localmap[i] = 0; // + redundant
  514           for (int i = 0;  i < this.ranges.length;  i += 2) {
  515               int s = this.ranges[i];
  516               int e = this.ranges[i+1];
  517               if (s < MAPSIZE) {
  518                   for (int j = s;  j <= e && j < MAPSIZE;  j ++)
  519                       localmap[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
  520               } else {
  521                   localnonMapIndex = i;
  522                   break;
  523               }
  524               if (e >= MAPSIZE) {
  525                   localnonMapIndex = i;
  526                   break;
  527               }
  528           }
  529           this.nonMapIndex = localnonMapIndex; // +
  530           this.map = localmap; // +
  531           //for (int i = 0;  i < asize;  i ++)  System.err.println("Map: "+Integer.toString(this.map[i], 16));
  532       }
  533   
  534       public String toString(int options) {
  535           String ret;
  536           if (this.type == RANGE) {
  537               if (this == Token.token_dot)
  538                   ret = ".";
  539               else if (this == Token.token_0to9)
  540                   ret = "\\d";
  541               else if (this == Token.token_wordchars)
  542                   ret = "\\w";
  543               else if (this == Token.token_spaces)
  544                   ret = "\\s";
  545               else {
  546                   StringBuffer sb = new StringBuffer();
  547                   sb.append("[");
  548                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  549                       if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
  550                       if (this.ranges[i] == this.ranges[i+1]) {
  551                           sb.append(escapeCharInCharClass(this.ranges[i]));
  552                       } else {
  553                           sb.append(escapeCharInCharClass(this.ranges[i]));
  554                           sb.append((char)'-');
  555                           sb.append(escapeCharInCharClass(this.ranges[i+1]));
  556                       }
  557                   }
  558                   sb.append("]");
  559                   ret = sb.toString();
  560               }
  561           } else {
  562               if (this == Token.token_not_0to9)
  563                   ret = "\\D";
  564               else if (this == Token.token_not_wordchars)
  565                   ret = "\\W";
  566               else if (this == Token.token_not_spaces)
  567                   ret = "\\S";
  568               else {
  569                   StringBuffer sb = new StringBuffer();
  570                   sb.append("[^");
  571                   for (int i = 0;  i < this.ranges.length;  i += 2) {
  572                       if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0)  sb.append(",");
  573                       if (this.ranges[i] == this.ranges[i+1]) {
  574                           sb.append(escapeCharInCharClass(this.ranges[i]));
  575                       } else {
  576                           sb.append(escapeCharInCharClass(this.ranges[i]));
  577                           sb.append('-');
  578                           sb.append(escapeCharInCharClass(this.ranges[i+1]));
  579                       }
  580                   }
  581                   sb.append("]");
  582                   ret = sb.toString();
  583               }
  584           }
  585           return ret;
  586       }
  587   
  588       private static String escapeCharInCharClass(int ch) {
  589           String ret;
  590           switch (ch) {
  591             case '[':  case ']':  case '-':  case '^':
  592             case ',':  case '\\':
  593               ret = "\\"+(char)ch;
  594               break;
  595             case '\f':  ret = "\\f";  break;
  596             case '\n':  ret = "\\n";  break;
  597             case '\r':  ret = "\\r";  break;
  598             case '\t':  ret = "\\t";  break;
  599             case 0x1b:  ret = "\\e";  break;
  600             //case 0x0b:  ret = "\\v";  break;
  601             default:
  602               if (ch < 0x20) {
  603                   String pre = "0"+Integer.toHexString(ch);
  604                   ret = "\\x"+pre.substring(pre.length()-2, pre.length());
  605               } else if (ch >= 0x10000) {
  606                   String pre = "0"+Integer.toHexString(ch);
  607                   ret = "\\v"+pre.substring(pre.length()-6, pre.length());
  608               } else
  609                   ret = ""+(char)ch;
  610           }
  611           return ret;
  612       }
  613   
  614   }

Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]