Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]
    1   /*   Copyright 2004 The Apache Software Foundation
    2    *
    3    *   Licensed under the Apache License, Version 2.0 (the "License");
    4    *   you may not use this file except in compliance with the License.
    5    *   You may obtain a copy of the License at
    6    *
    7    *       http://www.apache.org/licenses/LICENSE-2.0
    8    *
    9    *   Unless required by applicable law or agreed to in writing, software
   10    *   distributed under the License is distributed on an "AS IS" BASIS,
   11    *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   12    *   See the License for the specific language governing permissions and
   13    *  limitations under the License.
   14    */
   15   
   16   package org.apache.xmlbeans.impl.regex;
   17   
   18   import java.util.Locale;
   19   import java.util.MissingResourceException;
   20   import java.util.ResourceBundle;
   21   import java.util.Vector;
   22   
   23   /**
   24    * A Regular Expression Parser.
   25    */
   26   class RegexParser {
   27       static final int T_CHAR = 0;
   28       static final int T_EOF = 1;
   29       static final int T_OR = 2;                  // '|'
   30       static final int T_STAR = 3;                // '*'
   31       static final int T_PLUS = 4;                // '+'
   32       static final int T_QUESTION = 5;            // '?'
   33       static final int T_LPAREN = 6;              // '('
   34       static final int T_RPAREN = 7;              // ')'
   35       static final int T_DOT = 8;                 // '.'
   36       static final int T_LBRACKET = 9;            // '['
   37       static final int T_BACKSOLIDUS = 10;        // '\'
   38       static final int T_CARET = 11;              // '^'
   39       static final int T_DOLLAR = 12;             // '$'
   40       static final int T_LPAREN2 = 13;            // '(?:'
   41       static final int T_LOOKAHEAD = 14;          // '(?='
   42       static final int T_NEGATIVELOOKAHEAD = 15;  // '(?!'
   43       static final int T_LOOKBEHIND = 16;         // '(?<='
   44       static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
   45       static final int T_INDEPENDENT = 18;        // '(?>'
   46       static final int T_SET_OPERATIONS = 19;     // '(?['
   47       static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
   48       static final int T_COMMENT = 21;            // '(?#'
   49       static final int T_MODIFIERS = 22;          // '(?' [\-,a-z,A-Z]
   50       static final int T_CONDITION = 23;          // '(?('
   51       static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
   52   
   53       static class ReferencePosition {
   54           int refNumber;
   55           int position;
   56           ReferencePosition(int n, int pos) {
   57               this.refNumber = n;
   58               this.position = pos;
   59           }
   60       }
   61   
   62       int offset;
   63       String regex;
   64       int regexlen;
   65       int options;
   66       ResourceBundle resources;
   67       int chardata;
   68       int nexttoken;
   69       static protected final int S_NORMAL = 0;
   70       static protected final int S_INBRACKETS = 1;
   71       static protected final int S_INXBRACKETS = 2;
   72       int context = S_NORMAL;
   73       int parennumber = 1;
   74       boolean hasBackReferences;
   75       Vector references = null;
   76   
   77       public RegexParser() {
   78           this.setLocale(Locale.getDefault());
   79       }
   80       public RegexParser(Locale locale) {
   81           this.setLocale(locale);
   82       }
   83   
   84       public void setLocale(Locale locale) {
   85           try {
   86               this.resources = ResourceBundle.getBundle("org.apache.xmlbeans.impl.regex.message", locale);
   87           } catch (MissingResourceException mre) {
   88               throw new RuntimeException("Installation Problem???  Couldn't load messages: "
   89                                          +mre.getMessage());
   90           }
   91       }
   92   
   93       final ParseException ex(String key, int loc) {
   94           return new ParseException(this.resources.getString(key), loc);
   95       }
   96   
   97       private final boolean isSet(int flag) {
   98           return (this.options & flag) == flag;
   99       }
  100   
  101       synchronized Token parse(String regex, int options) throws ParseException {
  102           this.options = options;
  103           this.offset = 0;
  104           this.setContext(S_NORMAL);
  105           this.parennumber = 1;
  106           this.hasBackReferences = false;
  107           this.regex = regex;
  108           if (this.isSet(RegularExpression.EXTENDED_COMMENT))
  109               this.regex = REUtil.stripExtendedComment(this.regex);
  110           this.regexlen = this.regex.length();
  111   
  112   
  113           this.next();
  114           Token ret = this.parseRegex();
  115           if (this.offset != this.regexlen)
  116               throw ex("parser.parse.1", this.offset);
  117           if (this.references != null) {
  118               for (int i = 0;  i < this.references.size();  i ++) {
  119                   ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
  120                   if (this.parennumber <= position.refNumber)
  121                       throw ex("parser.parse.2", position.position);
  122               }
  123               this.references.removeAllElements();
  124           }
  125           return ret;
  126       }
  127   
  128       /*
  129       public RegularExpression createRegex(String regex, int options) throws ParseException {
  130           Token tok = this.parse(regex, options);
  131           return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
  132       }
  133       */
  134   
  135       protected final void setContext(int con) {
  136           this.context = con;
  137       }
  138   
  139       final int read() {
  140           return this.nexttoken;
  141       }
  142   
  143       final void next() {
  144           if (this.offset >= this.regexlen) {
  145               this.chardata = -1;
  146               this.nexttoken = T_EOF;
  147               return;
  148           }
  149   
  150           int ret;
  151           int ch = this.regex.charAt(this.offset++);
  152           this.chardata = ch;
  153   
  154           if (this.context == S_INBRACKETS) {
  155               // In a character class, this.chardata has one character, that is to say,
  156               // a pair of surrogates is composed and stored to this.chardata.
  157               switch (ch) {
  158                 case '\\':
  159                   ret = T_BACKSOLIDUS;
  160                   if (this.offset >= this.regexlen)
  161                       throw ex("parser.next.1", this.offset-1);
  162                   this.chardata = this.regex.charAt(this.offset++);
  163                   break;
  164   
  165                 case '-':
  166                   if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
  167                       && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
  168                       this.offset++;
  169                       ret = T_XMLSCHEMA_CC_SUBTRACTION;
  170                   } else
  171                       ret = T_CHAR;
  172                   break;
  173   
  174                 case '[':
  175                   if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
  176                       && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
  177                       this.offset++;
  178                       ret = T_POSIX_CHARCLASS_START;
  179                       break;
  180                   } // Through down
  181                 default:
  182                   if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
  183                       int low = this.regex.charAt(this.offset);
  184                       if (REUtil.isLowSurrogate(low)) {
  185                           this.chardata = REUtil.composeFromSurrogates(ch, low);
  186                           this.offset ++;
  187                       }
  188                   }
  189                   ret = T_CHAR;
  190               }
  191               this.nexttoken = ret;
  192               return;
  193           }
  194   
  195           switch (ch) {
  196             case '|': ret = T_OR;             break;
  197             case '*': ret = T_STAR;           break;
  198             case '+': ret = T_PLUS;           break;
  199             case '?': ret = T_QUESTION;       break;
  200             case ')': ret = T_RPAREN;         break;
  201             case '.': ret = T_DOT;            break;
  202             case '[': ret = T_LBRACKET;       break;
  203             case '^': ret = T_CARET;          break;
  204             case '$': ret = T_DOLLAR;         break;
  205             case '(':
  206               ret = T_LPAREN;
  207               if (this.offset >= this.regexlen)
  208                   break;
  209               if (this.regex.charAt(this.offset) != '?')
  210                   break;
  211               if (++this.offset >= this.regexlen)
  212                   throw ex("parser.next.2", this.offset-1);
  213               ch = this.regex.charAt(this.offset++);
  214               switch (ch) {
  215                 case ':':  ret = T_LPAREN2;            break;
  216                 case '=':  ret = T_LOOKAHEAD;          break;
  217                 case '!':  ret = T_NEGATIVELOOKAHEAD;  break;
  218                 case '[':  ret = T_SET_OPERATIONS;     break;
  219                 case '>':  ret = T_INDEPENDENT;        break;
  220                 case '<':
  221                   if (this.offset >= this.regexlen)
  222                       throw ex("parser.next.2", this.offset-3);
  223                   ch = this.regex.charAt(this.offset++);
  224                   if (ch == '=') {
  225                       ret = T_LOOKBEHIND;
  226                   } else if (ch == '!') {
  227                       ret = T_NEGATIVELOOKBEHIND;
  228                   } else
  229                       throw ex("parser.next.3", this.offset-3);
  230                   break;
  231                 case '#':
  232                   while (this.offset < this.regexlen) {
  233                       ch = this.regex.charAt(this.offset++);
  234                       if (ch == ')')  break;
  235                   }
  236                   if (ch != ')')
  237                       throw ex("parser.next.4", this.offset-1);
  238                   ret = T_COMMENT;
  239                   break;
  240                 default:
  241                   if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
  242                       this.offset --;
  243                       ret = T_MODIFIERS;
  244                       break;
  245                   } else if (ch == '(') {         // conditional
  246                       ret = T_CONDITION;          // this.offsets points the next of '('.
  247                       break;
  248                   }
  249                   throw ex("parser.next.2", this.offset-2);
  250               }
  251               break;
  252               
  253             case '\\':
  254               ret = T_BACKSOLIDUS;
  255               if (this.offset >= this.regexlen)
  256                   throw ex("parser.next.1", this.offset-1);
  257               this.chardata = this.regex.charAt(this.offset++);
  258               break;
  259   
  260             default:
  261               ret = T_CHAR;
  262           }
  263           this.nexttoken = ret;
  264       }
  265   
  266       /**
  267        * regex ::= term (`|` term)*
  268        * term ::= factor+
  269        * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  270        *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  271        *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
  272        * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  273        *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block 
  274        */
  275       Token parseRegex() throws ParseException {
  276           Token tok = this.parseTerm();
  277           Token parent = null;
  278           while (this.read() == T_OR) {
  279               this.next();                    // '|'
  280               if (parent == null) {
  281                   parent = Token.createUnion();
  282                   parent.addChild(tok);
  283                   tok = parent;
  284               }
  285               tok.addChild(this.parseTerm());
  286           }
  287           return tok;
  288       }
  289   
  290       /**
  291        * term ::= factor+
  292        */
  293       Token parseTerm() throws ParseException {
  294           int ch = this.read();
  295           if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
  296               return Token.createEmpty();
  297           } else {
  298               Token tok = this.parseFactor();
  299               Token concat = null;
  300               while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
  301                   if (concat == null) {
  302                       concat = Token.createConcat();
  303                       concat.addChild(tok);
  304                       tok = concat;
  305                   }
  306                   concat.addChild(this.parseFactor());
  307                   //tok = Token.createConcat(tok, this.parseFactor());
  308               }
  309               return tok;
  310           }
  311       }
  312   
  313       // ----------------------------------------------------------------
  314   
  315       Token processCaret() throws ParseException {
  316           this.next();
  317           return Token.token_linebeginning;
  318       }
  319       Token processDollar() throws ParseException {
  320           this.next();
  321           return Token.token_lineend;
  322       }
  323       Token processLookahead() throws ParseException {
  324           this.next();
  325           Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
  326           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  327           this.next();                            // ')'
  328           return tok;
  329       }
  330       Token processNegativelookahead() throws ParseException {
  331           this.next();
  332           Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
  333           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  334           this.next();                            // ')'
  335           return tok;
  336       }
  337       Token processLookbehind() throws ParseException {
  338           this.next();
  339           Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
  340           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  341           this.next();                            // ')'
  342           return tok;
  343       }
  344       Token processNegativelookbehind() throws ParseException {
  345           this.next();
  346           Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
  347           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  348           this.next();                    // ')'
  349           return tok;
  350       }
  351       Token processBacksolidus_A() throws ParseException {
  352           this.next();
  353           return Token.token_stringbeginning;
  354       }
  355       Token processBacksolidus_Z() throws ParseException {
  356           this.next();
  357           return Token.token_stringend2;
  358       }
  359       Token processBacksolidus_z() throws ParseException {
  360           this.next();
  361           return Token.token_stringend;
  362       }
  363       Token processBacksolidus_b() throws ParseException {
  364           this.next();
  365           return Token.token_wordedge;
  366       }
  367       Token processBacksolidus_B() throws ParseException {
  368           this.next();
  369           return Token.token_not_wordedge;
  370       }
  371       Token processBacksolidus_lt() throws ParseException {
  372           this.next();
  373           return Token.token_wordbeginning;
  374       }
  375       Token processBacksolidus_gt() throws ParseException {
  376           this.next();
  377           return Token.token_wordend;
  378       }
  379       Token processStar(Token tok) throws ParseException {
  380           this.next();
  381           if (this.read() == T_QUESTION) {
  382               this.next();
  383               return Token.createNGClosure(tok);
  384           } else
  385               return Token.createClosure(tok);
  386       }
  387       Token processPlus(Token tok) throws ParseException {
  388           // X+ -> XX*
  389           this.next();
  390           if (this.read() == T_QUESTION) {
  391               this.next();
  392               return Token.createConcat(tok, Token.createNGClosure(tok));
  393           } else
  394               return Token.createConcat(tok, Token.createClosure(tok));
  395       }
  396       Token processQuestion(Token tok) throws ParseException {
  397           // X? -> X|
  398           this.next();
  399           Token par = Token.createUnion();
  400           if (this.read() == T_QUESTION) {
  401               this.next();
  402               par.addChild(Token.createEmpty());
  403               par.addChild(tok);
  404           } else {
  405               par.addChild(tok);
  406               par.addChild(Token.createEmpty());
  407           }
  408           return par;
  409       }
  410       boolean checkQuestion(int off) {
  411           return off < this.regexlen && this.regex.charAt(off) == '?';
  412       }
  413       Token processParen() throws ParseException {
  414           this.next();
  415           int p = this.parennumber++;
  416           Token tok = Token.createParen(this.parseRegex(), p);
  417           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  418           this.next();                            // Skips ')'
  419           return tok;
  420       }
  421       Token processParen2() throws ParseException {
  422           this.next();
  423           Token tok = Token.createParen(this.parseRegex(), 0);
  424           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  425           this.next();                            // Skips ')'
  426           return tok;
  427       }
  428       Token processCondition() throws ParseException {
  429                                                   // this.offset points the next of '('
  430           if (this.offset+1 >= this.regexlen)  throw ex("parser.factor.4", this.offset);
  431                                                   // Parses a condition.
  432           int refno = -1;
  433           Token condition = null;
  434           int ch = this.regex.charAt(this.offset);
  435           if ('1' <= ch && ch <= '9') {
  436               refno = ch-'0';
  437               this.hasBackReferences = true;
  438               if (this.references == null)  this.references = new Vector();
  439               this.references.addElement(new ReferencePosition(refno, this.offset));
  440               this.offset ++;
  441               if (this.regex.charAt(this.offset) != ')')  throw ex("parser.factor.1", this.offset);
  442               this.offset ++;
  443           } else {
  444               if (ch == '?')  this.offset --; // Points '('.
  445               this.next();
  446               condition = this.parseFactor();
  447               switch (condition.type) {
  448                 case Token.LOOKAHEAD:
  449                 case Token.NEGATIVELOOKAHEAD:
  450                 case Token.LOOKBEHIND:
  451                 case Token.NEGATIVELOOKBEHIND:
  452                   break;
  453                 case Token.ANCHOR:
  454                   if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  455                   break;
  456                 default:
  457                   throw ex("parser.factor.5", this.offset);
  458               }
  459           }
  460                                                   // Parses yes/no-patterns.
  461           this.next();
  462           Token yesPattern = this.parseRegex();
  463           Token noPattern = null;
  464           if (yesPattern.type == Token.UNION) {
  465               if (yesPattern.size() != 2)  throw ex("parser.factor.6", this.offset);
  466               noPattern = yesPattern.getChild(1);
  467               yesPattern = yesPattern.getChild(0);
  468           }
  469           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  470           this.next();
  471           return Token.createCondition(refno, condition, yesPattern, noPattern);
  472       }
  473       Token processModifiers() throws ParseException {
  474                                                   // this.offset points the next of '?'.
  475                                                   // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
  476           int add = 0, mask = 0, ch = -1;
  477           while (this.offset < this.regexlen) {
  478               ch = this.regex.charAt(this.offset);
  479               int v = REUtil.getOptionValue(ch);
  480               if (v == 0)  break;                 // '-' or ':'?
  481               add |= v;
  482               this.offset ++;
  483           }
  484           if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
  485           if (ch == '-') {
  486               this.offset ++;
  487               while (this.offset < this.regexlen) {
  488                   ch = this.regex.charAt(this.offset);
  489                   int v = REUtil.getOptionValue(ch);
  490                   if (v == 0)  break;             // ':'?
  491                   mask |= v;
  492                   this.offset ++;
  493               }
  494               if (this.offset >= this.regexlen)  throw ex("parser.factor.2", this.offset-1);
  495           }
  496           Token tok;
  497           if (ch == ':') {
  498               this.offset ++;
  499               this.next();
  500               tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  501               if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  502               this.next();
  503           } else if (ch == ')') {                 // such as (?-i)
  504               this.offset ++;
  505               this.next();
  506               tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  507           } else
  508               throw ex("parser.factor.3", this.offset);
  509   
  510           return tok;
  511       }
  512       Token processIndependent() throws ParseException {
  513           this.next();
  514           Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
  515           if (this.read() != T_RPAREN)  throw ex("parser.factor.1", this.offset-1);
  516           this.next();                            // Skips ')'
  517           return tok;
  518       }
  519       Token processBacksolidus_c() throws ParseException {
  520           int ch2;                                // Must be in 0x0040-0x005f
  521           if (this.offset >= this.regexlen
  522               || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
  523               throw ex("parser.atom.1", this.offset-1);
  524           this.next();
  525           return Token.createChar(ch2-0x40);
  526       }
  527       Token processBacksolidus_C() throws ParseException {
  528           throw ex("parser.process.1", this.offset);
  529       }
  530       Token processBacksolidus_i() throws ParseException {
  531           Token tok = Token.createChar('i');
  532           this.next();
  533           return tok;
  534       }
  535       Token processBacksolidus_I() throws ParseException {
  536           throw ex("parser.process.1", this.offset);
  537       }
  538       Token processBacksolidus_g() throws ParseException {
  539           this.next();
  540           return Token.getGraphemePattern();
  541       }
  542       Token processBacksolidus_X() throws ParseException {
  543           this.next();
  544           return Token.getCombiningCharacterSequence();
  545       }
  546       Token processBackreference() throws ParseException {
  547           int refnum = this.chardata-'0';
  548           Token tok = Token.createBackReference(refnum);
  549           this.hasBackReferences = true;
  550           if (this.references == null)  this.references = new Vector();
  551           this.references.addElement(new ReferencePosition(refnum, this.offset-2));
  552           this.next();
  553           return tok;
  554       }
  555   
  556       // ----------------------------------------------------------------
  557   
  558       /**
  559        * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  560        *            | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  561        *            | '(?=' regex ')'  | '(?!' regex ')'  | '(?&lt;=' regex ')'  | '(?&lt;!' regex ')'
  562        *            | '(?#' [^)]* ')'
  563        * minmax ::= '{' min (',' max?)? '}'
  564        * min ::= [0-9]+
  565        * max ::= [0-9]+
  566        */
  567       Token parseFactor() throws ParseException {        
  568           int ch = this.read();
  569           Token tok;
  570           switch (ch) {
  571             case T_CARET:         return this.processCaret();
  572             case T_DOLLAR:        return this.processDollar();
  573             case T_LOOKAHEAD:     return this.processLookahead();
  574             case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
  575             case T_LOOKBEHIND:    return this.processLookbehind();
  576             case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
  577   
  578             case T_COMMENT:
  579               this.next();
  580               return Token.createEmpty();
  581   
  582             case T_BACKSOLIDUS:
  583               switch (this.chardata) {
  584                 case 'A': return this.processBacksolidus_A();
  585                 case 'Z': return this.processBacksolidus_Z();
  586                 case 'z': return this.processBacksolidus_z();
  587                 case 'b': return this.processBacksolidus_b();
  588                 case 'B': return this.processBacksolidus_B();
  589                 case '<': return this.processBacksolidus_lt();
  590                 case '>': return this.processBacksolidus_gt();
  591               }
  592                                                   // through down
  593           }
  594           tok = this.parseAtom();
  595           ch = this.read();
  596           switch (ch) {
  597             case T_STAR:  return this.processStar(tok);
  598             case T_PLUS:  return this.processPlus(tok);
  599             case T_QUESTION: return this.processQuestion(tok);
  600             case T_CHAR:
  601               if (this.chardata == '{' && this.offset < this.regexlen) {
  602   
  603                   int off = this.offset;          // this.offset -> next of '{'
  604                   int min = 0, max = -1;
  605   
  606                   if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  607   
  608                       min = ch -'0';
  609                       while (off < this.regexlen
  610                              && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  611                           min = min*10 +ch-'0';
  612                           if (min < 0)
  613                               throw ex("parser.quantifier.5", this.offset);
  614                       }
  615                   }
  616                   else {
  617                       throw ex("parser.quantifier.1", this.offset);
  618                   }
  619   
  620                   max = min;
  621                   if (ch == ',') {
  622   
  623                      if (off >= this.regexlen) {
  624                          throw ex("parser.quantifier.3", this.offset);
  625                      }
  626                      else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {                       
  627   
  628                           max = ch -'0';       // {min,max}
  629                           while (off < this.regexlen
  630                                  && (ch = this.regex.charAt(off++)) >= '0'
  631                                  && ch <= '9') {
  632                               max = max*10 +ch-'0';
  633                               if (max < 0)
  634                                   throw ex("parser.quantifier.5", this.offset);
  635                           }
  636   
  637                           if (min > max)
  638                               throw ex("parser.quantifier.4", this.offset);
  639                      }
  640                      else { // assume {min,}
  641                           max = -1;           
  642                       }
  643                   }
  644   
  645                  if (ch != '}')
  646                      throw ex("parser.quantifier.2", this.offset);
  647   
  648                  if (this.checkQuestion(off)) {  // off -> next of '}'
  649                       tok = Token.createNGClosure(tok);
  650                       this.offset = off+1;
  651                   } else {
  652                       tok = Token.createClosure(tok);
  653                       this.offset = off;
  654                   }
  655   
  656                   tok.setMin(min);
  657                   tok.setMax(max);
  658                   //System.err.println("CLOSURE: "+min+", "+max);
  659                   this.next();
  660               }
  661           }
  662           return tok;
  663       }
  664   
  665       /**
  666        * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  667        *          | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
  668        *          | '(?>' regex ')'
  669        * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
  670        */
  671       Token parseAtom() throws ParseException {
  672           int ch = this.read();
  673           Token tok = null;
  674           switch (ch) {
  675             case T_LPAREN:        return this.processParen();
  676             case T_LPAREN2:       return this.processParen2(); // '(?:'
  677             case T_CONDITION:     return this.processCondition(); // '(?('
  678             case T_MODIFIERS:     return this.processModifiers(); // (?modifiers ... )
  679             case T_INDEPENDENT:   return this.processIndependent();
  680             case T_DOT:
  681               this.next();                    // Skips '.'
  682               tok = Token.token_dot;
  683               break;
  684   
  685               /**
  686                * char-class ::= '[' ( '^'? range ','?)+ ']'
  687                * range ::= '\d' | '\w' | '\s' | category-block | range-char
  688                *           | range-char '-' range-char
  689                * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  690                * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  691                */
  692             case T_LBRACKET:      return this.parseCharacterClass(true);
  693             case T_SET_OPERATIONS: return this.parseSetOperations();
  694   
  695             case T_BACKSOLIDUS:
  696               switch (this.chardata) {
  697                 case 'd':  case 'D':
  698                 case 'w':  case 'W':
  699                 case 's':  case 'S':
  700                   tok = this.getTokenForShorthand(this.chardata);
  701                   this.next();
  702                   return tok;
  703   
  704                 case 'e':  case 'f':  case 'n':  case 'r':
  705                 case 't':  case 'u':  case 'v':  case 'x':
  706                   {
  707                       int ch2 = this.decodeEscaped();
  708                       if (ch2 < 0x10000) {
  709                           tok = Token.createChar(ch2);
  710                       } else {
  711                           tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
  712                       }
  713                   }
  714                   break;
  715   
  716                 case 'c': return this.processBacksolidus_c();
  717                 case 'C': return this.processBacksolidus_C();
  718                 case 'i': return this.processBacksolidus_i();
  719                 case 'I': return this.processBacksolidus_I();
  720                 case 'g': return this.processBacksolidus_g();
  721                 case 'X': return this.processBacksolidus_X();
  722                 case '1':  case '2':  case '3':  case '4':
  723                 case '5':  case '6':  case '7':  case '8':  case '9':
  724                   return this.processBackreference();
  725   
  726                 case 'P':
  727                 case 'p':
  728                   int pstart = this.offset;
  729                   tok = processBacksolidus_pP(this.chardata);
  730                   if (tok == null)  throw this.ex("parser.atom.5", pstart);
  731                   break;
  732   
  733                 default:
  734                   tok = Token.createChar(this.chardata);
  735               }
  736               this.next();
  737               break;
  738   
  739             case T_CHAR:
  740               if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
  741                   throw this.ex("parser.atom.4", this.offset-1);
  742               tok = Token.createChar(this.chardata);
  743               int high = this.chardata;
  744               this.next();
  745               if (REUtil.isHighSurrogate(high)
  746                   && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
  747                   char[] sur = new char[2];
  748                   sur[0] = (char)high;
  749                   sur[1] = (char)this.chardata;
  750                   tok = Token.createParen(Token.createString(new String(sur)), 0);
  751                   this.next();
  752               }
  753               break;
  754   
  755             default:
  756               throw this.ex("parser.atom.4", this.offset-1);
  757           }
  758           return tok;
  759       }
  760   
  761       protected RangeToken processBacksolidus_pP(int c) throws ParseException {
  762   
  763           this.next();
  764           if (this.read() != T_CHAR || this.chardata != '{')
  765               throw this.ex("parser.atom.2", this.offset-1);
  766   
  767           // handle category escape
  768           boolean positive = c == 'p';
  769           int namestart = this.offset;
  770           int nameend = this.regex.indexOf('}', namestart);
  771   
  772           if (nameend < 0)
  773               throw this.ex("parser.atom.3", this.offset);
  774   
  775           String pname = this.regex.substring(namestart, nameend);
  776           this.offset = nameend+1;
  777   
  778           return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
  779       }
  780   
  781       int processCIinCharacterClass(RangeToken tok, int c) {
  782           return this.decodeEscaped();
  783       }
  784   
  785       /**
  786        * char-class ::= '[' ( '^'? range ','?)+ ']'
  787        * range ::= '\d' | '\w' | '\s' | category-block | range-char
  788        *           | range-char '-' range-char
  789        * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  790        * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  791        */
  792       protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  793           this.setContext(S_INBRACKETS);
  794           this.next();                            // '['
  795           boolean nrange = false;
  796           RangeToken base = null;
  797           RangeToken tok;
  798           if (this.read() == T_CHAR && this.chardata == '^') {
  799               nrange = true;
  800               this.next();                        // '^'
  801               if (useNrange) {
  802                   tok = Token.createNRange();
  803               } else {
  804                   base = Token.createRange();
  805                   base.addRange(0, Token.UTF16_MAX);
  806                   tok = Token.createRange();
  807               }
  808           } else {
  809               tok = Token.createRange();
  810           }
  811           int type;
  812           boolean firstloop = true;
  813           while ((type = this.read()) != T_EOF) {
  814               if (type == T_CHAR && this.chardata == ']' && !firstloop)
  815                   break;
  816               firstloop = false;
  817               int c = this.chardata;
  818               boolean end = false;
  819               if (type == T_BACKSOLIDUS) {
  820                   switch (c) {
  821                     case 'd':  case 'D':
  822                     case 'w':  case 'W':
  823                     case 's':  case 'S':
  824                       tok.mergeRanges(this.getTokenForShorthand(c));
  825                       end = true;
  826                       break;
  827   
  828                     case 'i':  case 'I':
  829                     case 'c':  case 'C':
  830                       c = this.processCIinCharacterClass(tok, c);
  831                       if (c < 0)  end = true;
  832                       break;
  833                       
  834                     case 'p':
  835                     case 'P':
  836                       int pstart = this.offset;
  837                       RangeToken tok2 = this.processBacksolidus_pP(c);
  838                       if (tok2 == null)  throw this.ex("parser.atom.5", pstart);
  839                       tok.mergeRanges(tok2);
  840                       end = true;
  841                       break;
  842   
  843                     default:
  844                       c = this.decodeEscaped();
  845                   } // \ + c
  846               } // backsolidus
  847                                                   // POSIX Character class such as [:alnum:]
  848               else if (type == T_POSIX_CHARCLASS_START) {
  849                   int nameend = this.regex.indexOf(':', this.offset);
  850                   if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
  851                   boolean positive = true;
  852                   if (this.regex.charAt(this.offset) == '^') {
  853                       this.offset ++;
  854                       positive = false;
  855                   }
  856                   String name = this.regex.substring(this.offset, nameend);
  857                   RangeToken range = Token.getRange(name, positive,
  858                                                     this.isSet(RegularExpression.XMLSCHEMA_MODE));
  859                   if (range == null)  throw this.ex("parser.cc.3", this.offset);
  860                   tok.mergeRanges(range);
  861                   end = true;
  862                   if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
  863                       throw this.ex("parser.cc.1", nameend);
  864                   this.offset = nameend+2;
  865               }
  866               this.next();
  867               if (!end) {                         // if not shorthands...
  868                   if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
  869                       tok.addRange(c, c);
  870                   } else {
  871                       this.next(); // Skips '-'
  872                       if ((type = this.read()) == T_EOF)  throw this.ex("parser.cc.2", this.offset);
  873                       if (type == T_CHAR && this.chardata == ']') {
  874                           tok.addRange(c, c);
  875                           tok.addRange('-', '-');
  876                       } else {
  877                           int rangeend = this.chardata;
  878                           if (type == T_BACKSOLIDUS)
  879                               rangeend = this.decodeEscaped();
  880                           this.next();
  881                           tok.addRange(c, rangeend);
  882                       }
  883                   }
  884               }
  885               if (this.isSet(RegularExpression.SPECIAL_COMMA)
  886                   && this.read() == T_CHAR && this.chardata == ',')
  887                   this.next();
  888           }
  889           if (this.read() == T_EOF)
  890               throw this.ex("parser.cc.2", this.offset);
  891           if (!useNrange && nrange) {
  892               base.subtractRanges(tok);
  893               tok = base;
  894           }
  895           tok.sortRanges();
  896           tok.compactRanges();
  897           //tok.dumpRanges();
  898           /*
  899           if (this.isSet(RegularExpression.IGNORE_CASE))
  900               tok = RangeToken.createCaseInsensitiveToken(tok);
  901           */
  902           this.setContext(S_NORMAL);
  903           this.next();                    // Skips ']'
  904   
  905           return tok;
  906       }
  907   
  908       /**
  909        * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
  910        */
  911       protected RangeToken parseSetOperations() throws ParseException {
  912           RangeToken tok = this.parseCharacterClass(false);
  913           int type;
  914           while ((type = this.read()) != T_RPAREN) {
  915               int ch = this.chardata;
  916               if (type == T_CHAR && (ch == '-' || ch == '&')
  917                   || type == T_PLUS) {
  918                   this.next();
  919                   if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
  920                   RangeToken t2 = this.parseCharacterClass(false);
  921                   if (type == T_PLUS)
  922                       tok.mergeRanges(t2);
  923                   else if (ch == '-')
  924                       tok.subtractRanges(t2);
  925                   else if (ch == '&')
  926                       tok.intersectRanges(t2);
  927                   else
  928                       throw new RuntimeException("ASSERT");
  929               } else {
  930                   throw ex("parser.ope.2", this.offset-1);
  931               }
  932           }
  933           this.next();
  934           return tok;
  935       }
  936   
  937       Token getTokenForShorthand(int ch) {
  938           Token tok;
  939           switch (ch) {
  940             case 'd':
  941               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  942                   ? Token.getRange("Nd", true) : Token.token_0to9;
  943               break;
  944             case 'D':
  945               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  946                   ? Token.getRange("Nd", false) : Token.token_not_0to9;
  947               break;
  948             case 'w':
  949               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  950                   ? Token.getRange("IsWord", true) : Token.token_wordchars;
  951               break;
  952             case 'W':
  953               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  954                   ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
  955               break;
  956             case 's':
  957               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  958                   ? Token.getRange("IsSpace", true) : Token.token_spaces;
  959               break;
  960             case 'S':
  961               tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  962                   ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
  963               break;
  964   
  965             default:
  966               throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
  967           }
  968           return tok;
  969       }
  970   
  971       /**
  972        */
  973       int decodeEscaped() throws ParseException {
  974           if (this.read() != T_BACKSOLIDUS)  throw ex("parser.next.1", this.offset-1);
  975           int c = this.chardata;
  976           switch (c) {
  977             case 'e':  c = 0x1b;  break; // ESCAPE U+001B
  978             case 'f':  c = '\f';  break; // FORM FEED U+000C
  979             case 'n':  c = '\n';  break; // LINE FEED U+000A
  980             case 'r':  c = '\r';  break; // CRRIAGE RETURN U+000D
  981             case 't':  c = '\t';  break; // HORIZONTAL TABULATION U+0009
  982             //case 'v':  c = 0x0b;  break; // VERTICAL TABULATION U+000B
  983             case 'x':
  984               this.next();
  985               if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
  986               if (this.chardata == '{') {
  987                   int v1 = 0;
  988                   int uv = 0;
  989                   do {
  990                       this.next();
  991                       if (this.read() != T_CHAR)  throw ex("parser.descape.1", this.offset-1);
  992                       if ((v1 = hexChar(this.chardata)) < 0)
  993                           break;
  994                       if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
  995                       uv = uv*16+v1;
  996                   } while (true);
  997                   if (this.chardata != '}')  throw ex("parser.descape.3", this.offset-1);
  998                   if (uv > Token.UTF16_MAX)  throw ex("parser.descape.4", this.offset-1);
  999                   c = uv;
 1000               } else {
 1001                   int v1 = 0;
 1002                   if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1003                       throw ex("parser.descape.1", this.offset-1);
 1004                   int uv = v1;
 1005                   this.next();
 1006                   if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1007                       throw ex("parser.descape.1", this.offset-1);
 1008                   uv = uv*16+v1;
 1009                   c = uv;
 1010               }
 1011               break;
 1012   
 1013             case 'u':
 1014               int v1 = 0;
 1015               this.next();
 1016               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1017                   throw ex("parser.descape.1", this.offset-1);
 1018               int uv = v1;
 1019               this.next();
 1020               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1021                   throw ex("parser.descape.1", this.offset-1);
 1022               uv = uv*16+v1;
 1023               this.next();
 1024               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1025                   throw ex("parser.descape.1", this.offset-1);
 1026               uv = uv*16+v1;
 1027               this.next();
 1028               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1029                   throw ex("parser.descape.1", this.offset-1);
 1030               uv = uv*16+v1;
 1031               c = uv;
 1032               break;
 1033   
 1034             case 'v':
 1035               this.next();
 1036               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1037                   throw ex("parser.descape.1", this.offset-1);
 1038               uv = v1;
 1039               this.next();
 1040               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1041                   throw ex("parser.descape.1", this.offset-1);
 1042               uv = uv*16+v1;
 1043               this.next();
 1044               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1045                   throw ex("parser.descape.1", this.offset-1);
 1046               uv = uv*16+v1;
 1047               this.next();
 1048               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1049                   throw ex("parser.descape.1", this.offset-1);
 1050               uv = uv*16+v1;
 1051               this.next();
 1052               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1053                   throw ex("parser.descape.1", this.offset-1);
 1054               uv = uv*16+v1;
 1055               this.next();
 1056               if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
 1057                   throw ex("parser.descape.1", this.offset-1);
 1058               uv = uv*16+v1;
 1059               if (uv > Token.UTF16_MAX)  throw ex("parser.descappe.4", this.offset-1);
 1060               c = uv;
 1061               break;
 1062             case 'A':
 1063             case 'Z':
 1064             case 'z':
 1065               throw ex("parser.descape.5", this.offset-2);
 1066             default:
 1067           }
 1068           return c;
 1069       }
 1070   
 1071       static private final int hexChar(int ch) {
 1072           if (ch < '0')  return -1;
 1073           if (ch > 'f')  return -1;
 1074           if (ch <= '9')  return ch-'0';
 1075           if (ch < 'A')  return -1;
 1076           if (ch <= 'F')  return ch-'A'+10;
 1077           if (ch < 'a')  return -1;
 1078           return ch-'a'+10;
 1079       }
 1080   }

Save This Page
Home » xmlbeans-2.4.0-src » org.apache.xmlbeans.impl.regex » [javadoc | source]