1 /* Copyright 2004 The Apache Software Foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.apache.xmlbeans.impl.regex;
17
18 import java.util.Locale;
19 import java.util.MissingResourceException;
20 import java.util.ResourceBundle;
21 import java.util.Vector;
22
23 /**
24 * A Regular Expression Parser.
25 */
26 class RegexParser {
27 static final int T_CHAR = 0;
28 static final int T_EOF = 1;
29 static final int T_OR = 2; // '|'
30 static final int T_STAR = 3; // '*'
31 static final int T_PLUS = 4; // '+'
32 static final int T_QUESTION = 5; // '?'
33 static final int T_LPAREN = 6; // '('
34 static final int T_RPAREN = 7; // ')'
35 static final int T_DOT = 8; // '.'
36 static final int T_LBRACKET = 9; // '['
37 static final int T_BACKSOLIDUS = 10; // '\'
38 static final int T_CARET = 11; // '^'
39 static final int T_DOLLAR = 12; // '$'
40 static final int T_LPAREN2 = 13; // '(?:'
41 static final int T_LOOKAHEAD = 14; // '(?='
42 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
43 static final int T_LOOKBEHIND = 16; // '(?<='
44 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
45 static final int T_INDEPENDENT = 18; // '(?>'
46 static final int T_SET_OPERATIONS = 19; // '(?['
47 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
48 static final int T_COMMENT = 21; // '(?#'
49 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
50 static final int T_CONDITION = 23; // '(?('
51 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
52
53 static class ReferencePosition {
54 int refNumber;
55 int position;
56 ReferencePosition(int n, int pos) {
57 this.refNumber = n;
58 this.position = pos;
59 }
60 }
61
62 int offset;
63 String regex;
64 int regexlen;
65 int options;
66 ResourceBundle resources;
67 int chardata;
68 int nexttoken;
69 static protected final int S_NORMAL = 0;
70 static protected final int S_INBRACKETS = 1;
71 static protected final int S_INXBRACKETS = 2;
72 int context = S_NORMAL;
73 int parennumber = 1;
74 boolean hasBackReferences;
75 Vector references = null;
76
77 public RegexParser() {
78 this.setLocale(Locale.getDefault());
79 }
80 public RegexParser(Locale locale) {
81 this.setLocale(locale);
82 }
83
84 public void setLocale(Locale locale) {
85 try {
86 this.resources = ResourceBundle.getBundle("org.apache.xmlbeans.impl.regex.message", locale);
87 } catch (MissingResourceException mre) {
88 throw new RuntimeException("Installation Problem??? Couldn't load messages: "
89 +mre.getMessage());
90 }
91 }
92
93 final ParseException ex(String key, int loc) {
94 return new ParseException(this.resources.getString(key), loc);
95 }
96
97 private final boolean isSet(int flag) {
98 return (this.options & flag) == flag;
99 }
100
101 synchronized Token parse(String regex, int options) throws ParseException {
102 this.options = options;
103 this.offset = 0;
104 this.setContext(S_NORMAL);
105 this.parennumber = 1;
106 this.hasBackReferences = false;
107 this.regex = regex;
108 if (this.isSet(RegularExpression.EXTENDED_COMMENT))
109 this.regex = REUtil.stripExtendedComment(this.regex);
110 this.regexlen = this.regex.length();
111
112
113 this.next();
114 Token ret = this.parseRegex();
115 if (this.offset != this.regexlen)
116 throw ex("parser.parse.1", this.offset);
117 if (this.references != null) {
118 for (int i = 0; i < this.references.size(); i ++) {
119 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
120 if (this.parennumber <= position.refNumber)
121 throw ex("parser.parse.2", position.position);
122 }
123 this.references.removeAllElements();
124 }
125 return ret;
126 }
127
128 /*
129 public RegularExpression createRegex(String regex, int options) throws ParseException {
130 Token tok = this.parse(regex, options);
131 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
132 }
133 */
134
135 protected final void setContext(int con) {
136 this.context = con;
137 }
138
139 final int read() {
140 return this.nexttoken;
141 }
142
143 final void next() {
144 if (this.offset >= this.regexlen) {
145 this.chardata = -1;
146 this.nexttoken = T_EOF;
147 return;
148 }
149
150 int ret;
151 int ch = this.regex.charAt(this.offset++);
152 this.chardata = ch;
153
154 if (this.context == S_INBRACKETS) {
155 // In a character class, this.chardata has one character, that is to say,
156 // a pair of surrogates is composed and stored to this.chardata.
157 switch (ch) {
158 case '\\':
159 ret = T_BACKSOLIDUS;
160 if (this.offset >= this.regexlen)
161 throw ex("parser.next.1", this.offset-1);
162 this.chardata = this.regex.charAt(this.offset++);
163 break;
164
165 case '-':
166 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
167 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
168 this.offset++;
169 ret = T_XMLSCHEMA_CC_SUBTRACTION;
170 } else
171 ret = T_CHAR;
172 break;
173
174 case '[':
175 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
176 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
177 this.offset++;
178 ret = T_POSIX_CHARCLASS_START;
179 break;
180 } // Through down
181 default:
182 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
183 int low = this.regex.charAt(this.offset);
184 if (REUtil.isLowSurrogate(low)) {
185 this.chardata = REUtil.composeFromSurrogates(ch, low);
186 this.offset ++;
187 }
188 }
189 ret = T_CHAR;
190 }
191 this.nexttoken = ret;
192 return;
193 }
194
195 switch (ch) {
196 case '|': ret = T_OR; break;
197 case '*': ret = T_STAR; break;
198 case '+': ret = T_PLUS; break;
199 case '?': ret = T_QUESTION; break;
200 case ')': ret = T_RPAREN; break;
201 case '.': ret = T_DOT; break;
202 case '[': ret = T_LBRACKET; break;
203 case '^': ret = T_CARET; break;
204 case '$': ret = T_DOLLAR; break;
205 case '(':
206 ret = T_LPAREN;
207 if (this.offset >= this.regexlen)
208 break;
209 if (this.regex.charAt(this.offset) != '?')
210 break;
211 if (++this.offset >= this.regexlen)
212 throw ex("parser.next.2", this.offset-1);
213 ch = this.regex.charAt(this.offset++);
214 switch (ch) {
215 case ':': ret = T_LPAREN2; break;
216 case '=': ret = T_LOOKAHEAD; break;
217 case '!': ret = T_NEGATIVELOOKAHEAD; break;
218 case '[': ret = T_SET_OPERATIONS; break;
219 case '>': ret = T_INDEPENDENT; break;
220 case '<':
221 if (this.offset >= this.regexlen)
222 throw ex("parser.next.2", this.offset-3);
223 ch = this.regex.charAt(this.offset++);
224 if (ch == '=') {
225 ret = T_LOOKBEHIND;
226 } else if (ch == '!') {
227 ret = T_NEGATIVELOOKBEHIND;
228 } else
229 throw ex("parser.next.3", this.offset-3);
230 break;
231 case '#':
232 while (this.offset < this.regexlen) {
233 ch = this.regex.charAt(this.offset++);
234 if (ch == ')') break;
235 }
236 if (ch != ')')
237 throw ex("parser.next.4", this.offset-1);
238 ret = T_COMMENT;
239 break;
240 default:
241 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
242 this.offset --;
243 ret = T_MODIFIERS;
244 break;
245 } else if (ch == '(') { // conditional
246 ret = T_CONDITION; // this.offsets points the next of '('.
247 break;
248 }
249 throw ex("parser.next.2", this.offset-2);
250 }
251 break;
252
253 case '\\':
254 ret = T_BACKSOLIDUS;
255 if (this.offset >= this.regexlen)
256 throw ex("parser.next.1", this.offset-1);
257 this.chardata = this.regex.charAt(this.offset++);
258 break;
259
260 default:
261 ret = T_CHAR;
262 }
263 this.nexttoken = ret;
264 }
265
266 /**
267 * regex ::= term (`|` term)*
268 * term ::= factor+
269 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
270 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
271 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
272 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
273 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
274 */
275 Token parseRegex() throws ParseException {
276 Token tok = this.parseTerm();
277 Token parent = null;
278 while (this.read() == T_OR) {
279 this.next(); // '|'
280 if (parent == null) {
281 parent = Token.createUnion();
282 parent.addChild(tok);
283 tok = parent;
284 }
285 tok.addChild(this.parseTerm());
286 }
287 return tok;
288 }
289
290 /**
291 * term ::= factor+
292 */
293 Token parseTerm() throws ParseException {
294 int ch = this.read();
295 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
296 return Token.createEmpty();
297 } else {
298 Token tok = this.parseFactor();
299 Token concat = null;
300 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
301 if (concat == null) {
302 concat = Token.createConcat();
303 concat.addChild(tok);
304 tok = concat;
305 }
306 concat.addChild(this.parseFactor());
307 //tok = Token.createConcat(tok, this.parseFactor());
308 }
309 return tok;
310 }
311 }
312
313 // ----------------------------------------------------------------
314
315 Token processCaret() throws ParseException {
316 this.next();
317 return Token.token_linebeginning;
318 }
319 Token processDollar() throws ParseException {
320 this.next();
321 return Token.token_lineend;
322 }
323 Token processLookahead() throws ParseException {
324 this.next();
325 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
326 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
327 this.next(); // ')'
328 return tok;
329 }
330 Token processNegativelookahead() throws ParseException {
331 this.next();
332 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
333 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
334 this.next(); // ')'
335 return tok;
336 }
337 Token processLookbehind() throws ParseException {
338 this.next();
339 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
340 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
341 this.next(); // ')'
342 return tok;
343 }
344 Token processNegativelookbehind() throws ParseException {
345 this.next();
346 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
347 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
348 this.next(); // ')'
349 return tok;
350 }
351 Token processBacksolidus_A() throws ParseException {
352 this.next();
353 return Token.token_stringbeginning;
354 }
355 Token processBacksolidus_Z() throws ParseException {
356 this.next();
357 return Token.token_stringend2;
358 }
359 Token processBacksolidus_z() throws ParseException {
360 this.next();
361 return Token.token_stringend;
362 }
363 Token processBacksolidus_b() throws ParseException {
364 this.next();
365 return Token.token_wordedge;
366 }
367 Token processBacksolidus_B() throws ParseException {
368 this.next();
369 return Token.token_not_wordedge;
370 }
371 Token processBacksolidus_lt() throws ParseException {
372 this.next();
373 return Token.token_wordbeginning;
374 }
375 Token processBacksolidus_gt() throws ParseException {
376 this.next();
377 return Token.token_wordend;
378 }
379 Token processStar(Token tok) throws ParseException {
380 this.next();
381 if (this.read() == T_QUESTION) {
382 this.next();
383 return Token.createNGClosure(tok);
384 } else
385 return Token.createClosure(tok);
386 }
387 Token processPlus(Token tok) throws ParseException {
388 // X+ -> XX*
389 this.next();
390 if (this.read() == T_QUESTION) {
391 this.next();
392 return Token.createConcat(tok, Token.createNGClosure(tok));
393 } else
394 return Token.createConcat(tok, Token.createClosure(tok));
395 }
396 Token processQuestion(Token tok) throws ParseException {
397 // X? -> X|
398 this.next();
399 Token par = Token.createUnion();
400 if (this.read() == T_QUESTION) {
401 this.next();
402 par.addChild(Token.createEmpty());
403 par.addChild(tok);
404 } else {
405 par.addChild(tok);
406 par.addChild(Token.createEmpty());
407 }
408 return par;
409 }
410 boolean checkQuestion(int off) {
411 return off < this.regexlen && this.regex.charAt(off) == '?';
412 }
413 Token processParen() throws ParseException {
414 this.next();
415 int p = this.parennumber++;
416 Token tok = Token.createParen(this.parseRegex(), p);
417 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
418 this.next(); // Skips ')'
419 return tok;
420 }
421 Token processParen2() throws ParseException {
422 this.next();
423 Token tok = Token.createParen(this.parseRegex(), 0);
424 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
425 this.next(); // Skips ')'
426 return tok;
427 }
428 Token processCondition() throws ParseException {
429 // this.offset points the next of '('
430 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
431 // Parses a condition.
432 int refno = -1;
433 Token condition = null;
434 int ch = this.regex.charAt(this.offset);
435 if ('1' <= ch && ch <= '9') {
436 refno = ch-'0';
437 this.hasBackReferences = true;
438 if (this.references == null) this.references = new Vector();
439 this.references.addElement(new ReferencePosition(refno, this.offset));
440 this.offset ++;
441 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
442 this.offset ++;
443 } else {
444 if (ch == '?') this.offset --; // Points '('.
445 this.next();
446 condition = this.parseFactor();
447 switch (condition.type) {
448 case Token.LOOKAHEAD:
449 case Token.NEGATIVELOOKAHEAD:
450 case Token.LOOKBEHIND:
451 case Token.NEGATIVELOOKBEHIND:
452 break;
453 case Token.ANCHOR:
454 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
455 break;
456 default:
457 throw ex("parser.factor.5", this.offset);
458 }
459 }
460 // Parses yes/no-patterns.
461 this.next();
462 Token yesPattern = this.parseRegex();
463 Token noPattern = null;
464 if (yesPattern.type == Token.UNION) {
465 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
466 noPattern = yesPattern.getChild(1);
467 yesPattern = yesPattern.getChild(0);
468 }
469 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
470 this.next();
471 return Token.createCondition(refno, condition, yesPattern, noPattern);
472 }
473 Token processModifiers() throws ParseException {
474 // this.offset points the next of '?'.
475 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
476 int add = 0, mask = 0, ch = -1;
477 while (this.offset < this.regexlen) {
478 ch = this.regex.charAt(this.offset);
479 int v = REUtil.getOptionValue(ch);
480 if (v == 0) break; // '-' or ':'?
481 add |= v;
482 this.offset ++;
483 }
484 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
485 if (ch == '-') {
486 this.offset ++;
487 while (this.offset < this.regexlen) {
488 ch = this.regex.charAt(this.offset);
489 int v = REUtil.getOptionValue(ch);
490 if (v == 0) break; // ':'?
491 mask |= v;
492 this.offset ++;
493 }
494 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
495 }
496 Token tok;
497 if (ch == ':') {
498 this.offset ++;
499 this.next();
500 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
501 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
502 this.next();
503 } else if (ch == ')') { // such as (?-i)
504 this.offset ++;
505 this.next();
506 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
507 } else
508 throw ex("parser.factor.3", this.offset);
509
510 return tok;
511 }
512 Token processIndependent() throws ParseException {
513 this.next();
514 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
515 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
516 this.next(); // Skips ')'
517 return tok;
518 }
519 Token processBacksolidus_c() throws ParseException {
520 int ch2; // Must be in 0x0040-0x005f
521 if (this.offset >= this.regexlen
522 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
523 throw ex("parser.atom.1", this.offset-1);
524 this.next();
525 return Token.createChar(ch2-0x40);
526 }
527 Token processBacksolidus_C() throws ParseException {
528 throw ex("parser.process.1", this.offset);
529 }
530 Token processBacksolidus_i() throws ParseException {
531 Token tok = Token.createChar('i');
532 this.next();
533 return tok;
534 }
535 Token processBacksolidus_I() throws ParseException {
536 throw ex("parser.process.1", this.offset);
537 }
538 Token processBacksolidus_g() throws ParseException {
539 this.next();
540 return Token.getGraphemePattern();
541 }
542 Token processBacksolidus_X() throws ParseException {
543 this.next();
544 return Token.getCombiningCharacterSequence();
545 }
546 Token processBackreference() throws ParseException {
547 int refnum = this.chardata-'0';
548 Token tok = Token.createBackReference(refnum);
549 this.hasBackReferences = true;
550 if (this.references == null) this.references = new Vector();
551 this.references.addElement(new ReferencePosition(refnum, this.offset-2));
552 this.next();
553 return tok;
554 }
555
556 // ----------------------------------------------------------------
557
558 /**
559 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
560 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
561 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
562 * | '(?#' [^)]* ')'
563 * minmax ::= '{' min (',' max?)? '}'
564 * min ::= [0-9]+
565 * max ::= [0-9]+
566 */
567 Token parseFactor() throws ParseException {
568 int ch = this.read();
569 Token tok;
570 switch (ch) {
571 case T_CARET: return this.processCaret();
572 case T_DOLLAR: return this.processDollar();
573 case T_LOOKAHEAD: return this.processLookahead();
574 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
575 case T_LOOKBEHIND: return this.processLookbehind();
576 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
577
578 case T_COMMENT:
579 this.next();
580 return Token.createEmpty();
581
582 case T_BACKSOLIDUS:
583 switch (this.chardata) {
584 case 'A': return this.processBacksolidus_A();
585 case 'Z': return this.processBacksolidus_Z();
586 case 'z': return this.processBacksolidus_z();
587 case 'b': return this.processBacksolidus_b();
588 case 'B': return this.processBacksolidus_B();
589 case '<': return this.processBacksolidus_lt();
590 case '>': return this.processBacksolidus_gt();
591 }
592 // through down
593 }
594 tok = this.parseAtom();
595 ch = this.read();
596 switch (ch) {
597 case T_STAR: return this.processStar(tok);
598 case T_PLUS: return this.processPlus(tok);
599 case T_QUESTION: return this.processQuestion(tok);
600 case T_CHAR:
601 if (this.chardata == '{' && this.offset < this.regexlen) {
602
603 int off = this.offset; // this.offset -> next of '{'
604 int min = 0, max = -1;
605
606 if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
607
608 min = ch -'0';
609 while (off < this.regexlen
610 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
611 min = min*10 +ch-'0';
612 if (min < 0)
613 throw ex("parser.quantifier.5", this.offset);
614 }
615 }
616 else {
617 throw ex("parser.quantifier.1", this.offset);
618 }
619
620 max = min;
621 if (ch == ',') {
622
623 if (off >= this.regexlen) {
624 throw ex("parser.quantifier.3", this.offset);
625 }
626 else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
627
628 max = ch -'0'; // {min,max}
629 while (off < this.regexlen
630 && (ch = this.regex.charAt(off++)) >= '0'
631 && ch <= '9') {
632 max = max*10 +ch-'0';
633 if (max < 0)
634 throw ex("parser.quantifier.5", this.offset);
635 }
636
637 if (min > max)
638 throw ex("parser.quantifier.4", this.offset);
639 }
640 else { // assume {min,}
641 max = -1;
642 }
643 }
644
645 if (ch != '}')
646 throw ex("parser.quantifier.2", this.offset);
647
648 if (this.checkQuestion(off)) { // off -> next of '}'
649 tok = Token.createNGClosure(tok);
650 this.offset = off+1;
651 } else {
652 tok = Token.createClosure(tok);
653 this.offset = off;
654 }
655
656 tok.setMin(min);
657 tok.setMax(max);
658 //System.err.println("CLOSURE: "+min+", "+max);
659 this.next();
660 }
661 }
662 return tok;
663 }
664
665 /**
666 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
667 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
668 * | '(?>' regex ')'
669 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
670 */
671 Token parseAtom() throws ParseException {
672 int ch = this.read();
673 Token tok = null;
674 switch (ch) {
675 case T_LPAREN: return this.processParen();
676 case T_LPAREN2: return this.processParen2(); // '(?:'
677 case T_CONDITION: return this.processCondition(); // '(?('
678 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
679 case T_INDEPENDENT: return this.processIndependent();
680 case T_DOT:
681 this.next(); // Skips '.'
682 tok = Token.token_dot;
683 break;
684
685 /**
686 * char-class ::= '[' ( '^'? range ','?)+ ']'
687 * range ::= '\d' | '\w' | '\s' | category-block | range-char
688 * | range-char '-' range-char
689 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
690 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
691 */
692 case T_LBRACKET: return this.parseCharacterClass(true);
693 case T_SET_OPERATIONS: return this.parseSetOperations();
694
695 case T_BACKSOLIDUS:
696 switch (this.chardata) {
697 case 'd': case 'D':
698 case 'w': case 'W':
699 case 's': case 'S':
700 tok = this.getTokenForShorthand(this.chardata);
701 this.next();
702 return tok;
703
704 case 'e': case 'f': case 'n': case 'r':
705 case 't': case 'u': case 'v': case 'x':
706 {
707 int ch2 = this.decodeEscaped();
708 if (ch2 < 0x10000) {
709 tok = Token.createChar(ch2);
710 } else {
711 tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
712 }
713 }
714 break;
715
716 case 'c': return this.processBacksolidus_c();
717 case 'C': return this.processBacksolidus_C();
718 case 'i': return this.processBacksolidus_i();
719 case 'I': return this.processBacksolidus_I();
720 case 'g': return this.processBacksolidus_g();
721 case 'X': return this.processBacksolidus_X();
722 case '1': case '2': case '3': case '4':
723 case '5': case '6': case '7': case '8': case '9':
724 return this.processBackreference();
725
726 case 'P':
727 case 'p':
728 int pstart = this.offset;
729 tok = processBacksolidus_pP(this.chardata);
730 if (tok == null) throw this.ex("parser.atom.5", pstart);
731 break;
732
733 default:
734 tok = Token.createChar(this.chardata);
735 }
736 this.next();
737 break;
738
739 case T_CHAR:
740 if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
741 throw this.ex("parser.atom.4", this.offset-1);
742 tok = Token.createChar(this.chardata);
743 int high = this.chardata;
744 this.next();
745 if (REUtil.isHighSurrogate(high)
746 && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
747 char[] sur = new char[2];
748 sur[0] = (char)high;
749 sur[1] = (char)this.chardata;
750 tok = Token.createParen(Token.createString(new String(sur)), 0);
751 this.next();
752 }
753 break;
754
755 default:
756 throw this.ex("parser.atom.4", this.offset-1);
757 }
758 return tok;
759 }
760
761 protected RangeToken processBacksolidus_pP(int c) throws ParseException {
762
763 this.next();
764 if (this.read() != T_CHAR || this.chardata != '{')
765 throw this.ex("parser.atom.2", this.offset-1);
766
767 // handle category escape
768 boolean positive = c == 'p';
769 int namestart = this.offset;
770 int nameend = this.regex.indexOf('}', namestart);
771
772 if (nameend < 0)
773 throw this.ex("parser.atom.3", this.offset);
774
775 String pname = this.regex.substring(namestart, nameend);
776 this.offset = nameend+1;
777
778 return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
779 }
780
781 int processCIinCharacterClass(RangeToken tok, int c) {
782 return this.decodeEscaped();
783 }
784
785 /**
786 * char-class ::= '[' ( '^'? range ','?)+ ']'
787 * range ::= '\d' | '\w' | '\s' | category-block | range-char
788 * | range-char '-' range-char
789 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
790 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
791 */
792 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
793 this.setContext(S_INBRACKETS);
794 this.next(); // '['
795 boolean nrange = false;
796 RangeToken base = null;
797 RangeToken tok;
798 if (this.read() == T_CHAR && this.chardata == '^') {
799 nrange = true;
800 this.next(); // '^'
801 if (useNrange) {
802 tok = Token.createNRange();
803 } else {
804 base = Token.createRange();
805 base.addRange(0, Token.UTF16_MAX);
806 tok = Token.createRange();
807 }
808 } else {
809 tok = Token.createRange();
810 }
811 int type;
812 boolean firstloop = true;
813 while ((type = this.read()) != T_EOF) {
814 if (type == T_CHAR && this.chardata == ']' && !firstloop)
815 break;
816 firstloop = false;
817 int c = this.chardata;
818 boolean end = false;
819 if (type == T_BACKSOLIDUS) {
820 switch (c) {
821 case 'd': case 'D':
822 case 'w': case 'W':
823 case 's': case 'S':
824 tok.mergeRanges(this.getTokenForShorthand(c));
825 end = true;
826 break;
827
828 case 'i': case 'I':
829 case 'c': case 'C':
830 c = this.processCIinCharacterClass(tok, c);
831 if (c < 0) end = true;
832 break;
833
834 case 'p':
835 case 'P':
836 int pstart = this.offset;
837 RangeToken tok2 = this.processBacksolidus_pP(c);
838 if (tok2 == null) throw this.ex("parser.atom.5", pstart);
839 tok.mergeRanges(tok2);
840 end = true;
841 break;
842
843 default:
844 c = this.decodeEscaped();
845 } // \ + c
846 } // backsolidus
847 // POSIX Character class such as [:alnum:]
848 else if (type == T_POSIX_CHARCLASS_START) {
849 int nameend = this.regex.indexOf(':', this.offset);
850 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
851 boolean positive = true;
852 if (this.regex.charAt(this.offset) == '^') {
853 this.offset ++;
854 positive = false;
855 }
856 String name = this.regex.substring(this.offset, nameend);
857 RangeToken range = Token.getRange(name, positive,
858 this.isSet(RegularExpression.XMLSCHEMA_MODE));
859 if (range == null) throw this.ex("parser.cc.3", this.offset);
860 tok.mergeRanges(range);
861 end = true;
862 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
863 throw this.ex("parser.cc.1", nameend);
864 this.offset = nameend+2;
865 }
866 this.next();
867 if (!end) { // if not shorthands...
868 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
869 tok.addRange(c, c);
870 } else {
871 this.next(); // Skips '-'
872 if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
873 if (type == T_CHAR && this.chardata == ']') {
874 tok.addRange(c, c);
875 tok.addRange('-', '-');
876 } else {
877 int rangeend = this.chardata;
878 if (type == T_BACKSOLIDUS)
879 rangeend = this.decodeEscaped();
880 this.next();
881 tok.addRange(c, rangeend);
882 }
883 }
884 }
885 if (this.isSet(RegularExpression.SPECIAL_COMMA)
886 && this.read() == T_CHAR && this.chardata == ',')
887 this.next();
888 }
889 if (this.read() == T_EOF)
890 throw this.ex("parser.cc.2", this.offset);
891 if (!useNrange && nrange) {
892 base.subtractRanges(tok);
893 tok = base;
894 }
895 tok.sortRanges();
896 tok.compactRanges();
897 //tok.dumpRanges();
898 /*
899 if (this.isSet(RegularExpression.IGNORE_CASE))
900 tok = RangeToken.createCaseInsensitiveToken(tok);
901 */
902 this.setContext(S_NORMAL);
903 this.next(); // Skips ']'
904
905 return tok;
906 }
907
908 /**
909 * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
910 */
911 protected RangeToken parseSetOperations() throws ParseException {
912 RangeToken tok = this.parseCharacterClass(false);
913 int type;
914 while ((type = this.read()) != T_RPAREN) {
915 int ch = this.chardata;
916 if (type == T_CHAR && (ch == '-' || ch == '&')
917 || type == T_PLUS) {
918 this.next();
919 if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
920 RangeToken t2 = this.parseCharacterClass(false);
921 if (type == T_PLUS)
922 tok.mergeRanges(t2);
923 else if (ch == '-')
924 tok.subtractRanges(t2);
925 else if (ch == '&')
926 tok.intersectRanges(t2);
927 else
928 throw new RuntimeException("ASSERT");
929 } else {
930 throw ex("parser.ope.2", this.offset-1);
931 }
932 }
933 this.next();
934 return tok;
935 }
936
937 Token getTokenForShorthand(int ch) {
938 Token tok;
939 switch (ch) {
940 case 'd':
941 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
942 ? Token.getRange("Nd", true) : Token.token_0to9;
943 break;
944 case 'D':
945 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
946 ? Token.getRange("Nd", false) : Token.token_not_0to9;
947 break;
948 case 'w':
949 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
950 ? Token.getRange("IsWord", true) : Token.token_wordchars;
951 break;
952 case 'W':
953 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
954 ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
955 break;
956 case 's':
957 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
958 ? Token.getRange("IsSpace", true) : Token.token_spaces;
959 break;
960 case 'S':
961 tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
962 ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
963 break;
964
965 default:
966 throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
967 }
968 return tok;
969 }
970
971 /**
972 */
973 int decodeEscaped() throws ParseException {
974 if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
975 int c = this.chardata;
976 switch (c) {
977 case 'e': c = 0x1b; break; // ESCAPE U+001B
978 case 'f': c = '\f'; break; // FORM FEED U+000C
979 case 'n': c = '\n'; break; // LINE FEED U+000A
980 case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
981 case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
982 //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
983 case 'x':
984 this.next();
985 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
986 if (this.chardata == '{') {
987 int v1 = 0;
988 int uv = 0;
989 do {
990 this.next();
991 if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
992 if ((v1 = hexChar(this.chardata)) < 0)
993 break;
994 if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
995 uv = uv*16+v1;
996 } while (true);
997 if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
998 if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
999 c = uv;
1000 } else {
1001 int v1 = 0;
1002 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1003 throw ex("parser.descape.1", this.offset-1);
1004 int uv = v1;
1005 this.next();
1006 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1007 throw ex("parser.descape.1", this.offset-1);
1008 uv = uv*16+v1;
1009 c = uv;
1010 }
1011 break;
1012
1013 case 'u':
1014 int v1 = 0;
1015 this.next();
1016 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1017 throw ex("parser.descape.1", this.offset-1);
1018 int uv = v1;
1019 this.next();
1020 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1021 throw ex("parser.descape.1", this.offset-1);
1022 uv = uv*16+v1;
1023 this.next();
1024 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1025 throw ex("parser.descape.1", this.offset-1);
1026 uv = uv*16+v1;
1027 this.next();
1028 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1029 throw ex("parser.descape.1", this.offset-1);
1030 uv = uv*16+v1;
1031 c = uv;
1032 break;
1033
1034 case 'v':
1035 this.next();
1036 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1037 throw ex("parser.descape.1", this.offset-1);
1038 uv = v1;
1039 this.next();
1040 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1041 throw ex("parser.descape.1", this.offset-1);
1042 uv = uv*16+v1;
1043 this.next();
1044 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1045 throw ex("parser.descape.1", this.offset-1);
1046 uv = uv*16+v1;
1047 this.next();
1048 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1049 throw ex("parser.descape.1", this.offset-1);
1050 uv = uv*16+v1;
1051 this.next();
1052 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1053 throw ex("parser.descape.1", this.offset-1);
1054 uv = uv*16+v1;
1055 this.next();
1056 if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
1057 throw ex("parser.descape.1", this.offset-1);
1058 uv = uv*16+v1;
1059 if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
1060 c = uv;
1061 break;
1062 case 'A':
1063 case 'Z':
1064 case 'z':
1065 throw ex("parser.descape.5", this.offset-2);
1066 default:
1067 }
1068 return c;
1069 }
1070
1071 static private final int hexChar(int ch) {
1072 if (ch < '0') return -1;
1073 if (ch > 'f') return -1;
1074 if (ch <= '9') return ch-'0';
1075 if (ch < 'A') return -1;
1076 if (ch <= 'F') return ch-'A'+10;
1077 if (ch < 'a') return -1;
1078 return ch-'a'+10;
1079 }
1080 }