Source code: org/apache/xerces/utils/regex/RegexParser.java
1 /*
2 * The Apache Software License, Version 1.1
3 *
4 *
5 * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6 * reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Apache Software Foundation (http://www.apache.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Xerces" and "Apache Software Foundation" must
28 * not be used to endorse or promote products derived from this
29 * software without prior written permission. For written
30 * permission, please contact apache@apache.org.
31 *
32 * 5. Products derived from this software may not be called "Apache",
33 * nor may "Apache" appear in their name, without prior written
34 * permission of the Apache Software Foundation.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This software consists of voluntary contributions made by many
51 * individuals on behalf of the Apache Software Foundation and was
52 * originally based on software copyright (c) 1999, International
53 * Business Machines, Inc., http://www.apache.org. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58 package org.apache.xerces.utils.regex;
59
60
61 import java.util.Locale;
62 import java.util.MissingResourceException;
63 import java.util.ResourceBundle;
64 import java.util.Vector;
65
66 /**
67 * A Regular Expression Parser.
68 */
69 class RegexParser {
70 static final int T_CHAR = 0;
71 static final int T_EOF = 1;
72 static final int T_OR = 2; // '|'
73 static final int T_STAR = 3; // '*'
74 static final int T_PLUS = 4; // '+'
75 static final int T_QUESTION = 5; // '?'
76 static final int T_LPAREN = 6; // '('
77 static final int T_RPAREN = 7; // ')'
78 static final int T_DOT = 8; // '.'
79 static final int T_LBRACKET = 9; // '['
80 static final int T_BACKSOLIDUS = 10; // '\'
81 static final int T_CARET = 11; // '^'
82 static final int T_DOLLAR = 12; // '$'
83 static final int T_LPAREN2 = 13; // '(?:'
84 static final int T_LOOKAHEAD = 14; // '(?='
85 static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
86 static final int T_LOOKBEHIND = 16; // '(?<='
87 static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
88 static final int T_INDEPENDENT = 18; // '(?>'
89 static final int T_SET_OPERATIONS = 19; // '(?['
90 static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
91 static final int T_COMMENT = 21; // '(?#'
92 static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
93 static final int T_CONDITION = 23; // '(?('
94 static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
95
96 static class ReferencePosition {
97 int refNumber;
98 int position;
99 ReferencePosition(int n, int pos) {
100 this.refNumber = n;
101 this.position = pos;
102 }
103 }
104
105 int offset;
106 String regex;
107 int regexlen;
108 int options;
109 ResourceBundle resources;
110 int chardata;
111 int nexttoken;
112 static protected final int S_NORMAL = 0;
113 static protected final int S_INBRACKETS = 1;
114 static protected final int S_INXBRACKETS = 2;
115 int context = S_NORMAL;
116 int parennumber = 1;
117 boolean hasBackReferences;
118 Vector references = null;
119
120 public RegexParser() {
121 //TODO IBM-JR this.setLocale(Locale.getDefault());
122 }
123 public RegexParser(Locale locale) {
124 //TODO IBM-JR this.setLocale(locale);
125 }
126
127 public void setLocale(Locale locale) {
128 try {
129 this.resources = ResourceBundle.getBundle("org.apache.xerces.utils.regex.message", locale);
130 } catch (MissingResourceException mre) {
131 throw new RuntimeException("Installation Problem??? Couldn't load messages: "
132 +mre.getMessage());
133 }
134 }
135
136 final ParseException ex(String key, int loc) {
137 return new ParseException(this.resources.getString(key), loc);
138 }
139
140 private final boolean isSet(int flag) {
141 return (this.options & flag) == flag;
142 }
143
144 synchronized Token parse(String regex, int options) throws ParseException {
145 this.options = options;
146 this.offset = 0;
147 this.setContext(S_NORMAL);
148 this.parennumber = 1;
149 this.hasBackReferences = false;
150 this.regex = regex;
151 if (this.isSet(RegularExpression.EXTENDED_COMMENT))
152 this.regex = REUtil.stripExtendedComment(this.regex);
153 this.regexlen = this.regex.length();
154
155
156 this.next();
157 Token ret = this.parseRegex();
158 if (this.offset != this.regexlen)
159 throw ex("parser.parse.1", this.offset);
160 if (this.references != null) {
161 for (int i = 0; i < this.references.size(); i ++) {
162 ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
163 if (this.parennumber <= position.refNumber)
164 throw ex("parser.parse.2", position.position);
165 }
166 this.references.removeAllElements();
167 }
168 return ret;
169 }
170
171 /*
172 public RegularExpression createRegex(String regex, int options) throws ParseException {
173 Token tok = this.parse(regex, options);
174 return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
175 }
176 */
177
178 protected final void setContext(int con) {
179 this.context = con;
180 }
181
182 final int read() {
183 return this.nexttoken;
184 }
185
186 final void next() {
187 if (this.offset >= this.regexlen) {
188 this.chardata = -1;
189 this.nexttoken = T_EOF;
190 return;
191 }
192
193 int ret;
194 int ch = this.regex.charAt(this.offset++);
195 this.chardata = ch;
196
197 if (this.context == S_INBRACKETS) {
198 // In a character class, this.chardata has one character, that is to say,
199 // a pair of surrogates is composed and stored to this.chardata.
200 switch (ch) {
201 case '\\':
202 ret = T_BACKSOLIDUS;
203 if (this.offset >= this.regexlen)
204 throw ex("parser.next.1", this.offset-1);
205 this.chardata = this.regex.charAt(this.offset++);
206 break;
207
208 case '-':
209 if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
210 && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
211 this.offset++;
212 ret = T_XMLSCHEMA_CC_SUBTRACTION;
213 } else
214 ret = T_CHAR;
215 break;
216
217 case '[':
218 if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
219 && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
220 this.offset++;
221 ret = T_POSIX_CHARCLASS_START;
222 break;
223 } // Through down
224 default:
225 if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
226 int low = this.regex.charAt(this.offset);
227 if (REUtil.isLowSurrogate(low)) {
228 this.chardata = REUtil.composeFromSurrogates(ch, low);
229 this.offset ++;
230 }
231 }
232 ret = T_CHAR;
233 }
234 this.nexttoken = ret;
235 return;
236 }
237
238 switch (ch) {
239 case '|': ret = T_OR; break;
240 case '*': ret = T_STAR; break;
241 case '+': ret = T_PLUS; break;
242 case '?': ret = T_QUESTION; break;
243 case ')': ret = T_RPAREN; break;
244 case '.': ret = T_DOT; break;
245 case '[': ret = T_LBRACKET; break;
246 case '^': ret = T_CARET; break;
247 case '$': ret = T_DOLLAR; break;
248 case '(':
249 ret = T_LPAREN;
250 if (this.offset >= this.regexlen)
251 break;
252 if (this.regex.charAt(this.offset) != '?')
253 break;
254 if (++this.offset >= this.regexlen)
255 throw ex("parser.next.2", this.offset-1);
256 ch = this.regex.charAt(this.offset++);
257 switch (ch) {
258 case ':': ret = T_LPAREN2; break;
259 case '=': ret = T_LOOKAHEAD; break;
260 case '!': ret = T_NEGATIVELOOKAHEAD; break;
261 case '[': ret = T_SET_OPERATIONS; break;
262 case '>': ret = T_INDEPENDENT; break;
263 case '<':
264 if (this.offset >= this.regexlen)
265 throw ex("parser.next.2", this.offset-3);
266 ch = this.regex.charAt(this.offset++);
267 if (ch == '=') {
268 ret = T_LOOKBEHIND;
269 } else if (ch == '!') {
270 ret = T_NEGATIVELOOKBEHIND;
271 } else
272 throw ex("parser.next.3", this.offset-3);
273 break;
274 case '#':
275 while (this.offset < this.regexlen) {
276 ch = this.regex.charAt(this.offset++);
277 if (ch == ')') break;
278 }
279 if (ch != ')')
280 throw ex("parser.next.4", this.offset-1);
281 ret = T_COMMENT;
282 break;
283 default:
284 if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
285 this.offset --;
286 ret = T_MODIFIERS;
287 break;
288 } else if (ch == '(') { // conditional
289 ret = T_CONDITION; // this.offsets points the next of '('.
290 break;
291 }
292 throw ex("parser.next.2", this.offset-2);
293 }
294 break;
295
296 case '\\':
297 ret = T_BACKSOLIDUS;
298 if (this.offset >= this.regexlen)
299 throw ex("parser.next.1", this.offset-1);
300 this.chardata = this.regex.charAt(this.offset++);
301 break;
302
303 default:
304 ret = T_CHAR;
305 if (REUtil.isHighSurrogate(this.chardata) && this.offset < this.regexlen)
306 this.chardata = REUtil.composeFromSurrogates(this.chardata,
307 this.regex.charAt(this.offset++));
308 }
309 this.nexttoken = ret;
310 }
311
312 /**
313 * regex ::= term (`|` term)*
314 * term ::= factor+
315 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
316 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
317 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
318 * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
319 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
320 */
321 Token parseRegex() throws ParseException {
322 Token tok = this.parseTerm();
323 Token parent = null;
324 while (this.read() == T_OR) {
325 this.next(); // '|'
326 if (parent == null) {
327 parent = Token.createUnion();
328 parent.addChild(tok);
329 tok = parent;
330 }
331 tok.addChild(this.parseTerm());
332 }
333 return tok;
334 }
335
336 /**
337 * term ::= factor+
338 */
339 Token parseTerm() throws ParseException {
340 int ch = this.read();
341 if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
342 return Token.createEmpty();
343 } else {
344 Token tok = this.parseFactor();
345 Token concat = null;
346 while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
347 if (concat == null) {
348 concat = Token.createConcat();
349 concat.addChild(tok);
350 tok = concat;
351 }
352 concat.addChild(this.parseFactor());
353 //tok = Token.createConcat(tok, this.parseFactor());
354 }
355 return tok;
356 }
357 }
358
359 // ----------------------------------------------------------------
360
361 Token processCaret() throws ParseException {
362 this.next();
363 return Token.token_linebeginning;
364 }
365 Token processDollar() throws ParseException {
366 this.next();
367 return Token.token_lineend;
368 }
369 Token processLookahead() throws ParseException {
370 this.next();
371 Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
372 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
373 this.next(); // ')'
374 return tok;
375 }
376 Token processNegativelookahead() throws ParseException {
377 this.next();
378 Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
379 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
380 this.next(); // ')'
381 return tok;
382 }
383 Token processLookbehind() throws ParseException {
384 this.next();
385 Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
386 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
387 this.next(); // ')'
388 return tok;
389 }
390 Token processNegativelookbehind() throws ParseException {
391 this.next();
392 Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
393 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
394 this.next(); // ')'
395 return tok;
396 }
397 Token processBacksolidus_A() throws ParseException {
398 this.next();
399 return Token.token_stringbeginning;
400 }
401 Token processBacksolidus_Z() throws ParseException {
402 this.next();
403 return Token.token_stringend2;
404 }
405 Token processBacksolidus_z() throws ParseException {
406 this.next();
407 return Token.token_stringend;
408 }
409 Token processBacksolidus_b() throws ParseException {
410 this.next();
411 return Token.token_wordedge;
412 }
413 Token processBacksolidus_B() throws ParseException {
414 this.next();
415 return Token.token_not_wordedge;
416 }
417 Token processBacksolidus_lt() throws ParseException {
418 this.next();
419 return Token.token_wordbeginning;
420 }
421 Token processBacksolidus_gt() throws ParseException {
422 this.next();
423 return Token.token_wordend;
424 }
425 Token processStar(Token tok) throws ParseException {
426 this.next();
427 if (this.read() == T_QUESTION) {
428 this.next();
429 return Token.createNGClosure(tok);
430 } else
431 return Token.createClosure(tok);
432 }
433 Token processPlus(Token tok) throws ParseException {
434 // X+ -> XX*
435 this.next();
436 if (this.read() == T_QUESTION) {
437 this.next();
438 return Token.createConcat(tok, Token.createNGClosure(tok));
439 } else
440 return Token.createConcat(tok, Token.createClosure(tok));
441 }
442 Token processQuestion(Token tok) throws ParseException {
443 // X? -> X|
444 this.next();
445 Token par = Token.createUnion();
446 if (this.read() == T_QUESTION) {
447 this.next();
448 par.addChild(Token.createEmpty());
449 par.addChild(tok);
450 } else {
451 par.addChild(tok);
452 par.addChild(Token.createEmpty());
453 }
454 return par;
455 }
456 boolean checkQuestion(int off) {
457 return off < this.regexlen && this.regex.charAt(off) == '?';
458 }
459 Token processParen() throws ParseException {
460 this.next();
461 int p = this.parennumber++;
462 Token tok = Token.createParen(this.parseRegex(), p);
463 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
464 this.next(); // Skips ')'
465 return tok;
466 }
467 Token processParen2() throws ParseException {
468 this.next();
469 Token tok = Token.createParen(this.parseRegex(), 0);
470 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
471 this.next(); // Skips ')'
472 return tok;
473 }
474 Token processCondition() throws ParseException {
475 // this.offset points the next of '('
476 if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
477 // Parses a condition.
478 int refno = -1;
479 Token condition = null;
480 int ch = this.regex.charAt(this.offset);
481 if ('1' <= ch && ch <= '9') {
482 refno = ch-'0';
483 this.hasBackReferences = true;
484 if (this.references == null) this.references = new Vector();
485 this.references.addElement(new ReferencePosition(refno, this.offset));
486 this.offset ++;
487 if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
488 this.offset ++;
489 } else {
490 if (ch == '?') this.offset --; // Points '('.
491 this.next();
492 condition = this.parseFactor();
493 switch (condition.type) {
494 case Token.LOOKAHEAD:
495 case Token.NEGATIVELOOKAHEAD:
496 case Token.LOOKBEHIND:
497 case Token.NEGATIVELOOKBEHIND:
498 break;
499 case Token.ANCHOR:
500 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
501 break;
502 default:
503 throw ex("parser.factor.5", this.offset);
504 }
505 }
506 // Parses yes/no-patterns.
507 this.next();
508 Token yesPattern = this.parseRegex();
509 Token noPattern = null;
510 if (yesPattern.type == Token.UNION) {
511 if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
512 noPattern = yesPattern.getChild(1);
513 yesPattern = yesPattern.getChild(0);
514 }
515 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
516 this.next();
517 return Token.createCondition(refno, condition, yesPattern, noPattern);
518 }
519 Token processModifiers() throws ParseException {
520 // this.offset points the next of '?'.
521 // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
522 int add = 0, mask = 0, ch = -1;
523 while (this.offset < this.regexlen) {
524 ch = this.regex.charAt(this.offset);
525 int v = REUtil.getOptionValue(ch);
526 if (v == 0) break; // '-' or ':'?
527 add |= v;
528 this.offset ++;
529 }
530 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
531 if (ch == '-') {
532 this.offset ++;
533 while (this.offset < this.regexlen) {
534 ch = this.regex.charAt(this.offset);
535 int v = REUtil.getOptionValue(ch);
536 if (v == 0) break; // ':'?
537 mask |= v;
538 this.offset ++;
539 }
540 if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
541 }
542 Token tok;
543 if (ch == ':') {
544 this.offset ++;
545 this.next();
546 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
547 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
548 this.next();
549 } else if (ch == ')') { // such as (?-i)
550 this.offset ++;
551 this.next();
552 tok = Token.createModifierGroup(this.parseRegex(), add, mask);
553 } else
554 throw ex("parser.factor.3", this.offset);
555
556 return tok;
557 }
558 Token processIndependent() throws ParseException {
559 this.next();
560 Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
561 if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
562 this.next(); // Skips ')'
563 return tok;
564 }
565 Token processBacksolidus_c() throws ParseException {
566 int ch2; // Must be in 0x0040-0x005f
567 if (this.offset >= this.regexlen
568 || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
569 throw ex("parser.atom.1", this.offset-1);
570 this.next();
571 return Token.createChar(ch2-0x40);
572 }
573 Token processBacksolidus_C() throws ParseException {
574 throw ex("parser.process.1", this.offset);
575 }
576 Token processBacksolidus_i() throws ParseException {
577 Token tok = Token.createChar('i');
578 this.next();
579 return tok;
580 }
581 Token processBacksolidus_I() throws ParseException {
582 throw ex("parser.process.1", this.offset);
583 }
584 Token processBacksolidus_g() throws ParseException {
585 this.next();
586 return Token.getGraphemePattern();
587 }
588 Token processBacksolidus_X() throws ParseException {
589 this.next();
590 return Token.getCombiningCharacterSequence();
591 }
592 Token processBackreference() throws ParseException {
593 int refnum = this.chardata-'0';
594 Token tok = Token.createBackReference(refnum);
595 this.hasBackReferences = true;
596 if (this.references == null) this.references = new Vector();
597 this.references.addElement(new ReferencePosition(refnum, this.offset-2));
598 this.next();
599 return tok;
600 }
601
602 // ----------------------------------------------------------------
603
604 /**
605 * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
606 * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
607 * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
608 * | '(?#' [^)]* ')'
609 * minmax ::= '{' min (',' max?)? '}'
610 * min ::= [0-9]+
611 * max ::= [0-9]+
612 */
613 Token parseFactor() throws ParseException {
614 int ch = this.read();
615 Token tok;
616 switch (ch) {
617 case T_CARET: return this.processCaret();
618 case T_DOLLAR: return this.processDollar();
619 case T_LOOKAHEAD: return this.processLookahead();
620 case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
621 case T_LOOKBEHIND: return this.processLookbehind();
622 case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
623
624 case T_COMMENT:
625 this.next();
626 return Token.createEmpty();
627
628 case T_BACKSOLIDUS:
629 switch (this.chardata) {
630 case 'A': return this.processBacksolidus_A();
631 case 'Z': return this.processBacksolidus_Z();
632 case 'z': return this.processBacksolidus_z();
633 case 'b': return this.processBacksolidus_b();
634 case 'B': return this.processBacksolidus_B();
635 case '<': return this.processBacksolidus_lt();
636 case '>': return this.processBacksolidus_gt();
637 }
638 // through down
639 }
640 tok = this.parseAtom();
641 ch = this.read();
642 switch (ch) {
643 case T_STAR: return this.processStar(tok);
644 case T_PLUS: return this.processPlus(tok);
645 case T_QUESTION: return this.processQuestion(tok);
646 case T_CHAR:
647 if (this.chardata == '{') {
648 // this.offset -> next of '{'
649 int off = this.offset;
650 int min = 0, max = -1;
651 if (off >= this.regexlen) break;
652 ch = this.regex.charAt(off++);
653 if (ch != ',' && (ch < '0' || ch > '9')) break;
654 if (ch != ',') { // 0-9
655 min = ch-'0';
656 while (off < this.regexlen
657 && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
658 min = min*10 +ch-'0';
659 ch = -1;
660 }
661 if (ch < 0) break;
662 }
663 //if (off >= this.regexlen) break;
664 max = min;
665 if (ch == ',') {
666 if (off >= this.regexlen
667 || ((ch = this.regex.charAt(off++)) < '0' || ch > '9')
668 && ch != '}')
669 break;
670 if (ch == '}') {
671 max = -1; // {min,}
672 } else {
673 max = ch-'0'; // {min,max}
674 while (off < this.regexlen
675 && (ch = this.regex.charAt(off++)) >= '0'
676 && ch <= '9') {
677 max = max*10 +ch-'0';
678 ch = -1;
679 }
680 if (ch < 0) break;
681 //if (min > max)
682 // throw new ParseException("parseFactor(): min > max: "+min+", "+max);
683 }
684 }
685 if (ch != '}') break;
686 // off -> next of '}'
687 if (this.checkQuestion(off)) {
688 tok = Token.createNGClosure(tok);
689 this.offset = off+1;
690 } else {
691 tok = Token.createClosure(tok);
692 this.offset = off;
693 }
694 tok.setMin(min);
695 tok.setMax(max);
696 //System.err.println("CLOSURE: "+min+", "+max);
697 this.next();
698 }
699 }
700 return tok;
701 }
702
703 /**
704 * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
705 * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
706 * | '(?>' regex ')'
707 * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
708 */
709 Token parseAtom() throws ParseException {
710 int ch = this.read();
711 Token tok = null;
712 switch (ch) {
713 case T_LPAREN: return this.processParen();
714 case T_LPAREN2: return this.processParen2(); // '(?:'
715 case T_CONDITION: return this.processCondition(); // '(?('
716 case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
717 case T_INDEPENDENT: return this.processIndependent();
718 case T_DOT:
719 this.next(); // Skips '.'
720 tok = Token.token_dot;
721 break;
722
723 /**
724 * char-class ::= '[' ( '^'? range ','?)+ ']'
725 * range ::= '\d' | '\w' | '\s' | category-block | range-char
726 * | range-char '-' range-char
727 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
728 * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
729 */
730 case T_LBRACKET: return this.parseCharacterClass(true);
731 case T_SET_OPERATIONS: return this.parseSetOperations();
732
733 case T_BACKSOLIDUS:
734 switch (this.chardata) {
735 case 'd': case 'D':
736 case 'w': case 'W':
737 case 's': case 'S':
738 tok = this.getTokenForShorthand(this.chardata);
739 this.next();
740 return tok;
741
742 case 'e': case 'f': case 'n': case 'r':
743 case 't': case 'u': case 'v': case 'x':
744 {
745 int ch2 = this.decodeEscaped();
746 if (ch2 < 0x10000) {
747 tok = Token.createChar(ch2);
748 } else {
749 tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
750 }
751 }
752 break;
753
754 case 'c': return this.processBacksolidus_c();
755 case 'C': return this.processBacksolidus_C();
756 case 'i': return this.processBacksolidus_i();
757 case 'I': return this.processBacksolidus_I();
758 case 'g': return this.processBacksolidus_g();
759 case 'X': return this.processBacksolidus_X();
760 case '1': case '2': case '3': case '4':
761 case '5': case '6': case '7': case '8': case '9':
762 return this.processBackreference();
763
764 case 'P':
765 case 'p':
766 int pstart = this.offset;
767 tok = processBacksolidus_pP(this.chardata);
768 if (tok == null) throw this.ex("parser.atom.5", pstart);
769 break;
770
771 default:
772 tok = Token.createChar(this.chardata);
773 }
774 this.next();
775 break;
776
777 case T_CHAR:
778 tok = Token.createChar(this.chardata);
779 this.next();
780 break;
781
782 default:
783 throw this.ex("parser.atom.4", this.offset-1);
784 }
785 return tok;
786 }
787
788 protected RangeToken processBacksolidus_pP(int c) throws ParseException {
789 boolean positive = c == 'p';
790 this.next();
791 if (this.read() != T_CHAR) throw this.ex("parser.atom.2", this.offset-1);
792 RangeToken tok;
793 switch (this.chardata) {
794 case 'L': // Letter
795 tok = Token.getRange("L", positive); break;
796 case 'M': // Mark
797 tok = Token.getRange("M", positive); break;
798 case 'N': // Number
799 tok = Token.getRange("N", positive); break;
800 case 'Z': // Separator
801 tok = Token.getRange("Z", positive); break;
802 case 'C': // Other
803 tok = Token.getRange("C", positive); break;
804 case 'P': // Punctuation
805 tok = Token.getRange("P", positive); break;
806 case 'S': // Symbol
807 tok = Token.getRange("S", positive); break;
808 case '{':
809 // this.offset points the next of '{'.
810 //pstart = this.offset;
811 int namestart = this.offset;
812 int nameend = this.regex.indexOf('}', namestart);
813 if (nameend < 0) throw this.ex("parser.atom.3", this.offset);
814 this.offset = nameend+1;
815 tok = Token.getRange(this.regex.substring(namestart, nameend), positive);
816 /*
817 if (this.isSet(RegularExpression.IGNORE_CASE))
818 tok = RangeToken.createCaseInsensitiveToken(tok);
819 */
820 break;
821
822 default:
823 throw this.ex("parser.atom.2", this.offset-1);
824 }
825 return tok;
826 }
827
828 int processCIinCharacterClass(RangeToken tok, int c) {
829 return this.decodeEscaped();
830 }
831
832 /**
833 * char-class ::= '[' ( '^'? range ','?)+ ']'
834 * range ::= '\d' | '\w' | '\s' | category-block | range-char
835 * | range-char '-' range-char
836 * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
837 * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
838 */
839 protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
840 this.setContext(S_INBRACKETS);
841 this.next(); // '['
842 boolean nrange = false;
843 RangeToken base = null;
844 RangeToken tok;
845 if (this.read() == T_CHAR && this.chardata == '^') {
846 nrange = true;
847 this.next(); // '^'
848 if (useNrange) {
849 tok = Token.createNRange();
850 } else {
851 base = Token.createRange();
852 base.addRange(0, Token.UTF16_MAX);
853 tok = Token.createRange();
854 }
855 } else {
856 tok = Token.createRange();
857 }
858 int type;
859 boolean firstloop = true;
860 while ((type = this.read()) != T_EOF) {
861 if (type == T_CHAR && this.chardata == ']' && !firstloop)
862 break;
863 firstloop = false;
864 int c = this.chardata;
865 boolean end = false;
866 if (type == T_BACKSOLIDUS) {
867 switch (c) {
868 case 'd': case 'D':
869 case 'w': case 'W':
870 case 's': case 'S':
871 tok.mergeRanges(this.getTokenForShorthand(c));
872 end = true;
873 break;
874
875 case 'i': case 'I':
876 case 'c': case 'C':
877 c = this.processCIinCharacterClass(tok, c);
878 if (c < 0) end = true;
879 break;
880
881 case 'p':
882 case 'P':
883 int pstart = this.offset;
884 RangeToken tok2 = this.processBacksolidus_pP(c);
885 if (tok2 == null) throw this.ex("parser.atom.5", pstart);
886 tok.mergeRanges(tok2);
887 end = true;
888 break;
889
890 default:
891 c = this.decodeEscaped();
892 } // \ + c
893 } // backsolidus
894 // POSIX Character class such as [:alnum:]
895 else if (type == T_POSIX_CHARCLASS_START) {
896 int nameend = this.regex.indexOf(':', this.offset);
897 if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
898 boolean positive = true;
899 if (this.regex.charAt(this.offset) == '^') {
900 this.offset ++;
901 positive = false;
902 }
903 String name = this.regex.substring(this.offset, nameend);
904 RangeToken range = Token.getRange(name, positive);
905 if (range == null) throw this.ex("parser.cc.3", this.offset);
906 tok.mergeRanges(range);
907 end = true;
908 if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
909 throw this.ex("parser.cc.1", nameend);
910 this.offset = nameend+2;
911 }
912 this.next();
913 if (!end) { // if not shorthands...
914 if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
915 tok.addRange(c, c);
916 } else {
917 this.next(); // Skips '-'
918 if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
919 if (type == T_CHAR && this.chardata == ']') {
920 tok.addRange(c, c);
921 tok.addRange('-', '-');
922 } else {
923 int rangeend = this.chardata;
924 if (type == T_BACKSOLIDUS)
925 rangeend = this.decodeEscaped();
926 this.next();
927 tok.addRange(c, rangeend);
928 }
929 }
930 }
931 if (this.isSet(RegularExpression.SPECIAL_COMMA)
932 && this.read() == T_CHAR && this.chardata == ',')
933 this.next();
934 }
935 if (this.read() == T_EOF)
936 throw this.ex("parser.cc.2", this.offset);
937 if (!useNrange && nrange) {
938 base.subtractRanges(tok);
939 tok = base;
940 }
941 tok.sortRanges();
942 tok.compactRanges();
943 //tok.dumpRanges();
944 /*
945 if (this.isSet(RegularExpression.IGNORE_CASE))
946 tok = RangeToken.createCaseInsensitiveToken(tok);
947 */
948 this.setContext(S_NORMAL);
949 this.next(); // Skips ']'
950
951 return tok;
952 }
953 private RangeToken parseCharacterClass_old(boolean useNrange) throws ParseException {
954 this.setContext(S_INBRACKETS);
955 this.next(); // '['
956 boolean nrange = false;
957 RangeToken base = null;
958 RangeToken tok;
959 if (this.read() == T_CHAR && this.chardata == '^') {
960 nrange = true;
961 this.next(); // '^'
962 if (useNrange) {
963 tok = Token.createNRange();
964 } else {
965 base = Token.createRange();
966 base.addRange(0, Token.UTF16_MAX);
967 tok = Token.createRange();
968 }
969 } else {
970 tok = Token.createRange();
971 }
972 int type;
973 while ((type = this.read()) != T_EOF
974 && !(type == T_CHAR && this.chardata == ']')) {
975 int c = this.chardata;
976 /*
977 if (type == T_CHAR && c == '^') {
978 this.next();
979 type = this.read();
980 c = this.chardata;
981 if (type == T_EOF) break;
982
983 nrange = !nrange;
984 if (nrange)
985 tok = Token.createRange();
986 else {
987 base.subtractRanges(tok);
988 tok = base;
989 }
990 }
991 */
992 boolean end = false;
993 if (type == T_BACKSOLIDUS) {
994 switch (c) {
995 case 'd': case 'D':
996 case 'w': case 'W':
997 case 's': case 'S':
998 tok.mergeRanges(this.getTokenForShorthand(c));
999 end = true;
1000 break;
1001
1002 case 'i': case 'I':
1003 case 'c': case 'C':
1004 c = this.processCIinCharacterClass(tok, c);
1005 if (c < 0) end = true;
1006 break;
1007
1008 case 'p':
1009 case 'P':
1010 boolean positive = c == 'p';
1011 int pstart = this.offset;
1012 this.next();
1013 if (this.read() != T_CHAR) throw ex("parser.atom.2", this.offset-1);
1014 RangeToken tok2 = null;
1015 switch (this.chardata) {
1016 case 'L': // Letter
1017 tok2 = Token.getRange("L", positive); break;
1018 case 'M': // Mark
1019 tok2 = Token.getRange("M", positive); break;
1020 case 'N': // Number
1021 tok2 = Token.getRange("N", positive); break;
1022 case 'Z': // Separator
1023 tok2 = Token.getRange("Z", positive); break;
1024 case 'C': // Other
1025 tok2 = Token.getRange("C", positive); break;
1026 case 'P': // Punctuation
1027 tok2 = Token.getRange("P", positive); break;
1028 case 'S': // Symbol
1029 tok2 = Token.getRange("S", positive); break;
1030 case '{':
1031 // this.offset points the next of '{'.
1032 pstart = this.offset;
1033 int namestart = this.offset;
1034 int nameend = this.regex.indexOf('}', namestart);
1035 if (nameend < 0) throw ex("parser.atom.3", this.offset);
1036 this.offset = nameend+1;
1037 tok2 = Token.getRange(this.regex.substring(namestart, nameend), positive);
1038 break;
1039
1040 default:
1041 throw ex("parser.atom.2", this.offset-1);
1042 }
1043 if (tok2 == null) throw ex("parser.atom.5", pstart);
1044 tok.mergeRanges(tok2);
1045 end = true;
1046 break;
1047
1048 default:
1049 c = this.decodeEscaped();
1050 } // \ + c
1051 }