1 /* Copyright 2004 The Apache Software Foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.apache.xmlbeans.impl.regex;
17
18 import java.util.Vector;
19 import java.util.Hashtable;
20
21 /**
22 * This class represents a node in parse tree.
23 */
24 class Token implements java.io.Serializable {
25 static final boolean COUNTTOKENS = true;
26 static int tokens = 0;
27
28 static final int CHAR = 0; // Literal char
29 static final int DOT = 11; // .
30 static final int CONCAT = 1; // XY
31 static final int UNION = 2; // X|Y|Z
32 static final int CLOSURE = 3; // X*
33 static final int RANGE = 4; // [a-zA-Z] etc.
34 static final int NRANGE = 5; // [^a-zA-Z] etc.
35 static final int PAREN = 6; // (X) or (?:X)
36 static final int EMPTY = 7; //
37 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
38 static final int NONGREEDYCLOSURE = 9; // *? +?
39 static final int STRING = 10; // strings
40 static final int BACKREFERENCE = 12; // back references
41 static final int LOOKAHEAD = 20; // (?=...)
42 static final int NEGATIVELOOKAHEAD = 21; // (?!...)
43 static final int LOOKBEHIND = 22; // (?<=...)
44 static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
45 static final int INDEPENDENT = 24; // (?>...)
46 static final int MODIFIERGROUP = 25; // (?ims-ims:...)
47 static final int CONDITION = 26; // (?(...)yes|no)
48
49 static final int UTF16_MAX = 0x10ffff;
50
51 int type;
52
53 static Token token_dot;
54 static Token token_0to9;
55 static Token token_wordchars;
56 static Token token_not_0to9;
57 static Token token_not_wordchars;
58 static Token token_spaces;
59 static Token token_not_spaces;
60 static Token token_empty;
61 static Token token_linebeginning;
62 static Token token_linebeginning2;
63 static Token token_lineend;
64 static Token token_stringbeginning;
65 static Token token_stringend;
66 static Token token_stringend2;
67 static Token token_wordedge;
68 static Token token_not_wordedge;
69 static Token token_wordbeginning;
70 static Token token_wordend;
71 static {
72 Token.token_empty = new Token(Token.EMPTY);
73
74 Token.token_linebeginning = Token.createAnchor('^');
75 Token.token_linebeginning2 = Token.createAnchor('@');
76 Token.token_lineend = Token.createAnchor('$');
77 Token.token_stringbeginning = Token.createAnchor('A');
78 Token.token_stringend = Token.createAnchor('z');
79 Token.token_stringend2 = Token.createAnchor('Z');
80 Token.token_wordedge = Token.createAnchor('b');
81 Token.token_not_wordedge = Token.createAnchor('B');
82 Token.token_wordbeginning = Token.createAnchor('<');
83 Token.token_wordend = Token.createAnchor('>');
84
85 Token.token_dot = new Token(Token.DOT);
86
87 Token.token_0to9 = Token.createRange();
88 Token.token_0to9.addRange('0', '9');
89 Token.token_wordchars = Token.createRange();
90 Token.token_wordchars.addRange('0', '9');
91 Token.token_wordchars.addRange('A', 'Z');
92 Token.token_wordchars.addRange('_', '_');
93 Token.token_wordchars.addRange('a', 'z');
94 Token.token_spaces = Token.createRange();
95 Token.token_spaces.addRange('\t', '\t');
96 Token.token_spaces.addRange('\n', '\n');
97 Token.token_spaces.addRange('\f', '\f');
98 Token.token_spaces.addRange('\r', '\r');
99 Token.token_spaces.addRange(' ', ' ');
100
101 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
102 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
103 Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
104 }
105
106 static Token.ParenToken createLook(int type, Token child) {
107 if (COUNTTOKENS) Token.tokens ++;
108 return new Token.ParenToken(type, child, 0);
109 }
110 static Token.ParenToken createParen(Token child, int pnumber) {
111 if (COUNTTOKENS) Token.tokens ++;
112 return new Token.ParenToken(Token.PAREN, child, pnumber);
113 }
114 static Token.ClosureToken createClosure(Token tok) {
115 if (COUNTTOKENS) Token.tokens ++;
116 return new Token.ClosureToken(Token.CLOSURE, tok);
117 }
118 static Token.ClosureToken createNGClosure(Token tok) {
119 if (COUNTTOKENS) Token.tokens ++;
120 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
121 }
122 static Token.ConcatToken createConcat(Token tok1, Token tok2) {
123 if (COUNTTOKENS) Token.tokens ++;
124 return new Token.ConcatToken(tok1, tok2);
125 }
126 static Token.UnionToken createConcat() {
127 if (COUNTTOKENS) Token.tokens ++;
128 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
129 }
130 static Token.UnionToken createUnion() {
131 if (COUNTTOKENS) Token.tokens ++;
132 return new Token.UnionToken(Token.UNION);
133 }
134 static Token createEmpty() {
135 return Token.token_empty;
136 }
137 static RangeToken createRange() {
138 if (COUNTTOKENS) Token.tokens ++;
139 return new RangeToken(Token.RANGE);
140 }
141 static RangeToken createNRange() {
142 if (COUNTTOKENS) Token.tokens ++;
143 return new RangeToken(Token.NRANGE);
144 }
145 static Token.CharToken createChar(int ch) {
146 if (COUNTTOKENS) Token.tokens ++;
147 return new Token.CharToken(Token.CHAR, ch);
148 }
149 static private Token.CharToken createAnchor(int ch) {
150 if (COUNTTOKENS) Token.tokens ++;
151 return new Token.CharToken(Token.ANCHOR, ch);
152 }
153 static Token.StringToken createBackReference(int refno) {
154 if (COUNTTOKENS) Token.tokens ++;
155 return new Token.StringToken(Token.BACKREFERENCE, null, refno);
156 }
157 static Token.StringToken createString(String str) {
158 if (COUNTTOKENS) Token.tokens ++;
159 return new Token.StringToken(Token.STRING, str, 0);
160 }
161 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
162 if (COUNTTOKENS) Token.tokens ++;
163 return new Token.ModifierToken(child, add, mask);
164 }
165 static Token.ConditionToken createCondition(int refno, Token condition,
166 Token yespat, Token nopat) {
167 if (COUNTTOKENS) Token.tokens ++;
168 return new Token.ConditionToken(refno, condition, yespat, nopat);
169 }
170
171 protected Token(int type) {
172 this.type = type;
173 }
174
175 /**
176 * A number of children.
177 */
178 int size() {
179 return 0;
180 }
181 Token getChild(int index) {
182 return null;
183 }
184 void addChild(Token tok) {
185 throw new RuntimeException("Not supported.");
186 }
187
188 // for RANGE or NRANGE
189 protected void addRange(int start, int end) {
190 throw new RuntimeException("Not supported.");
191 }
192 protected void sortRanges() {
193 throw new RuntimeException("Not supported.");
194 }
195 protected void compactRanges() {
196 throw new RuntimeException("Not supported.");
197 }
198 protected void mergeRanges(Token tok) {
199 throw new RuntimeException("Not supported.");
200 }
201 protected void subtractRanges(Token tok) {
202 throw new RuntimeException("Not supported.");
203 }
204 protected void intersectRanges(Token tok) {
205 throw new RuntimeException("Not supported.");
206 }
207 static Token complementRanges(Token tok) {
208 return RangeToken.complementRanges(tok);
209 }
210
211
212 void setMin(int min) { // for CLOSURE
213 }
214 void setMax(int max) { // for CLOSURE
215 }
216 int getMin() { // for CLOSURE
217 return -1;
218 }
219 int getMax() { // for CLOSURE
220 return -1;
221 }
222 int getReferenceNumber() { // for STRING
223 return 0;
224 }
225 String getString() { // for STRING
226 return null;
227 }
228
229 int getParenNumber() {
230 return 0;
231 }
232 int getChar() {
233 return -1;
234 }
235
236 public String toString() {
237 return this.toString(0);
238 }
239 public String toString(int options) {
240 return this.type == Token.DOT ? "." : "";
241 }
242
243 /**
244 * How many characters are needed?
245 */
246 final int getMinLength() {
247 switch (this.type) {
248 case CONCAT:
249 int sum = 0;
250 for (int i = 0; i < this.size(); i ++)
251 sum += this.getChild(i).getMinLength();
252 return sum;
253
254 case CONDITION:
255 case UNION:
256 if (this.size() == 0)
257 return 0;
258 int ret = this.getChild(0).getMinLength();
259 for (int i = 1; i < this.size(); i ++) {
260 int min = this.getChild(i).getMinLength();
261 if (min < ret) ret = min;
262 }
263 return ret;
264
265 case CLOSURE:
266 case NONGREEDYCLOSURE:
267 if (this.getMin() >= 0)
268 return this.getMin() * this.getChild(0).getMinLength();
269 return 0;
270
271 case EMPTY:
272 case ANCHOR:
273 return 0;
274
275 case DOT:
276 case CHAR:
277 case RANGE:
278 case NRANGE:
279 return 1;
280
281 case INDEPENDENT:
282 case PAREN:
283 case MODIFIERGROUP:
284 return this.getChild(0).getMinLength();
285
286 case BACKREFERENCE:
287 return 0; // *******
288
289 case STRING:
290 return this.getString().length();
291
292 case LOOKAHEAD:
293 case NEGATIVELOOKAHEAD:
294 case LOOKBEHIND:
295 case NEGATIVELOOKBEHIND:
296 return 0; // ***** Really?
297
298 default:
299 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
300 }
301 }
302
303 final int getMaxLength() {
304 switch (this.type) {
305 case CONCAT:
306 int sum = 0;
307 for (int i = 0; i < this.size(); i ++) {
308 int d = this.getChild(i).getMaxLength();
309 if (d < 0) return -1;
310 sum += d;
311 }
312 return sum;
313
314 case CONDITION:
315 case UNION:
316 if (this.size() == 0)
317 return 0;
318 int ret = this.getChild(0).getMaxLength();
319 for (int i = 1; ret >= 0 && i < this.size(); i ++) {
320 int max = this.getChild(i).getMaxLength();
321 if (max < 0) { // infinity
322 ret = -1;
323 break;
324 }
325 if (max > ret) ret = max;
326 }
327 return ret;
328
329 case CLOSURE:
330 case NONGREEDYCLOSURE:
331 if (this.getMax() >= 0)
332 // When this.child.getMaxLength() < 0,
333 // this returns minus value
334 return this.getMax() * this.getChild(0).getMaxLength();
335 return -1;
336
337 case EMPTY:
338 case ANCHOR:
339 return 0;
340
341 case CHAR:
342 return 1;
343 case DOT:
344 case RANGE:
345 case NRANGE:
346 return 2;
347
348 case INDEPENDENT:
349 case PAREN:
350 case MODIFIERGROUP:
351 return this.getChild(0).getMaxLength();
352
353 case BACKREFERENCE:
354 return -1; // ******
355
356 case STRING:
357 return this.getString().length();
358
359 case LOOKAHEAD:
360 case NEGATIVELOOKAHEAD:
361 case LOOKBEHIND:
362 case NEGATIVELOOKBEHIND:
363 return 0; // ***** Really?
364
365 default:
366 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
367 }
368 }
369
370 static final int FC_CONTINUE = 0;
371 static final int FC_TERMINAL = 1;
372 static final int FC_ANY = 2;
373 private static final boolean isSet(int options, int flag) {
374 return (options & flag) == flag;
375 }
376 final int analyzeFirstCharacter(RangeToken result, int options) {
377 switch (this.type) {
378 case CONCAT:
379 int ret = FC_CONTINUE;
380 for (int i = 0; i < this.size(); i ++)
381 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
382 break;
383 return ret;
384
385 case UNION:
386 if (this.size() == 0)
387 return FC_CONTINUE;
388 /*
389 * a|b|c -> FC_TERMINAL
390 * a|.|c -> FC_ANY
391 * a|b| -> FC_CONTINUE
392 */
393 int ret2 = FC_CONTINUE;
394 boolean hasEmpty = false;
395 for (int i = 0; i < this.size(); i ++) {
396 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
397 if (ret2 == FC_ANY)
398 break;
399 else if (ret2 == FC_CONTINUE)
400 hasEmpty = true;
401 }
402 return hasEmpty ? FC_CONTINUE : ret2;
403
404 case CONDITION:
405 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
406 if (this.size() == 1) return FC_CONTINUE;
407 if (ret3 == FC_ANY) return ret3;
408 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
409 if (ret4 == FC_ANY) return ret4;
410 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
411
412 case CLOSURE:
413 case NONGREEDYCLOSURE:
414 this.getChild(0).analyzeFirstCharacter(result, options);
415 return FC_CONTINUE;
416
417 case EMPTY:
418 case ANCHOR:
419 return FC_CONTINUE;
420
421 case CHAR:
422 int ch = this.getChar();
423 result.addRange(ch, ch);
424 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
425 ch = Character.toUpperCase((char)ch);
426 result.addRange(ch, ch);
427 ch = Character.toLowerCase((char)ch);
428 result.addRange(ch, ch);
429 }
430 return FC_TERMINAL;
431
432 case DOT: // ****
433 if (isSet(options, RegularExpression.SINGLE_LINE)) {
434 return FC_CONTINUE; // **** We can not optimize.
435 } else {
436 return FC_CONTINUE;
437 /*
438 result.addRange(0, RegularExpression.LINE_FEED-1);
439 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
440 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
441 RegularExpression.LINE_SEPARATOR-1);
442 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
443 return 1;
444 */
445 }
446
447 case RANGE:
448 if (isSet(options, RegularExpression.IGNORE_CASE)) {
449 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
450 } else {
451 result.mergeRanges(this);
452 }
453 return FC_TERMINAL;
454
455 case NRANGE: // ****
456 if (isSet(options, RegularExpression.IGNORE_CASE)) {
457 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
458 } else {
459 result.mergeRanges(Token.complementRanges(this));
460 }
461 return FC_TERMINAL;
462
463 case INDEPENDENT:
464 case PAREN:
465 return this.getChild(0).analyzeFirstCharacter(result, options);
466
467 case MODIFIERGROUP:
468 options |= ((ModifierToken)this).getOptions();
469 options &= ~((ModifierToken)this).getOptionsMask();
470 return this.getChild(0).analyzeFirstCharacter(result, options);
471
472 case BACKREFERENCE:
473 result.addRange(0, UTF16_MAX); // **** We can not optimize.
474 return FC_ANY;
475
476 case STRING:
477 int cha = this.getString().charAt(0);
478 int ch2;
479 if (REUtil.isHighSurrogate(cha)
480 && this.getString().length() >= 2
481 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
482 cha = REUtil.composeFromSurrogates(cha, ch2);
483 result.addRange(cha, cha);
484 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
485 cha = Character.toUpperCase((char)cha);
486 result.addRange(cha, cha);
487 cha = Character.toLowerCase((char)cha);
488 result.addRange(cha, cha);
489 }
490 return FC_TERMINAL;
491
492 case LOOKAHEAD:
493 case NEGATIVELOOKAHEAD:
494 case LOOKBEHIND:
495 case NEGATIVELOOKBEHIND:
496 return FC_CONTINUE;
497
498 default:
499 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
500 }
501 }
502
503 private final boolean isShorterThan(Token tok) {
504 if (tok == null) return false;
505 /*
506 int mylength;
507 if (this.type == STRING) mylength = this.getString().length();
508 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
509 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
510 int otherlength;
511 if (tok.type == STRING) otherlength = tok.getString().length();
512 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
513 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
514 */
515 int mylength;
516 if (this.type == STRING) mylength = this.getString().length();
517 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
518 int otherlength;
519 if (tok.type == STRING) otherlength = tok.getString().length();
520 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
521 return mylength < otherlength;
522 }
523
524 static class FixedStringContainer {
525 Token token = null;
526 int options = 0;
527 FixedStringContainer() {
528 }
529 }
530
531 final void findFixedString(FixedStringContainer container, int options) {
532 switch (this.type) {
533 case CONCAT:
534 Token prevToken = null;
535 int prevOptions = 0;
536 for (int i = 0; i < this.size(); i ++) {
537 this.getChild(i).findFixedString(container, options);
538 if (prevToken == null || prevToken.isShorterThan(container.token)) {
539 prevToken = container.token;
540 prevOptions = container.options;
541 }
542 }
543 container.token = prevToken;
544 container.options = prevOptions;
545 return;
546
547 case UNION:
548 case CLOSURE:
549 case NONGREEDYCLOSURE:
550 case EMPTY:
551 case ANCHOR:
552 case RANGE:
553 case DOT:
554 case NRANGE:
555 case BACKREFERENCE:
556 case LOOKAHEAD:
557 case NEGATIVELOOKAHEAD:
558 case LOOKBEHIND:
559 case NEGATIVELOOKBEHIND:
560 case CONDITION:
561 container.token = null;
562 return;
563
564 case CHAR: // Ignore CHAR tokens.
565 container.token = null; // **
566 return; // **
567
568 case STRING:
569 container.token = this;
570 container.options = options;
571 return;
572
573 case INDEPENDENT:
574 case PAREN:
575 this.getChild(0).findFixedString(container, options);
576 return;
577
578 case MODIFIERGROUP:
579 options |= ((ModifierToken)this).getOptions();
580 options &= ~((ModifierToken)this).getOptionsMask();
581 this.getChild(0).findFixedString(container, options);
582 return;
583
584 default:
585 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
586 }
587 }
588
589 boolean match(int ch) {
590 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
591 }
592
593 // ------------------------------------------------------
594 private final static Hashtable categories = new Hashtable();
595 private final static Hashtable categories2 = new Hashtable();
596 private static final String[] categoryNames = {
597 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
598 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
599 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
600 "Pi", "Pf", // 29, 30
601 "L", "M", "N", "Z", "C", "P", "S", // 31-37
602 };
603
604 // Schema Rec. {Datatypes} - Punctuation
605 static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
606 static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
607 static final int CHAR_LETTER = 31;
608 static final int CHAR_MARK = 32;
609 static final int CHAR_NUMBER = 33;
610 static final int CHAR_SEPARATOR = 34;
611 static final int CHAR_OTHER = 35;
612 static final int CHAR_PUNCTUATION = 36;
613 static final int CHAR_SYMBOL = 37;
614
615 //blockNames in UNICODE 3.1 that supported by XML Schema REC
616 private static final String[] blockNames = {
617 /*0000..007F;*/ "Basic Latin",
618 /*0080..00FF;*/ "Latin-1 Supplement",
619 /*0100..017F;*/ "Latin Extended-A",
620 /*0180..024F;*/ "Latin Extended-B",
621 /*0250..02AF;*/ "IPA Extensions",
622 /*02B0..02FF;*/ "Spacing Modifier Letters",
623 /*0300..036F;*/ "Combining Diacritical Marks",
624 /*0370..03FF;*/ "Greek",
625 /*0400..04FF;*/ "Cyrillic",
626 /*0530..058F;*/ "Armenian",
627 /*0590..05FF;*/ "Hebrew",
628 /*0600..06FF;*/ "Arabic",
629 /*0700..074F;*/ "Syriac",
630 /*0780..07BF;*/ "Thaana",
631 /*0900..097F;*/ "Devanagari",
632 /*0980..09FF;*/ "Bengali",
633 /*0A00..0A7F;*/ "Gurmukhi",
634 /*0A80..0AFF;*/ "Gujarati",
635 /*0B00..0B7F;*/ "Oriya",
636 /*0B80..0BFF;*/ "Tamil",
637 /*0C00..0C7F;*/ "Telugu",
638 /*0C80..0CFF;*/ "Kannada",
639 /*0D00..0D7F;*/ "Malayalam",
640 /*0D80..0DFF;*/ "Sinhala",
641 /*0E00..0E7F;*/ "Thai",
642 /*0E80..0EFF;*/ "Lao",
643 /*0F00..0FFF;*/ "Tibetan",
644 /*1000..109F;*/ "Myanmar",
645 /*10A0..10FF;*/ "Georgian",
646 /*1100..11FF;*/ "Hangul Jamo",
647 /*1200..137F;*/ "Ethiopic",
648 /*13A0..13FF;*/ "Cherokee",
649 /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
650 /*1680..169F;*/ "Ogham",
651 /*16A0..16FF;*/ "Runic",
652 /*1780..17FF;*/ "Khmer",
653 /*1800..18AF;*/ "Mongolian",
654 /*1E00..1EFF;*/ "Latin Extended Additional",
655 /*1F00..1FFF;*/ "Greek Extended",
656 /*2000..206F;*/ "General Punctuation",
657 /*2070..209F;*/ "Superscripts and Subscripts",
658 /*20A0..20CF;*/ "Currency Symbols",
659 /*20D0..20FF;*/ "Combining Marks for Symbols",
660 /*2100..214F;*/ "Letterlike Symbols",
661 /*2150..218F;*/ "Number Forms",
662 /*2190..21FF;*/ "Arrows",
663 /*2200..22FF;*/ "Mathematical Operators",
664 /*2300..23FF;*/ "Miscellaneous Technical",
665 /*2400..243F;*/ "Control Pictures",
666 /*2440..245F;*/ "Optical Character Recognition",
667 /*2460..24FF;*/ "Enclosed Alphanumerics",
668 /*2500..257F;*/ "Box Drawing",
669 /*2580..259F;*/ "Block Elements",
670 /*25A0..25FF;*/ "Geometric Shapes",
671 /*2600..26FF;*/ "Miscellaneous Symbols",
672 /*2700..27BF;*/ "Dingbats",
673 /*2800..28FF;*/ "Braille Patterns",
674 /*2E80..2EFF;*/ "CJK Radicals Supplement",
675 /*2F00..2FDF;*/ "Kangxi Radicals",
676 /*2FF0..2FFF;*/ "Ideographic Description Characters",
677 /*3000..303F;*/ "CJK Symbols and Punctuation",
678 /*3040..309F;*/ "Hiragana",
679 /*30A0..30FF;*/ "Katakana",
680 /*3100..312F;*/ "Bopomofo",
681 /*3130..318F;*/ "Hangul Compatibility Jamo",
682 /*3190..319F;*/ "Kanbun",
683 /*31A0..31BF;*/ "Bopomofo Extended",
684 /*3200..32FF;*/ "Enclosed CJK Letters and Months",
685 /*3300..33FF;*/ "CJK Compatibility",
686 /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
687 /*4E00..9FFF;*/ "CJK Unified Ideographs",
688 /*A000..A48F;*/ "Yi Syllables",
689 /*A490..A4CF;*/ "Yi Radicals",
690 /*AC00..D7A3;*/ "Hangul Syllables",
691 /*E000..F8FF;*/ "Private Use",
692 /*F900..FAFF;*/ "CJK Compatibility Ideographs",
693 /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
694 /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
695 /*FE20..FE2F;*/ "Combining Half Marks",
696 /*FE30..FE4F;*/ "CJK Compatibility Forms",
697 /*FE50..FE6F;*/ "Small Form Variants",
698 /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
699 /*FEFF..FEFF;*/ "Specials",
700 /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
701 //missing Specials add manually
702 /*10300..1032F;*/ "Old Italic", // 84
703 /*10330..1034F;*/ "Gothic",
704 /*10400..1044F;*/ "Deseret",
705 /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
706 /*1D100..1D1FF;*/ "Musical Symbols",
707 /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
708 /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
709 /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
710 /*E0000..E007F;*/ "Tags",
711 //missing 2 private use add manually
712
713 };
714 //ADD THOSE MANUALLY
715 //F0000..FFFFD; "Private Use",
716 //100000..10FFFD; "Private Use"
717 //FFF0..FFFD; "Specials",
718 static final String blockRanges =
719 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
720 +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
721 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
722 +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
723 +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
724 +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
725 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
726 +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
727 +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
728 +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
729 +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
730 static final int[] nonBMPBlockRanges = {
731 0x10300, 0x1032F, // 84
732 0x10330, 0x1034F,
733 0x10400, 0x1044F,
734 0x1D000, 0x1D0FF,
735 0x1D100, 0x1D1FF,
736 0x1D400, 0x1D7FF,
737 0x20000, 0x2A6D6,
738 0x2F800, 0x2FA1F,
739 0xE0000, 0xE007F
740 };
741 private static final int NONBMP_BLOCK_START = 84;
742
743 static protected RangeToken getRange(String name, boolean positive) {
744 if (Token.categories.size() == 0) {
745 synchronized (Token.categories) {
746 Token[] ranges = new Token[Token.categoryNames.length];
747 for (int i = 0; i < ranges.length; i ++) {
748 ranges[i] = Token.createRange();
749 }
750 int type;
751 for (int i = 0; i < 0x10000; i ++) {
752 type = Character.getType((char)i);
753 if (type == Character.START_PUNCTUATION ||
754 type == Character.END_PUNCTUATION) {
755 //build table of Pi values
756 if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
757 i == 0x201F || i == 0x2039) {
758 type = CHAR_INIT_QUOTE;
759 }
760 //build table of Pf values
761 if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
762 type = CHAR_FINAL_QUOTE;
763 }
764 }
765 ranges[type].addRange(i, i);
766 switch (type) {
767 case Character.UPPERCASE_LETTER:
768 case Character.LOWERCASE_LETTER:
769 case Character.TITLECASE_LETTER:
770 case Character.MODIFIER_LETTER:
771 case Character.OTHER_LETTER:
772 type = CHAR_LETTER;
773 break;
774 case Character.NON_SPACING_MARK:
775 case Character.COMBINING_SPACING_MARK:
776 case Character.ENCLOSING_MARK:
777 type = CHAR_MARK;
778 break;
779 case Character.DECIMAL_DIGIT_NUMBER:
780 case Character.LETTER_NUMBER:
781 case Character.OTHER_NUMBER:
782 type = CHAR_NUMBER;
783 break;
784 case Character.SPACE_SEPARATOR:
785 case Character.LINE_SEPARATOR:
786 case Character.PARAGRAPH_SEPARATOR:
787 type = CHAR_SEPARATOR;
788 break;
789 case Character.CONTROL:
790 case Character.FORMAT:
791 case Character.SURROGATE:
792 case Character.PRIVATE_USE:
793 case Character.UNASSIGNED:
794 type = CHAR_OTHER;
795 break;
796 case Character.CONNECTOR_PUNCTUATION:
797 case Character.DASH_PUNCTUATION:
798 case Character.START_PUNCTUATION:
799 case Character.END_PUNCTUATION:
800 case CHAR_INIT_QUOTE:
801 case CHAR_FINAL_QUOTE:
802 case Character.OTHER_PUNCTUATION:
803 type = CHAR_PUNCTUATION;
804 break;
805 case Character.MATH_SYMBOL:
806 case Character.CURRENCY_SYMBOL:
807 case Character.MODIFIER_SYMBOL:
808 case Character.OTHER_SYMBOL:
809 type = CHAR_SYMBOL;
810 break;
811 default:
812 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
813 }
814 ranges[type].addRange(i, i);
815 } // for all characters
816 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
817
818 for (int i = 0; i < ranges.length; i ++) {
819 if (Token.categoryNames[i] != null) {
820 if (i == Character.UNASSIGNED) { // Unassigned
821 ranges[i].addRange(0x10000, Token.UTF16_MAX);
822 }
823 Token.categories.put(Token.categoryNames[i], ranges[i]);
824 Token.categories2.put(Token.categoryNames[i],
825 Token.complementRanges(ranges[i]));
826 }
827 }
828 //REVISIT: do we really need to support block names as in Unicode 3.1
829 // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
830 //
831 StringBuffer buffer = new StringBuffer(50);
832 for (int i = 0; i < Token.blockNames.length; i ++) {
833 Token r1 = Token.createRange();
834 int location;
835 if (i < NONBMP_BLOCK_START) {
836 location = i*2;
837 int rstart = Token.blockRanges.charAt(location);
838 int rend = Token.blockRanges.charAt(location+1);
839 //DEBUGING
840 //System.out.println(n+" " +Integer.toHexString(rstart)
841 // +"-"+ Integer.toHexString(rend));
842 r1.addRange(rstart, rend);
843 } else {
844 location = (i - NONBMP_BLOCK_START) * 2;
845 r1.addRange(Token.nonBMPBlockRanges[location],
846 Token.nonBMPBlockRanges[location + 1]);
847 }
848 String n = Token.blockNames[i];
849 if (n.equals("Specials"))
850 r1.addRange(0xfff0, 0xfffd);
851 if (n.equals("Private Use")) {
852 r1.addRange(0xF0000,0xFFFFD);
853 r1.addRange(0x100000,0x10FFFD);
854 }
855 Token.categories.put(n, r1);
856 Token.categories2.put(n, Token.complementRanges(r1));
857 buffer.setLength(0);
858 buffer.append("Is");
859 if (n.indexOf(' ') >= 0) {
860 for (int ci = 0; ci < n.length(); ci ++)
861 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
862 }
863 else {
864 buffer.append(n);
865 }
866 Token.setAlias(buffer.toString(), n, true);
867 }
868
869 // TR#18 1.2
870 Token.setAlias("ASSIGNED", "Cn", false);
871 Token.setAlias("UNASSIGNED", "Cn", true);
872 Token all = Token.createRange();
873 all.addRange(0, Token.UTF16_MAX);
874 Token.categories.put("ALL", all);
875 Token.categories2.put("ALL", Token.complementRanges(all));
876 Token.registerNonXS("ASSIGNED");
877 Token.registerNonXS("UNASSIGNED");
878 Token.registerNonXS("ALL");
879
880 Token isalpha = Token.createRange();
881 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
882 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
883 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
884 Token.categories.put("IsAlpha", isalpha);
885 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
886 Token.registerNonXS("IsAlpha");
887
888 Token isalnum = Token.createRange();
889 isalnum.mergeRanges(isalpha); // Lu Ll Lo
890 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
891 Token.categories.put("IsAlnum", isalnum);
892 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
893 Token.registerNonXS("IsAlnum");
894
895 Token isspace = Token.createRange();
896 isspace.mergeRanges(Token.token_spaces);
897 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
898 Token.categories.put("IsSpace", isspace);
899 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
900 Token.registerNonXS("IsSpace");
901
902 Token isword = Token.createRange();
903 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
904 isword.addRange('_', '_');
905 Token.categories.put("IsWord", isword);
906 Token.categories2.put("IsWord", Token.complementRanges(isword));
907 Token.registerNonXS("IsWord");
908
909 Token isascii = Token.createRange();
910 isascii.addRange(0, 127);
911 Token.categories.put("IsASCII", isascii);
912 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
913 Token.registerNonXS("IsASCII");
914
915 Token isnotgraph = Token.createRange();
916 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
917 isnotgraph.addRange(' ', ' ');
918 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
919 Token.categories2.put("IsGraph", isnotgraph);
920 Token.registerNonXS("IsGraph");
921
922 Token isxdigit = Token.createRange();
923 isxdigit.addRange('0', '9');
924 isxdigit.addRange('A', 'F');
925 isxdigit.addRange('a', 'f');
926 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
927 Token.categories2.put("IsXDigit", isxdigit);
928 Token.registerNonXS("IsXDigit");
929
930 Token.setAlias("IsDigit", "Nd", true);
931 Token.setAlias("IsUpper", "Lu", true);
932 Token.setAlias("IsLower", "Ll", true);
933 Token.setAlias("IsCntrl", "C", true);
934 Token.setAlias("IsPrint", "C", false);
935 Token.setAlias("IsPunct", "P", true);
936 Token.registerNonXS("IsDigit");
937 Token.registerNonXS("IsUpper");
938 Token.registerNonXS("IsLower");
939 Token.registerNonXS("IsCntrl");
940 Token.registerNonXS("IsPrint");
941 Token.registerNonXS("IsPunct");
942
943 Token.setAlias("alpha", "IsAlpha", true);
944 Token.setAlias("alnum", "IsAlnum", true);
945 Token.setAlias("ascii", "IsASCII", true);
946 Token.setAlias("cntrl", "IsCntrl", true);
947 Token.setAlias("digit", "IsDigit", true);
948 Token.setAlias("graph", "IsGraph", true);
949 Token.setAlias("lower", "IsLower", true);
950 Token.setAlias("print", "IsPrint", true);
951 Token.setAlias("punct", "IsPunct", true);
952 Token.setAlias("space", "IsSpace", true);
953 Token.setAlias("upper", "IsUpper", true);
954 Token.setAlias("word", "IsWord", true); // Perl extension
955 Token.setAlias("xdigit", "IsXDigit", true);
956 Token.registerNonXS("alpha");
957 Token.registerNonXS("alnum");
958 Token.registerNonXS("ascii");
959 Token.registerNonXS("cntrl");
960 Token.registerNonXS("digit");
961 Token.registerNonXS("graph");
962 Token.registerNonXS("lower");
963 Token.registerNonXS("print");
964 Token.registerNonXS("punct");
965 Token.registerNonXS("space");
966 Token.registerNonXS("upper");
967 Token.registerNonXS("word");
968 Token.registerNonXS("xdigit");
969 } // synchronized
970 } // if null
971 RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
972 : (RangeToken)Token.categories2.get(name);
973 //if (tok == null) System.out.println(name);
974 return tok;
975 }
976 static protected RangeToken getRange(String name, boolean positive, boolean xs) {
977 RangeToken range = Token.getRange(name, positive);
978 if (xs && range != null && Token.isRegisterNonXS(name))
979 range = null;
980 return range;
981 }
982
983 static Hashtable nonxs = null;
984 /**
985 * This method is called by only getRange().
986 * So this method need not MT-safe.
987 */
988 static protected void registerNonXS(String name) {
989 if (Token.nonxs == null)
990 Token.nonxs = new Hashtable();
991 Token.nonxs.put(name, name);
992 }
993 static protected boolean isRegisterNonXS(String name) {
994 if (Token.nonxs == null)
995 return false;
996 //DEBUG
997 //System.err.println("isRegisterNonXS: "+name);
998 return Token.nonxs.containsKey(name);
999 }
1000
1001 private static void setAlias(String newName, String name, boolean positive) {
1002 Token t1 = (Token)Token.categories.get(name);
1003 Token t2 = (Token)Token.categories2.get(name);
1004 if (positive) {
1005 Token.categories.put(newName, t1);
1006 Token.categories2.put(newName, t2);
1007 } else {
1008 Token.categories2.put(newName, t1);
1009 Token.categories.put(newName, t2);
1010 }
1011 }
1012
1013 // ------------------------------------------------------
1014
1015 static final String viramaString =
1016 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1017 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1018 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1019 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1020 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1021 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1022 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1023 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1024 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
1025 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
1026 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
1027
1028 static private Token token_grapheme = null;
1029 static synchronized Token getGraphemePattern() {
1030 if (Token.token_grapheme != null)
1031 return Token.token_grapheme;
1032
1033 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
1034 base_char.mergeRanges(Token.getRange("ASSIGNED", true));
1035 base_char.subtractRanges(Token.getRange("M", true));
1036 base_char.subtractRanges(Token.getRange("C", true));
1037
1038 Token virama = Token.createRange();
1039 for (int i = 0; i < Token.viramaString.length(); i ++) {
1040 int ch = viramaString.charAt(i);
1041 virama.addRange(i, i);
1042 }
1043
1044 Token combiner_wo_virama = Token.createRange();
1045 combiner_wo_virama.mergeRanges(Token.getRange("M", true));
1046 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
1047 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
1048
1049 Token left = Token.createUnion(); // base_char?
1050 left.addChild(base_char);
1051 left.addChild(Token.token_empty);
1052
1053 Token foo = Token.createUnion();
1054 foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
1055 foo.addChild(combiner_wo_virama);
1056
1057 foo = Token.createClosure(foo);
1058
1059 foo = Token.createConcat(left, foo);
1060
1061 Token.token_grapheme = foo;
1062 return Token.token_grapheme;
1063 }
1064
1065 /**
1066 * Combing Character Sequence in Perl 5.6.
1067 */
1068 static private Token token_ccs = null;
1069 static synchronized Token getCombiningCharacterSequence() {
1070 if (Token.token_ccs != null)
1071 return Token.token_ccs;
1072
1073 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
1074 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
1075 Token.token_ccs = foo;
1076 return Token.token_ccs;
1077 }
1078
1079 // ------------------------------------------------------
1080
1081 // ------------------------------------------------------
1082 /**
1083 * This class represents a node in parse tree.
1084 */
1085 static class StringToken extends Token implements java.io.Serializable {
1086 String string;
1087 int refNumber;
1088
1089 StringToken(int type, String str, int n) {
1090 super(type);
1091 this.string = str;
1092 this.refNumber = n;
1093 }
1094
1095 int getReferenceNumber() { // for STRING
1096 return this.refNumber;
1097 }
1098 String getString() { // for STRING
1099 return this.string;
1100 }
1101
1102 public String toString(int options) {
1103 if (this.type == BACKREFERENCE)
1104 return "\\"+this.refNumber;
1105 else
1106 return REUtil.quoteMeta(this.string);
1107 }
1108 }
1109
1110 /**
1111 * This class represents a node in parse tree.
1112 */
1113 static class ConcatToken extends Token implements java.io.Serializable {
1114 Token child;
1115 Token child2;
1116
1117 ConcatToken(Token t1, Token t2) {
1118 super(Token.CONCAT);
1119 this.child = t1;
1120 this.child2 = t2;
1121 }
1122
1123 int size() {
1124 return 2;
1125 }
1126 Token getChild(int index) {
1127 return index == 0 ? this.child : this.child2;
1128 }
1129
1130 public String toString(int options) {
1131 String ret;
1132 if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
1133 ret = this.child.toString(options)+"+";
1134 } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
1135 ret = this.child.toString(options)+"+?";
1136 } else
1137 ret = this.child.toString(options)+this.child2.toString(options);
1138 return ret;
1139 }
1140 }
1141
1142 /**
1143 * This class represents a node in parse tree.
1144 */
1145 static class CharToken extends Token implements java.io.Serializable {
1146 int chardata;
1147
1148 CharToken(int type, int ch) {
1149 super(type);
1150 this.chardata = ch;
1151 }
1152
1153 int getChar() {
1154 return this.chardata;
1155 }
1156
1157 public String toString(int options) {
1158 String ret;
1159 switch (this.type) {
1160 case CHAR:
1161 switch (this.chardata) {
1162 case '|': case '*': case '+': case '?':
1163 case '(': case ')': case '.': case '[':
1164 case '{': case '\\':
1165 ret = "\\"+(char)this.chardata;
1166 break;
1167 case '\f': ret = "\\f"; break;
1168 case '\n': ret = "\\n"; break;
1169 case '\r': ret = "\\r"; break;
1170 case '\t': ret = "\\t"; break;
1171 case 0x1b: ret = "\\e"; break;
1172 //case 0x0b: ret = "\\v"; break;
1173 default:
1174 if (this.chardata >= 0x10000) {
1175 String pre = "0"+Integer.toHexString(this.chardata);
1176 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
1177 } else
1178 ret = ""+(char)this.chardata;
1179 }
1180 break;
1181
1182 case ANCHOR:
1183 if (this == Token.token_linebeginning || this == Token.token_lineend)
1184 ret = ""+(char)this.chardata;
1185 else
1186 ret = "\\"+(char)this.chardata;
1187 break;
1188
1189 default:
1190 ret = null;
1191 }
1192 return ret;
1193 }
1194
1195 boolean match(int ch) {
1196 if (this.type == CHAR) {
1197 return ch == this.chardata;
1198 } else
1199 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
1200 }
1201 }
1202
1203 /**
1204 * This class represents a node in parse tree.
1205 */
1206 static class ClosureToken extends Token implements java.io.Serializable {
1207 int min;
1208 int max;
1209 Token child;
1210
1211 ClosureToken(int type, Token tok) {
1212 super(type);
1213 this.child = tok;
1214 this.setMin(-1);
1215 this.setMax(-1);
1216 }
1217
1218 int size() {
1219 return 1;
1220 }
1221 Token getChild(int index) {
1222 return this.child;
1223 }
1224
1225 final void setMin(int min) {
1226 this.min = min;
1227 }
1228 final void setMax(int max) {
1229 this.max = max;
1230 }
1231 final int getMin() {
1232 return this.min;
1233 }
1234 final int getMax() {
1235 return this.max;
1236 }
1237
1238 public String toString(int options) {
1239 String ret;
1240 if (this.type == CLOSURE) {
1241 if (this.getMin() < 0 && this.getMax() < 0) {
1242 ret = this.child.toString(options)+"*";
1243 } else if (this.getMin() == this.getMax()) {
1244 ret = this.child.toString(options)+"{"+this.getMin()+"}";
1245 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1246 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
1247 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1248 ret = this.child.toString(options)+"{"+this.getMin()+",}";
1249 } else
1250 throw new RuntimeException("Token#toString(): CLOSURE "
1251 +this.getMin()+", "+this.getMax());
1252 } else {
1253 if (this.getMin() < 0 && this.getMax() < 0) {
1254 ret = this.child.toString(options)+"*?";
1255 } else if (this.getMin() == this.getMax()) {
1256 ret = this.child.toString(options)+"{"+this.getMin()+"}?";
1257 } else if (this.getMin() >= 0 && this.getMax() >= 0) {
1258 ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
1259 } else if (this.getMin() >= 0 && this.getMax() < 0) {
1260 ret = this.child.toString(options)+"{"+this.getMin()+",}?";
1261 } else
1262 throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
1263 +this.getMin()+", "+this.getMax());
1264 }
1265 return ret;
1266 }
1267 }
1268
1269 /**
1270 * This class represents a node in parse tree.
1271 */
1272 static class ParenToken extends Token implements java.io.Serializable {
1273 Token child;
1274 int parennumber;
1275
1276 ParenToken(int type, Token tok, int paren) {
1277 super(type);
1278 this.child = tok;
1279 this.parennumber = paren;
1280 }
1281
1282 int size() {
1283 return 1;
1284 }
1285 Token getChild(int index) {
1286 return this.child;
1287 }
1288
1289 int getParenNumber() {
1290 return this.parennumber;
1291 }
1292
1293 public String toString(int options) {
1294 String ret = null;
1295 switch (this.type) {
1296 case PAREN:
1297 if (this.parennumber == 0) {
1298 ret = "(?:"+this.child.toString(options)+")";
1299 } else {
1300 ret = "("+this.child.toString(options)+")";
1301 }
1302 break;
1303
1304 case LOOKAHEAD:
1305 ret = "(?="+this.child.toString(options)+")";
1306 break;
1307 case NEGATIVELOOKAHEAD:
1308 ret = "(?!"+this.child.toString(options)+")";
1309 break;
1310 case LOOKBEHIND:
1311 ret = "(?<="+this.child.toString(options)+")";
1312 break;
1313 case NEGATIVELOOKBEHIND:
1314 ret = "(?<!"+this.child.toString(options)+")";
1315 break;
1316 case INDEPENDENT:
1317 ret = "(?>"+this.child.toString(options)+")";
1318 break;
1319 }
1320 return ret;
1321 }
1322 }
1323
1324 /**
1325 * (?(condition)yes-pattern|no-pattern)
1326 */
1327 static class ConditionToken extends Token implements java.io.Serializable {
1328 int refNumber;
1329 Token condition;
1330 Token yes;
1331 Token no;
1332 ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
1333 super(Token.CONDITION);
1334 this.refNumber = refno;
1335 this.condition = cond;
1336 this.yes = yespat;
1337 this.no = nopat;
1338 }
1339 int size() {
1340 return this.no == null ? 1 : 2;
1341 }
1342 Token getChild(int index) {
1343 if (index == 0) return this.yes;
1344 if (index == 1) return this.no;
1345 throw new RuntimeException("Internal Error: "+index);
1346 }
1347
1348 public String toString(int options) {
1349 String ret;
1350 if (refNumber > 0) {
1351 ret = "(?("+refNumber+")";
1352 } else if (this.condition.type == Token.ANCHOR) {
1353 ret = "(?("+this.condition+")";
1354 } else {
1355 ret = "(?"+this.condition;
1356 }
1357
1358 if (this.no == null) {
1359 ret += this.yes+")";
1360 } else {
1361 ret += this.yes+"|"+this.no+")";
1362 }
1363 return ret;
1364 }
1365 }
1366
1367 /**
1368 * (ims-ims: .... )
1369 */
1370 static class ModifierToken extends Token implements java.io.Serializable {
1371 Token child;
1372 int add;
1373 int mask;
1374
1375 ModifierToken(Token tok, int add, int mask) {
1376 super(Token.MODIFIERGROUP);
1377 this.child = tok;
1378 this.add = add;
1379 this.mask = mask;
1380 }
1381
1382 int size() {
1383 return 1;
1384 }
1385 Token getChild(int index) {
1386 return this.child;
1387 }
1388
1389 int getOptions() {
1390 return this.add;
1391 }
1392 int getOptionsMask() {
1393 return this.mask;
1394 }
1395
1396 public String toString(int options) {
1397 return "(?"
1398 +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
1399 +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
1400 +":"
1401 +this.child.toString(options)
1402 +")";
1403 }
1404 }
1405
1406 /**
1407 * This class represents a node in parse tree.
1408 * for UNION or CONCAT.
1409 */
1410 static class UnionToken extends Token implements java.io.Serializable {
1411 Vector children;
1412
1413 UnionToken(int type) {
1414 super(type);
1415 }
1416
1417 void addChild(Token tok) {
1418 if (tok == null) return;
1419 if (this.children == null) this.children = new Vector();
1420 if (this.type == UNION) {
1421 this.children.addElement(tok);
1422 return;
1423 }
1424 // This is CONCAT, and new child is CONCAT.
1425 if (tok.type == CONCAT) {
1426 for (int i = 0; i < tok.size(); i ++)
1427 this.addChild(tok.getChild(i)); // Recursion
1428 return;
1429 }
1430 int size = this.children.size();
1431 if (size == 0) {
1432 this.children.addElement(tok);
1433 return;
1434 }
1435 Token previous = (Token)this.children.elementAt(size-1);
1436 if (!((previous.type == CHAR || previous.type == STRING)
1437 && (tok.type == CHAR || tok.type == STRING))) {
1438 this.children.addElement(tok);
1439 return;
1440 }
1441
1442 //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
1443
1444 StringBuffer buffer;
1445 int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
1446 if (previous.type == CHAR) { // Replace previous token by STRING
1447 buffer = new StringBuffer(2 + nextMaxLength);
1448 int ch = previous.getChar();
1449 if (ch >= 0x10000)
1450 buffer.append(REUtil.decomposeToSurrogates(ch));
1451 else
1452 buffer.append((char)ch);
1453 previous = Token.createString(null);
1454 this.children.setElementAt(previous, size-1);
1455 } else { // STRING
1456 buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
1457 buffer.append(previous.getString());
1458 }
1459
1460 if (tok.type == CHAR) {
1461 int ch = tok.getChar();
1462 if (ch >= 0x10000)
1463 buffer.append(REUtil.decomposeToSurrogates(ch));
1464 else
1465 buffer.append((char)ch);
1466 } else {
1467 buffer.append(tok.getString());
1468 }
1469
1470 ((StringToken)previous).string = new String(buffer);
1471 }
1472
1473 int size() {
1474 return this.children == null ? 0 : this.children.size();
1475 }
1476 Token getChild(int index) {
1477 return (Token)this.children.elementAt(index);
1478 }
1479
1480 public String toString(int options) {
1481 String ret;
1482 if (this.type == CONCAT) {
1483 if (this.children.size() == 2) {
1484 Token ch = this.getChild(0);
1485 Token ch2 = this.getChild(1);
1486 if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
1487 ret = ch.toString(options)+"+";
1488 } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
1489 ret = ch.toString(options)+"+?";
1490 } else
1491 ret = ch.toString(options)+ch2.toString(options);
1492 } else {
1493 StringBuffer sb = new StringBuffer();
1494 for (int i = 0; i < this.children.size(); i ++) {
1495 sb.append(((Token)this.children.elementAt(i)).toString(options));
1496 }
1497 ret = new String(sb);
1498 }
1499 return ret;
1500 }
1501 if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
1502 ret = this.getChild(0).toString(options)+"?";
1503 } else if (this.children.size() == 2
1504 && this.getChild(0).type == EMPTY) {
1505 ret = this.getChild(1).toString(options)+"??";
1506 } else {
1507 StringBuffer sb = new StringBuffer();
1508 sb.append(((Token)this.children.elementAt(0)).toString(options));
1509 for (int i = 1; i < this.children.size(); i ++) {
1510 sb.append((char)'|');
1511 sb.append(((Token)this.children.elementAt(i)).toString(options));
1512 }
1513 ret = new String(sb);
1514 }
1515 return ret;
1516 }
1517 }
1518 }