Source code: org/apache/xerces/utils/regex/Token.java
1 /*
2 * The Apache Software License, Version 1.1
3 *
4 *
5 * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6 * reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Apache Software Foundation (http://www.apache.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Xerces" and "Apache Software Foundation" must
28 * not be used to endorse or promote products derived from this
29 * software without prior written permission. For written
30 * permission, please contact apache@apache.org.
31 *
32 * 5. Products derived from this software may not be called "Apache",
33 * nor may "Apache" appear in their name, without prior written
34 * permission of the Apache Software Foundation.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This software consists of voluntary contributions made by many
51 * individuals on behalf of the Apache Software Foundation and was
52 * originally based on software copyright (c) 1999, International
53 * Business Machines, Inc., http://www.apache.org. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58 package org.apache.xerces.utils.regex;
59
60 import java.util.Vector;
61 import java.util.Hashtable;
62
63 /**
64 * This class represents a node in parse tree.
65 */
66 class Token implements java.io.Serializable {
67 static final boolean COUNTTOKENS = true;
68 static int tokens = 0;
69
70 static final int CHAR = 0; // Literal char
71 static final int DOT = 11; // .
72 static final int CONCAT = 1; // XY
73 static final int UNION = 2; // X|Y|Z
74 static final int CLOSURE = 3; // X*
75 static final int RANGE = 4; // [a-zA-Z] etc.
76 static final int NRANGE = 5; // [^a-zA-Z] etc.
77 static final int PAREN = 6; // (X) or (?:X)
78 static final int EMPTY = 7; //
79 static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
80 static final int NONGREEDYCLOSURE = 9; // *? +?
81 static final int STRING = 10; // strings
82 static final int BACKREFERENCE = 12; // back references
83 static final int LOOKAHEAD = 20; // (?=...)
84 static final int NEGATIVELOOKAHEAD = 21; // (?!...)
85 static final int LOOKBEHIND = 22; // (?<=...)
86 static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
87 static final int INDEPENDENT = 24; // (?>...)
88 static final int MODIFIERGROUP = 25; // (?ims-ims:...)
89 static final int CONDITION = 26; // (?(...)yes|no)
90
91 static final int UTF16_MAX = 0x10ffff;
92
93 int type;
94
95 static protected Token token_dot;
96 static protected Token token_0to9;
97 static protected Token token_wordchars;
98 static protected Token token_not_0to9;
99 static protected Token token_not_wordchars;
100 static protected Token token_spaces;
101 static protected Token token_not_spaces;
102 static protected Token token_empty;
103 static protected Token token_linebeginning;
104 static protected Token token_linebeginning2;
105 static protected Token token_lineend;
106 static protected Token token_stringbeginning;
107 static protected Token token_stringend;
108 static protected Token token_stringend2;
109 static protected Token token_wordedge;
110 static protected Token token_not_wordedge;
111 static protected Token token_wordbeginning;
112 static protected Token token_wordend;
113 static {
114 Token.token_empty = new Token(Token.EMPTY);
115
116 Token.token_linebeginning = Token.createAnchor('^');
117 Token.token_linebeginning2 = Token.createAnchor('@');
118 Token.token_lineend = Token.createAnchor('$');
119 Token.token_stringbeginning = Token.createAnchor('A');
120 Token.token_stringend = Token.createAnchor('z');
121 Token.token_stringend2 = Token.createAnchor('Z');
122 Token.token_wordedge = Token.createAnchor('b');
123 Token.token_not_wordedge = Token.createAnchor('B');
124 Token.token_wordbeginning = Token.createAnchor('<');
125 Token.token_wordend = Token.createAnchor('>');
126
127 Token.token_dot = new Token(Token.DOT);
128
129 Token.token_0to9 = Token.createRange();
130 Token.token_0to9.addRange('0', '9');
131 Token.token_wordchars = Token.createRange();
132 Token.token_wordchars.addRange('0', '9');
133 Token.token_wordchars.addRange('A', 'Z');
134 Token.token_wordchars.addRange('_', '_');
135 Token.token_wordchars.addRange('a', 'z');
136 Token.token_spaces = Token.createRange();
137 Token.token_spaces.addRange('\t', '\t');
138 Token.token_spaces.addRange('\n', '\n');
139 Token.token_spaces.addRange('\f', '\f');
140 Token.token_spaces.addRange('\r', '\r');
141 Token.token_spaces.addRange(' ', ' ');
142
143 Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
144 Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
145 Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
146 }
147
148 static Token.ParenToken createLook(int type, Token child) {
149 if (COUNTTOKENS) Token.tokens ++;
150 return new Token.ParenToken(type, child, 0);
151 }
152 static Token.ParenToken createParen(Token child, int pnumber) {
153 if (COUNTTOKENS) Token.tokens ++;
154 return new Token.ParenToken(Token.PAREN, child, pnumber);
155 }
156 static Token.ClosureToken createClosure(Token tok) {
157 if (COUNTTOKENS) Token.tokens ++;
158 return new Token.ClosureToken(Token.CLOSURE, tok);
159 }
160 static Token.ClosureToken createNGClosure(Token tok) {
161 if (COUNTTOKENS) Token.tokens ++;
162 return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
163 }
164 static Token.ConcatToken createConcat(Token tok1, Token tok2) {
165 if (COUNTTOKENS) Token.tokens ++;
166 return new Token.ConcatToken(tok1, tok2);
167 }
168 static Token.UnionToken createConcat() {
169 if (COUNTTOKENS) Token.tokens ++;
170 return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
171 }
172 static Token.UnionToken createUnion() {
173 if (COUNTTOKENS) Token.tokens ++;
174 return new Token.UnionToken(Token.UNION);
175 }
176 static Token createEmpty() {
177 return Token.token_empty;
178 }
179 static RangeToken createRange() {
180 if (COUNTTOKENS) Token.tokens ++;
181 return new RangeToken(Token.RANGE);
182 }
183 static RangeToken createNRange() {
184 if (COUNTTOKENS) Token.tokens ++;
185 return new RangeToken(Token.NRANGE);
186 }
187 static Token.CharToken createChar(int ch) {
188 if (COUNTTOKENS) Token.tokens ++;
189 return new Token.CharToken(Token.CHAR, ch);
190 }
191 static private Token.CharToken createAnchor(int ch) {
192 if (COUNTTOKENS) Token.tokens ++;
193 return new Token.CharToken(Token.ANCHOR, ch);
194 }
195 static Token.StringToken createBackReference(int refno) {
196 if (COUNTTOKENS) Token.tokens ++;
197 return new Token.StringToken(Token.BACKREFERENCE, null, refno);
198 }
199 static Token.StringToken createString(String str) {
200 if (COUNTTOKENS) Token.tokens ++;
201 return new Token.StringToken(Token.STRING, str, 0);
202 }
203 static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
204 if (COUNTTOKENS) Token.tokens ++;
205 return new Token.ModifierToken(child, add, mask);
206 }
207 static Token.ConditionToken createCondition(int refno, Token condition,
208 Token yespat, Token nopat) {
209 if (COUNTTOKENS) Token.tokens ++;
210 return new Token.ConditionToken(refno, condition, yespat, nopat);
211 }
212
213 protected Token(int type) {
214 this.type = type;
215 }
216
217 /**
218 * A number of children.
219 */
220 int size() {
221 return 0;
222 }
223 Token getChild(int index) {
224 return null;
225 }
226 void addChild(Token tok) {
227 throw new RuntimeException("Not supported.");
228 }
229
230 // for RANGE or NRANGE
231 protected void addRange(int start, int end) {
232 throw new RuntimeException("Not supported.");
233 }
234 protected void sortRanges() {
235 throw new RuntimeException("Not supported.");
236 }
237 protected void compactRanges() {
238 throw new RuntimeException("Not supported.");
239 }
240 protected void mergeRanges(Token tok) {
241 throw new RuntimeException("Not supported.");
242 }
243 protected void subtractRanges(Token tok) {
244 throw new RuntimeException("Not supported.");
245 }
246 protected void intersectRanges(Token tok) {
247 throw new RuntimeException("Not supported.");
248 }
249 static Token complementRanges(Token tok) {
250 return RangeToken.complementRanges(tok);
251 }
252
253
254 void setMin(int min) { // for CLOSURE
255 }
256 void setMax(int max) { // for CLOSURE
257 }
258 int getMin() { // for CLOSURE
259 return -1;
260 }
261 int getMax() { // for CLOSURE
262 return -1;
263 }
264 int getReferenceNumber() { // for STRING
265 return 0;
266 }
267 String getString() { // for STRING
268 return null;
269 }
270
271 int getParenNumber() {
272 return 0;
273 }
274 int getChar() {
275 return -1;
276 }
277
278 public String toString() {
279 return this.toString(0);
280 }
281 public String toString(int options) {
282 return this.type == Token.DOT ? "." : "";
283 }
284
285 /**
286 * How many characters are needed?
287 */
288 final int getMinLength() {
289 switch (this.type) {
290 case CONCAT:
291 int sum = 0;
292 for (int i = 0; i < this.size(); i ++)
293 sum += this.getChild(i).getMinLength();
294 return sum;
295
296 case CONDITION:
297 case UNION:
298 if (this.size() == 0)
299 return 0;
300 int ret = this.getChild(0).getMinLength();
301 for (int i = 1; i < this.size(); i ++) {
302 int min = this.getChild(i).getMinLength();
303 if (min < ret) ret = min;
304 }
305 return ret;
306
307 case CLOSURE:
308 case NONGREEDYCLOSURE:
309 if (this.getMin() >= 0)
310 return this.getMin() * this.getChild(0).getMinLength();
311 return 0;
312
313 case EMPTY:
314 case ANCHOR:
315 return 0;
316
317 case DOT:
318 case CHAR:
319 case RANGE:
320 case NRANGE:
321 return 1;
322
323 case INDEPENDENT:
324 case PAREN:
325 case MODIFIERGROUP:
326 return this.getChild(0).getMinLength();
327
328 case BACKREFERENCE:
329 return 0; // *******
330
331 case STRING:
332 return this.getString().length();
333
334 case LOOKAHEAD:
335 case NEGATIVELOOKAHEAD:
336 case LOOKBEHIND:
337 case NEGATIVELOOKBEHIND:
338 return 0; // ***** Really?
339
340 default:
341 throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
342 }
343 }
344
345 final int getMaxLength() {
346 switch (this.type) {
347 case CONCAT:
348 int sum = 0;
349 for (int i = 0; i < this.size(); i ++) {
350 int d = this.getChild(i).getMaxLength();
351 if (d < 0) return -1;
352 sum += d;
353 }
354 return sum;
355
356 case CONDITION:
357 case UNION:
358 if (this.size() == 0)
359 return 0;
360 int ret = this.getChild(0).getMaxLength();
361 for (int i = 1; ret >= 0 && i < this.size(); i ++) {
362 int max = this.getChild(i).getMaxLength();
363 if (max < 0) { // infinity
364 ret = -1;
365 break;
366 }
367 if (max > ret) ret = max;
368 }
369 return ret;
370
371 case CLOSURE:
372 case NONGREEDYCLOSURE:
373 if (this.getMax() >= 0)
374 // When this.child.getMaxLength() < 0,
375 // this returns minus value
376 return this.getMax() * this.getChild(0).getMaxLength();
377 return -1;
378
379 case EMPTY:
380 case ANCHOR:
381 return 0;
382
383 case CHAR:
384 return 1;
385 case DOT:
386 case RANGE:
387 case NRANGE:
388 return 2;
389
390 case INDEPENDENT:
391 case PAREN:
392 case MODIFIERGROUP:
393 return this.getChild(0).getMaxLength();
394
395 case BACKREFERENCE:
396 return -1; // ******
397
398 case STRING:
399 return this.getString().length();
400
401 case LOOKAHEAD:
402 case NEGATIVELOOKAHEAD:
403 case LOOKBEHIND:
404 case NEGATIVELOOKBEHIND:
405 return 0; // ***** Really?
406
407 default:
408 throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
409 }
410 }
411
412 static final int FC_CONTINUE = 0;
413 static final int FC_TERMINAL = 1;
414 static final int FC_ANY = 2;
415 private static final boolean isSet(int options, int flag) {
416 return (options & flag) == flag;
417 }
418 final int analyzeFirstCharacter(RangeToken result, int options) {
419 switch (this.type) {
420 case CONCAT:
421 int ret = FC_CONTINUE;
422 for (int i = 0; i < this.size(); i ++)
423 if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
424 break;
425 return ret;
426
427 case UNION:
428 if (this.size() == 0)
429 return FC_CONTINUE;
430 /*
431 * a|b|c -> FC_TERMINAL
432 * a|.|c -> FC_ANY
433 * a|b| -> FC_CONTINUE
434 */
435 int ret2 = FC_CONTINUE;
436 boolean hasEmpty = false;
437 for (int i = 0; i < this.size(); i ++) {
438 ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
439 if (ret2 == FC_ANY)
440 break;
441 else if (ret2 == FC_CONTINUE)
442 hasEmpty = true;
443 }
444 return hasEmpty ? FC_CONTINUE : ret2;
445
446 case CONDITION:
447 int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
448 if (this.size() == 1) return FC_CONTINUE;
449 if (ret3 == FC_ANY) return ret3;
450 int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
451 if (ret4 == FC_ANY) return ret4;
452 return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
453
454 case CLOSURE:
455 case NONGREEDYCLOSURE:
456 this.getChild(0).analyzeFirstCharacter(result, options);
457 return FC_CONTINUE;
458
459 case EMPTY:
460 case ANCHOR:
461 return FC_CONTINUE;
462
463 case CHAR:
464 int ch = this.getChar();
465 result.addRange(ch, ch);
466 if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
467 ch = Character.toUpperCase((char)ch);
468 result.addRange(ch, ch);
469 ch = Character.toLowerCase((char)ch);
470 result.addRange(ch, ch);
471 }
472 return FC_TERMINAL;
473
474 case DOT: // ****
475 if (isSet(options, RegularExpression.SINGLE_LINE)) {
476 return FC_CONTINUE; // **** We can not optimize.
477 } else {
478 return FC_CONTINUE;
479 /*
480 result.addRange(0, RegularExpression.LINE_FEED-1);
481 result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
482 result.addRange(RegularExpression.CARRIAGE_RETURN+1,
483 RegularExpression.LINE_SEPARATOR-1);
484 result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
485 return 1;
486 */
487 }
488
489 case RANGE:
490 if (isSet(options, RegularExpression.IGNORE_CASE)) {
491 result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
492 } else {
493 result.mergeRanges(this);
494 }
495 return FC_TERMINAL;
496
497 case NRANGE: // ****
498 if (isSet(options, RegularExpression.IGNORE_CASE)) {
499 result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
500 } else {
501 result.mergeRanges(Token.complementRanges(this));
502 }
503 return FC_TERMINAL;
504
505 case INDEPENDENT:
506 case PAREN:
507 return this.getChild(0).analyzeFirstCharacter(result, options);
508
509 case MODIFIERGROUP:
510 options |= ((ModifierToken)this).getOptions();
511 options &= ~((ModifierToken)this).getOptionsMask();
512 return this.getChild(0).analyzeFirstCharacter(result, options);
513
514 case BACKREFERENCE:
515 result.addRange(0, UTF16_MAX); // **** We can not optimize.
516 return FC_ANY;
517
518 case STRING:
519 int cha = this.getString().charAt(0);
520 int ch2;
521 if (REUtil.isHighSurrogate(cha)
522 && this.getString().length() >= 2
523 && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
524 cha = REUtil.composeFromSurrogates(cha, ch2);
525 result.addRange(cha, cha);
526 if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
527 cha = Character.toUpperCase((char)cha);
528 result.addRange(cha, cha);
529 cha = Character.toLowerCase((char)cha);
530 result.addRange(cha, cha);
531 }
532 return FC_TERMINAL;
533
534 case LOOKAHEAD:
535 case NEGATIVELOOKAHEAD:
536 case LOOKBEHIND:
537 case NEGATIVELOOKBEHIND:
538 return FC_CONTINUE;
539
540 default:
541 throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
542 }
543 }
544
545 private final boolean isShorterThan(Token tok) {
546 if (tok == null) return false;
547 /*
548 int mylength;
549 if (this.type == STRING) mylength = this.getString().length();
550 else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
551 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
552 int otherlength;
553 if (tok.type == STRING) otherlength = tok.getString().length();
554 else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
555 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
556 */
557 int mylength;
558 if (this.type == STRING) mylength = this.getString().length();
559 else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
560 int otherlength;
561 if (tok.type == STRING) otherlength = tok.getString().length();
562 else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
563 return mylength < otherlength;
564 }
565
566 static class FixedStringContainer {
567 Token token = null;
568 int options = 0;
569 FixedStringContainer() {
570 }
571 }
572
573 final void findFixedString(FixedStringContainer container, int options) {
574 switch (this.type) {
575 case CONCAT:
576 Token prevToken = null;
577 int prevOptions = 0;
578 for (int i = 0; i < this.size(); i ++) {
579 this.getChild(i).findFixedString(container, options);
580 if (prevToken == null || prevToken.isShorterThan(container.token)) {
581 prevToken = container.token;
582 prevOptions = container.options;
583 }
584 }
585 container.token = prevToken;
586 container.options = prevOptions;
587 return;
588
589 case UNION:
590 case CLOSURE:
591 case NONGREEDYCLOSURE:
592 case EMPTY:
593 case ANCHOR:
594 case RANGE:
595 case DOT:
596 case NRANGE:
597 case BACKREFERENCE:
598 case LOOKAHEAD:
599 case NEGATIVELOOKAHEAD:
600 case LOOKBEHIND:
601 case NEGATIVELOOKBEHIND:
602 case CONDITION:
603 container.token = null;
604 return;
605
606 case CHAR: // Ignore CHAR tokens.
607 container.token = null; // **
608 return; // **
609
610 case STRING:
611 container.token = this;
612 container.options = options;
613 return;
614
615 case INDEPENDENT:
616 case PAREN:
617 this.getChild(0).findFixedString(container, options);
618 return;
619
620 case MODIFIERGROUP:
621 options |= ((ModifierToken)this).getOptions();
622 options &= ~((ModifierToken)this).getOptionsMask();
623 this.getChild(0).findFixedString(container, options);
624 return;
625
626 default:
627 throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
628 }
629 }
630
631 boolean match(int ch) {
632 throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
633 }
634
635 // ------------------------------------------------------
636 static protected Hashtable categories = new Hashtable();
637 static protected Hashtable categories2 = null;
638 static final String[] categoryNames = {
639 "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
640 "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
641 "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
642 "L", "M", "N", "Z", "C", "P", "S", // 29-35
643 };
644 static final int CHAR_LETTER = 29;
645 static final int CHAR_MARK = 30;
646 static final int CHAR_NUMBER = 31;
647 static final int CHAR_SEPARATOR = 32;
648 static final int CHAR_OTHER = 33;
649 static final int CHAR_PUNCTUATION = 34;
650 static final int CHAR_SYMBOL = 35;
651 static final String[] blockNames = {
652 "Basic Latin", // 0
653 "Latin-1 Supplement",
654 "Latin Extended-A",
655 "Latin Extended-B",
656 "IPA Extensions",
657 "Spacing Modifier Letters",
658 "Combining Diacritical Marks",
659 "Greek",
660 "Cyrillic", // 8
661 "Armenian",
662 "Hebrew",
663 "Arabic",
664 "Devanagari",
665 "Bengali",
666 "Gurmukhi",
667 "Gujarati",
668 "Oriya", // 16
669 "Tamil",
670 "Telugu",
671 "Kannada",
672 "Malayalam",
673 "Thai",
674 "Lao",
675 "Tibetan",
676 "Georgian", // 24
677 "Hangul Jamo",
678 "Latin Extended Additional",
679 "Greek Extended",
680 "General Punctuation",
681 "Superscripts and Subscripts",
682 "Currency Symbols",
683 "Combining Marks for Symbols",
684 "Letterlike Symbols", // 32
685 "Number Forms",
686 "Arrows",
687 "Mathematical Operators",
688 "Miscellaneous Technical",
689 "Control Pictures",
690 "Optical Character Recognition",
691 "Enclosed Alphanumerics",
692 "Box Drawing", // 40
693 "Block Elements",
694 "Geometric Shapes",
695 "Miscellaneous Symbols",
696 "Dingbats",
697 "CJK Symbols and Punctuation",
698 "Hiragana",
699 "Katakana",
700 "Bopomofo", // 48
701 "Hangul Compatibility Jamo",
702 "Kanbun",
703 "Enclosed CJK Letters and Months",
704 "CJK Compatibility",
705 "CJK Unified Ideographs",
706 "Hangul Syllables",
707 "High Surrogates",
708 "High Private Use Surrogates", // 56
709 "Low Surrogates",
710 "Private Use",
711 "CJK Compatibility Ideographs",
712 "Alphabetic Presentation Forms",
713 "Arabic Presentation Forms-A",
714 "Combining Half Marks",
715 "CJK Compatibility Forms",
716 "Small Form Variants", // 64
717 "Arabic Presentation Forms-B",
718 "Specials",
719 "Halfwidth and Fullwidth Forms", // 67
720 };
721 static final String blockRanges =
722 "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF"
723 +"\u0300\u036F\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF"
724 +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF"
725 +"\u0C00\u0C7F\u0C80\u0CFF\u0D00\u0D7F\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FBF"
726 +"\u10A0\u10FF\u1100\u11FF\u1E00\u1EFF\u1F00\u1FFF\u2000\u206F\u2070\u209F"
727 +"\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
728 +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F"
729 +"\u25A0\u25FF\u2600\u26FF\u2700\u27BF\u3000\u303F\u3040\u309F\u30A0\u30FF"
730 +"\u3100\u312F\u3130\u318F\u3190\u319F\u3200\u32FF\u3300\u33FF\u4E00\u9FFF"
731 +"\uAC00\uD7A3\uD800\uDB7F\uDB80\uDBFF\uDC00\uDFFF\uE000\uF8FF\uF900\uFAFF"
732 +"\uFB00\uFB4F\uFB50\uFDFF\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE"
733 +"\uFEFF\uFEFF\uFF00\uFFEF";
734
735 static protected RangeToken getRange(String name, boolean positive) {
736 if (Token.categories.size() == 0) {
737 synchronized (Token.categories) {
738 Token[] ranges = new Token[Token.categoryNames.length];
739 for (int i = 0; i < ranges.length; i ++) {
740 ranges[i] = Token.createRange();
741 }
742 for (int i = 0; i < 0x10000; i ++) {
743 int type = Character.getType((char)i);
744 ranges[type].addRange(i, i);
745 switch (type) {
746 case Character.UPPERCASE_LETTER:
747 case Character.LOWERCASE_LETTER:
748 case Character.TITLECASE_LETTER:
749 case Character.MODIFIER_LETTER:
750 case Character.OTHER_LETTER:
751 type = CHAR_LETTER;
752 break;
753 case Character.NON_SPACING_MARK:
754 case Character.COMBINING_SPACING_MARK:
755 case Character.ENCLOSING_MARK:
756 type = CHAR_MARK;
757 break;
758 case Character.DECIMAL_DIGIT_NUMBER:
759 case Character.LETTER_NUMBER:
760 case Character.OTHER_NUMBER:
761 type = CHAR_NUMBER;
762 break;
763 case Character.SPACE_SEPARATOR:
764 case Character.LINE_SEPARATOR:
765 case Character.PARAGRAPH_SEPARATOR:
766 type = CHAR_SEPARATOR;
767 break;
768 case Character.CONTROL:
769 case Character.FORMAT:
770 case Character.SURROGATE:
771 case Character.PRIVATE_USE:
772 case Character.UNASSIGNED:
773 type = CHAR_OTHER;
774 break;
775 case Character.CONNECTOR_PUNCTUATION:
776 case Character.DASH_PUNCTUATION:
777 case Character.START_PUNCTUATION:
778 case Character.END_PUNCTUATION:
779 case Character.OTHER_PUNCTUATION:
780 type = CHAR_PUNCTUATION;
781 break;
782 case Character.MATH_SYMBOL:
783 case Character.CURRENCY_SYMBOL:
784 case Character.MODIFIER_SYMBOL:
785 case Character.OTHER_SYMBOL:
786 type = CHAR_SYMBOL;
787 break;
788 default:
789 throw new RuntimeException("org.apache.xerces.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
790 }
791 ranges[type].addRange(i, i);
792 } // for all characters
793 ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
794
795 Token.categories2 = new Hashtable();
796 for (int i = 0; i < ranges.length; i ++) {
797 if (Token.categoryNames[i] != null) {
798 if (i == Character.UNASSIGNED) { // Unassigned
799 ranges[i].addRange(0x10000, Token.UTF16_MAX);
800 }
801 Token.categories.put(Token.categoryNames[i], ranges[i]);
802 Token.categories2.put(Token.categoryNames[i],
803 Token.complementRanges(ranges[i]));
804 }
805 }
806 for (int i = 0; i < Token.blockNames.length; i ++) {
807 Token r1 = Token.createRange();
808 int rstart = Token.blockRanges.charAt(i*2);
809 int rend = Token.blockRanges.charAt(i*2+1);
810 String n = Token.blockNames[i];
811 r1.addRange(rstart, rend);
812 if (n.equals("Specials"))
813 r1.addRange(0xfff0, 0xfffd);
814 Token.categories.put(n, r1);
815 Token.categories2.put(n, Token.complementRanges(r1));
816 if (n.indexOf(' ') >= 0) {
817 StringBuffer buffer = new StringBuffer(n.length()+2);
818 buffer.append("Is");
819 for (int ci = 0; ci < n.length(); ci ++)
820 if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
821 Token.setAlias(new String(buffer), n, true);
822 }
823 }
824
825 // TR#18 1.2
826 Token.setAlias("ASSIGNED", "Cn", false);
827 Token.setAlias("UNASSIGNED", "Cn", true);
828 Token all = Token.createRange();
829 all.addRange(0, Token.UTF16_MAX);
830 Token.categories.put("ALL", all);
831 Token.categories2.put("ALL", Token.complementRanges(all));
832
833 Token isalpha = Token.createRange();
834 isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
835 isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
836 isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
837 Token.categories.put("IsAlpha", isalpha);
838 Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
839
840 Token isalnum = Token.createRange();
841 isalnum.mergeRanges(isalpha); // Lu Ll Lo
842 isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
843 Token.categories.put("IsAlnum", isalnum);
844 Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
845
846 Token isspace = Token.createRange();
847 isspace.mergeRanges(Token.token_spaces);
848 isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
849 Token.categories.put("IsSpace", isspace);
850 Token.categories2.put("IsSpace", Token.complementRanges(isspace));
851
852 Token isword = Token.createRange();
853 isword.mergeRanges(isalnum); // Lu Ll Lo Nd
854 isword.addRange('_', '_');
855 Token.categories.put("IsWord", isword);
856 Token.categories2.put("IsWord", Token.complementRanges(isword));
857
858 Token isascii = Token.createRange();
859 isascii.addRange(0, 127);
860 Token.categories.put("IsASCII", isascii);
861 Token.categories2.put("IsASCII", Token.complementRanges(isascii));
862
863 Token isnotgraph = Token.createRange();
864 isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
865 isnotgraph.addRange(' ', ' ');
866 Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
867 Token.categories2.put("IsGraph", isnotgraph);
868
869 Token isxdigit = Token.createRange();
870 isxdigit.addRange('0', '9');
871 isxdigit.addRange('A', 'F');
872 isxdigit.addRange('a', 'f');
873 Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
874 Token.categories2.put("IsXDigit", isxdigit);
875
876 Token.setAlias("IsDigit", "Nd", true);
877 Token.setAlias("IsUpper", "Lu", true);
878 Token.setAlias("IsLower", "Ll", true);
879 Token.setAlias("IsCntrl", "C", true);
880 Token.setAlias("IsPrint", "C", false);
881 Token.setAlias("IsPunct", "P", true);
882
883 Token.setAlias("alpha", "IsAlpha", true);
884 Token.setAlias("alnum", "IsAlnum", true);
885 Token.setAlias("ascii", "IsASCII", true);
886 Token.setAlias("cntrl", "IsCntrl", true);
887 Token.setAlias("digit", "IsDigit", true);
888 Token.setAlias("graph", "IsGraph", true);
889 Token.setAlias("lower", "IsLower", true);
890 Token.setAlias("print", "IsPrint", true);
891 Token.setAlias("punct", "IsPunct", true);
892 Token.setAlias("space", "IsSpace", true);
893 Token.setAlias("upper", "IsUpper", true);
894 Token.setAlias("word", "IsWord", true); // Perl extension
895 Token.setAlias("xdigit", "IsXDigit", true);
896
897 } // synchronized
898 } // if null
899 RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
900 : (RangeToken)Token.categories2.get(name);
901 return tok;
902 }
903
904 private static void setAlias(String newName, String name, boolean positive) {
905 Token t1 = (Token)Token.categories.get(name);
906 Token t2 = (Token)Token.categories2.get(name);
907 if (positive) {
908 Token.categories.put(newName, t1);
909 Token.categories2.put(newName, t2);
910 } else {
911 Token.categories2.put(newName, t1);
912 Token.categories.put(newName, t2);
913 }
914 }
915
916 // ------------------------------------------------------
917
918 static final String viramaString =
919 "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
920 +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
921 +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
922 +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
923 +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
924 +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
925 +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
926 +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
927 +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
928 +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
929 +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
930
931 static private Token token_grapheme = null;
932 static synchronized protected Token getGraphemePattern() {
933 if (Token.token_grapheme != null)
934 return Token.token_grapheme;
935
936 Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
937 base_char.mergeRanges(Token.getRange("ASSIGNED", true));
938 base_char.subtractRanges(Token.getRange("M", true));
939 base_char.subtractRanges(Token.getRange("C", true));
940
941 Token virama = Token.createRange();
942 for (int i = 0; i < Token.viramaString.length(); i ++) {
943 int ch = viramaString.charAt(i);
944 virama.addRange(i, i);
945 }
946
947 Token combiner_wo_virama = Token.createRange();
948 combiner_wo_virama.mergeRanges(Token.getRange("M", true));
949 combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
950 combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
951
952 Token left = Token.createUnion(); // base_char?
953 left.addChild(base_char);
954 left.addChild(Token.token_empty);
955
956 Token foo = Token.createUnion();
957 foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
958 foo.addChild(combiner_wo_virama);
959
960 foo = Token.createClosure(foo);
961
962 foo = Token.createConcat(left, foo);
963
964 Token.token_grapheme = foo;
965 return Token.token_grapheme;
966 }
967
968 /**
969 * Combing Character Sequence in Perl 5.6.
970 */
971 static private Token token_ccs = null;
972 static synchronized protected Token getCombiningCharacterSequence() {
973 if (Token.token_ccs != null)
974 return Token.token_ccs;
975
976 Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
977 foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
978 Token.token_ccs = foo;
979 return Token.token_ccs;
980 }
981
982 // ------------------------------------------------------
983
984 // ------------------------------------------------------
985 /**
986 * This class represents a node in parse tree.
987 */
988 static class StringToken extends Token implements java.io.Serializable {
989 String string;
990 int refNumber;
991
992 StringToken(int type, String str, int n) {
993 super(type);
994 this.string = str;
995 this.refNumber = n;
996 }
997
998 int getReferenceNumber() { // for STRING
999 return this.refNumber;
1000 }
1001 String getString() { // for STRING
1002 return this.string;
1003 }
1004
1005 public String toString(int options) {
1006 if (this.type == BACKREFERENCE)
1007 return "\\"+this.refNumber;
1008 else
1009 return REUtil.quoteMeta(this.string);
1010 }
1011 }
1012
1013 /**
1014 * This class represents a node in parse tree.
1015 */
1016 static class ConcatToken extends Token implements java.io.Serializable {
1017 Token child;
1018 Token child2;
1019
1020 ConcatToken(Token t1, Token t2) {
1021 super(Token.CONCAT);
1022 this.child = t1;
1023 this.child2 = t2;
1024 }
1025
1026 int size() {
1027 return 2;
1028