Source code: org/apache/xerces/utils/regex/RangeToken.java
1 /*
2 * The Apache Software License, Version 1.1
3 *
4 *
5 * Copyright (c) 1999,2000 The Apache Software Foundation. All rights
6 * reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in
17 * the documentation and/or other materials provided with the
18 * distribution.
19 *
20 * 3. The end-user documentation included with the redistribution,
21 * if any, must include the following acknowledgment:
22 * "This product includes software developed by the
23 * Apache Software Foundation (http://www.apache.org/)."
24 * Alternately, this acknowledgment may appear in the software itself,
25 * if and wherever such third-party acknowledgments normally appear.
26 *
27 * 4. The names "Xerces" and "Apache Software Foundation" must
28 * not be used to endorse or promote products derived from this
29 * software without prior written permission. For written
30 * permission, please contact apache@apache.org.
31 *
32 * 5. Products derived from this software may not be called "Apache",
33 * nor may "Apache" appear in their name, without prior written
34 * permission of the Apache Software Foundation.
35 *
36 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
37 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
39 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
40 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
41 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
42 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
43 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
44 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
45 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
46 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * SUCH DAMAGE.
48 * ====================================================================
49 *
50 * This software consists of voluntary contributions made by many
51 * individuals on behalf of the Apache Software Foundation and was
52 * originally based on software copyright (c) 1999, International
53 * Business Machines, Inc., http://www.apache.org. For more
54 * information on the Apache Software Foundation, please see
55 * <http://www.apache.org/>.
56 */
57
58 package org.apache.xerces.utils.regex;
59
60
61 /**
62 * This class represents a character class such as [a-z] or a period.
63 */
64 final class RangeToken extends Token implements java.io.Serializable {
65
66 int[] ranges;
67 boolean sorted;
68 boolean compacted;
69 RangeToken icaseCache = null;
70 int[] map = null;
71 int nonMapIndex;
72
73 RangeToken(int type) {
74 super(type);
75 this.setSorted(false);
76 }
77
78 // for RANGE or NRANGE
79 protected void addRange(int start, int end) {
80 this.icaseCache = null;
81 //System.err.println("Token#addRange(): "+start+" "+end);
82 int r1, r2;
83 if (start <= end) {
84 r1 = start;
85 r2 = end;
86 } else {
87 r1 = end;
88 r2 = start;
89 }
90
91 int pos = 0;
92 if (this.ranges == null) {
93 this.ranges = new int[2];
94 this.ranges[0] = r1;
95 this.ranges[1] = r2;
96 this.setSorted(true);
97 } else {
98 pos = this.ranges.length;
99 if (this.ranges[pos-1]+1 == r1) {
100 this.ranges[pos-1] = r2;
101 return;
102 }
103 int[] temp = new int[pos+2];
104 System.arraycopy(this.ranges, 0, temp, 0, pos);
105 this.ranges = temp;
106 if (this.ranges[pos-1] >= r1)
107 this.setSorted(false);
108 this.ranges[pos++] = r1;
109 this.ranges[pos] = r2;
110 if (!this.sorted)
111 this.sortRanges();
112 }
113 }
114
115 private final boolean isSorted() {
116 return this.sorted;
117 }
118 private final void setSorted(boolean sort) {
119 this.sorted = sort;
120 if (!sort) this.compacted = false;
121 }
122 private final boolean isCompacted() {
123 return this.compacted;
124 }
125 private final void setCompacted() {
126 this.compacted = true;
127 }
128
129 protected void sortRanges() {
130 if (this.isSorted())
131 return;
132 if (this.ranges == null)
133 return;
134 //System.err.println("Do sorting: "+this.ranges.length);
135
136 // Bubble sort
137 // Why? -- In many cases,
138 // this.ranges has few elements.
139 for (int i = this.ranges.length-4; i >= 0; i -= 2) {
140 for (int j = 0; j <= i; j += 2) {
141 if (this.ranges[j] > this.ranges[j+2]
142 || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
143 int tmp;
144 tmp = this.ranges[j+2];
145 this.ranges[j+2] = this.ranges[j];
146 this.ranges[j] = tmp;
147 tmp = this.ranges[j+3];
148 this.ranges[j+3] = this.ranges[j+1];
149 this.ranges[j+1] = tmp;
150 }
151 }
152 }
153 this.setSorted(true);
154 }
155
156 /**
157 * this.ranges is sorted.
158 */
159 protected void compactRanges() {
160 boolean DEBUG = false;
161 if (this.ranges == null || this.ranges.length <= 2)
162 return;
163 if (this.isCompacted())
164 return;
165 int base = 0; // Index of writing point
166 int target = 0; // Index of processing point
167
168 while (target < this.ranges.length) {
169 if (base != target) {
170 this.ranges[base] = this.ranges[target++];
171 this.ranges[base+1] = this.ranges[target++];
172 } else
173 target += 2;
174 int baseend = this.ranges[base+1];
175 while (target < this.ranges.length) {
176 if (baseend+1 < this.ranges[target])
177 break;
178 if (baseend+1 == this.ranges[target]) {
179 if (DEBUG)
180 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
181 +", "+this.ranges[base+1]
182 +"], ["+this.ranges[target]
183 +", "+this.ranges[target+1]
184 +"] -> ["+this.ranges[base]
185 +", "+this.ranges[target+1]
186 +"]");
187 this.ranges[base+1] = this.ranges[target+1];
188 baseend = this.ranges[base+1];
189 target += 2;
190 } else if (baseend >= this.ranges[target+1]) {
191 if (DEBUG)
192 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
193 +", "+this.ranges[base+1]
194 +"], ["+this.ranges[target]
195 +", "+this.ranges[target+1]
196 +"] -> ["+this.ranges[base]
197 +", "+this.ranges[base+1]
198 +"]");
199 target += 2;
200 } else if (baseend < this.ranges[target+1]) {
201 if (DEBUG)
202 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
203 +", "+this.ranges[base+1]
204 +"], ["+this.ranges[target]
205 +", "+this.ranges[target+1]
206 +"] -> ["+this.ranges[base]
207 +", "+this.ranges[target+1]
208 +"]");
209 this.ranges[base+1] = this.ranges[target+1];
210 baseend = this.ranges[base+1];
211 target += 2;
212 } else {
213 throw new RuntimeException("Token#compactRanges(): Internel Error: ["
214 +this.ranges[base]
215 +","+this.ranges[base+1]
216 +"] ["+this.ranges[target]
217 +","+this.ranges[target+1]+"]");
218 }
219 } // while
220 base += 2;
221 }
222
223 if (base != this.ranges.length) {
224 int[] result = new int[base];
225 System.arraycopy(this.ranges, 0, result, 0, base);
226 this.ranges = result;
227 }
228 this.setCompacted();
229 }
230
231 protected void mergeRanges(Token token) {
232 if (token.type != this.type)
233 throw new IllegalArgumentException("Token#mergeRanges(): Mismatched Type: "+token.type);
234 RangeToken tok = (RangeToken)token;
235 this.sortRanges();
236 tok.sortRanges();
237 if (tok.ranges == null)
238 return;
239 this.icaseCache = null;
240 this.setSorted(true);
241 if (this.ranges == null) {
242 this.ranges = new int[tok.ranges.length];
243 System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
244 return;
245 }
246 int[] result = new int[this.ranges.length+tok.ranges.length];
247 for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
248 if (i >= this.ranges.length) {
249 result[k++] = tok.ranges[j++];
250 result[k++] = tok.ranges[j++];
251 } else if (j >= tok.ranges.length) {
252 result[k++] = this.ranges[i++];
253 result[k++] = this.ranges[i++];
254 } else if (tok.ranges[j] < this.ranges[i]
255 || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
256 result[k++] = tok.ranges[j++];
257 result[k++] = tok.ranges[j++];
258 } else {
259 result[k++] = this.ranges[i++];
260 result[k++] = this.ranges[i++];
261 }
262 }
263 this.ranges = result;
264 }
265
266 protected void subtractRanges(Token token) {
267 if (token.type == NRANGE) {
268 this.intersectRanges(token);
269 return;
270 }
271 RangeToken tok = (RangeToken)token;
272 if (tok.ranges == null || this.ranges == null)
273 return;
274 this.icaseCache = null;
275 this.sortRanges();
276 this.compactRanges();
277 tok.sortRanges();
278 tok.compactRanges();
279
280 //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
281
282 int[] result = new int[this.ranges.length+tok.ranges.length];
283 int wp = 0, src = 0, sub = 0;
284 while (src < this.ranges.length && sub < tok.ranges.length) {
285 int srcbegin = this.ranges[src];
286 int srcend = this.ranges[src+1];
287 int subbegin = tok.ranges[sub];
288 int subend = tok.ranges[sub+1];
289 if (srcend < subbegin) { // Not overlapped
290 // src: o-----o
291 // sub: o-----o
292 // res: o-----o
293 // Reuse sub
294 result[wp++] = this.ranges[src++];
295 result[wp++] = this.ranges[src++];
296 } else if (srcend >= subbegin
297 && srcbegin <= subend) { // Overlapped
298 // src: o--------o
299 // sub: o----o
300 // sub: o----o
301 // sub: o----o
302 // sub: o------------o
303 if (subbegin <= srcbegin && srcend <= subend) {
304 // src: o--------o
305 // sub: o------------o
306 // res: empty
307 // Reuse sub
308 src += 2;
309 } else if (subbegin <= srcbegin) {
310 // src: o--------o
311 // sub: o----o
312 // res: o-----o
313 // Reuse src(=res)
314 this.ranges[src] = subend+1;
315 sub += 2;
316 } else if (srcend <= subend) {
317 // src: o--------o
318 // sub: o----o
319 // res: o-----o
320 // Reuse sub
321 result[wp++] = srcbegin;
322 result[wp++] = subbegin-1;
323 src += 2;
324 } else {
325 // src: o--------o
326 // sub: o----o
327 // res: o-o o-o
328 // Reuse src(=right res)
329 result[wp++] = srcbegin;
330 result[wp++] = subbegin-1;
331 this.ranges[src] = subend+1;
332 sub += 2;
333 }
334 } else if (subend < srcbegin) {
335 // Not overlapped
336 // src: o-----o
337 // sub: o----o
338 sub += 2;
339 } else {
340 throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
341 +","+this.ranges[src+1]
342 +"] - ["+tok.ranges[sub]
343 +","+tok.ranges[sub+1]
344 +"]");
345 }
346 }
347 while (src < this.ranges.length) {
348 result[wp++] = this.ranges[src++];
349 result[wp++] = this.ranges[src++];
350 }
351 this.ranges = new int[wp];
352 System.arraycopy(result, 0, this.ranges, 0, wp);
353 // this.ranges is sorted and compacted.
354 }
355
356 /**
357 * @param tok Ignore whether it is NRANGE or not.
358 */
359 protected void intersectRanges(Token token) {
360 RangeToken tok = (RangeToken)token;
361 if (tok.ranges == null || this.ranges == null)
362 return;
363 this.icaseCache = null;
364 this.sortRanges();
365 this.compactRanges();
366 tok.sortRanges();
367 tok.compactRanges();
368
369 int[] result = new int[this.ranges.length+tok.ranges.length];
370 int wp = 0, src1 = 0, src2 = 0;
371 while (src1 < this.ranges.length && src2 < tok.ranges.length) {
372 int src1begin = this.ranges[src1];
373 int src1end = this.ranges[src1+1];
374 int src2begin = tok.ranges[src2];
375 int src2end = tok.ranges[src2+1];
376 if (src1end < src2begin) { // Not overlapped
377 // src1: o-----o
378 // src2: o-----o
379 // res: empty
380 // Reuse src2
381 src1 += 2;
382 } else if (src1end >= src2begin
383 && src1begin <= src2end) { // Overlapped
384 // src1: o--------o
385 // src2: o----o
386 // src2: o----o
387 // src2: o----o
388 // src2: o------------o
389 if (src2begin <= src2begin && src1end <= src2end) {
390 // src1: o--------o
391 // src2: o------------o
392 // res: o--------o
393 // Reuse src2
394 result[wp++] = src1begin;
395 result[wp++] = src1end;
396 src1 += 2;
397 } else if (src2begin <= src1begin) {
398 // src1: o--------o
399 // src2: o----o
400 // res: o--o
401 // Reuse the rest of src1
402 result[wp++] = src1begin;
403 result[wp++] = src2end;
404 this.ranges[src1] = src2end+1;
405 src2 += 2;
406 } else if (src1end <= src2end) {
407 // src1: o--------o
408 // src2: o----o
409 // res: o--o
410 // Reuse src2
411 result[wp++] = src2begin;
412 result[wp++] = src1end;
413 src1 += 2;
414 } else {
415 // src1: o--------o
416 // src2: o----o
417 // res: o----o
418 // Reuse the rest of src1
419 result[wp++] = src2begin;
420 result[wp++] = src2end;
421 this.ranges[src1] = src2end+1;
422 }
423 } else if (src2end < src1begin) {
424 // Not overlapped
425 // src1: o-----o
426 // src2: o----o
427 src2 += 2;
428 } else {
429 throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
430 +this.ranges[src1]
431 +","+this.ranges[src1+1]
432 +"] & ["+tok.ranges[src2]
433 +","+tok.ranges[src2+1]
434 +"]");
435 }
436 }
437 while (src1 < this.ranges.length) {
438 result[wp++] = this.ranges[src1++];
439 result[wp++] = this.ranges[src1++];
440 }
441 this.ranges = new int[wp];
442 System.arraycopy(result, 0, this.ranges, 0, wp);
443 // this.ranges is sorted and compacted.
444 }
445
446 /**
447 * for RANGE: Creates complement.
448 * for NRANGE: Creates the same meaning RANGE.
449 */
450 static Token complementRanges(Token token) {
451 if (token.type != RANGE && token.type != NRANGE)
452 throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
453 RangeToken tok = (RangeToken)token;
454 tok.sortRanges();
455 tok.compactRanges();
456 int len = tok.ranges.length+2;
457 if (tok.ranges[0] == 0)
458 len -= 2;
459 int last = tok.ranges[tok.ranges.length-1];
460 if (last == UTF16_MAX)
461 len -= 2;
462 RangeToken ret = Token.createRange();
463 ret.ranges = new int[len];
464 int wp = 0;
465 if (tok.ranges[0] > 0) {
466 ret.ranges[wp++] = 0;
467 ret.ranges[wp++] = tok.ranges[0]-1;
468 }
469 for (int i = 1; i < tok.ranges.length-2; i += 2) {
470 ret.ranges[wp++] = tok.ranges[i]+1;
471 ret.ranges[wp++] = tok.ranges[i+1]-1;
472 }
473 if (last != UTF16_MAX) {
474 ret.ranges[wp++] = last+1;
475 ret.ranges[wp] = UTF16_MAX;
476 }
477 ret.setCompacted();
478 return ret;
479 }
480
481 synchronized RangeToken getCaseInsensitiveToken() {
482 if (this.icaseCache != null)
483 return this.icaseCache;
484
485 RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
486 for (int i = 0; i < this.ranges.length; i += 2) {
487 for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
488 if (ch > 0xffff)
489 uppers.addRange(ch, ch);
490 else {
491 char uch = Character.toUpperCase((char)ch);
492 uppers.addRange(uch, uch);
493 }
494 }
495 }
496 RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
497 for (int i = 0; i < uppers.ranges.length; i += 2) {
498 for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
499 if (ch > 0xffff)
500 lowers.addRange(ch, ch);
501 else {
502 char uch = Character.toUpperCase((char)ch);
503 lowers.addRange(uch, uch);
504 }
505 }
506 }
507 lowers.mergeRanges(uppers);
508 lowers.mergeRanges(this);
509 lowers.compactRanges();
510
511 this.icaseCache = lowers;
512 return lowers;
513 }
514
515 void dumpRanges() {
516 System.err.print("RANGE: ");
517 if (this.ranges == null)
518 System.err.println(" NULL");
519 for (int i = 0; i < this.ranges.length; i += 2) {
520 System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
521 }
522 System.err.println("");
523 }
524
525 boolean match(int ch) {
526 if (this.map == null) this.createMap();
527 boolean ret;
528 if (this.type == RANGE) {
529 if (ch < MAPSIZE)
530 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
531 ret = false;
532 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
533 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
534 return true;
535 }
536 } else {
537 if (ch < MAPSIZE)
538 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
539 ret = true;
540 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
541 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
542 return false;
543 }
544 }
545 return ret;
546 }
547
548 private static final int MAPSIZE = 256;
549 private void createMap() {
550 int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
551 this.map = new int[asize];
552 this.nonMapIndex = this.ranges.length;
553 for (int i = 0; i < asize; i ++) this.map[i] = 0;
554 for (int i = 0; i < this.ranges.length; i += 2) {
555 int s = this.ranges[i];
556 int e = this.ranges[i+1];
557 if (s < MAPSIZE) {
558 for (int j = s; j <= e && j < MAPSIZE; j ++)
559 this.map[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
560 } else {
561 this.nonMapIndex = i;
562 break;
563 }
564 if (e >= MAPSIZE) {
565 this.nonMapIndex = i;
566 break;
567 }
568 }
569 //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
570 }
571
572 public String toString(int options) {
573 String ret;
574 if (this.type == RANGE) {
575 if (this == Token.token_dot)
576 ret = ".";
577 else if (this == Token.token_0to9)
578 ret = "\\d";
579 else if (this == Token.token_wordchars)
580 ret = "\\w";
581 else if (this == Token.token_spaces)
582 ret = "\\s";
583 else {
584 StringBuffer sb = new StringBuffer();
585 sb.append("[");
586 for (int i = 0; i < this.ranges.length; i += 2) {
587 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
588 if (this.ranges[i] == this.ranges[i+1]) {
589 sb.append(escapeCharInCharClass(this.ranges[i]));
590 } else {
591 sb.append(escapeCharInCharClass(this.ranges[i]));
592 sb.append((char)'-');
593 sb.append(escapeCharInCharClass(this.ranges[i+1]));
594 }
595 }
596 sb.append("]");
597 ret = sb.toString();
598 }
599 } else {
600 if (this == Token.token_not_0to9)
601 ret = "\\D";
602 else if (this == Token.token_not_wordchars)
603 ret = "\\W";
604 else if (this == Token.token_not_spaces)
605 ret = "\\S";
606 else {
607 StringBuffer sb = new StringBuffer();
608 sb.append("[^");
609 for (int i = 0; i < this.ranges.length; i += 2) {
610 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
611 if (this.ranges[i] == this.ranges[i+1]) {
612 sb.append(escapeCharInCharClass(this.ranges[i]));
613 } else {
614 sb.append(escapeCharInCharClass(this.ranges[i]));
615 sb.append('-');
616 sb.append(escapeCharInCharClass(this.ranges[i+1]));
617 }
618 }
619 sb.append("]");
620 ret = sb.toString();
621 }
622 }
623 return ret;
624 }
625
626 private static String escapeCharInCharClass(int ch) {
627 String ret;
628 switch (ch) {
629 case '[': case ']': case '-': case '^':
630 case ',': case '\\':
631 ret = "\\"+(char)ch;
632 break;
633 case '\f': ret = "\\f"; break;
634 case '\n': ret = "\\n"; break;
635 case '\r': ret = "\\r"; break;
636 case '\t': ret = "\\t"; break;
637 case 0x1b: ret = "\\e"; break;
638 //case 0x0b: ret = "\\v"; break;
639 default:
640 if (ch < 0x20) {
641 String pre = "0"+Integer.toHexString(ch);
642 ret = "\\x"+pre.substring(pre.length()-2, pre.length());
643 } else if (ch >= 0x10000) {
644 String pre = "0"+Integer.toHexString(ch);
645 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
646 } else
647 ret = ""+(char)ch;
648 }
649 return ret;
650 }
651
652 }