1 /* Copyright 2004 The Apache Software Foundation
2 *
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 package org.apache.xmlbeans.impl.regex;
17
18 /**
19 * This class represents a character class such as [a-z] or a period.
20 */
21 final class RangeToken extends Token implements java.io.Serializable {
22
23 int[] ranges;
24 boolean sorted;
25 boolean compacted;
26 RangeToken icaseCache = null;
27 int[] map = null;
28 int nonMapIndex;
29
30 RangeToken(int type) {
31 super(type);
32 this.setSorted(false);
33 }
34
35 // for RANGE or NRANGE
36 protected void addRange(int start, int end) {
37 this.icaseCache = null;
38 //System.err.println("Token#addRange(): "+start+" "+end);
39 int r1, r2;
40 if (start <= end) {
41 r1 = start;
42 r2 = end;
43 } else {
44 r1 = end;
45 r2 = start;
46 }
47
48 int pos = 0;
49 if (this.ranges == null) {
50 this.ranges = new int[2];
51 this.ranges[0] = r1;
52 this.ranges[1] = r2;
53 this.setSorted(true);
54 } else {
55 pos = this.ranges.length;
56 if (this.ranges[pos-1]+1 == r1) {
57 this.ranges[pos-1] = r2;
58 return;
59 }
60 int[] temp = new int[pos+2];
61 System.arraycopy(this.ranges, 0, temp, 0, pos);
62 this.ranges = temp;
63 if (this.ranges[pos-1] >= r1)
64 this.setSorted(false);
65 this.ranges[pos++] = r1;
66 this.ranges[pos] = r2;
67 if (!this.sorted)
68 this.sortRanges();
69 }
70 }
71
72 private final boolean isSorted() {
73 return this.sorted;
74 }
75 private final void setSorted(boolean sort) {
76 this.sorted = sort;
77 if (!sort) this.compacted = false;
78 }
79 private final boolean isCompacted() {
80 return this.compacted;
81 }
82 private final void setCompacted() {
83 this.compacted = true;
84 }
85
86 protected void sortRanges() {
87 if (this.isSorted())
88 return;
89 if (this.ranges == null)
90 return;
91 //System.err.println("Do sorting: "+this.ranges.length);
92
93 // Bubble sort
94 // Why? -- In many cases,
95 // this.ranges has few elements.
96 for (int i = this.ranges.length-4; i >= 0; i -= 2) {
97 for (int j = 0; j <= i; j += 2) {
98 if (this.ranges[j] > this.ranges[j+2]
99 || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
100 int tmp;
101 tmp = this.ranges[j+2];
102 this.ranges[j+2] = this.ranges[j];
103 this.ranges[j] = tmp;
104 tmp = this.ranges[j+3];
105 this.ranges[j+3] = this.ranges[j+1];
106 this.ranges[j+1] = tmp;
107 }
108 }
109 }
110 this.setSorted(true);
111 }
112
113 /**
114 * this.ranges is sorted.
115 */
116 protected void compactRanges() {
117 boolean DEBUG = false;
118 if (this.ranges == null || this.ranges.length <= 2)
119 return;
120 if (this.isCompacted())
121 return;
122 int base = 0; // Index of writing point
123 int target = 0; // Index of processing point
124
125 while (target < this.ranges.length) {
126 if (base != target) {
127 this.ranges[base] = this.ranges[target++];
128 this.ranges[base+1] = this.ranges[target++];
129 } else
130 target += 2;
131 int baseend = this.ranges[base+1];
132 while (target < this.ranges.length) {
133 if (baseend+1 < this.ranges[target])
134 break;
135 if (baseend+1 == this.ranges[target]) {
136 if (DEBUG)
137 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
138 +", "+this.ranges[base+1]
139 +"], ["+this.ranges[target]
140 +", "+this.ranges[target+1]
141 +"] -> ["+this.ranges[base]
142 +", "+this.ranges[target+1]
143 +"]");
144 this.ranges[base+1] = this.ranges[target+1];
145 baseend = this.ranges[base+1];
146 target += 2;
147 } else if (baseend >= this.ranges[target+1]) {
148 if (DEBUG)
149 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
150 +", "+this.ranges[base+1]
151 +"], ["+this.ranges[target]
152 +", "+this.ranges[target+1]
153 +"] -> ["+this.ranges[base]
154 +", "+this.ranges[base+1]
155 +"]");
156 target += 2;
157 } else if (baseend < this.ranges[target+1]) {
158 if (DEBUG)
159 System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
160 +", "+this.ranges[base+1]
161 +"], ["+this.ranges[target]
162 +", "+this.ranges[target+1]
163 +"] -> ["+this.ranges[base]
164 +", "+this.ranges[target+1]
165 +"]");
166 this.ranges[base+1] = this.ranges[target+1];
167 baseend = this.ranges[base+1];
168 target += 2;
169 } else {
170 throw new RuntimeException("Token#compactRanges(): Internel Error: ["
171 +this.ranges[base]
172 +","+this.ranges[base+1]
173 +"] ["+this.ranges[target]
174 +","+this.ranges[target+1]+"]");
175 }
176 } // while
177 base += 2;
178 }
179
180 if (base != this.ranges.length) {
181 int[] result = new int[base];
182 System.arraycopy(this.ranges, 0, result, 0, base);
183 this.ranges = result;
184 }
185 this.setCompacted();
186 }
187
188 protected void mergeRanges(Token token) {
189 RangeToken tok = (RangeToken)token;
190 this.sortRanges();
191 tok.sortRanges();
192 if (tok.ranges == null)
193 return;
194 this.icaseCache = null;
195 this.setSorted(true);
196 if (this.ranges == null) {
197 this.ranges = new int[tok.ranges.length];
198 System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
199 return;
200 }
201 int[] result = new int[this.ranges.length+tok.ranges.length];
202 for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
203 if (i >= this.ranges.length) {
204 result[k++] = tok.ranges[j++];
205 result[k++] = tok.ranges[j++];
206 } else if (j >= tok.ranges.length) {
207 result[k++] = this.ranges[i++];
208 result[k++] = this.ranges[i++];
209 } else if (tok.ranges[j] < this.ranges[i]
210 || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
211 result[k++] = tok.ranges[j++];
212 result[k++] = tok.ranges[j++];
213 } else {
214 result[k++] = this.ranges[i++];
215 result[k++] = this.ranges[i++];
216 }
217 }
218 this.ranges = result;
219 }
220
221 protected void subtractRanges(Token token) {
222 if (token.type == NRANGE) {
223 this.intersectRanges(token);
224 return;
225 }
226 RangeToken tok = (RangeToken)token;
227 if (tok.ranges == null || this.ranges == null)
228 return;
229 this.icaseCache = null;
230 this.sortRanges();
231 this.compactRanges();
232 tok.sortRanges();
233 tok.compactRanges();
234
235 //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
236
237 int[] result = new int[this.ranges.length+tok.ranges.length];
238 int wp = 0, src = 0, sub = 0;
239 while (src < this.ranges.length && sub < tok.ranges.length) {
240 int srcbegin = this.ranges[src];
241 int srcend = this.ranges[src+1];
242 int subbegin = tok.ranges[sub];
243 int subend = tok.ranges[sub+1];
244 if (srcend < subbegin) { // Not overlapped
245 // src: o-----o
246 // sub: o-----o
247 // res: o-----o
248 // Reuse sub
249 result[wp++] = this.ranges[src++];
250 result[wp++] = this.ranges[src++];
251 } else if (srcend >= subbegin
252 && srcbegin <= subend) { // Overlapped
253 // src: o--------o
254 // sub: o----o
255 // sub: o----o
256 // sub: o----o
257 // sub: o------------o
258 if (subbegin <= srcbegin && srcend <= subend) {
259 // src: o--------o
260 // sub: o------------o
261 // res: empty
262 // Reuse sub
263 src += 2;
264 } else if (subbegin <= srcbegin) {
265 // src: o--------o
266 // sub: o----o
267 // res: o-----o
268 // Reuse src(=res)
269 this.ranges[src] = subend+1;
270 sub += 2;
271 } else if (srcend <= subend) {
272 // src: o--------o
273 // sub: o----o
274 // res: o-----o
275 // Reuse sub
276 result[wp++] = srcbegin;
277 result[wp++] = subbegin-1;
278 src += 2;
279 } else {
280 // src: o--------o
281 // sub: o----o
282 // res: o-o o-o
283 // Reuse src(=right res)
284 result[wp++] = srcbegin;
285 result[wp++] = subbegin-1;
286 this.ranges[src] = subend+1;
287 sub += 2;
288 }
289 } else if (subend < srcbegin) {
290 // Not overlapped
291 // src: o-----o
292 // sub: o----o
293 sub += 2;
294 } else {
295 throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
296 +","+this.ranges[src+1]
297 +"] - ["+tok.ranges[sub]
298 +","+tok.ranges[sub+1]
299 +"]");
300 }
301 }
302 while (src < this.ranges.length) {
303 result[wp++] = this.ranges[src++];
304 result[wp++] = this.ranges[src++];
305 }
306 this.ranges = new int[wp];
307 System.arraycopy(result, 0, this.ranges, 0, wp);
308 // this.ranges is sorted and compacted.
309 }
310
311 /**
312 * @param tok Ignore whether it is NRANGE or not.
313 */
314 protected void intersectRanges(Token token) {
315 RangeToken tok = (RangeToken)token;
316 if (tok.ranges == null || this.ranges == null)
317 return;
318 this.icaseCache = null;
319 this.sortRanges();
320 this.compactRanges();
321 tok.sortRanges();
322 tok.compactRanges();
323
324 int[] result = new int[this.ranges.length+tok.ranges.length];
325 int wp = 0, src1 = 0, src2 = 0;
326 while (src1 < this.ranges.length && src2 < tok.ranges.length) {
327 int src1begin = this.ranges[src1];
328 int src1end = this.ranges[src1+1];
329 int src2begin = tok.ranges[src2];
330 int src2end = tok.ranges[src2+1];
331 if (src1end < src2begin) { // Not overlapped
332 // src1: o-----o
333 // src2: o-----o
334 // res: empty
335 // Reuse src2
336 src1 += 2;
337 } else if (src1end >= src2begin
338 && src1begin <= src2end) { // Overlapped
339 // src1: o--------o
340 // src2: o----o
341 // src2: o----o
342 // src2: o----o
343 // src2: o------------o
344 if (src2begin <= src2begin && src1end <= src2end) {
345 // src1: o--------o
346 // src2: o------------o
347 // res: o--------o
348 // Reuse src2
349 result[wp++] = src1begin;
350 result[wp++] = src1end;
351 src1 += 2;
352 } else if (src2begin <= src1begin) {
353 // src1: o--------o
354 // src2: o----o
355 // res: o--o
356 // Reuse the rest of src1
357 result[wp++] = src1begin;
358 result[wp++] = src2end;
359 this.ranges[src1] = src2end+1;
360 src2 += 2;
361 } else if (src1end <= src2end) {
362 // src1: o--------o
363 // src2: o----o
364 // res: o--o
365 // Reuse src2
366 result[wp++] = src2begin;
367 result[wp++] = src1end;
368 src1 += 2;
369 } else {
370 // src1: o--------o
371 // src2: o----o
372 // res: o----o
373 // Reuse the rest of src1
374 result[wp++] = src2begin;
375 result[wp++] = src2end;
376 this.ranges[src1] = src2end+1;
377 }
378 } else if (src2end < src1begin) {
379 // Not overlapped
380 // src1: o-----o
381 // src2: o----o
382 src2 += 2;
383 } else {
384 throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
385 +this.ranges[src1]
386 +","+this.ranges[src1+1]
387 +"] & ["+tok.ranges[src2]
388 +","+tok.ranges[src2+1]
389 +"]");
390 }
391 }
392 while (src1 < this.ranges.length) {
393 result[wp++] = this.ranges[src1++];
394 result[wp++] = this.ranges[src1++];
395 }
396 this.ranges = new int[wp];
397 System.arraycopy(result, 0, this.ranges, 0, wp);
398 // this.ranges is sorted and compacted.
399 }
400
401 /**
402 * for RANGE: Creates complement.
403 * for NRANGE: Creates the same meaning RANGE.
404 */
405 static Token complementRanges(Token token) {
406 if (token.type != RANGE && token.type != NRANGE)
407 throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
408 RangeToken tok = (RangeToken)token;
409 tok.sortRanges();
410 tok.compactRanges();
411 int len = tok.ranges.length+2;
412 if (tok.ranges[0] == 0)
413 len -= 2;
414 int last = tok.ranges[tok.ranges.length-1];
415 if (last == UTF16_MAX)
416 len -= 2;
417 RangeToken ret = Token.createRange();
418 ret.ranges = new int[len];
419 int wp = 0;
420 if (tok.ranges[0] > 0) {
421 ret.ranges[wp++] = 0;
422 ret.ranges[wp++] = tok.ranges[0]-1;
423 }
424 for (int i = 1; i < tok.ranges.length-2; i += 2) {
425 ret.ranges[wp++] = tok.ranges[i]+1;
426 ret.ranges[wp++] = tok.ranges[i+1]-1;
427 }
428 if (last != UTF16_MAX) {
429 ret.ranges[wp++] = last+1;
430 ret.ranges[wp] = UTF16_MAX;
431 }
432 ret.setCompacted();
433 return ret;
434 }
435
436 synchronized RangeToken getCaseInsensitiveToken() {
437 if (this.icaseCache != null)
438 return this.icaseCache;
439
440 RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
441 for (int i = 0; i < this.ranges.length; i += 2) {
442 for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
443 if (ch > 0xffff)
444 uppers.addRange(ch, ch);
445 else {
446 char uch = Character.toUpperCase((char)ch);
447 uppers.addRange(uch, uch);
448 }
449 }
450 }
451 RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
452 for (int i = 0; i < uppers.ranges.length; i += 2) {
453 for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
454 if (ch > 0xffff)
455 lowers.addRange(ch, ch);
456 else {
457 char uch = Character.toUpperCase((char)ch);
458 lowers.addRange(uch, uch);
459 }
460 }
461 }
462 lowers.mergeRanges(uppers);
463 lowers.mergeRanges(this);
464 lowers.compactRanges();
465
466 this.icaseCache = lowers;
467 return lowers;
468 }
469
470 void dumpRanges() {
471 System.err.print("RANGE: ");
472 if (this.ranges == null)
473 System.err.println(" NULL");
474 for (int i = 0; i < this.ranges.length; i += 2) {
475 System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
476 }
477 System.err.println("");
478 }
479
480 boolean match(int ch) {
481 if (this.map == null) this.createMap();
482 boolean ret;
483 if (this.type == RANGE) {
484 if (ch < MAPSIZE)
485 return (this.map[ch/32] & (1<<(ch&0x1f))) != 0;
486 ret = false;
487 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
488 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
489 return true;
490 }
491 } else {
492 if (ch < MAPSIZE)
493 return (this.map[ch/32] & (1<<(ch&0x1f))) == 0;
494 ret = true;
495 for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
496 if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
497 return false;
498 }
499 }
500 return ret;
501 }
502
503 private static final int MAPSIZE = 256;
504 private void createMap() {
505 int asize = MAPSIZE/32; // 32 is the number of bits in `int'.
506 // CHANGE(radup) we need a new map, since this is not synchronized
507 // and if we init the instance map with 0's it's going to be trouble
508 // -this.map = new int[asize];
509 // -this.nonMapIndex = this.ranges.length;
510 // -for (int i = 0; i < asize; i++) this.map[i] = 0;
511 int[] localmap = new int[asize]; // +
512 int localnonMapIndex = this.ranges.length; // +
513 for (int i = 0; i < asize; i ++) localmap[i] = 0; // + redundant
514 for (int i = 0; i < this.ranges.length; i += 2) {
515 int s = this.ranges[i];
516 int e = this.ranges[i+1];
517 if (s < MAPSIZE) {
518 for (int j = s; j <= e && j < MAPSIZE; j ++)
519 localmap[j/32] |= 1<<(j&0x1f); // s&0x1f : 0-31
520 } else {
521 localnonMapIndex = i;
522 break;
523 }
524 if (e >= MAPSIZE) {
525 localnonMapIndex = i;
526 break;
527 }
528 }
529 this.nonMapIndex = localnonMapIndex; // +
530 this.map = localmap; // +
531 //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
532 }
533
534 public String toString(int options) {
535 String ret;
536 if (this.type == RANGE) {
537 if (this == Token.token_dot)
538 ret = ".";
539 else if (this == Token.token_0to9)
540 ret = "\\d";
541 else if (this == Token.token_wordchars)
542 ret = "\\w";
543 else if (this == Token.token_spaces)
544 ret = "\\s";
545 else {
546 StringBuffer sb = new StringBuffer();
547 sb.append("[");
548 for (int i = 0; i < this.ranges.length; i += 2) {
549 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
550 if (this.ranges[i] == this.ranges[i+1]) {
551 sb.append(escapeCharInCharClass(this.ranges[i]));
552 } else {
553 sb.append(escapeCharInCharClass(this.ranges[i]));
554 sb.append((char)'-');
555 sb.append(escapeCharInCharClass(this.ranges[i+1]));
556 }
557 }
558 sb.append("]");
559 ret = sb.toString();
560 }
561 } else {
562 if (this == Token.token_not_0to9)
563 ret = "\\D";
564 else if (this == Token.token_not_wordchars)
565 ret = "\\W";
566 else if (this == Token.token_not_spaces)
567 ret = "\\S";
568 else {
569 StringBuffer sb = new StringBuffer();
570 sb.append("[^");
571 for (int i = 0; i < this.ranges.length; i += 2) {
572 if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
573 if (this.ranges[i] == this.ranges[i+1]) {
574 sb.append(escapeCharInCharClass(this.ranges[i]));
575 } else {
576 sb.append(escapeCharInCharClass(this.ranges[i]));
577 sb.append('-');
578 sb.append(escapeCharInCharClass(this.ranges[i+1]));
579 }
580 }
581 sb.append("]");
582 ret = sb.toString();
583 }
584 }
585 return ret;
586 }
587
588 private static String escapeCharInCharClass(int ch) {
589 String ret;
590 switch (ch) {
591 case '[': case ']': case '-': case '^':
592 case ',': case '\\':
593 ret = "\\"+(char)ch;
594 break;
595 case '\f': ret = "\\f"; break;
596 case '\n': ret = "\\n"; break;
597 case '\r': ret = "\\r"; break;
598 case '\t': ret = "\\t"; break;
599 case 0x1b: ret = "\\e"; break;
600 //case 0x0b: ret = "\\v"; break;
601 default:
602 if (ch < 0x20) {
603 String pre = "0"+Integer.toHexString(ch);
604 ret = "\\x"+pre.substring(pre.length()-2, pre.length());
605 } else if (ch >= 0x10000) {
606 String pre = "0"+Integer.toHexString(ch);
607 ret = "\\v"+pre.substring(pre.length()-6, pre.length());
608 } else
609 ret = ""+(char)ch;
610 }
611 return ret;
612 }
613
614 }