1 /*
2 * Copyright 1999-2005 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26 /*
27 *
28 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
29 * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
30 *
31 * The original version of this source code and documentation
32 * is copyrighted and owned by Taligent, Inc., a wholly-owned
33 * subsidiary of IBM. These materials are provided under terms
34 * of a License Agreement between Taligent and Sun. This technology
35 * is protected by multiple US and International patents.
36 *
37 * This notice and attribution to Taligent may not be removed.
38 * Taligent is a registered trademark of Taligent, Inc.
39 */
40
41 package java.text;
42
43 import java.io.BufferedInputStream;
44 import java.io.IOException;
45 import java.security.AccessController;
46 import java.security.PrivilegedActionException;
47 import java.security.PrivilegedExceptionAction;
48 import java.util.Vector;
49 import java.util.Stack;
50 import java.util.Hashtable;
51 import java.util.Enumeration;
52 import java.util.MissingResourceException;
53 import java.text.CharacterIterator;
54 import java.text.StringCharacterIterator;
55 import sun.text.CompactByteArray;
56 import sun.text.SupplementaryCharacterData;
57
58 /**
59 * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
60 *
61 * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
62 * and <i>regular expressions.</i></p>
63 *
64 * <p>A substitution rule defines a name that can be used in place of an expression. It
65 * consists of a name, which is a string of characters contained in angle brackets, an equals
66 * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
67 * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
68 * square brackets. A substitution is visible after its definition, and is filled in using
69 * simple textual substitution. Substitution definitions can contain other substitutions, as
70 * long as those substitutions have been defined first. Substitutions are generally used to
71 * make the regular expressions (which can get quite complex) shorted and easier to read.
72 * They typically define either character categories or commonly-used subexpressions.</p>
73 *
74 * <p>There is one special substitution. If the description defines a substitution
75 * called "<ignore>", the expression must be a [] expression, and the
76 * expression defines a set of characters (the "<em>ignore characters</em>") that
77 * will be transparent to the BreakIterator. A sequence of characters will break the
78 * same way it would if any ignore characters it contains are taken out. Break
79 * positions never occur befoer ignore characters.</p>
80 *
81 * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
82 * defines a sequence of characters to be kept together. With one significant exception, the
83 * iterator uses a longest-possible-match algorithm when matching text to regular
84 * expressions. The iterator also treats descriptions containing multiple regular expressions
85 * as if they were ORed together (i.e., as if they were separated by |).</p>
86 *
87 * <p>The special characters recognized by the regular-expression parser are as follows:</p>
88 *
89 * <blockquote>
90 * <table border="1" width="100%">
91 * <tr>
92 * <td width="6%">*</td>
93 * <td width="94%">Specifies that the expression preceding the asterisk may occur any number
94 * of times (including not at all).</td>
95 * </tr>
96 * <tr>
97 * <td width="6%">{}</td>
98 * <td width="94%">Encloses a sequence of characters that is optional.</td>
99 * </tr>
100 * <tr>
101 * <td width="6%">()</td>
102 * <td width="94%">Encloses a sequence of characters. If followed by *, the sequence
103 * repeats. Otherwise, the parentheses are just a grouping device and a way to delimit
104 * the ends of expressions containing |.</td>
105 * </tr>
106 * <tr>
107 * <td width="6%">|</td>
108 * <td width="94%">Separates two alternative sequences of characters. Either one
109 * sequence or the other, but not both, matches this expression. The | character can
110 * only occur inside ().</td>
111 * </tr>
112 * <tr>
113 * <td width="6%">.</td>
114 * <td width="94%">Matches any character.</td>
115 * </tr>
116 * <tr>
117 * <td width="6%">*?</td>
118 * <td width="94%">Specifies a non-greedy asterisk. *? works the same way as *, except
119 * when there is overlap between the last group of characters in the expression preceding the
120 * * and the first group of characters following the *. When there is this kind of
121 * overlap, * will match the longest sequence of characters that match the expression before
122 * the *, and *? will match the shortest sequence of characters matching the expression
123 * before the *?. For example, if you have "xxyxyyyxyxyxxyxyxyy" in the text,
124 * "x[xy]*x" will match through to the last x (i.e., "<strong>xxyxyyyxyxyxxyxyx</strong>yy",
125 * but "x[xy]*?x" will only match the first two xes ("<strong>xx</strong>yxyyyxyxyxxyxyxyy").</td>
126 * </tr>
127 * <tr>
128 * <td width="6%">[]</td>
129 * <td width="94%">Specifies a group of alternative characters. A [] expression will
130 * match any single character that is specified in the [] expression. For more on the
131 * syntax of [] expressions, see below.</td>
132 * </tr>
133 * <tr>
134 * <td width="6%">/</td>
135 * <td width="94%">Specifies where the break position should go if text matches this
136 * expression. (e.g., "[a-z]*/[:Zs:]*[1-0]" will match if the iterator sees a run
137 * of letters, followed by a run of whitespace, followed by a digit, but the break position
138 * will actually go before the whitespace). Expressions that don't contain / put the
139 * break position at the end of the matching text.</td>
140 * </tr>
141 * <tr>
142 * <td width="6%">\</td>
143 * <td width="94%">Escape character. The \ itself is ignored, but causes the next
144 * character to be treated as literal character. This has no effect for many
145 * characters, but for the characters listed above, this deprives them of their special
146 * meaning. (There are no special escape sequences for Unicode characters, or tabs and
147 * newlines; these are all handled by a higher-level protocol. In a Java string,
148 * "\n" will be converted to a literal newline character by the time the
149 * regular-expression parser sees it. Of course, this means that \ sequences that are
150 * visible to the regexp parser must be written as \\ when inside a Java string.) All
151 * characters in the ASCII range except for letters, digits, and control characters are
152 * reserved characters to the parser and must be preceded by \ even if they currently don't
153 * mean anything.</td>
154 * </tr>
155 * <tr>
156 * <td width="6%">!</td>
157 * <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
158 * parser that this expression specifies the backwards-iteration behavior of the iterator,
159 * and not its normal iteration behavior. This is generally only used in situations
160 * where the automatically-generated backwards-iteration brhavior doesn't produce
161 * satisfactory results and must be supplemented with extra client-specified rules.</td>
162 * </tr>
163 * <tr>
164 * <td width="6%"><em>(all others)</em></td>
165 * <td width="94%">All other characters are treated as literal characters, which must match
166 * the corresponding character(s) in the text exactly.</td>
167 * </tr>
168 * </table>
169 * </blockquote>
170 *
171 * <p>Within a [] expression, a number of other special characters can be used to specify
172 * groups of characters:</p>
173 *
174 * <blockquote>
175 * <table border="1" width="100%">
176 * <tr>
177 * <td width="6%">-</td>
178 * <td width="94%">Specifies a range of matching characters. For example
179 * "[a-p]" matches all lowercase Latin letters from a to p (inclusive). The -
180 * sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
181 * language's alphabetical order: "[a-z]" doesn't include capital letters, nor does
182 * it include accented letters such as a-umlaut.</td>
183 * </tr>
184 * <tr>
185 * <td width="6%">::</td>
186 * <td width="94%">A pair of colons containing a one- or two-letter code matches all
187 * characters in the corresponding Unicode category. The two-letter codes are the same
188 * as the two-letter codes in the Unicode database (for example, "[:Sc::Sm:]"
189 * matches all currency symbols and all math symbols). Specifying a one-letter code is
190 * the same as specifying all two-letter codes that begin with that letter (for example,
191 * "[:L:]" matches all letters, and is equivalent to
192 * "[:Lu::Ll::Lo::Lm::Lt:]"). Anything other than a valid two-letter Unicode
193 * category code or a single letter that begins a Unicode category code is illegal within
194 * colons.</td>
195 * </tr>
196 * <tr>
197 * <td width="6%">[]</td>
198 * <td width="94%">[] expressions can nest. This has no effect, except when used in
199 * conjunction with the ^ token.</td>
200 * </tr>
201 * <tr>
202 * <td width="6%">^</td>
203 * <td width="94%">Excludes the character (or the characters in the [] expression) following
204 * it from the group of characters. For example, "[a-z^p]" matches all Latin
205 * lowercase letters except p. "[:L:^[\u4e00-\u9fff]]" matches all letters
206 * except the Han ideographs.</td>
207 * </tr>
208 * <tr>
209 * <td width="6%"><em>(all others)</em></td>
210 * <td width="94%">All other characters are treated as literal characters. (For
211 * example, "[aeiou]" specifies just the letters a, e, i, o, and u.)</td>
212 * </tr>
213 * </table>
214 * </blockquote>
215 *
216 * <p>For a more complete explanation, see <a
217 * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
218 * For examples, see the resource data (which is annotated).</p>
219 *
220 * @author Richard Gillam
221 */
222 class RuleBasedBreakIterator extends BreakIterator {
223
224 /**
225 * A token used as a character-category value to identify ignore characters
226 */
227 protected static final byte IGNORE = -1;
228
229 /**
230 * The state number of the starting state
231 */
232 private static final short START_STATE = 1;
233
234 /**
235 * The state-transition value indicating "stop"
236 */
237 private static final short STOP_STATE = 0;
238
239 /**
240 * Magic number for the BreakIterator data file format.
241 */
242 static final byte[] LABEL = {
243 (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
244 (byte)'\0'
245 };
246 static final int LABEL_LENGTH = LABEL.length;
247
248 /**
249 * Version number of the dictionary that was read in.
250 */
251 static final byte supportedVersion = 1;
252
253 /**
254 * Header size in byte count
255 */
256 private static final int HEADER_LENGTH = 36;
257
258 /**
259 * An array length of indices for BMP characters
260 */
261 private static final int BMP_INDICES_LENGTH = 512;
262
263 /**
264 * Tables that indexes from character values to character category numbers
265 */
266 private CompactByteArray charCategoryTable = null;
267 private SupplementaryCharacterData supplementaryCharCategoryTable = null;
268
269 /**
270 * The table of state transitions used for forward iteration
271 */
272 private short[] stateTable = null;
273
274 /**
275 * The table of state transitions used to sync up the iterator with the
276 * text in backwards and random-access iteration
277 */
278 private short[] backwardsStateTable = null;
279
280 /**
281 * A list of flags indicating which states in the state table are accepting
282 * ("end") states
283 */
284 private boolean[] endStates = null;
285
286 /**
287 * A list of flags indicating which states in the state table are
288 * lookahead states (states which turn lookahead on and off)
289 */
290 private boolean[] lookaheadStates = null;
291
292 /**
293 * A table for additional data. May be used by a subclass of
294 * RuleBasedBreakIterator.
295 */
296 private byte[] additionalData = null;
297
298 /**
299 * The number of character categories (and, thus, the number of columns in
300 * the state tables)
301 */
302 private int numCategories;
303
304 /**
305 * The character iterator through which this BreakIterator accesses the text
306 */
307 private CharacterIterator text = null;
308
309 /**
310 * A CRC32 value of all data in datafile
311 */
312 private long checksum;
313
314 //=======================================================================
315 // constructors
316 //=======================================================================
317
318 /**
319 * Constructs a RuleBasedBreakIterator according to the datafile
320 * provided.
321 */
322 public RuleBasedBreakIterator(String datafile)
323 throws IOException, MissingResourceException {
324 readTables(datafile);
325 }
326
327 /**
328 * Read datafile. The datafile's format is as follows:
329 * <pre>
330 * BreakIteratorData {
331 * u1 magic[7];
332 * u1 version;
333 * u4 totalDataSize;
334 * header_info header;
335 * body value;
336 * }
337 * </pre>
338 * <code>totalDataSize</code> is the summation of the size of
339 * <code>header_info</code> and <code>body</code> in byte count.
340 * <p>
341 * In <code>header</code>, each field except for checksum implies the
342 * length of each field. Since <code>BMPdataLength</code> is a fixed-length
343 * data(512 entries), its length isn't included in <code>header</code>.
344 * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
345 * <pre>
346 * header_info {
347 * u4 stateTableLength;
348 * u4 backwardsStateTableLength;
349 * u4 endStatesLength;
350 * u4 lookaheadStatesLength;
351 * u4 BMPdataLength;
352 * u4 nonBMPdataLength;
353 * u4 additionalDataLength;
354 * u8 checksum;
355 * }
356 * </pre>
357 * <p>
358 *
359 * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
360 * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
361 * <code>supplementaryCharCategoryTable</code>.
362 * <pre>
363 * body {
364 * u2 stateTable[stateTableLength];
365 * u2 backwardsStateTable[backwardsStateTableLength];
366 * u1 endStates[endStatesLength];
367 * u1 lookaheadStates[lookaheadStatesLength];
368 * u2 BMPindices[512];
369 * u1 BMPdata[BMPdataLength];
370 * u4 nonBMPdata[numNonBMPdataLength];
371 * u1 additionalData[additionalDataLength];
372 * }
373 * </pre>
374 */
375 protected void readTables(String datafile)
376 throws IOException, MissingResourceException {
377
378 byte[] buffer = readFile(datafile);
379
380 /* Read header_info. */
381 int stateTableLength = BreakIterator.getInt(buffer, 0);
382 int backwardsStateTableLength = BreakIterator.getInt(buffer, 4);
383 int endStatesLength = BreakIterator.getInt(buffer, 8);
384 int lookaheadStatesLength = BreakIterator.getInt(buffer, 12);
385 int BMPdataLength = BreakIterator.getInt(buffer, 16);
386 int nonBMPdataLength = BreakIterator.getInt(buffer, 20);
387 int additionalDataLength = BreakIterator.getInt(buffer, 24);
388 checksum = BreakIterator.getLong(buffer, 28);
389
390 /* Read stateTable[numCategories * numRows] */
391 stateTable = new short[stateTableLength];
392 int offset = HEADER_LENGTH;
393 for (int i = 0; i < stateTableLength; i++, offset+=2) {
394 stateTable[i] = BreakIterator.getShort(buffer, offset);
395 }
396
397 /* Read backwardsStateTable[numCategories * numRows] */
398 backwardsStateTable = new short[backwardsStateTableLength];
399 for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
400 backwardsStateTable[i] = BreakIterator.getShort(buffer, offset);
401 }
402
403 /* Read endStates[numRows] */
404 endStates = new boolean[endStatesLength];
405 for (int i = 0; i < endStatesLength; i++, offset++) {
406 endStates[i] = buffer[offset] == 1;
407 }
408
409 /* Read lookaheadStates[numRows] */
410 lookaheadStates = new boolean[lookaheadStatesLength];
411 for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
412 lookaheadStates[i] = buffer[offset] == 1;
413 }
414
415 /* Read a category table and indices for BMP characters. */
416 short[] temp1 = new short[BMP_INDICES_LENGTH]; // BMPindices
417 for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
418 temp1[i] = BreakIterator.getShort(buffer, offset);
419 }
420 byte[] temp2 = new byte[BMPdataLength]; // BMPdata
421 System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
422 offset += BMPdataLength;
423 charCategoryTable = new CompactByteArray(temp1, temp2);
424
425 /* Read a category table for non-BMP characters. */
426 int[] temp3 = new int[nonBMPdataLength];
427 for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
428 temp3[i] = BreakIterator.getInt(buffer, offset);
429 }
430 supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
431
432 /* Read additional data */
433 if (additionalDataLength > 0) {
434 additionalData = new byte[additionalDataLength];
435 System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
436 }
437
438 /* Set numCategories */
439 numCategories = stateTable.length / endStates.length;
440 }
441
442 protected byte[] readFile(final String datafile)
443 throws IOException, MissingResourceException {
444
445 BufferedInputStream is;
446 try {
447 is = (BufferedInputStream)AccessController.doPrivileged(
448 new PrivilegedExceptionAction() {
449 public Object run() throws Exception {
450 return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile));
451 }
452 }
453 );
454 }
455 catch (PrivilegedActionException e) {
456 throw new InternalError(e.toString());
457 }
458
459 int offset = 0;
460
461 /* First, read magic, version, and header_info. */
462 int len = LABEL_LENGTH + 5;
463 byte[] buf = new byte[len];
464 if (is.read(buf) != len) {
465 throw new MissingResourceException("Wrong header length",
466 datafile, "");
467 }
468
469 /* Validate the magic number. */
470 for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
471 if (buf[offset] != LABEL[offset]) {
472 throw new MissingResourceException("Wrong magic number",
473 datafile, "");
474 }
475 }
476
477 /* Validate the version number. */
478 if (buf[offset] != supportedVersion) {
479 throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
480 datafile, "");
481 }
482
483 /* Read data: totalDataSize + 8(for checksum) */
484 len = BreakIterator.getInt(buf, ++offset);
485 buf = new byte[len];
486 if (is.read(buf) != len) {
487 throw new MissingResourceException("Wrong data length",
488 datafile, "");
489 }
490
491 is.close();
492
493 return buf;
494 }
495
496 byte[] getAdditionalData() {
497 return additionalData;
498 }
499
500 void setAdditionalData(byte[] b) {
501 additionalData = b;
502 }
503
504 //=======================================================================
505 // boilerplate
506 //=======================================================================
507 /**
508 * Clones this iterator.
509 * @return A newly-constructed RuleBasedBreakIterator with the same
510 * behavior as this one.
511 */
512 public Object clone() {
513 RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
514 if (text != null) {
515 result.text = (CharacterIterator) text.clone();
516 }
517 return result;
518 }
519
520 /**
521 * Returns true if both BreakIterators are of the same class, have the same
522 * rules, and iterate over the same text.
523 */
524 public boolean equals(Object that) {
525 try {
526 if (that == null) {
527 return false;
528 }
529
530 RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
531 if (checksum != other.checksum) {
532 return false;
533 }
534 if (text == null) {
535 return other.text == null;
536 } else {
537 return text.equals(other.text);
538 }
539 }
540 catch(ClassCastException e) {
541 return false;
542 }
543 }
544
545 /**
546 * Returns text
547 */
548 public String toString() {
549 StringBuffer sb = new StringBuffer();
550 sb.append('[');
551 sb.append("checksum=0x" + Long.toHexString(checksum));
552 sb.append(']');
553 return sb.toString();
554 }
555
556 /**
557 * Compute a hashcode for this BreakIterator
558 * @return A hash code
559 */
560 public int hashCode() {
561 return (int)checksum;
562 }
563
564 //=======================================================================
565 // BreakIterator overrides
566 //=======================================================================
567
568 /**
569 * Sets the current iteration position to the beginning of the text.
570 * (i.e., the CharacterIterator's starting offset).
571 * @return The offset of the beginning of the text.
572 */
573 public int first() {
574 CharacterIterator t = getText();
575
576 t.first();
577 return t.getIndex();
578 }
579
580 /**
581 * Sets the current iteration position to the end of the text.
582 * (i.e., the CharacterIterator's ending offset).
583 * @return The text's past-the-end offset.
584 */
585 public int last() {
586 CharacterIterator t = getText();
587
588 // I'm not sure why, but t.last() returns the offset of the last character,
589 // rather than the past-the-end offset
590 t.setIndex(t.getEndIndex());
591 return t.getIndex();
592 }
593
594 /**
595 * Advances the iterator either forward or backward the specified number of steps.
596 * Negative values move backward, and positive values move forward. This is
597 * equivalent to repeatedly calling next() or previous().
598 * @param n The number of steps to move. The sign indicates the direction
599 * (negative is backwards, and positive is forwards).
600 * @return The character offset of the boundary position n boundaries away from
601 * the current one.
602 */
603 public int next(int n) {
604 int result = current();
605 while (n > 0) {
606 result = handleNext();
607 --n;
608 }
609 while (n < 0) {
610 result = previous();
611 ++n;
612 }
613 return result;
614 }
615
616 /**
617 * Advances the iterator to the next boundary position.
618 * @return The position of the first boundary after this one.
619 */
620 public int next() {
621 return handleNext();
622 }
623
624 /**
625 * Advances the iterator backwards, to the last boundary preceding this one.
626 * @return The position of the last boundary position preceding this one.
627 */
628 public int previous() {
629 // if we're already sitting at the beginning of the text, return DONE
630 CharacterIterator text = getText();
631 if (current() == text.getBeginIndex()) {
632 return BreakIterator.DONE;
633 }
634
635 // set things up. handlePrevious() will back us up to some valid
636 // break position before the current position (we back our internal
637 // iterator up one step to prevent handlePrevious() from returning
638 // the current position), but not necessarily the last one before
639 // where we started
640 int start = current();
641 getPrevious();
642 int lastResult = handlePrevious();
643 int result = lastResult;
644
645 // iterate forward from the known break position until we pass our
646 // starting point. The last break position before the starting
647 // point is our return value
648 while (result != BreakIterator.DONE && result < start) {
649 lastResult = result;
650 result = handleNext();
651 }
652
653 // set the current iteration position to be the last break position
654 // before where we started, and then return that value
655 text.setIndex(lastResult);
656 return lastResult;
657 }
658
659 /**
660 * Returns previous character
661 */
662 private int getPrevious() {
663 char c2 = text.previous();
664 if (Character.isLowSurrogate(c2) &&
665 text.getIndex() > text.getBeginIndex()) {
666 char c1 = text.previous();
667 if (Character.isHighSurrogate(c1)) {
668 return Character.toCodePoint(c1, c2);
669 } else {
670 text.next();
671 }
672 }
673 return (int)c2;
674 }
675
676 /**
677 * Returns current character
678 */
679 int getCurrent() {
680 char c1 = text.current();
681 if (Character.isHighSurrogate(c1) &&
682 text.getIndex() < text.getEndIndex()) {
683 char c2 = text.next();
684 text.previous();
685 if (Character.isLowSurrogate(c2)) {
686 return Character.toCodePoint(c1, c2);
687 }
688 }
689 return (int)c1;
690 }
691
692 /**
693 * Returns the count of next character.
694 */
695 private int getCurrentCodePointCount() {
696 char c1 = text.current();
697 if (Character.isHighSurrogate(c1) &&
698 text.getIndex() < text.getEndIndex()) {
699 char c2 = text.next();
700 text.previous();
701 if (Character.isLowSurrogate(c2)) {
702 return 2;
703 }
704 }
705 return 1;
706 }
707
708 /**
709 * Returns next character
710 */
711 int getNext() {
712 int index = text.getIndex();
713 int endIndex = text.getEndIndex();
714 if (index == endIndex ||
715 (index = index + getCurrentCodePointCount()) >= endIndex) {
716 return CharacterIterator.DONE;
717 }
718 text.setIndex(index);
719 return getCurrent();
720 }
721
722 /**
723 * Returns the position of next character.
724 */
725 private int getNextIndex() {
726 int index = text.getIndex() + getCurrentCodePointCount();
727 int endIndex = text.getEndIndex();
728 if (index > endIndex) {
729 return endIndex;
730 } else {
731 return index;
732 }
733 }
734
735 /**
736 * Throw IllegalArgumentException unless begin <= offset < end.
737 */
738 protected static final void checkOffset(int offset, CharacterIterator text) {
739 if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
740 throw new IllegalArgumentException("offset out of bounds");
741 }
742 }
743
744 /**
745 * Sets the iterator to refer to the first boundary position following
746 * the specified position.
747 * @offset The position from which to begin searching for a break position.
748 * @return The position of the first break after the current position.
749 */
750 public int following(int offset) {
751
752 CharacterIterator text = getText();
753 checkOffset(offset, text);
754
755 // Set our internal iteration position (temporarily)
756 // to the position passed in. If this is the _beginning_ position,
757 // then we can just use next() to get our return value
758 text.setIndex(offset);
759 if (offset == text.getBeginIndex()) {
760 return handleNext();
761 }
762
763 // otherwise, we have to sync up first. Use handlePrevious() to back
764 // us up to a known break position before the specified position (if
765 // we can determine that the specified position is a break position,
766 // we don't back up at all). This may or may not be the last break
767 // position at or before our starting position. Advance forward
768 // from here until we've passed the starting position. The position
769 // we stop on will be the first break position after the specified one.
770 int result = handlePrevious();
771 while (result != BreakIterator.DONE && result <= offset) {
772 result = handleNext();
773 }
774 return result;
775 }
776
777 /**
778 * Sets the iterator to refer to the last boundary position before the
779 * specified position.
780 * @offset The position to begin searching for a break from.
781 * @return The position of the last boundary before the starting position.
782 */
783 public int preceding(int offset) {
784 // if we start by updating the current iteration position to the
785 // position specified by the caller, we can just use previous()
786 // to carry out this operation
787 CharacterIterator text = getText();
788 checkOffset(offset, text);
789 text.setIndex(offset);
790 return previous();
791 }
792
793 /**
794 * Returns true if the specfied position is a boundary position. As a side
795 * effect, leaves the iterator pointing to the first boundary position at
796 * or after "offset".
797 * @param offset the offset to check.
798 * @return True if "offset" is a boundary position.
799 */
800 public boolean isBoundary(int offset) {
801 CharacterIterator text = getText();
802 checkOffset(offset, text);
803 if (offset == text.getBeginIndex()) {
804 return true;
805 }
806
807 // to check whether this is a boundary, we can use following() on the
808 // position before the specified one and return true if the position we
809 // get back is the one the user specified
810 else {
811 return following(offset - 1) == offset;
812 }
813 }
814
815 /**
816 * Returns the current iteration position.
817 * @return The current iteration position.
818 */
819 public int current() {
820 return getText().getIndex();
821 }
822
823 /**
824 * Return a CharacterIterator over the text being analyzed. This version
825 * of this method returns the actual CharacterIterator we're using internally.
826 * Changing the state of this iterator can have undefined consequences. If
827 * you need to change it, clone it first.
828 * @return An iterator over the text being analyzed.
829 */
830 public CharacterIterator getText() {
831 // The iterator is initialized pointing to no text at all, so if this
832 // function is called while we're in that state, we have to fudge an
833 // iterator to return.
834 if (text == null) {
835 text = new StringCharacterIterator("");
836 }
837 return text;
838 }
839
840 /**
841 * Set the iterator to analyze a new piece of text. This function resets
842 * the current iteration position to the beginning of the text.
843 * @param newText An iterator over the text to analyze.
844 */
845 public void setText(CharacterIterator newText) {
846 // Test iterator to see if we need to wrap it in a SafeCharIterator.
847 // The correct behavior for CharacterIterators is to allow the
848 // position to be set to the endpoint of the iterator. Many
849 // CharacterIterators do not uphold this, so this is a workaround
850 // to permit them to use this class.
851 int end = newText.getEndIndex();
852 boolean goodIterator;
853 try {
854 newText.setIndex(end); // some buggy iterators throw an exception here
855 goodIterator = newText.getIndex() == end;
856 }
857 catch(IllegalArgumentException e) {
858 goodIterator = false;
859 }
860
861 if (goodIterator) {
862 text = newText;
863 }
864 else {
865 text = new SafeCharIterator(newText);
866 }
867 text.first();
868 }
869
870
871 //=======================================================================
872 // implementation
873 //=======================================================================
874
875 /**
876 * This method is the actual implementation of the next() method. All iteration
877 * vectors through here. This method initializes the state machine to state 1
878 * and advances through the text character by character until we reach the end
879 * of the text or the state machine transitions to state 0. We update our return
880 * value every time the state machine passes through a possible end state.
881 */
882 protected int handleNext() {
883 // if we're already at the end of the text, return DONE.
884 CharacterIterator text = getText();
885 if (text.getIndex() == text.getEndIndex()) {
886 return BreakIterator.DONE;
887 }
888
889 // no matter what, we always advance at least one character forward
890 int result = getNextIndex();
891 int lookaheadResult = 0;
892
893 // begin in state 1
894 int state = START_STATE;
895 int category;
896 int c = getCurrent();
897
898 // loop until we reach the end of the text or transition to state 0
899 while (c != CharacterIterator.DONE && state != STOP_STATE) {
900
901 // look up the current character's character category (which tells us
902 // which column in the state table to look at)
903 category = lookupCategory(c);
904
905 // if the character isn't an ignore character, look up a state
906 // transition in the state table
907 if (category != IGNORE) {
908 state = lookupState(state, category);
909 }
910
911 // if the state we've just transitioned to is a lookahead state,
912 // (but not also an end state), save its position. If it's
913 // both a lookahead state and an end state, update the break position
914 // to the last saved lookup-state position
915 if (lookaheadStates[state]) {
916 if (endStates[state]) {
917 result = lookaheadResult;
918 }
919 else {
920 lookaheadResult = getNextIndex();
921 }
922 }
923
924 // otherwise, if the state we've just transitioned to is an accepting
925 // state, update the break position to be the current iteration position
926 else {
927 if (endStates[state]) {
928 result = getNextIndex();
929 }
930 }
931
932 c = getNext();
933 }
934
935 // if we've run off the end of the text, and the very last character took us into
936 // a lookahead state, advance the break position to the lookahead position
937 // (the theory here is that if there are no characters at all after the lookahead
938 // position, that always matches the lookahead criteria)
939 if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
940 result = lookaheadResult;
941 }
942
943 text.setIndex(result);
944 return result;
945 }
946
947 /**
948 * This method backs the iterator back up to a "safe position" in the text.
949 * This is a position that we know, without any context, must be a break position.
950 * The various calling methods then iterate forward from this safe position to
951 * the appropriate position to return. (For more information, see the description
952 * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
953 */
954 protected int handlePrevious() {
955 CharacterIterator text = getText();
956 int state = START_STATE;
957 int category = 0;
958 int lastCategory = 0;
959 int c = getCurrent();
960
961 // loop until we reach the beginning of the text or transition to state 0
962 while (c != CharacterIterator.DONE && state != STOP_STATE) {
963
964 // save the last character's category and look up the current
965 // character's category
966 lastCategory = category;
967 category = lookupCategory(c);
968
969 // if the current character isn't an ignore character, look up a
970 // state transition in the backwards state table
971 if (category != IGNORE) {
972 state = lookupBackwardState(state, category);
973 }
974
975 // then advance one character backwards
976 c = getPrevious();
977 }
978
979 // if we didn't march off the beginning of the text, we're either one or two
980 // positions away from the real break position. (One because of the call to
981 // previous() at the end of the loop above, and another because the character
982 // that takes us into the stop state will always be the character BEFORE
983 // the break position.)
984 if (c != CharacterIterator.DONE) {
985 if (lastCategory != IGNORE) {
986 getNext();
987 getNext();
988 }
989 else {
990 getNext();
991 }
992 }
993 return text.getIndex();
994 }
995
996 /**
997 * Looks up a character's category (i.e., its category for breaking purposes,
998 * not its Unicode category)
999 */
1000 protected int lookupCategory(int c) {
1001 if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
1002 return charCategoryTable.elementAt((char)c);
1003 } else {
1004 return supplementaryCharCategoryTable.getValue(c);
1005 }
1006 }
1007
1008 /**
1009 * Given a current state and a character category, looks up the
1010 * next state to transition to in the state table.
1011 */
1012 protected int lookupState(int state, int category) {
1013 return stateTable[state * numCategories + category];
1014 }
1015
1016 /**
1017 * Given a current state and a character category, looks up the
1018 * next state to transition to in the backwards state table.
1019 */
1020 protected int lookupBackwardState(int state, int category) {
1021 return backwardsStateTable[state * numCategories + category];
1022 }
1023
1024 /*
1025 * This class exists to work around a bug in incorrect implementations
1026 * of CharacterIterator, which incorrectly handle setIndex(endIndex).
1027 * This iterator relies only on base.setIndex(n) where n is less than
1028 * endIndex.
1029 *
1030 * One caveat: if the base iterator's begin and end indices change
1031 * the change will not be reflected by this wrapper. Does that matter?
1032 */
1033 private static final class SafeCharIterator implements CharacterIterator,
1034 Cloneable {
1035
1036 private CharacterIterator base;
1037 private int rangeStart;
1038 private int rangeLimit;
1039 private int currentIndex;
1040
1041 SafeCharIterator(CharacterIterator base) {
1042 this.base = base;
1043 this.rangeStart = base.getBeginIndex();
1044 this.rangeLimit = base.getEndIndex();
1045 this.currentIndex = base.getIndex();
1046 }
1047
1048 public char first() {
1049 return setIndex(rangeStart);
1050 }
1051
1052 public char last() {
1053 return setIndex(rangeLimit - 1);
1054 }
1055
1056 public char current() {
1057 if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
1058 return DONE;
1059 }
1060 else {
1061 return base.setIndex(currentIndex);
1062 }
1063 }
1064
1065 public char next() {
1066
1067 currentIndex++;
1068 if (currentIndex >= rangeLimit) {
1069 currentIndex = rangeLimit;
1070 return DONE;
1071 }
1072 else {
1073 return base.setIndex(currentIndex);
1074 }
1075 }
1076
1077 public char previous() {
1078
1079 currentIndex--;
1080 if (currentIndex < rangeStart) {
1081 currentIndex = rangeStart;
1082 return DONE;
1083 }
1084 else {
1085 return base.setIndex(currentIndex);
1086 }
1087 }
1088
1089 public char setIndex(int i) {
1090
1091 if (i < rangeStart || i > rangeLimit) {
1092 throw new IllegalArgumentException("Invalid position");
1093 }
1094 currentIndex = i;
1095 return current();
1096 }
1097
1098 public int getBeginIndex() {
1099 return rangeStart;
1100 }
1101
1102 public int getEndIndex() {
1103 return rangeLimit;
1104 }
1105
1106 public int getIndex() {
1107 return currentIndex;
1108 }
1109
1110 public Object clone() {
1111
1112 SafeCharIterator copy = null;
1113 try {
1114 copy = (SafeCharIterator) super.clone();
1115 }
1116 catch(CloneNotSupportedException e) {
1117 throw new Error("Clone not supported: " + e);
1118 }
1119
1120 CharacterIterator copyOfBase = (CharacterIterator) base.clone();
1121 copy.base = copyOfBase;
1122 return copy;
1123 }
1124 }
1125 }