Save This Page
Home » openjdk-7 » java » text » [javadoc | source]
    1   /*
    2    * Copyright 1999-2005 Sun Microsystems, Inc.  All Rights Reserved.
    3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4    *
    5    * This code is free software; you can redistribute it and/or modify it
    6    * under the terms of the GNU General Public License version 2 only, as
    7    * published by the Free Software Foundation.  Sun designates this
    8    * particular file as subject to the "Classpath" exception as provided
    9    * by Sun in the LICENSE file that accompanied this code.
   10    *
   11    * This code is distributed in the hope that it will be useful, but WITHOUT
   12    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   13    * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   14    * version 2 for more details (a copy is included in the LICENSE file that
   15    * accompanied this code).
   16    *
   17    * You should have received a copy of the GNU General Public License version
   18    * 2 along with this work; if not, write to the Free Software Foundation,
   19    * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   20    *
   21    * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
   22    * CA 95054 USA or visit www.sun.com if you need additional information or
   23    * have any questions.
   24    */
   25   
   26   /*
   27    *
   28    * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
   29    * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
   30    *
   31    * The original version of this source code and documentation
   32    * is copyrighted and owned by Taligent, Inc., a wholly-owned
   33    * subsidiary of IBM. These materials are provided under terms
   34    * of a License Agreement between Taligent and Sun. This technology
   35    * is protected by multiple US and International patents.
   36    *
   37    * This notice and attribution to Taligent may not be removed.
   38    * Taligent is a registered trademark of Taligent, Inc.
   39    */
   40   
   41   package java.text;
   42   
   43   import java.io.BufferedInputStream;
   44   import java.io.IOException;
   45   import java.security.AccessController;
   46   import java.security.PrivilegedActionException;
   47   import java.security.PrivilegedExceptionAction;
   48   import java.util.Vector;
   49   import java.util.Stack;
   50   import java.util.Hashtable;
   51   import java.util.Enumeration;
   52   import java.util.MissingResourceException;
   53   import java.text.CharacterIterator;
   54   import java.text.StringCharacterIterator;
   55   import sun.text.CompactByteArray;
   56   import sun.text.SupplementaryCharacterData;
   57   
   58   /**
   59    * <p>A subclass of BreakIterator whose behavior is specified using a list of rules.</p>
   60    *
   61    * <p>There are two kinds of rules, which are separated by semicolons: <i>substitutions</i>
   62    * and <i>regular expressions.</i></p>
   63    *
   64    * <p>A substitution rule defines a name that can be used in place of an expression. It
   65    * consists of a name, which is a string of characters contained in angle brackets, an equals
   66    * sign, and an expression. (There can be no whitespace on either side of the equals sign.)
   67    * To keep its syntactic meaning intact, the expression must be enclosed in parentheses or
   68    * square brackets. A substitution is visible after its definition, and is filled in using
   69    * simple textual substitution. Substitution definitions can contain other substitutions, as
   70    * long as those substitutions have been defined first. Substitutions are generally used to
   71    * make the regular expressions (which can get quite complex) shorted and easier to read.
   72    * They typically define either character categories or commonly-used subexpressions.</p>
   73    *
   74    * <p>There is one special substitution.&nbsp; If the description defines a substitution
   75    * called &quot;&lt;ignore&gt;&quot;, the expression must be a [] expression, and the
   76    * expression defines a set of characters (the &quot;<em>ignore characters</em>&quot;) that
   77    * will be transparent to the BreakIterator.&nbsp; A sequence of characters will break the
   78    * same way it would if any ignore characters it contains are taken out.&nbsp; Break
   79    * positions never occur befoer ignore characters.</p>
   80    *
   81    * <p>A regular expression uses a subset of the normal Unix regular-expression syntax, and
   82    * defines a sequence of characters to be kept together. With one significant exception, the
   83    * iterator uses a longest-possible-match algorithm when matching text to regular
   84    * expressions. The iterator also treats descriptions containing multiple regular expressions
   85    * as if they were ORed together (i.e., as if they were separated by |).</p>
   86    *
   87    * <p>The special characters recognized by the regular-expression parser are as follows:</p>
   88    *
   89    * <blockquote>
   90    *   <table border="1" width="100%">
   91    *     <tr>
   92    *       <td width="6%">*</td>
   93    *       <td width="94%">Specifies that the expression preceding the asterisk may occur any number
   94    *       of times (including not at all).</td>
   95    *     </tr>
   96    *     <tr>
   97    *       <td width="6%">{}</td>
   98    *       <td width="94%">Encloses a sequence of characters that is optional.</td>
   99    *     </tr>
  100    *     <tr>
  101    *       <td width="6%">()</td>
  102    *       <td width="94%">Encloses a sequence of characters.&nbsp; If followed by *, the sequence
  103    *       repeats.&nbsp; Otherwise, the parentheses are just a grouping device and a way to delimit
  104    *       the ends of expressions containing |.</td>
  105    *     </tr>
  106    *     <tr>
  107    *       <td width="6%">|</td>
  108    *       <td width="94%">Separates two alternative sequences of characters.&nbsp; Either one
  109    *       sequence or the other, but not both, matches this expression.&nbsp; The | character can
  110    *       only occur inside ().</td>
  111    *     </tr>
  112    *     <tr>
  113    *       <td width="6%">.</td>
  114    *       <td width="94%">Matches any character.</td>
  115    *     </tr>
  116    *     <tr>
  117    *       <td width="6%">*?</td>
  118    *       <td width="94%">Specifies a non-greedy asterisk.&nbsp; *? works the same way as *, except
  119    *       when there is overlap between the last group of characters in the expression preceding the
  120    *       * and the first group of characters following the *.&nbsp; When there is this kind of
  121    *       overlap, * will match the longest sequence of characters that match the expression before
  122    *       the *, and *? will match the shortest sequence of characters matching the expression
  123    *       before the *?.&nbsp; For example, if you have &quot;xxyxyyyxyxyxxyxyxyy&quot; in the text,
  124    *       &quot;x[xy]*x&quot; will match through to the last x (i.e., &quot;<strong>xxyxyyyxyxyxxyxyx</strong>yy&quot;,
  125    *       but &quot;x[xy]*?x&quot; will only match the first two xes (&quot;<strong>xx</strong>yxyyyxyxyxxyxyxyy&quot;).</td>
  126    *     </tr>
  127    *     <tr>
  128    *       <td width="6%">[]</td>
  129    *       <td width="94%">Specifies a group of alternative characters.&nbsp; A [] expression will
  130    *       match any single character that is specified in the [] expression.&nbsp; For more on the
  131    *       syntax of [] expressions, see below.</td>
  132    *     </tr>
  133    *     <tr>
  134    *       <td width="6%">/</td>
  135    *       <td width="94%">Specifies where the break position should go if text matches this
  136    *       expression.&nbsp; (e.g., &quot;[a-z]&#42;/[:Zs:]*[1-0]&quot; will match if the iterator sees a run
  137    *       of letters, followed by a run of whitespace, followed by a digit, but the break position
  138    *       will actually go before the whitespace).&nbsp; Expressions that don't contain / put the
  139    *       break position at the end of the matching text.</td>
  140    *     </tr>
  141    *     <tr>
  142    *       <td width="6%">\</td>
  143    *       <td width="94%">Escape character.&nbsp; The \ itself is ignored, but causes the next
  144    *       character to be treated as literal character.&nbsp; This has no effect for many
  145    *       characters, but for the characters listed above, this deprives them of their special
  146    *       meaning.&nbsp; (There are no special escape sequences for Unicode characters, or tabs and
  147    *       newlines; these are all handled by a higher-level protocol.&nbsp; In a Java string,
  148    *       &quot;\n&quot; will be converted to a literal newline character by the time the
  149    *       regular-expression parser sees it.&nbsp; Of course, this means that \ sequences that are
  150    *       visible to the regexp parser must be written as \\ when inside a Java string.)&nbsp; All
  151    *       characters in the ASCII range except for letters, digits, and control characters are
  152    *       reserved characters to the parser and must be preceded by \ even if they currently don't
  153    *       mean anything.</td>
  154    *     </tr>
  155    *     <tr>
  156    *       <td width="6%">!</td>
  157    *       <td width="94%">If ! appears at the beginning of a regular expression, it tells the regexp
  158    *       parser that this expression specifies the backwards-iteration behavior of the iterator,
  159    *       and not its normal iteration behavior.&nbsp; This is generally only used in situations
  160    *       where the automatically-generated backwards-iteration brhavior doesn't produce
  161    *       satisfactory results and must be supplemented with extra client-specified rules.</td>
  162    *     </tr>
  163    *     <tr>
  164    *       <td width="6%"><em>(all others)</em></td>
  165    *       <td width="94%">All other characters are treated as literal characters, which must match
  166    *       the corresponding character(s) in the text exactly.</td>
  167    *     </tr>
  168    *   </table>
  169    * </blockquote>
  170    *
  171    * <p>Within a [] expression, a number of other special characters can be used to specify
  172    * groups of characters:</p>
  173    *
  174    * <blockquote>
  175    *   <table border="1" width="100%">
  176    *     <tr>
  177    *       <td width="6%">-</td>
  178    *       <td width="94%">Specifies a range of matching characters.&nbsp; For example
  179    *       &quot;[a-p]&quot; matches all lowercase Latin letters from a to p (inclusive).&nbsp; The -
  180    *       sign specifies ranges of continuous Unicode numeric values, not ranges of characters in a
  181    *       language's alphabetical order: &quot;[a-z]&quot; doesn't include capital letters, nor does
  182    *       it include accented letters such as a-umlaut.</td>
  183    *     </tr>
  184    *     <tr>
  185    *       <td width="6%">::</td>
  186    *       <td width="94%">A pair of colons containing a one- or two-letter code matches all
  187    *       characters in the corresponding Unicode category.&nbsp; The two-letter codes are the same
  188    *       as the two-letter codes in the Unicode database (for example, &quot;[:Sc::Sm:]&quot;
  189    *       matches all currency symbols and all math symbols).&nbsp; Specifying a one-letter code is
  190    *       the same as specifying all two-letter codes that begin with that letter (for example,
  191    *       &quot;[:L:]&quot; matches all letters, and is equivalent to
  192    *       &quot;[:Lu::Ll::Lo::Lm::Lt:]&quot;).&nbsp; Anything other than a valid two-letter Unicode
  193    *       category code or a single letter that begins a Unicode category code is illegal within
  194    *       colons.</td>
  195    *     </tr>
  196    *     <tr>
  197    *       <td width="6%">[]</td>
  198    *       <td width="94%">[] expressions can nest.&nbsp; This has no effect, except when used in
  199    *       conjunction with the ^ token.</td>
  200    *     </tr>
  201    *     <tr>
  202    *       <td width="6%">^</td>
  203    *       <td width="94%">Excludes the character (or the characters in the [] expression) following
  204    *       it from the group of characters.&nbsp; For example, &quot;[a-z^p]&quot; matches all Latin
  205    *       lowercase letters except p.&nbsp; &quot;[:L:^[&#92;u4e00-&#92;u9fff]]&quot; matches all letters
  206    *       except the Han ideographs.</td>
  207    *     </tr>
  208    *     <tr>
  209    *       <td width="6%"><em>(all others)</em></td>
  210    *       <td width="94%">All other characters are treated as literal characters.&nbsp; (For
  211    *       example, &quot;[aeiou]&quot; specifies just the letters a, e, i, o, and u.)</td>
  212    *     </tr>
  213    *   </table>
  214    * </blockquote>
  215    *
  216    * <p>For a more complete explanation, see <a
  217    * href="http://www.ibm.com/java/education/boundaries/boundaries.html">http://www.ibm.com/java/education/boundaries/boundaries.html</a>.
  218    * &nbsp; For examples, see the resource data (which is annotated).</p>
  219    *
  220    * @author Richard Gillam
  221    */
  222   class RuleBasedBreakIterator extends BreakIterator {
  223   
  224       /**
  225        * A token used as a character-category value to identify ignore characters
  226        */
  227       protected static final byte IGNORE = -1;
  228   
  229       /**
  230        * The state number of the starting state
  231        */
  232       private static final short START_STATE = 1;
  233   
  234       /**
  235        * The state-transition value indicating "stop"
  236        */
  237       private static final short STOP_STATE = 0;
  238   
  239       /**
  240        * Magic number for the BreakIterator data file format.
  241        */
  242       static final byte[] LABEL = {
  243           (byte)'B', (byte)'I', (byte)'d', (byte)'a', (byte)'t', (byte)'a',
  244           (byte)'\0'
  245       };
  246       static final int    LABEL_LENGTH = LABEL.length;
  247   
  248       /**
  249        * Version number of the dictionary that was read in.
  250        */
  251       static final byte supportedVersion = 1;
  252   
  253       /**
  254        * Header size in byte count
  255        */
  256       private static final int HEADER_LENGTH = 36;
  257   
  258       /**
  259        * An array length of indices for BMP characters
  260        */
  261       private static final int BMP_INDICES_LENGTH = 512;
  262   
  263       /**
  264        * Tables that indexes from character values to character category numbers
  265        */
  266       private CompactByteArray charCategoryTable = null;
  267       private SupplementaryCharacterData supplementaryCharCategoryTable = null;
  268   
  269       /**
  270        * The table of state transitions used for forward iteration
  271        */
  272       private short[] stateTable = null;
  273   
  274       /**
  275        * The table of state transitions used to sync up the iterator with the
  276        * text in backwards and random-access iteration
  277        */
  278       private short[] backwardsStateTable = null;
  279   
  280       /**
  281        * A list of flags indicating which states in the state table are accepting
  282        * ("end") states
  283        */
  284       private boolean[] endStates = null;
  285   
  286       /**
  287        * A list of flags indicating which states in the state table are
  288        * lookahead states (states which turn lookahead on and off)
  289        */
  290       private boolean[] lookaheadStates = null;
  291   
  292       /**
  293        * A table for additional data. May be used by a subclass of
  294        * RuleBasedBreakIterator.
  295        */
  296       private byte[] additionalData = null;
  297   
  298       /**
  299        * The number of character categories (and, thus, the number of columns in
  300        * the state tables)
  301        */
  302       private int numCategories;
  303   
  304       /**
  305        * The character iterator through which this BreakIterator accesses the text
  306        */
  307       private CharacterIterator text = null;
  308   
  309       /**
  310        * A CRC32 value of all data in datafile
  311        */
  312       private long checksum;
  313   
  314       //=======================================================================
  315       // constructors
  316       //=======================================================================
  317   
  318       /**
  319        * Constructs a RuleBasedBreakIterator according to the datafile
  320        * provided.
  321        */
  322       public RuleBasedBreakIterator(String datafile)
  323           throws IOException, MissingResourceException {
  324           readTables(datafile);
  325       }
  326   
  327       /**
  328        * Read datafile. The datafile's format is as follows:
  329        * <pre>
  330        *   BreakIteratorData {
  331        *       u1           magic[7];
  332        *       u1           version;
  333        *       u4           totalDataSize;
  334        *       header_info  header;
  335        *       body         value;
  336        *   }
  337        * </pre>
  338        * <code>totalDataSize</code> is the summation of the size of
  339        * <code>header_info</code> and <code>body</code> in byte count.
  340        * <p>
  341        * In <code>header</code>, each field except for checksum implies the
  342        * length of each field. Since <code>BMPdataLength</code> is a fixed-length
  343        *  data(512 entries), its length isn't included in <code>header</code>.
  344        * <code>checksum</code> is a CRC32 value of all in <code>body</code>.
  345        * <pre>
  346        *   header_info {
  347        *       u4           stateTableLength;
  348        *       u4           backwardsStateTableLength;
  349        *       u4           endStatesLength;
  350        *       u4           lookaheadStatesLength;
  351        *       u4           BMPdataLength;
  352        *       u4           nonBMPdataLength;
  353        *       u4           additionalDataLength;
  354        *       u8           checksum;
  355        *   }
  356        * </pre>
  357        * <p>
  358        *
  359        * Finally, <code>BMPindices</code> and <code>BMPdata</code> are set to
  360        * <code>charCategoryTable</code>. <code>nonBMPdata</code> is set to
  361        * <code>supplementaryCharCategoryTable</code>.
  362        * <pre>
  363        *   body {
  364        *       u2           stateTable[stateTableLength];
  365        *       u2           backwardsStateTable[backwardsStateTableLength];
  366        *       u1           endStates[endStatesLength];
  367        *       u1           lookaheadStates[lookaheadStatesLength];
  368        *       u2           BMPindices[512];
  369        *       u1           BMPdata[BMPdataLength];
  370        *       u4           nonBMPdata[numNonBMPdataLength];
  371        *       u1           additionalData[additionalDataLength];
  372        *   }
  373        * </pre>
  374        */
  375       protected void readTables(String datafile)
  376           throws IOException, MissingResourceException {
  377   
  378           byte[] buffer = readFile(datafile);
  379   
  380           /* Read header_info. */
  381           int stateTableLength = BreakIterator.getInt(buffer, 0);
  382           int backwardsStateTableLength = BreakIterator.getInt(buffer, 4);
  383           int endStatesLength = BreakIterator.getInt(buffer, 8);
  384           int lookaheadStatesLength = BreakIterator.getInt(buffer, 12);
  385           int BMPdataLength = BreakIterator.getInt(buffer, 16);
  386           int nonBMPdataLength = BreakIterator.getInt(buffer, 20);
  387           int additionalDataLength = BreakIterator.getInt(buffer, 24);
  388           checksum = BreakIterator.getLong(buffer, 28);
  389   
  390           /* Read stateTable[numCategories * numRows] */
  391           stateTable = new short[stateTableLength];
  392           int offset = HEADER_LENGTH;
  393           for (int i = 0; i < stateTableLength; i++, offset+=2) {
  394              stateTable[i] = BreakIterator.getShort(buffer, offset);
  395           }
  396   
  397           /* Read backwardsStateTable[numCategories * numRows] */
  398           backwardsStateTable = new short[backwardsStateTableLength];
  399           for (int i = 0; i < backwardsStateTableLength; i++, offset+=2) {
  400              backwardsStateTable[i] = BreakIterator.getShort(buffer, offset);
  401           }
  402   
  403           /* Read endStates[numRows] */
  404           endStates = new boolean[endStatesLength];
  405           for (int i = 0; i < endStatesLength; i++, offset++) {
  406              endStates[i] = buffer[offset] == 1;
  407           }
  408   
  409           /* Read lookaheadStates[numRows] */
  410           lookaheadStates = new boolean[lookaheadStatesLength];
  411           for (int i = 0; i < lookaheadStatesLength; i++, offset++) {
  412              lookaheadStates[i] = buffer[offset] == 1;
  413           }
  414   
  415           /* Read a category table and indices for BMP characters. */
  416           short[] temp1 = new short[BMP_INDICES_LENGTH];  // BMPindices
  417           for (int i = 0; i < BMP_INDICES_LENGTH; i++, offset+=2) {
  418               temp1[i] = BreakIterator.getShort(buffer, offset);
  419           }
  420           byte[] temp2 = new byte[BMPdataLength];  // BMPdata
  421           System.arraycopy(buffer, offset, temp2, 0, BMPdataLength);
  422           offset += BMPdataLength;
  423           charCategoryTable = new CompactByteArray(temp1, temp2);
  424   
  425           /* Read a category table for non-BMP characters. */
  426           int[] temp3 = new int[nonBMPdataLength];
  427           for (int i = 0; i < nonBMPdataLength; i++, offset+=4) {
  428               temp3[i] = BreakIterator.getInt(buffer, offset);
  429           }
  430           supplementaryCharCategoryTable = new SupplementaryCharacterData(temp3);
  431   
  432           /* Read additional data */
  433           if (additionalDataLength > 0) {
  434               additionalData = new byte[additionalDataLength];
  435               System.arraycopy(buffer, offset, additionalData, 0, additionalDataLength);
  436           }
  437   
  438           /* Set numCategories */
  439           numCategories = stateTable.length / endStates.length;
  440       }
  441   
  442       protected byte[] readFile(final String datafile)
  443           throws IOException, MissingResourceException {
  444   
  445           BufferedInputStream is;
  446           try {
  447               is = (BufferedInputStream)AccessController.doPrivileged(
  448                   new PrivilegedExceptionAction() {
  449                       public Object run() throws Exception {
  450                           return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + datafile));
  451                       }
  452                   }
  453               );
  454           }
  455           catch (PrivilegedActionException e) {
  456               throw new InternalError(e.toString());
  457           }
  458   
  459           int offset = 0;
  460   
  461           /* First, read magic, version, and header_info. */
  462           int len = LABEL_LENGTH + 5;
  463           byte[] buf = new byte[len];
  464           if (is.read(buf) != len) {
  465               throw new MissingResourceException("Wrong header length",
  466                                                  datafile, "");
  467           }
  468   
  469           /* Validate the magic number. */
  470           for (int i = 0; i < LABEL_LENGTH; i++, offset++) {
  471               if (buf[offset] != LABEL[offset]) {
  472                   throw new MissingResourceException("Wrong magic number",
  473                                                      datafile, "");
  474               }
  475           }
  476   
  477           /* Validate the version number. */
  478           if (buf[offset] != supportedVersion) {
  479               throw new MissingResourceException("Unsupported version(" + buf[offset] + ")",
  480                                                  datafile, "");
  481           }
  482   
  483           /* Read data: totalDataSize + 8(for checksum) */
  484           len = BreakIterator.getInt(buf, ++offset);
  485           buf = new byte[len];
  486           if (is.read(buf) != len) {
  487               throw new MissingResourceException("Wrong data length",
  488                                                  datafile, "");
  489           }
  490   
  491           is.close();
  492   
  493           return buf;
  494       }
  495   
  496       byte[] getAdditionalData() {
  497           return additionalData;
  498       }
  499   
  500       void setAdditionalData(byte[] b) {
  501           additionalData = b;
  502       }
  503   
  504       //=======================================================================
  505       // boilerplate
  506       //=======================================================================
  507       /**
  508        * Clones this iterator.
  509        * @return A newly-constructed RuleBasedBreakIterator with the same
  510        * behavior as this one.
  511        */
  512       public Object clone() {
  513           RuleBasedBreakIterator result = (RuleBasedBreakIterator) super.clone();
  514           if (text != null) {
  515               result.text = (CharacterIterator) text.clone();
  516           }
  517           return result;
  518       }
  519   
  520       /**
  521        * Returns true if both BreakIterators are of the same class, have the same
  522        * rules, and iterate over the same text.
  523        */
  524       public boolean equals(Object that) {
  525           try {
  526               if (that == null) {
  527                   return false;
  528               }
  529   
  530               RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
  531               if (checksum != other.checksum) {
  532                   return false;
  533               }
  534               if (text == null) {
  535                   return other.text == null;
  536               } else {
  537                   return text.equals(other.text);
  538               }
  539           }
  540           catch(ClassCastException e) {
  541               return false;
  542           }
  543       }
  544   
  545       /**
  546        * Returns text
  547        */
  548       public String toString() {
  549           StringBuffer sb = new StringBuffer();
  550           sb.append('[');
  551           sb.append("checksum=0x" + Long.toHexString(checksum));
  552           sb.append(']');
  553           return sb.toString();
  554       }
  555   
  556       /**
  557        * Compute a hashcode for this BreakIterator
  558        * @return A hash code
  559        */
  560       public int hashCode() {
  561           return (int)checksum;
  562       }
  563   
  564       //=======================================================================
  565       // BreakIterator overrides
  566       //=======================================================================
  567   
  568       /**
  569        * Sets the current iteration position to the beginning of the text.
  570        * (i.e., the CharacterIterator's starting offset).
  571        * @return The offset of the beginning of the text.
  572        */
  573       public int first() {
  574           CharacterIterator t = getText();
  575   
  576           t.first();
  577           return t.getIndex();
  578       }
  579   
  580       /**
  581        * Sets the current iteration position to the end of the text.
  582        * (i.e., the CharacterIterator's ending offset).
  583        * @return The text's past-the-end offset.
  584        */
  585       public int last() {
  586           CharacterIterator t = getText();
  587   
  588           // I'm not sure why, but t.last() returns the offset of the last character,
  589           // rather than the past-the-end offset
  590           t.setIndex(t.getEndIndex());
  591           return t.getIndex();
  592       }
  593   
  594       /**
  595        * Advances the iterator either forward or backward the specified number of steps.
  596        * Negative values move backward, and positive values move forward.  This is
  597        * equivalent to repeatedly calling next() or previous().
  598        * @param n The number of steps to move.  The sign indicates the direction
  599        * (negative is backwards, and positive is forwards).
  600        * @return The character offset of the boundary position n boundaries away from
  601        * the current one.
  602        */
  603       public int next(int n) {
  604           int result = current();
  605           while (n > 0) {
  606               result = handleNext();
  607               --n;
  608           }
  609           while (n < 0) {
  610               result = previous();
  611               ++n;
  612           }
  613           return result;
  614       }
  615   
  616       /**
  617        * Advances the iterator to the next boundary position.
  618        * @return The position of the first boundary after this one.
  619        */
  620       public int next() {
  621           return handleNext();
  622       }
  623   
  624       /**
  625        * Advances the iterator backwards, to the last boundary preceding this one.
  626        * @return The position of the last boundary position preceding this one.
  627        */
  628       public int previous() {
  629           // if we're already sitting at the beginning of the text, return DONE
  630           CharacterIterator text = getText();
  631           if (current() == text.getBeginIndex()) {
  632               return BreakIterator.DONE;
  633           }
  634   
  635           // set things up.  handlePrevious() will back us up to some valid
  636           // break position before the current position (we back our internal
  637           // iterator up one step to prevent handlePrevious() from returning
  638           // the current position), but not necessarily the last one before
  639           // where we started
  640           int start = current();
  641           getPrevious();
  642           int lastResult = handlePrevious();
  643           int result = lastResult;
  644   
  645           // iterate forward from the known break position until we pass our
  646           // starting point.  The last break position before the starting
  647           // point is our return value
  648           while (result != BreakIterator.DONE && result < start) {
  649               lastResult = result;
  650               result = handleNext();
  651           }
  652   
  653           // set the current iteration position to be the last break position
  654           // before where we started, and then return that value
  655           text.setIndex(lastResult);
  656           return lastResult;
  657       }
  658   
  659       /**
  660        * Returns previous character
  661        */
  662       private int getPrevious() {
  663           char c2 = text.previous();
  664           if (Character.isLowSurrogate(c2) &&
  665               text.getIndex() > text.getBeginIndex()) {
  666               char c1 = text.previous();
  667               if (Character.isHighSurrogate(c1)) {
  668                   return Character.toCodePoint(c1, c2);
  669               } else {
  670                   text.next();
  671               }
  672           }
  673           return (int)c2;
  674       }
  675   
  676       /**
  677        * Returns current character
  678        */
  679       int getCurrent() {
  680           char c1 = text.current();
  681           if (Character.isHighSurrogate(c1) &&
  682               text.getIndex() < text.getEndIndex()) {
  683               char c2 = text.next();
  684               text.previous();
  685               if (Character.isLowSurrogate(c2)) {
  686                   return Character.toCodePoint(c1, c2);
  687               }
  688           }
  689           return (int)c1;
  690       }
  691   
  692       /**
  693        * Returns the count of next character.
  694        */
  695       private int getCurrentCodePointCount() {
  696           char c1 = text.current();
  697           if (Character.isHighSurrogate(c1) &&
  698               text.getIndex() < text.getEndIndex()) {
  699               char c2 = text.next();
  700               text.previous();
  701               if (Character.isLowSurrogate(c2)) {
  702                   return 2;
  703               }
  704           }
  705           return 1;
  706       }
  707   
  708       /**
  709        * Returns next character
  710        */
  711       int getNext() {
  712           int index = text.getIndex();
  713           int endIndex = text.getEndIndex();
  714           if (index == endIndex ||
  715               (index = index + getCurrentCodePointCount()) >= endIndex) {
  716               return CharacterIterator.DONE;
  717           }
  718           text.setIndex(index);
  719           return getCurrent();
  720       }
  721   
  722       /**
  723        * Returns the position of next character.
  724        */
  725       private int getNextIndex() {
  726           int index = text.getIndex() + getCurrentCodePointCount();
  727           int endIndex = text.getEndIndex();
  728           if (index > endIndex) {
  729               return endIndex;
  730           } else {
  731               return index;
  732           }
  733       }
  734   
  735       /**
  736        * Throw IllegalArgumentException unless begin <= offset < end.
  737        */
  738       protected static final void checkOffset(int offset, CharacterIterator text) {
  739           if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
  740               throw new IllegalArgumentException("offset out of bounds");
  741           }
  742       }
  743   
  744       /**
  745        * Sets the iterator to refer to the first boundary position following
  746        * the specified position.
  747        * @offset The position from which to begin searching for a break position.
  748        * @return The position of the first break after the current position.
  749        */
  750       public int following(int offset) {
  751   
  752           CharacterIterator text = getText();
  753           checkOffset(offset, text);
  754   
  755           // Set our internal iteration position (temporarily)
  756           // to the position passed in.  If this is the _beginning_ position,
  757           // then we can just use next() to get our return value
  758           text.setIndex(offset);
  759           if (offset == text.getBeginIndex()) {
  760               return handleNext();
  761           }
  762   
  763           // otherwise, we have to sync up first.  Use handlePrevious() to back
  764           // us up to a known break position before the specified position (if
  765           // we can determine that the specified position is a break position,
  766           // we don't back up at all).  This may or may not be the last break
  767           // position at or before our starting position.  Advance forward
  768           // from here until we've passed the starting position.  The position
  769           // we stop on will be the first break position after the specified one.
  770           int result = handlePrevious();
  771           while (result != BreakIterator.DONE && result <= offset) {
  772               result = handleNext();
  773           }
  774           return result;
  775       }
  776   
  777       /**
  778        * Sets the iterator to refer to the last boundary position before the
  779        * specified position.
  780        * @offset The position to begin searching for a break from.
  781        * @return The position of the last boundary before the starting position.
  782        */
  783       public int preceding(int offset) {
  784           // if we start by updating the current iteration position to the
  785           // position specified by the caller, we can just use previous()
  786           // to carry out this operation
  787           CharacterIterator text = getText();
  788           checkOffset(offset, text);
  789           text.setIndex(offset);
  790           return previous();
  791       }
  792   
  793       /**
  794        * Returns true if the specfied position is a boundary position.  As a side
  795        * effect, leaves the iterator pointing to the first boundary position at
  796        * or after "offset".
  797        * @param offset the offset to check.
  798        * @return True if "offset" is a boundary position.
  799        */
  800       public boolean isBoundary(int offset) {
  801           CharacterIterator text = getText();
  802           checkOffset(offset, text);
  803           if (offset == text.getBeginIndex()) {
  804               return true;
  805           }
  806   
  807           // to check whether this is a boundary, we can use following() on the
  808           // position before the specified one and return true if the position we
  809           // get back is the one the user specified
  810           else {
  811               return following(offset - 1) == offset;
  812           }
  813       }
  814   
  815       /**
  816        * Returns the current iteration position.
  817        * @return The current iteration position.
  818        */
  819       public int current() {
  820           return getText().getIndex();
  821       }
  822   
  823       /**
  824        * Return a CharacterIterator over the text being analyzed.  This version
  825        * of this method returns the actual CharacterIterator we're using internally.
  826        * Changing the state of this iterator can have undefined consequences.  If
  827        * you need to change it, clone it first.
  828        * @return An iterator over the text being analyzed.
  829        */
  830       public CharacterIterator getText() {
  831           // The iterator is initialized pointing to no text at all, so if this
  832           // function is called while we're in that state, we have to fudge an
  833           // iterator to return.
  834           if (text == null) {
  835               text = new StringCharacterIterator("");
  836           }
  837           return text;
  838       }
  839   
  840       /**
  841        * Set the iterator to analyze a new piece of text.  This function resets
  842        * the current iteration position to the beginning of the text.
  843        * @param newText An iterator over the text to analyze.
  844        */
  845       public void setText(CharacterIterator newText) {
  846           // Test iterator to see if we need to wrap it in a SafeCharIterator.
  847           // The correct behavior for CharacterIterators is to allow the
  848           // position to be set to the endpoint of the iterator.  Many
  849           // CharacterIterators do not uphold this, so this is a workaround
  850           // to permit them to use this class.
  851           int end = newText.getEndIndex();
  852           boolean goodIterator;
  853           try {
  854               newText.setIndex(end);  // some buggy iterators throw an exception here
  855               goodIterator = newText.getIndex() == end;
  856           }
  857           catch(IllegalArgumentException e) {
  858               goodIterator = false;
  859           }
  860   
  861           if (goodIterator) {
  862               text = newText;
  863           }
  864           else {
  865               text = new SafeCharIterator(newText);
  866           }
  867           text.first();
  868       }
  869   
  870   
  871       //=======================================================================
  872       // implementation
  873       //=======================================================================
  874   
  875       /**
  876        * This method is the actual implementation of the next() method.  All iteration
  877        * vectors through here.  This method initializes the state machine to state 1
  878        * and advances through the text character by character until we reach the end
  879        * of the text or the state machine transitions to state 0.  We update our return
  880        * value every time the state machine passes through a possible end state.
  881        */
  882       protected int handleNext() {
  883           // if we're already at the end of the text, return DONE.
  884           CharacterIterator text = getText();
  885           if (text.getIndex() == text.getEndIndex()) {
  886               return BreakIterator.DONE;
  887           }
  888   
  889           // no matter what, we always advance at least one character forward
  890           int result = getNextIndex();
  891           int lookaheadResult = 0;
  892   
  893           // begin in state 1
  894           int state = START_STATE;
  895           int category;
  896           int c = getCurrent();
  897   
  898           // loop until we reach the end of the text or transition to state 0
  899           while (c != CharacterIterator.DONE && state != STOP_STATE) {
  900   
  901               // look up the current character's character category (which tells us
  902               // which column in the state table to look at)
  903               category = lookupCategory(c);
  904   
  905               // if the character isn't an ignore character, look up a state
  906               // transition in the state table
  907               if (category != IGNORE) {
  908                   state = lookupState(state, category);
  909               }
  910   
  911               // if the state we've just transitioned to is a lookahead state,
  912               // (but not also an end state), save its position.  If it's
  913               // both a lookahead state and an end state, update the break position
  914               // to the last saved lookup-state position
  915               if (lookaheadStates[state]) {
  916                   if (endStates[state]) {
  917                       result = lookaheadResult;
  918                   }
  919                   else {
  920                       lookaheadResult = getNextIndex();
  921                   }
  922               }
  923   
  924               // otherwise, if the state we've just transitioned to is an accepting
  925               // state, update the break position to be the current iteration position
  926               else {
  927                   if (endStates[state]) {
  928                       result = getNextIndex();
  929                   }
  930               }
  931   
  932               c = getNext();
  933           }
  934   
  935           // if we've run off the end of the text, and the very last character took us into
  936           // a lookahead state, advance the break position to the lookahead position
  937           // (the theory here is that if there are no characters at all after the lookahead
  938           // position, that always matches the lookahead criteria)
  939           if (c == CharacterIterator.DONE && lookaheadResult == text.getEndIndex()) {
  940               result = lookaheadResult;
  941           }
  942   
  943           text.setIndex(result);
  944           return result;
  945       }
  946   
  947       /**
  948        * This method backs the iterator back up to a "safe position" in the text.
  949        * This is a position that we know, without any context, must be a break position.
  950        * The various calling methods then iterate forward from this safe position to
  951        * the appropriate position to return.  (For more information, see the description
  952        * of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
  953        */
  954       protected int handlePrevious() {
  955           CharacterIterator text = getText();
  956           int state = START_STATE;
  957           int category = 0;
  958           int lastCategory = 0;
  959           int c = getCurrent();
  960   
  961           // loop until we reach the beginning of the text or transition to state 0
  962           while (c != CharacterIterator.DONE && state != STOP_STATE) {
  963   
  964               // save the last character's category and look up the current
  965               // character's category
  966               lastCategory = category;
  967               category = lookupCategory(c);
  968   
  969               // if the current character isn't an ignore character, look up a
  970               // state transition in the backwards state table
  971               if (category != IGNORE) {
  972                   state = lookupBackwardState(state, category);
  973               }
  974   
  975               // then advance one character backwards
  976               c = getPrevious();
  977           }
  978   
  979           // if we didn't march off the beginning of the text, we're either one or two
  980           // positions away from the real break position.  (One because of the call to
  981           // previous() at the end of the loop above, and another because the character
  982           // that takes us into the stop state will always be the character BEFORE
  983           // the break position.)
  984           if (c != CharacterIterator.DONE) {
  985               if (lastCategory != IGNORE) {
  986                   getNext();
  987                   getNext();
  988               }
  989               else {
  990                   getNext();
  991               }
  992           }
  993           return text.getIndex();
  994       }
  995   
  996       /**
  997        * Looks up a character's category (i.e., its category for breaking purposes,
  998        * not its Unicode category)
  999        */
 1000       protected int lookupCategory(int c) {
 1001           if (c < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
 1002               return charCategoryTable.elementAt((char)c);
 1003           } else {
 1004               return supplementaryCharCategoryTable.getValue(c);
 1005           }
 1006       }
 1007   
 1008       /**
 1009        * Given a current state and a character category, looks up the
 1010        * next state to transition to in the state table.
 1011        */
 1012       protected int lookupState(int state, int category) {
 1013           return stateTable[state * numCategories + category];
 1014       }
 1015   
 1016       /**
 1017        * Given a current state and a character category, looks up the
 1018        * next state to transition to in the backwards state table.
 1019        */
 1020       protected int lookupBackwardState(int state, int category) {
 1021           return backwardsStateTable[state * numCategories + category];
 1022       }
 1023   
 1024       /*
 1025        * This class exists to work around a bug in incorrect implementations
 1026        * of CharacterIterator, which incorrectly handle setIndex(endIndex).
 1027        * This iterator relies only on base.setIndex(n) where n is less than
 1028        * endIndex.
 1029        *
 1030        * One caveat:  if the base iterator's begin and end indices change
 1031        * the change will not be reflected by this wrapper.  Does that matter?
 1032        */
 1033       private static final class SafeCharIterator implements CharacterIterator,
 1034                                                              Cloneable {
 1035   
 1036           private CharacterIterator base;
 1037           private int rangeStart;
 1038           private int rangeLimit;
 1039           private int currentIndex;
 1040   
 1041           SafeCharIterator(CharacterIterator base) {
 1042               this.base = base;
 1043               this.rangeStart = base.getBeginIndex();
 1044               this.rangeLimit = base.getEndIndex();
 1045               this.currentIndex = base.getIndex();
 1046           }
 1047   
 1048           public char first() {
 1049               return setIndex(rangeStart);
 1050           }
 1051   
 1052           public char last() {
 1053               return setIndex(rangeLimit - 1);
 1054           }
 1055   
 1056           public char current() {
 1057               if (currentIndex < rangeStart || currentIndex >= rangeLimit) {
 1058                   return DONE;
 1059               }
 1060               else {
 1061                   return base.setIndex(currentIndex);
 1062               }
 1063           }
 1064   
 1065           public char next() {
 1066   
 1067               currentIndex++;
 1068               if (currentIndex >= rangeLimit) {
 1069                   currentIndex = rangeLimit;
 1070                   return DONE;
 1071               }
 1072               else {
 1073                   return base.setIndex(currentIndex);
 1074               }
 1075           }
 1076   
 1077           public char previous() {
 1078   
 1079               currentIndex--;
 1080               if (currentIndex < rangeStart) {
 1081                   currentIndex = rangeStart;
 1082                   return DONE;
 1083               }
 1084               else {
 1085                   return base.setIndex(currentIndex);
 1086               }
 1087           }
 1088   
 1089           public char setIndex(int i) {
 1090   
 1091               if (i < rangeStart || i > rangeLimit) {
 1092                   throw new IllegalArgumentException("Invalid position");
 1093               }
 1094               currentIndex = i;
 1095               return current();
 1096           }
 1097   
 1098           public int getBeginIndex() {
 1099               return rangeStart;
 1100           }
 1101   
 1102           public int getEndIndex() {
 1103               return rangeLimit;
 1104           }
 1105   
 1106           public int getIndex() {
 1107               return currentIndex;
 1108           }
 1109   
 1110           public Object clone() {
 1111   
 1112               SafeCharIterator copy = null;
 1113               try {
 1114                   copy = (SafeCharIterator) super.clone();
 1115               }
 1116               catch(CloneNotSupportedException e) {
 1117                   throw new Error("Clone not supported: " + e);
 1118               }
 1119   
 1120               CharacterIterator copyOfBase = (CharacterIterator) base.clone();
 1121               copy.base = copyOfBase;
 1122               return copy;
 1123           }
 1124       }
 1125   }

Save This Page
Home » openjdk-7 » java » text » [javadoc | source]