Save This Page
Home » openjdk-7 » java » text » [javadoc | source]
    1   /*
    2    * Copyright 1999-2003 Sun Microsystems, Inc.  All Rights Reserved.
    3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4    *
    5    * This code is free software; you can redistribute it and/or modify it
    6    * under the terms of the GNU General Public License version 2 only, as
    7    * published by the Free Software Foundation.  Sun designates this
    8    * particular file as subject to the "Classpath" exception as provided
    9    * by Sun in the LICENSE file that accompanied this code.
   10    *
   11    * This code is distributed in the hope that it will be useful, but WITHOUT
   12    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   13    * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   14    * version 2 for more details (a copy is included in the LICENSE file that
   15    * accompanied this code).
   16    *
   17    * You should have received a copy of the GNU General Public License version
   18    * 2 along with this work; if not, write to the Free Software Foundation,
   19    * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   20    *
   21    * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
   22    * CA 95054 USA or visit www.sun.com if you need additional information or
   23    * have any questions.
   24    */
   25   
   26   /*
   27    *
   28    * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
   29    * (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
   30    *
   31    * The original version of this source code and documentation
   32    * is copyrighted and owned by Taligent, Inc., a wholly-owned
   33    * subsidiary of IBM. These materials are provided under terms
   34    * of a License Agreement between Taligent and Sun. This technology
   35    * is protected by multiple US and International patents.
   36    *
   37    * This notice and attribution to Taligent may not be removed.
   38    * Taligent is a registered trademark of Taligent, Inc.
   39    */
   40   package java.text;
   41   
   42   import java.io;
   43   import java.security.AccessController;
   44   import java.security.PrivilegedActionException;
   45   import java.security.PrivilegedExceptionAction;
   46   import java.util.MissingResourceException;
   47   import sun.text.CompactByteArray;
   48   import sun.text.SupplementaryCharacterData;
   49   
   50   /**
   51    * This is the class that represents the list of known words used by
   52    * DictionaryBasedBreakIterator.  The conceptual data structure used
   53    * here is a trie: there is a node hanging off the root node for every
   54    * letter that can start a word.  Each of these nodes has a node hanging
   55    * off of it for every letter that can be the second letter of a word
   56    * if this node is the first letter, and so on.  The trie is represented
   57    * as a two-dimensional array that can be treated as a table of state
   58    * transitions.  Indexes are used to compress this array, taking
   59    * advantage of the fact that this array will always be very sparse.
   60    */
   61   class BreakDictionary {
   62   
   63       //=========================================================================
   64       // data members
   65       //=========================================================================
   66   
   67       /**
   68         * The version of the dictionary that was read in.
   69         */
   70       private static int supportedVersion = 1;
   71   
   72       /**
   73        * Maps from characters to column numbers.  The main use of this is to
   74        * avoid making room in the array for empty columns.
   75        */
   76       private CompactByteArray columnMap = null;
   77       private SupplementaryCharacterData supplementaryCharColumnMap = null;
   78   
   79       /**
   80        * The number of actual columns in the table
   81        */
   82       private int numCols;
   83   
   84       /**
   85        * Columns are organized into groups of 32.  This says how many
   86        * column groups.  (We could calculate this, but we store the
   87        * value to avoid having to repeatedly calculate it.)
   88        */
   89       private int numColGroups;
   90   
   91       /**
   92        * The actual compressed state table.  Each conceptual row represents
   93        * a state, and the cells in it contain the row numbers of the states
   94        * to transition to for each possible letter.  0 is used to indicate
   95        * an illegal combination of letters (i.e., the error state).  The
   96        * table is compressed by eliminating all the unpopulated (i.e., zero)
   97        * cells.  Multiple conceptual rows can then be doubled up in a single
   98        * physical row by sliding them up and possibly shifting them to one
   99        * side or the other so the populated cells don't collide.  Indexes
  100        * are used to identify unpopulated cells and to locate populated cells.
  101        */
  102       private short[] table = null;
  103   
  104       /**
  105        * This index maps logical row numbers to physical row numbers
  106        */
  107       private short[] rowIndex = null;
  108   
  109       /**
  110        * A bitmap is used to tell which cells in the comceptual table are
  111        * populated.  This array contains all the unique bit combinations
  112        * in that bitmap.  If the table is more than 32 columns wide,
  113        * successive entries in this array are used for a single row.
  114        */
  115       private int[] rowIndexFlags = null;
  116   
  117       /**
  118        * This index maps from a logical row number into the bitmap table above.
  119        * (This keeps us from storing duplicate bitmap combinations.)  Since there
  120        * are a lot of rows with only one populated cell, instead of wasting space
  121        * in the bitmap table, we just store a negative number in this index for
  122        * rows with one populated cell.  The absolute value of that number is
  123        * the column number of the populated cell.
  124        */
  125       private short[] rowIndexFlagsIndex = null;
  126   
  127       /**
  128        * For each logical row, this index contains a constant that is added to
  129        * the logical column number to get the physical column number
  130        */
  131       private byte[] rowIndexShifts = null;
  132   
  133       //=========================================================================
  134       // deserialization
  135       //=========================================================================
  136   
  137       public BreakDictionary(String dictionaryName)
  138           throws IOException, MissingResourceException {
  139   
  140           readDictionaryFile(dictionaryName);
  141       }
  142   
  143       private void readDictionaryFile(final String dictionaryName)
  144           throws IOException, MissingResourceException {
  145   
  146           BufferedInputStream in;
  147           try {
  148               in = (BufferedInputStream)AccessController.doPrivileged(
  149                   new PrivilegedExceptionAction() {
  150                       public Object run() throws Exception {
  151                           return new BufferedInputStream(getClass().getResourceAsStream("/sun/text/resources/" + dictionaryName));
  152                       }
  153                   }
  154               );
  155           }
  156           catch (PrivilegedActionException e) {
  157               throw new InternalError(e.toString());
  158           }
  159   
  160           byte[] buf = new byte[8];
  161           if (in.read(buf) != 8) {
  162               throw new MissingResourceException("Wrong data length",
  163                                                  dictionaryName, "");
  164           }
  165   
  166           // check vesion
  167           int version = BreakIterator.getInt(buf, 0);
  168           if (version != supportedVersion) {
  169               throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
  170                                                              dictionaryName, "");
  171           }
  172   
  173           // get data size
  174           int len = BreakIterator.getInt(buf, 4);
  175           buf = new byte[len];
  176           if (in.read(buf) != len) {
  177               throw new MissingResourceException("Wrong data length",
  178                                                  dictionaryName, "");
  179           }
  180   
  181           // close the stream
  182           in.close();
  183   
  184           int l;
  185           int offset = 0;
  186   
  187           // read in the column map for BMP characteres (this is serialized in
  188           // its internal form: an index array followed by a data array)
  189           l = BreakIterator.getInt(buf, offset);
  190           offset += 4;
  191           short[] temp = new short[l];
  192           for (int i = 0; i < l; i++, offset+=2) {
  193               temp[i] = BreakIterator.getShort(buf, offset);
  194           }
  195           l = BreakIterator.getInt(buf, offset);
  196           offset += 4;
  197           byte[] temp2 = new byte[l];
  198           for (int i = 0; i < l; i++, offset++) {
  199               temp2[i] = buf[offset];
  200           }
  201           columnMap = new CompactByteArray(temp, temp2);
  202   
  203           // read in numCols and numColGroups
  204           numCols = BreakIterator.getInt(buf, offset);
  205           offset += 4;
  206           numColGroups = BreakIterator.getInt(buf, offset);
  207           offset += 4;
  208   
  209           // read in the row-number index
  210           l = BreakIterator.getInt(buf, offset);
  211           offset += 4;
  212           rowIndex = new short[l];
  213           for (int i = 0; i < l; i++, offset+=2) {
  214               rowIndex[i] = BreakIterator.getShort(buf, offset);
  215           }
  216   
  217           // load in the populated-cells bitmap: index first, then bitmap list
  218           l = BreakIterator.getInt(buf, offset);
  219           offset += 4;
  220           rowIndexFlagsIndex = new short[l];
  221           for (int i = 0; i < l; i++, offset+=2) {
  222               rowIndexFlagsIndex[i] = BreakIterator.getShort(buf, offset);
  223           }
  224           l = BreakIterator.getInt(buf, offset);
  225           offset += 4;
  226           rowIndexFlags = new int[l];
  227           for (int i = 0; i < l; i++, offset+=4) {
  228               rowIndexFlags[i] = BreakIterator.getInt(buf, offset);
  229           }
  230   
  231           // load in the row-shift index
  232           l = BreakIterator.getInt(buf, offset);
  233           offset += 4;
  234           rowIndexShifts = new byte[l];
  235           for (int i = 0; i < l; i++, offset++) {
  236               rowIndexShifts[i] = buf[offset];
  237           }
  238   
  239           // load in the actual state table
  240           l = BreakIterator.getInt(buf, offset);
  241           offset += 4;
  242           table = new short[l];
  243           for (int i = 0; i < l; i++, offset+=2) {
  244               table[i] = BreakIterator.getShort(buf, offset);
  245           }
  246   
  247           // finally, prepare the column map for supplementary characters
  248           l = BreakIterator.getInt(buf, offset);
  249           offset += 4;
  250           int[] temp3 = new int[l];
  251           for (int i = 0; i < l; i++, offset+=4) {
  252               temp3[i] = BreakIterator.getInt(buf, offset);
  253           }
  254           supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
  255       }
  256   
  257       //=========================================================================
  258       // access to the words
  259       //=========================================================================
  260   
  261       /**
  262        * Uses the column map to map the character to a column number, then
  263        * passes the row and column number to getNextState()
  264        * @param row The current state
  265        * @param ch The character whose column we're interested in
  266        * @return The new state to transition to
  267        */
  268       public final short getNextStateFromCharacter(int row, int ch) {
  269           int col;
  270           if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
  271               col = columnMap.elementAt((char)ch);
  272           } else {
  273               col = supplementaryCharColumnMap.getValue(ch);
  274           }
  275           return getNextState(row, col);
  276       }
  277   
  278       /**
  279        * Returns the value in the cell with the specified (logical) row and
  280        * column numbers.  In DictionaryBasedBreakIterator, the row number is
  281        * a state number, the column number is an input, and the return value
  282        * is the row number of the new state to transition to.  (0 is the
  283        * "error" state, and -1 is the "end of word" state in a dictionary)
  284        * @param row The row number of the current state
  285        * @param col The column number of the input character (0 means "not a
  286        * dictionary character")
  287        * @return The row number of the new state to transition to
  288        */
  289       public final short getNextState(int row, int col) {
  290           if (cellIsPopulated(row, col)) {
  291               // we map from logical to physical row number by looking up the
  292               // mapping in rowIndex; we map from logical column number to
  293               // physical column number by looking up a shift value for this
  294               // logical row and offsetting the logical column number by
  295               // the shift amount.  Then we can use internalAt() to actually
  296               // get the value out of the table.
  297               return internalAt(rowIndex[row], col + rowIndexShifts[row]);
  298           }
  299           else {
  300               return 0;
  301           }
  302       }
  303   
  304       /**
  305        * Given (logical) row and column numbers, returns true if the
  306        * cell in that position is populated
  307        */
  308       private final boolean cellIsPopulated(int row, int col) {
  309           // look up the entry in the bitmap index for the specified row.
  310           // If it's a negative number, it's the column number of the only
  311           // populated cell in the row
  312           if (rowIndexFlagsIndex[row] < 0) {
  313               return col == -rowIndexFlagsIndex[row];
  314           }
  315   
  316           // if it's a positive number, it's the offset of an entry in the bitmap
  317           // list.  If the table is more than 32 columns wide, the bitmap is stored
  318           // successive entries in the bitmap list, so we have to divide the column
  319           // number by 32 and offset the number we got out of the index by the result.
  320           // Once we have the appropriate piece of the bitmap, test the appropriate
  321           // bit and return the result.
  322           else {
  323               int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
  324               return (flags & (1 << (col & 0x1f))) != 0;
  325           }
  326       }
  327   
  328       /**
  329        * Implementation of getNextState() when we know the specified cell is
  330        * populated.
  331        * @param row The PHYSICAL row number of the cell
  332        * @param col The PHYSICAL column number of the cell
  333        * @return The value stored in the cell
  334        */
  335       private final short internalAt(int row, int col) {
  336           // the table is a one-dimensional array, so this just does the math necessary
  337           // to treat it as a two-dimensional array (we don't just use a two-dimensional
  338           // array because two-dimensional arrays are inefficient in Java)
  339           return table[row * numCols + col];
  340       }
  341   }

Save This Page
Home » openjdk-7 » java » text » [javadoc | source]