Source code: com/eireneh/bible/book/raw/WordItemsMem.java
1
2 package com.eireneh.bible.book.raw;
3
4 import java.io.*;
5 import java.util.*;
6
7 import com.eireneh.bible.book.*;
8 import com.eireneh.util.*;
9
10 /**
11 * The WordItemsMem stores words in a dictionary for a Bible.
12 * The single method that will be of use 99% of the time is the
13 * <code>getWord(int)</code> method. This method will be called once for
14 * each word every time we display a verse (Assuming that we have not
15 * implemented any caches).
16 * <p>The <code>getIndex(String)</code> method is the reverse of this, and
17 * is used in creating the index in the first place.
18 * <p>The class has an underlying File however this is transarent to the
19 * user, since calls to getIndex(String) have any disk changes
20 * automatically written to disk, and the implementation of this class must
21 * be free to choose whatever cacheing scheme it needs.
22 * <p>The index file size will be roughly n*(a+v) where:<ul>
23 * <li>n is the number of words. (=~16,000)
24 * <li>a is the average word length. (=~8)
25 * <li>v is the overhead per word. (=~1)
26 * </ul>
27 * This would give an index file size of 150k. I need to check with the
28 * OLB and with Theopholos, however I think this compares favorably. It
29 * would make the smallest download that contained Bible text (but no
30 * punctuation or case marks, etc) under 200k before compression, maybe
31 * under 150k after. A full basic extensible OLB in under 200k would be
32 * a achievement and well under a 2 minute download.
33 *
34 * <h3>Index File Structure</h3>
35 * I expect that the general layout will be something like:<pre>
36 * 0 -. \
37 * 1 -+. !
38 * 2 -++. ! index area
39 * . !!! !
40 * . !!! /
41 * a <'!! \
42 * a !! !
43 * r !! !
44 * o !! ! text area
45 * n !! !
46 * a <-'! !
47 * b ! !
48 * . ! /
49 * </pre>
50 * For this layout we can use the index of word (n+1) to calculate the
51 * length of word (n) (so long as the words are in index order in the text
52 * area. This would make v=1 (for the index). We could even use upper case
53 * letters to mark new words - this would mean we could have an out of
54 * order text area, <i>or</i> no index area (i.e. v=0) However having v=0
55 * would force us to do in memory cacheing.
56 * <p>The OLB v8 seems to do some form of (offset,length) indexing to
57 * compress files sizes further (or is it simply to obfusticate the file
58 * format?) I'd rather use .zip technology for compression.
59 *
60 * <p>Consider whether and to what extent this class should be static and
61 * public. I think that it should be package scope - Use of this class
62 * does not make sense outside of the RawBible package. There should
63 * only ever be one WordIndex for a given file, but if we can instansiate
64 * this class for several sets of files - it does not make sense to make
65 * it static.
66 *
67 * <p>How can we extend this class in the future?<ul>
68 * <li>Various different cacheing methods, so that we can work in low
69 * memory conditions. This is totally internal to this class, and
70 * does not affect the interface at all.
71 * <li>Inheritance. There are various classes that do a similar job of
72 * reading from files from similar locations.
73 * </ul>
74 *
75 * <table border='1' cellPadding='3' cellSpacing='0' width="100%">
76 * <tr><td bgColor='white'class='TableRowColor'><font size='-7'>
77 * Distribution Licence:<br />
78 * Project B is free software; you can redistribute it
79 * and/or modify it under the terms of the GNU General Public License,
80 * version 2 as published by the Free Software Foundation.<br />
81 * This program is distributed in the hope that it will be useful,
82 * but WITHOUT ANY WARRANTY; without even the implied warranty of
83 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
84 * General Public License for more details.<br />
85 * The License is available on the internet
86 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, by writing to
87 * <i>Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
88 * MA 02111-1307, USA</i>, Or locally at the Licence link below.<br />
89 * The copyright to this program is held by it's authors.
90 * </font></td></tr></table>
91 * @see <a href='http://www.eireneh.com/servlets/Web'>Project B Home</a>
92 * @see docs.Licence
93 * @author Joe Walker
94 * @version D0.I0.T0
95 */
96 public class WordItemsMem extends ItemsMem
97 {
98 /**
99 * Create a WordMemResourceIndex from a File that contains the dictionary.
100 * @param raw Reference to the RawBible that is using us
101 * @param filename The leaf name to read/write
102 * @param create Should we start all over again
103 */
104 public WordItemsMem(RawBible raw, boolean create) throws Exception
105 {
106 super(raw, "word.idx", create);
107 }
108
109 /**
110 * This is a specialization of IndexedResource.getIndex(String) that
111 * ensures that the word is lower case before we insert it.
112 * @see com.eireneh.bible.book.raw.IndexedResource#getIndex(String)
113 * @param data The word to find/create an id for
114 * @return The (new) id for the item
115 */
116 public int getIndex(String data)
117 {
118 return super.getIndex(data.toLowerCase());
119 }
120
121 /**
122 * How many items are there in this index?
123 * @return The number of items that we must remember
124 */
125 public int getMaxItems()
126 {
127 return 20000;
128 }
129
130 /**
131 * Find a list of words that start with the given word
132 * @param word The word to search for
133 * @return An array of matches
134 */
135 public String[] getStartsWith(String word) throws BookException
136 {
137 if (array == null)
138 throw new NullPointerException();
139
140 Vector vec = new Vector();
141 word = word.toLowerCase();
142
143 // This bit is s_l_o_w. We do a one end to the other search through all
144 // the words for starts-with matches, putting the results into a vector.
145 for (int i=0; i<array.length; i++)
146 {
147 if (array[i] == null)
148 {
149 log.warning("null word at index "+i);
150 }
151 else
152 {
153 if (array[i].startsWith(word))
154 vec.addElement(array[i]);
155 }
156 }
157
158 // Convert the vector into an array. We can't put the suff into an array
159 // to start with because we don't know how big to make it. It would be
160 // go to know how to get at the guts of a Vector
161 String[] retcode = new String[vec.size()];
162 for (int i=0; i<retcode.length; i++)
163 {
164 retcode[i] = (String) vec.elementAt(i);
165 }
166
167 return retcode;
168 }
169
170 /**
171 * Load the Resource from a stream
172 * @param in The stream to read from
173 */
174 public void load(InputStream in) throws IOException, ClassNotFoundException
175 {
176 DataInputStream din = new DataInputStream(in);
177
178 byte[] asig = new byte[6];
179 din.readFully(asig);
180 String ssig = new String(asig);
181 if (!ssig.equals("RAW:WR"))
182 throw new IOException("This file is not a Word file");
183
184 count = din.readInt();
185 hash = new Hashtable(count);
186 array = new String[count];
187
188 for (int i=0; i<count; i++)
189 {
190 byte wordlen = din.readByte();
191 byte[] aword = new byte[wordlen];
192 din.readFully(aword);
193 String word = new String(aword);
194
195 hash.put(word, new Integer(i));
196 array[i] = word;
197 }
198
199 din.close();
200 }
201
202 /**
203 * Ensure that all changes to the index of words are written to a
204 * stream
205 * @param out The stream to write to
206 */
207 public void save(OutputStream out) throws IOException
208 {
209 DataOutputStream dout = new DataOutputStream(out);
210
211 dout.writeBytes("RAW:WR");
212 dout.writeInt(hash.size());
213
214 for (int i=0; i<hash.size(); i++)
215 {
216 dout.writeByte(array[i].length());
217 dout.writeBytes(array[i]);
218 }
219
220 dout.close();
221 }
222
223 /** The log stream */
224 protected static Logger log = Logger.getLogger("bible.book");
225 }