Source code: com/eireneh/bible/control/dictionary/Grammar.java
1
2 package com.eireneh.bible.control.dictionary;
3
4 import com.eireneh.util.StringUtil;
5
6 /**
7 * Grammar.
8 *
9 * <table border='1' cellPadding='3' cellSpacing='0' width="100%">
10 * <tr><td bgColor='white'class='TableRowColor'><font size='-7'>
11 * Distribution Licence:<br />
12 * Project B is free software; you can redistribute it
13 * and/or modify it under the terms of the GNU General Public License,
14 * version 2 as published by the Free Software Foundation.<br />
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.<br />
19 * The License is available on the internet
20 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, by writing to
21 * <i>Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
22 * MA 02111-1307, USA</i>, Or locally at the Licence link below.<br />
23 * The copyright to this program is held by it's authors.
24 * </font></td></tr></table>
25 * @see <a href='http://www.eireneh.com/servlets/Web'>Project B Home</a>
26 * @see docs.Licence
27 * @author Joe Walker
28 */
29 public class Grammar
30 {
31 /**
32 * Strip of any parts of speech to leave a root word.
33 * This class may not be the best place for this code, however I'm not
34 * sure if we have a better place for it at the mo. Maybe it should be
35 * in passage.PassageUtil?
36 * @param word The word to strip
37 * @return The root word
38 */
39 public static String getRoot(String word)
40 {
41 for (int i=0; i<endings.length; i++)
42 {
43 if (word.endsWith(endings[i]))
44 {
45 // Make the assumption that we never have 2 ending on a word
46 return word.substring(0, word.length() - endings[i].length());
47 }
48 }
49
50 return word;
51 }
52
53 /**
54 * Is this word one of those small words that can slaughter a DB
55 * query. An empty string IS a small word.
56 * @param word The word to test
57 */
58 public static boolean isSmallWord(String word)
59 {
60 word = word.trim();
61 if (word.equals(""))
62 return true;
63
64 for (int i=0; i<word_freq.length; i++)
65 {
66 if (word.equalsIgnoreCase(word_freq[i]))
67 {
68 return true;
69 }
70 }
71
72 return false;
73 }
74
75 /**
76 * Is this word one of those small words that can slaughter a DB query
77 * @param word The word to test
78 */
79 public static String[] stripSmallWords(String[] words)
80 {
81 // How many long words are there?
82 int long_words = 0;
83 for (int i=0; i<words.length; i++)
84 {
85 if (!isSmallWord(words[i]))
86 long_words++;
87 }
88
89 // Create the array with just the long words
90 int count = 0;
91 String[] retcode = new String[long_words];
92 for (int i=0; i<words.length; i++)
93 {
94 if (!isSmallWord(words[i]))
95 retcode[count++] = words[i];
96 }
97
98 return retcode;
99 }
100
101 /**
102 * Like PassageUtil.tokenize that leaves out the small words
103 * @param word The word to split up
104 * @param delims The word separators
105 * @return The long words in the string
106 */
107 public static String[] tokenizeWithoutSmallWords(String original, String delims)
108 {
109 String[] words = StringUtil.tokenize(original, delims);
110 int small_words = 0;
111
112 for (int i=0; i<words.length; i++)
113 {
114 if (Grammar.isSmallWord(words[i]))
115 {
116 small_words++;
117 words[i] = null;
118 }
119 }
120
121 String retcode[] = new String[words.length-small_words];
122 int count = 0;
123 for (int i=0; i<words.length; i++)
124 {
125 if (words[i] != null) retcode[count++] = words[i];
126 }
127
128 return retcode;
129 }
130
131 /**
132 * The Endings a word can have.
133 * These are matched in order so there is no point in having "s"
134 * before "es" because the second will not be tried.
135 */
136 private static final String[] endings =
137 {
138 "es",
139 "'s",
140 "s",
141 "ing",
142 "ed",
143 "er",
144 "ly",
145 };
146
147 /**
148 * The one hundred most used words, and the instance count
149 */
150 private static final String[] word_freq =
151 {
152 // word instance count (in AV & NIV)
153 "the", // 119135
154 "and", // 81244
155 "of", // 59417
156 "to", // 43624
157 "in", // 24233
158 "he", // 20088
159 "that", // 18672
160 "i", // 17605
161 "a", // 17439
162 "for", // 16780
163 "you", // 16324
164 "his", // 15438
165 // "lord", // 15319
166 "is", // 14304
167 "will", // 13981
168 "they", // 13942
169 "not", // 12507
170 "with", // 12125
171 "him", // 12058
172 "it", // 11834
173 "be", // 11638
174 "them", // 11608
175 "shall", // 10833
176 "all", // 10333
177 "my", // 9547
178 "from", // 9323
179 "was", // 8530
180 "your", // 8400
181 // "god", // 8381
182 "have", // 8322
183 "me", // 8102
184 "but", // 7991
185 "their", // 7638
186 "as", // 7521
187 "who", // 7425
188 "said", // 7198
189 "are", // 6981
190 "on", // 6914
191 "this", // 6558
192 "when", // 5667
193 "thou", // 5470
194 "thy", // 5469
195 "by", // 5434
196 "were", // 5192
197 "had", // 5109
198 "then", // 5105
199 "out", // 4778
200 // "man", // 4702
201 // "son", // 4701
202 "so", // 4689
203 // "king", // 4568
204 // "israel", // 4407
205 "there", // 4393
206 // "people", // 4355
207 "which", // 4253
208 "do", // 4032
209 "one", // 3998
210 "ye", // 3970
211 "up", // 3798
212 "thee", // 3780
213 "at", // 3767
214 "we", // 3725
215 "her", // 3583
216 "what", // 3545
217 "men", // 3482
218 "come", // 3404
219 "if", // 3380
220 "into", // 3284
221 "came", // 3283
222 // "land", // 3182
223 // "day", // 3168
224 "upon", // 3164
225 "before", // 3133
226 "or", // 3097
227 // "house", // 2997
228 "us", // 2886
229 "because", // 2879
230 "go", // 2869
231 // "against", // 2851
232 "an", // 2828
233 // "no", // 2711
234 "went", // 2597
235 "also", // 2586
236 "now", // 2571
237 "let", // 2548
238 // "made", // 2478
239 "hath", // 2450
240 "may", // 2418
241 "has", // 2406
242 "our", // 2361
243 "these", // 2356
244 // "down", // 2314
245 // "hand", // 2314
246 // "jesus", // 2255
247 // "children", // 2231
248 // "like", // 2180
249 // "over", // 2091
250 "o", // 2090
251 // "david", // 2089
252 // "father", // 2065
253 "am",
254 };
255 }