Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/eireneh/bible/control/dictionary/Grammar.java


1   
2   package com.eireneh.bible.control.dictionary;
3   
4   import com.eireneh.util.StringUtil;
5   
6   /**
7   * Grammar. 
8   * 
9   * <table border='1' cellPadding='3' cellSpacing='0' width="100%">
10  * <tr><td bgColor='white'class='TableRowColor'><font size='-7'>
11  * Distribution Licence:<br />
12  * Project B is free software; you can redistribute it
13  * and/or modify it under the terms of the GNU General Public License,
14  * version 2 as published by the Free Software Foundation.<br />
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * General Public License for more details.<br />
19  * The License is available on the internet
20  * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, by writing to
21  * <i>Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
22  * MA 02111-1307, USA</i>, Or locally at the Licence link below.<br />
23  * The copyright to this program is held by it's authors.
24  * </font></td></tr></table>
25  * @see <a href='http://www.eireneh.com/servlets/Web'>Project B Home</a>
26  * @see docs.Licence
27  * @author Joe Walker
28  */
29  public class Grammar
30  {
31      /**
32      * Strip of any parts of speech to leave a root word.
33      * This class may not be the best place for this code, however I'm not
34      * sure if we have a better place for it at the mo. Maybe it should be
35      * in passage.PassageUtil?
36      * @param word The word to strip
37      * @return The root word
38      */
39      public static String getRoot(String word)
40      {
41          for (int i=0; i<endings.length; i++)
42          {
43              if (word.endsWith(endings[i]))
44              {
45                  // Make the assumption that we never have 2 ending on a word
46                  return word.substring(0, word.length() - endings[i].length());
47              }
48          }
49  
50          return word;
51      }
52  
53      /**
54      * Is this word one of those small words that can slaughter a DB
55      * query. An empty string IS a small word.
56      * @param word The word to test
57      */
58      public static boolean isSmallWord(String word)
59      {
60          word = word.trim();
61          if (word.equals(""))
62              return true;
63  
64          for (int i=0; i<word_freq.length; i++)
65          {
66              if (word.equalsIgnoreCase(word_freq[i]))
67              {
68                  return true;
69              }
70          }
71  
72          return false;
73      }
74  
75      /**
76      * Is this word one of those small words that can slaughter a DB query
77      * @param word The word to test
78      */
79      public static String[] stripSmallWords(String[] words)
80      {
81          // How many long words are there?
82          int long_words = 0;
83          for (int i=0; i<words.length; i++)
84          {
85              if (!isSmallWord(words[i]))
86                  long_words++;
87          }
88  
89          // Create the array with just the long words
90          int count = 0;
91          String[] retcode = new String[long_words];
92          for (int i=0; i<words.length; i++)
93          {
94              if (!isSmallWord(words[i]))
95                  retcode[count++] = words[i];
96          }
97  
98          return retcode;
99      }
100 
101     /**
102     * Like PassageUtil.tokenize that leaves out the small words
103     * @param word The word to split up
104     * @param delims The word separators
105     * @return The long words in the string
106     */
107     public static String[] tokenizeWithoutSmallWords(String original, String delims)
108     {
109         String[] words = StringUtil.tokenize(original, delims);
110         int small_words = 0;
111 
112         for (int i=0; i<words.length; i++)
113         {
114             if (Grammar.isSmallWord(words[i]))
115             {
116                 small_words++;
117                 words[i] = null;
118             }
119         }
120 
121         String retcode[] = new String[words.length-small_words];
122         int count = 0;
123         for (int i=0; i<words.length; i++)
124         {
125             if (words[i] != null) retcode[count++] = words[i];
126         }
127 
128         return retcode;
129     }
130 
131     /**
132     * The Endings a word can have.
133     * These are matched in order so there is no point in having "s"
134     * before "es" because the second will not be tried.
135     */
136     private static final String[] endings =
137     {
138         "es",
139         "'s",
140         "s",
141         "ing",
142         "ed",
143         "er",
144         "ly",
145     };
146 
147     /**
148     * The one hundred most used words, and the instance count
149     */
150     private static final String[] word_freq = 
151     {
152         // word     instance count (in AV & NIV)
153         "the",      // 119135
154         "and",      // 81244
155         "of",       // 59417
156         "to",       // 43624
157         "in",       // 24233
158         "he",       // 20088
159         "that",     // 18672
160         "i",        // 17605
161         "a",        // 17439
162         "for",      // 16780
163         "you",      // 16324
164         "his",      // 15438
165 //      "lord",     // 15319
166         "is",       // 14304
167         "will",     // 13981
168         "they",     // 13942
169         "not",      // 12507
170         "with",     // 12125
171         "him",      // 12058
172         "it",       // 11834
173         "be",       // 11638
174         "them",     // 11608
175         "shall",    // 10833
176         "all",      // 10333
177         "my",       // 9547
178         "from",     // 9323
179         "was",      // 8530
180         "your",     // 8400
181 //      "god",      // 8381
182         "have",     // 8322
183         "me",       // 8102
184         "but",      // 7991
185         "their",    // 7638
186         "as",       // 7521
187         "who",      // 7425
188         "said",     // 7198
189         "are",      // 6981
190         "on",       // 6914
191         "this",     // 6558
192         "when",     // 5667
193         "thou",     // 5470
194         "thy",      // 5469
195         "by",       // 5434
196         "were",     // 5192
197         "had",      // 5109
198         "then",     // 5105
199         "out",      // 4778
200 //      "man",      // 4702
201 //      "son",      // 4701
202         "so",       // 4689
203 //      "king",     // 4568
204 //      "israel",   // 4407
205         "there",    // 4393
206 //      "people",   // 4355
207         "which",    // 4253
208         "do",       // 4032
209         "one",      // 3998
210         "ye",       // 3970
211         "up",       // 3798
212         "thee",     // 3780
213         "at",       // 3767
214         "we",       // 3725
215         "her",      // 3583
216         "what",     // 3545
217         "men",      // 3482
218         "come",     // 3404
219         "if",       // 3380
220         "into",     // 3284
221         "came",     // 3283
222 //      "land",     // 3182
223 //      "day",      // 3168
224         "upon",     // 3164
225         "before",   // 3133
226         "or",       // 3097
227 //      "house",    // 2997
228         "us",       // 2886
229         "because",  // 2879
230         "go",       // 2869
231 //      "against",  // 2851
232         "an",       // 2828
233 //      "no",       // 2711
234         "went",     // 2597
235         "also",     // 2586
236         "now",      // 2571
237         "let",      // 2548
238 //      "made",     // 2478
239         "hath",     // 2450
240         "may",      // 2418
241         "has",      // 2406
242         "our",      // 2361
243         "these",    // 2356
244 //      "down",     // 2314
245 //      "hand",     // 2314
246 //      "jesus",    // 2255
247 //      "children", // 2231
248 //      "like",     // 2180
249 //      "over",     // 2091
250         "o",        // 2090
251 //      "david",    // 2089
252 //      "father",   // 2065
253         "am",
254     };
255 }