Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/eireneh/bible/book/raw/RawBible.java


1   
2   package com.eireneh.bible.book.raw;
3   
4   import java.net.*;
5   import java.io.*;
6   import java.util.*;
7   
8   import com.eireneh.util.*;
9   import com.eireneh.bible.book.*;
10  import com.eireneh.bible.passage.*;
11  import com.eireneh.bible.util.*;
12  
13  /**
14   * RawBible is a custom Bible. It is designed to be:<ul>
15   * <li>Compact: So that the download time is as small as possible
16   * <li>Divisible: So that a download can be partial, and some text
17   *     can be read whilst missing content like styles, notes, or
18   *     even word case.
19   * </ul>
20   * <p>As a result of this is can be very slow, or very memory hungry.
21   * I guess that the technology developed here could be useful as a
22   * delivery format, but the timings I am getting from my benchmarks
23   * say "start again".</p>
24   *
25   * <p>There is a question mark over how this format will handle rich
26   * text. The dictionary lookup scheme can be very space efficient
27   * but I'm not sure how to embed strongs numbers with the same
28   * efficiency.</p>
29   *
30   * <p>The algorithm I have implemented here is not perfect. To get a list
31   * of the verses it gets 'wrong' see generate.log.
32   * There are 2 reasons for problems. The RawBible does not take note of
33   * double spaces. And we incorrectly capitalize hyphenated words at the
34   * beginning of sentances.</p>
35   *
36   * <p>This is in part converted from the VB code that I wrote ages ago
37   * that does asimilar job.</p>
38   * <pre>
39   * Public Sub WritePassage(sText As String, lPassageID As Long, bLang As Byte, lBibleID As Long)
40   *
41   *   Static bItalic As Boolean
42   *
43   *   Dim mWordInsts As Collection
44   *
45   *   Dim iNext As Long
46   *   Dim iTemp As Long
47   *   Dim iLast As Long
48   *   Dim bDash As Boolean
49   *   Dim sWord As String
50   *   Dim bThisItalic As Boolean
51   *   Dim iStart As Long
52   *   Dim iEnd As Long
53   *   Dim sNote As String
54   *   Dim mNotes As Collection
55   *   Dim vNoteStr As Variant
56   *   Dim iNumNotes As Long
57   *   Dim lWordInstID As Long
58   *
59   *   Set mWordInsts = New Collection
60   *   iNext = 1
61   *   iTemp = 1
62   *   iLast = 1
63   *   bDash = False
64   *   iNumNotes = 1
65   *
66   *   ' For each real word in the verse
67   *   Do
68   *
69   *     ' If this word contains a "{" then it is part of a comment
70   *     ' and not a word. We need to strip out sets of comments
71   *     Set mNotes = New Collection
72   *     Do
73   *       ' Decide how long this word is
74   *       iNext = InStr(iLast, sText, " ")
75   *       iTemp = InStr(iLast, sText, "--")
76   *       If iTemp = iLast Then iTemp = 0
77   *       If iTemp <> 0 And iTemp < iNext Then
78   *         iNext = iTemp
79   *         bDash = True
80   *       Else
81   *         bDash = False
82   *       End If
83   *
84   *       ' If this is the end add in the rest otherwise just add in this word
85   *       If iNext = 0 Then
86   *         sWord = Mid$(sText, iLast, Len(sText) - iLast + 1)
87   *       Else
88   *         sWord = Mid$(sText, iLast, iNext - iLast)
89   *       End If
90   *
91   *
92   *       ' Strip out the notes
93   *       ' If this word is not a comment
94   *       iStart = InStr(iLast, sText, "{")
95   *       If iStart = 0 Then Exit Do
96   *       If iStart > iLast Then Exit Do
97   *
98   *       ' Check we have a start and an end
99   *       iEnd = InStr(iLast, sText, "}")
100  *
101  *       ' Add the note in
102  *       sNote = Mid$(sText, iStart + 1, iEnd - iStart - 1)
103  *       mNotes.Add sNote
104  *
105  *       ' Adjust where we are looking for words
106  *       iLast = iEnd + 2
107  *       If iLast > Len(sText) Then
108  *         iNext = 0
109  *         sWord = ""
110  *         Exit Do
111  *       End If
112  *     Loop
113  *
114  *     ' Are there any notes to add?
115  *     If mNotes.Count <> 0 Then
116  *       ' If there is no previous word to add to then create one
117  *       If mWordInsts.Count = 0 Then
118  *         lWordInstID = WriteWordInst(lPassageID, 1, lBibleID)
119  *         SetWordInstItalic lWordInstID, bItalic
120  *         mWordInsts.Add lWordInstID
121  *       End If
122  *
123  *       ' So add the notes to the previous word
124  *       For Each vNoteStr In mNotes
125  *         sNote = vNoteStr
126  *         WriteNote mWordInsts.Item(mWordInsts.Count), iNumNotes, sNote
127  *         iNumNotes = iNumNotes + 1
128  *       Next
129  *     End If
130  *     Set mNotes = Nothing
131  *
132  *
133  *     ' Italics
134  *     ' Do we have a start italic char
135  *     If InStr(sWord, "[") Then
136  *       bItalic = True
137  *       sWord = RemoveChar(sWord, "[")
138  *     End If
139  *
140  *     ' Remember the state for this letter
141  *     bThisItalic = bItalic
142  *
143  *     ' do we have an end italic char
144  *     If InStr(sWord, "]") Then
145  *       bItalic = False
146  *       sWord = RemoveChar(sWord, "]")
147  *     End If
148  *
149  *
150  *     ' Actually add the word in
151  *     If sWord <> "" Then
152  *       AddWord mWordInsts, sWord, lPassageID, bLang, lBibleID, bThisItalic
153  *     End If
154  *
155  *
156  *     ' Add one an extra one to the last used only for a Space split
157  *     If bDash Then
158  *       iLast = iNext
159  *     Else
160  *       iLast = iNext + 1
161  *     End If
162  *
163  *   Loop Until iNext = 0
164  *   Set mWordInsts = Nothing
165  *
166  * End Sub
167  * </pre>
168  *
169  * <table border='1' cellPadding='3' cellSpacing='0' width="100%">
170  * <tr><td bgColor='white'class='TableRowColor'><font size='-7'>
171  * Distribution Licence:<br />
172  * Project B is free software; you can redistribute it
173  * and/or modify it under the terms of the GNU General Public License,
174  * version 2 as published by the Free Software Foundation.<br />
175  * This program is distributed in the hope that it will be useful,
176  * but WITHOUT ANY WARRANTY; without even the implied warranty of
177  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
178  * General Public License for more details.<br />
179  * The License is available on the internet
180  * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, by writing to
181  * <i>Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
182  * MA 02111-1307, USA</i>, Or locally at the Licence link below.<br />
183  * The copyright to this program is held by it's authors.
184  * </font></td></tr></table>
185  * @see <a href='http://www.eireneh.com/servlets/Web'>Project B Home</a>
186  * @see docs.Licence
187  * @author Joe Walker
188  * @version D0.I0.T0
189  */
190 public class RawBible extends VersewiseBible
191 {
192     /**
193      * Create a new set of resources based on a URL.
194      */
195     public RawBible(String name, URL url, int mode) throws BookException
196     {
197         this.name = name;
198         this.dir = url;
199 
200         setMode(mode);
201 
202         if (mode != MODE_WRITE)
203         {
204             try
205             {
206                 // The version information
207                 URL prop_url = NetUtil.lengthenURL(url, "bible.properties");
208                 InputStream prop_in = prop_url.openStream();
209                 Properties prop = new Properties();
210                 PropertiesUtil.load(prop, prop_in);
211                 String version_name = prop.getProperty("Version");
212                 version = VersionFactory.getVersion(version_name);
213             }
214             catch (Exception ex)
215             {
216                 throw new BookException("raw_init", ex);
217             }
218         }
219 
220         log.fine("Started RawBible url="+url+ " name="+name+" mode="+mode);
221     }
222 
223     /**
224      * Does this Bible cache everything in memory or leave it on disk and
225      * then read it at query time.
226      * @return True if we are cacheing data in memory
227      */
228     public int getMode()
229     {
230         if (create)
231         {
232             return MODE_WRITE;
233         }
234         else
235         {
236             if (memory) return MODE_READ_MEMORY;
237             else        return MODE_READ_DISK;
238         }
239     }
240 
241     /**
242      * Does this Bible cache everything in memory or leave it on disk and
243      * then read it at query time. I wonder if this is an over complex and
244      * redundant function? Maybe it is something to simplify at some point.
245      * @param memory True if we are cacheing data in memory
246      */
247     public void setMode(int mode) throws BookException
248     {
249         // Do we need to patch-up if we have started reading.
250         if (started)
251         {
252             if (mode == MODE_WRITE)
253                 throw new IllegalStateException("Can't change to write mode once reading has started.");
254 
255             try
256             {
257                 if (this.memory && mode != MODE_READ_MEMORY)
258                 {
259                     word_insts = new WordInstsDisk(this, create);
260                 }
261                 else if (!this.memory && mode == MODE_READ_MEMORY)
262                 {
263                     word_insts = new WordInstsMem(this, create);
264                 }
265             }
266             catch (BookException ex)
267             {
268                 throw ex;
269             }
270             catch (Exception ex)
271             {
272                 throw new BookException("raw_bible_mode", ex);
273             }
274         }
275 
276         // Change the state
277         switch (mode)
278         {
279         case MODE_READ_DISK:
280             create = false;
281             memory = false;
282             break;
283 
284         case MODE_READ_MEMORY:
285             create = false;
286             memory = true;
287             break;
288 
289         case MODE_WRITE:
290             create = true;
291             memory = false;
292             break;
293 
294         default:
295             throw new IllegalArgumentException("Mode must be one of MODE_[WRITE|READ_[DISK|MEMORY]]");
296         }
297     }
298 
299     /**
300      * Lazy initialization
301      */
302     public void init() throws BookException
303     {
304         if (started) return;
305         started = true;
306 
307         // Without these we can't go on
308         try
309         {
310             word_items = new WordItemsMem(this, create);
311 
312             if (memory) word_insts = new WordInstsMem(this, create);
313             else        word_insts = new WordInstsDisk(this, create);
314         }
315         catch (Exception ex)
316         {
317             if (ex instanceof BookException) throw (BookException) ex;
318             throw new BookException("Error initializing resource. System error: "+ex);
319         }
320 
321         // We can still produce text without these though so they
322         // should not except if the load fails.
323         StringBuffer messages = new StringBuffer();
324 
325         if (memory) punc_insts = new PuncInstsMem(this, create, messages);
326         else        punc_insts = new PuncInstsDisk(this, create, messages);
327 
328         punc_items = new PuncItemsMem(this, create, messages);
329         case_insts = new CaseInstsMem(this, create, messages);
330         para_insts = new ParaInstsMem(this, create, messages);
331 
332         // So if any of them have failed to load we have a record of it.
333         // We can carry on work fine, but shouldn't we be telling someone?
334 
335         /** @todo work out if we should have this configurable */
336         //createSearchCache();
337     }
338 
339     /**
340      * Shut the Bible down.
341      */
342     public void destroy() throws BookException
343     {
344     }
345 
346     /**
347      * What driver is controlling this Bible?
348      * @return A BibleDriver relevant to this Bible
349      */
350     public BibleDriver getDriver()
351     {
352         return RawBibleDriver.driver;
353     }
354 
355     /**
356      * Meta-Information: What name can I use to get this Bible in a call
357      * to Bibles.getBible(name);
358      * @return The name of this Bible
359      */
360     public String getName()
361     {
362         return name;
363     }
364 
365     /**
366      * Meta-Information: What version of the Bible is this?
367      * @return A Version for this Bible
368      */
369     public Version getVersion()
370     {
371         return version;
372     }
373 
374     /**
375      * Setup the Version information
376      * @param version The version that this Bible is becoming
377      */
378     public void setVersion(Version version)
379     {
380         this.version = version;
381     }
382 
383     /**
384      * Create an String for the specified Verses
385      * @param range The verses to search for
386      * @return The Bible text
387      */
388     public String getText(VerseRange range) throws BookException
389     {
390         if (!started) init();
391 
392         StringBuffer retcode = new StringBuffer();
393 
394         Verse[] verses = range.toVerseArray();
395         for (int i=0; i<verses.length; i++)
396         {
397             int[] word_idxs = word_insts.getIndexes(verses[i]);
398             int[] case_idxs = case_insts.getIndexes(verses[i]);
399             int[] punc_idxs = punc_insts.getIndexes(verses[i]);
400 
401             for (int j=0; j<word_idxs.length; j++)
402             {
403                 String punc = null;
404                 String word = null;
405 
406                 try
407                 {
408                     int punc_idx = punc_idxs[j];
409                     int word_idx = word_idxs[j];
410                     int case_idx = case_idxs[j];
411 
412                     punc = punc_items.getItem(punc_idx);
413                     word = PassageUtil.setCase(word_items.getItem(word_idx), case_idx);
414                 }
415                 catch (Exception ex)
416                 {
417                     Reporter.informUser(this, ex);
418                 }
419 
420                 retcode.append(punc);
421                 retcode.append(word);
422             }
423 
424             try
425             {
426                 if (punc_idxs.length != 0)
427                     retcode.append(punc_items.getItem(punc_idxs[punc_idxs.length-1]));
428             }
429             catch (Exception ex)
430             {
431                 Reporter.informUser(this, ex);
432             }
433         }
434 
435         return retcode.toString().trim();
436     }
437 
438     /**
439      * Create an XML document for the specified Verses
440      * @param doc The XML document
441      * @param ref The verses to search for
442      */
443     public void getDocument(BibleEle doc, Passage ref) throws BookException
444     {
445         if (!started) init();
446 
447         Enumeration en = ref.rangeElements();
448         while (en.hasMoreElements())
449         {
450             VerseRange range = (VerseRange) en.nextElement();
451             append(doc, range);
452         }
453     }
454 
455     /**
456      * For a given word find a list of references to it
457      * @param word The text to search for
458      * @return The references to the word
459      */
460     public Passage findPassage(String word) throws BookException
461     {
462         if (!started) init();
463         if (word == null)
464             return PassageFactory.createPassage();
465 
466         int word_idx = word_items.getIndex(word);
467 
468         // Are we caching searches?
469         if (cache != null && cache[word_idx] != null)
470         {
471             return cache[word_idx];
472         }
473 
474         // Do the real seacrh
475         Passage ref = PassageFactory.createPassage();
476         try
477         {
478             int total = Books.versesInBible();
479 
480             for (int ord=1; ord<=total; ord++)
481             {
482                 int[] word_items = word_insts.getIndexes(ord);
483                 for (int i=0; i<word_items.length; i++)
484                 {
485                     if (word_items[i] == word_idx)
486                         ref.add(new Verse(ord));
487                 }
488             }
489         }
490         catch (NoSuchVerseException ex)
491         {
492             throw new BookException("raw_bible_find", ex);
493         }
494 
495         return ref;
496     }
497 
498     /**
499      * Retrieval: Get a list of the words used by this Version. This is
500      * not vital for normal display, however it is very useful for various
501      * things, not least of which is new Version generation. However if
502      * you are only looking to <i>display</i> from this Bible then you
503      * could skip this one.
504      * @return The references to the word
505      */
506     public Enumeration listWords() throws BookException
507     {
508         if (!started) init();
509         return word_items.getEnumeration();
510     }
511 
512     /**
513      * Write the XML to disk. Now this code limits us to only having para
514      * marks at the start of verses, and in the NIV there are marks in the
515      * middle of verses. However all the data sources that I have (ex-OLB)
516      * have the same limitation so I'm not to bothered just yet.
517      * @param verse The verse to write
518      * @param text The data to write
519      */
520     public void setDocument(BibleEle doc) throws BookException
521     {
522         if (!started) init();
523 
524         // For all of the sections
525         for (Enumeration sen=doc.getSectionEles(); sen.hasMoreElements(); )
526         {
527             SectionEle section = (SectionEle) sen.nextElement();
528 
529             // For all of the Verses in the section
530             for (Enumeration ven=section.getRefEles(); ven.hasMoreElements(); )
531             {
532                 RefEle vel = (RefEle) ven.nextElement();
533 
534                 Verse verse = vel.getVerse();
535                 String text = vel.getPlainText();
536 
537                 // Is this verse part of a new paragraph?
538                 boolean para = BookUtil.isNewPara(doc);
539                 para_insts.setPara(para, verse);
540 
541                 // Chop the sentance into words.
542                 String[] text_array = BookUtil.tokenize(text);
543 
544                 // The word index
545                 String[] word_array = BookUtil.stripPunctuation(text_array);
546                 int[] word_indexes = word_items.getIndex(word_array);
547                 word_insts.setIndexes(word_indexes, verse);
548 
549                 // The punctuation index
550                 String[] punc_array = BookUtil.stripWords(text_array);
551                 int[] punc_indexes = punc_items.getIndex(punc_array);
552                 punc_insts.setIndexes(punc_indexes, verse);
553 
554                 // The case index
555                 int[] case_indexes = BookUtil.getCases(word_array);
556                 case_insts.setIndexes(case_indexes, verse);
557             }
558         }
559     }
560 
561     /**
562      * Save a list of found words. This has been dome already
563      * @param word The word to write
564      * @param ref The data to write
565      */
566     public void foundPassage(String word, Passage ref) throws BookException
567     {
568     }
569 
570     /**
571      * Flush the data written to disk
572      */
573     public void flush() throws BookException
574     {
575         if (!started) init();
576 
577         try
578         {
579             word_items.save();
580             word_insts.save();
581 
582             punc_items.save();
583             punc_insts.save();
584 
585             case_insts.save();
586             para_insts.save();
587 
588             // generateSearchCache();
589 
590             Properties prop = new Properties();
591             prop.put("Version", getVersion().getFullName());
592             URL prop_url = NetUtil.lengthenURL(dir, "bible.properties");
593             OutputStream prop_out = NetUtil.getOutputStream(prop_url);
594             PropertiesUtil.save(prop, prop_out, "RawBible Config");
595         }
596         catch (IOException ex)
597         {
598             throw new BookException("raw_bible_flush", ex);
599         }
600     }
601 
602     /**
603      * The directory that holds the RawBible files
604      * @return The index file directory
605      */
606     public URL getBaseURL()
607     {
608         return dir;
609     }
610 
611     /**
612      * Accessor for the list of Words. For testing only
613      */
614     protected WordItemsMem getWords()
615     {
616         return (WordItemsMem) word_items;
617     }
618 
619     /**
620      * Accessor for the Verse/Words arrays. For testing only
621      */
622     protected WordInstsMem getWordData()
623     {
624         return (WordInstsMem) word_insts;
625     }
626 
627     /**
628      * Part of the Bible interface - Get the text for this reference.
629      * Fetch the Bible text for a single reference from a PassageID and a Bible
630      */
631     protected void append(BibleEle doc, VerseRange range) throws BookException
632     {
633         try
634         {
635             Verse start = range.getStart();
636             Verse end = range.getEnd();
637             int start_id = Books.verseOrdinal(start.getRefArray());
638             int end_id = Books.verseOrdinal(end.getRefArray());
639 
640             SectionEle section = doc.createSectionEle(range.getName(), "AV");
641 
642             Verse[] array = range.toVerseArray();
643             for (int i=0; i<array.length; i++)
644             {
645                 Verse verse = array[i];
646                 String text = getText(new VerseRange(verse));
647                 boolean para = para_insts.getPara(verse);
648 
649                 RefEle ref = section.createRefEle(verse, para);
650                 ref.setPlainText(text);
651             }
652         }
653         catch (NoSuchVerseException ex)
654         {
655             throw new BookException("raw_bible_append", ex);
656         }
657     }
658 
659     /**
660      * Create a cache to speed up searches.
661      */
662     protected void createSearchCache() throws BookException
663     {
664         try
665         {
666             // Create a passage for each word
667             cache = new Passage[word_items.size()];
668             for (int i=0; i<word_items.size(); i++)
669             {
670                 cache[i] = PassageFactory.createPassage();
671             }
672 
673             // For each verse in the Bible
674             for (int ord=1; ord<=Books.versesInBible(); ord++)
675             {
676                 // and each word in the verse
677                 int[] word_items = word_insts.getIndexes(ord);
678                 for (int i=0; i<word_items.length; i++)
679                 {
680                     // add the word to that words passage
681                     cache[word_items[i]].add(new Verse(ord));
682                 }
683             }
684         }
685         catch (NoSuchVerseException ex)
686         {
687             throw new BookException("raw_bible_find", ex);
688         }
689     }
690 
691     /**
692      * Create a cache to speed up searches.
693      */
694     protected void deleteSearchCache() throws BookException
695     {
696         cache = null;
697     }
698 
699     /**
700      * Find a list of words that start with the given word
701      * @param word The word to search for
702      * @return An array of matches
703      */
704     public String[] getStartsWith(String word) throws BookException
705     {
706         return ((WordItemsMem) word_items).getStartsWith(word);
707     }
708 
709     /** Constant for read-only, data in memory mode */
710     public static final int MODE_READ_MEMORY = 0;
711 
712     /** Constant for read-only, data on disk mode */
713     public static final int MODE_READ_DISK = 1;
714 
715     /** Constant for create mode */
716     public static final int MODE_WRITE = 2;
717 
718     /** The directory that the data files are stored in */
719     private URL dir;
720 
721     /** Are we in create mode? */
722     private boolean create;
723 
724     /** Has init() been called? */
725     private boolean started = false;
726 
727     /** The Source of Words */
728     private Items word_items;
729 
730     /** The Source of Word Instances */
731     private Insts word_insts;
732 
733     /** The source of Punctuation */
734     private Items punc_items;
735 
736     /** The source of Punctuation Instances */
737     private Insts punc_insts;
738 
739     /** The source of Case Instances */
740     private Insts case_insts;
741 
742     /** The source of Para Instances */
743     private ParaInstsMem para_insts;
744 
745     /** The name of this version */
746     private String name;
747 
748     /** The cache of word searches */
749     private Passage[] cache;
750 
751     /** Are we cacheing or in on disk mode */
752     private boolean memory = true;
753 
754     /** The Version of the Bible that this produces */
755     private Version version;
756 
757     /** The log stream */
758     protected static Logger log = Logger.getLogger("bible.book");
759 }