Source code: com/eireneh/bible/book/raw/RawBible.java
1
2 package com.eireneh.bible.book.raw;
3
4 import java.net.*;
5 import java.io.*;
6 import java.util.*;
7
8 import com.eireneh.util.*;
9 import com.eireneh.bible.book.*;
10 import com.eireneh.bible.passage.*;
11 import com.eireneh.bible.util.*;
12
13 /**
14 * RawBible is a custom Bible. It is designed to be:<ul>
15 * <li>Compact: So that the download time is as small as possible
16 * <li>Divisible: So that a download can be partial, and some text
17 * can be read whilst missing content like styles, notes, or
18 * even word case.
19 * </ul>
20 * <p>As a result of this is can be very slow, or very memory hungry.
21 * I guess that the technology developed here could be useful as a
22 * delivery format, but the timings I am getting from my benchmarks
23 * say "start again".</p>
24 *
25 * <p>There is a question mark over how this format will handle rich
26 * text. The dictionary lookup scheme can be very space efficient
27 * but I'm not sure how to embed strongs numbers with the same
28 * efficiency.</p>
29 *
30 * <p>The algorithm I have implemented here is not perfect. To get a list
31 * of the verses it gets 'wrong' see generate.log.
32 * There are 2 reasons for problems. The RawBible does not take note of
33 * double spaces. And we incorrectly capitalize hyphenated words at the
34 * beginning of sentances.</p>
35 *
36 * <p>This is in part converted from the VB code that I wrote ages ago
37 * that does asimilar job.</p>
38 * <pre>
39 * Public Sub WritePassage(sText As String, lPassageID As Long, bLang As Byte, lBibleID As Long)
40 *
41 * Static bItalic As Boolean
42 *
43 * Dim mWordInsts As Collection
44 *
45 * Dim iNext As Long
46 * Dim iTemp As Long
47 * Dim iLast As Long
48 * Dim bDash As Boolean
49 * Dim sWord As String
50 * Dim bThisItalic As Boolean
51 * Dim iStart As Long
52 * Dim iEnd As Long
53 * Dim sNote As String
54 * Dim mNotes As Collection
55 * Dim vNoteStr As Variant
56 * Dim iNumNotes As Long
57 * Dim lWordInstID As Long
58 *
59 * Set mWordInsts = New Collection
60 * iNext = 1
61 * iTemp = 1
62 * iLast = 1
63 * bDash = False
64 * iNumNotes = 1
65 *
66 * ' For each real word in the verse
67 * Do
68 *
69 * ' If this word contains a "{" then it is part of a comment
70 * ' and not a word. We need to strip out sets of comments
71 * Set mNotes = New Collection
72 * Do
73 * ' Decide how long this word is
74 * iNext = InStr(iLast, sText, " ")
75 * iTemp = InStr(iLast, sText, "--")
76 * If iTemp = iLast Then iTemp = 0
77 * If iTemp <> 0 And iTemp < iNext Then
78 * iNext = iTemp
79 * bDash = True
80 * Else
81 * bDash = False
82 * End If
83 *
84 * ' If this is the end add in the rest otherwise just add in this word
85 * If iNext = 0 Then
86 * sWord = Mid$(sText, iLast, Len(sText) - iLast + 1)
87 * Else
88 * sWord = Mid$(sText, iLast, iNext - iLast)
89 * End If
90 *
91 *
92 * ' Strip out the notes
93 * ' If this word is not a comment
94 * iStart = InStr(iLast, sText, "{")
95 * If iStart = 0 Then Exit Do
96 * If iStart > iLast Then Exit Do
97 *
98 * ' Check we have a start and an end
99 * iEnd = InStr(iLast, sText, "}")
100 *
101 * ' Add the note in
102 * sNote = Mid$(sText, iStart + 1, iEnd - iStart - 1)
103 * mNotes.Add sNote
104 *
105 * ' Adjust where we are looking for words
106 * iLast = iEnd + 2
107 * If iLast > Len(sText) Then
108 * iNext = 0
109 * sWord = ""
110 * Exit Do
111 * End If
112 * Loop
113 *
114 * ' Are there any notes to add?
115 * If mNotes.Count <> 0 Then
116 * ' If there is no previous word to add to then create one
117 * If mWordInsts.Count = 0 Then
118 * lWordInstID = WriteWordInst(lPassageID, 1, lBibleID)
119 * SetWordInstItalic lWordInstID, bItalic
120 * mWordInsts.Add lWordInstID
121 * End If
122 *
123 * ' So add the notes to the previous word
124 * For Each vNoteStr In mNotes
125 * sNote = vNoteStr
126 * WriteNote mWordInsts.Item(mWordInsts.Count), iNumNotes, sNote
127 * iNumNotes = iNumNotes + 1
128 * Next
129 * End If
130 * Set mNotes = Nothing
131 *
132 *
133 * ' Italics
134 * ' Do we have a start italic char
135 * If InStr(sWord, "[") Then
136 * bItalic = True
137 * sWord = RemoveChar(sWord, "[")
138 * End If
139 *
140 * ' Remember the state for this letter
141 * bThisItalic = bItalic
142 *
143 * ' do we have an end italic char
144 * If InStr(sWord, "]") Then
145 * bItalic = False
146 * sWord = RemoveChar(sWord, "]")
147 * End If
148 *
149 *
150 * ' Actually add the word in
151 * If sWord <> "" Then
152 * AddWord mWordInsts, sWord, lPassageID, bLang, lBibleID, bThisItalic
153 * End If
154 *
155 *
156 * ' Add one an extra one to the last used only for a Space split
157 * If bDash Then
158 * iLast = iNext
159 * Else
160 * iLast = iNext + 1
161 * End If
162 *
163 * Loop Until iNext = 0
164 * Set mWordInsts = Nothing
165 *
166 * End Sub
167 * </pre>
168 *
169 * <table border='1' cellPadding='3' cellSpacing='0' width="100%">
170 * <tr><td bgColor='white'class='TableRowColor'><font size='-7'>
171 * Distribution Licence:<br />
172 * Project B is free software; you can redistribute it
173 * and/or modify it under the terms of the GNU General Public License,
174 * version 2 as published by the Free Software Foundation.<br />
175 * This program is distributed in the hope that it will be useful,
176 * but WITHOUT ANY WARRANTY; without even the implied warranty of
177 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
178 * General Public License for more details.<br />
179 * The License is available on the internet
180 * <a href='http://www.gnu.org/copyleft/gpl.html'>here</a>, by writing to
181 * <i>Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
182 * MA 02111-1307, USA</i>, Or locally at the Licence link below.<br />
183 * The copyright to this program is held by it's authors.
184 * </font></td></tr></table>
185 * @see <a href='http://www.eireneh.com/servlets/Web'>Project B Home</a>
186 * @see docs.Licence
187 * @author Joe Walker
188 * @version D0.I0.T0
189 */
190 public class RawBible extends VersewiseBible
191 {
192 /**
193 * Create a new set of resources based on a URL.
194 */
195 public RawBible(String name, URL url, int mode) throws BookException
196 {
197 this.name = name;
198 this.dir = url;
199
200 setMode(mode);
201
202 if (mode != MODE_WRITE)
203 {
204 try
205 {
206 // The version information
207 URL prop_url = NetUtil.lengthenURL(url, "bible.properties");
208 InputStream prop_in = prop_url.openStream();
209 Properties prop = new Properties();
210 PropertiesUtil.load(prop, prop_in);
211 String version_name = prop.getProperty("Version");
212 version = VersionFactory.getVersion(version_name);
213 }
214 catch (Exception ex)
215 {
216 throw new BookException("raw_init", ex);
217 }
218 }
219
220 log.fine("Started RawBible url="+url+ " name="+name+" mode="+mode);
221 }
222
223 /**
224 * Does this Bible cache everything in memory or leave it on disk and
225 * then read it at query time.
226 * @return True if we are cacheing data in memory
227 */
228 public int getMode()
229 {
230 if (create)
231 {
232 return MODE_WRITE;
233 }
234 else
235 {
236 if (memory) return MODE_READ_MEMORY;
237 else return MODE_READ_DISK;
238 }
239 }
240
241 /**
242 * Does this Bible cache everything in memory or leave it on disk and
243 * then read it at query time. I wonder if this is an over complex and
244 * redundant function? Maybe it is something to simplify at some point.
245 * @param memory True if we are cacheing data in memory
246 */
247 public void setMode(int mode) throws BookException
248 {
249 // Do we need to patch-up if we have started reading.
250 if (started)
251 {
252 if (mode == MODE_WRITE)
253 throw new IllegalStateException("Can't change to write mode once reading has started.");
254
255 try
256 {
257 if (this.memory && mode != MODE_READ_MEMORY)
258 {
259 word_insts = new WordInstsDisk(this, create);
260 }
261 else if (!this.memory && mode == MODE_READ_MEMORY)
262 {
263 word_insts = new WordInstsMem(this, create);
264 }
265 }
266 catch (BookException ex)
267 {
268 throw ex;
269 }
270 catch (Exception ex)
271 {
272 throw new BookException("raw_bible_mode", ex);
273 }
274 }
275
276 // Change the state
277 switch (mode)
278 {
279 case MODE_READ_DISK:
280 create = false;
281 memory = false;
282 break;
283
284 case MODE_READ_MEMORY:
285 create = false;
286 memory = true;
287 break;
288
289 case MODE_WRITE:
290 create = true;
291 memory = false;
292 break;
293
294 default:
295 throw new IllegalArgumentException("Mode must be one of MODE_[WRITE|READ_[DISK|MEMORY]]");
296 }
297 }
298
299 /**
300 * Lazy initialization
301 */
302 public void init() throws BookException
303 {
304 if (started) return;
305 started = true;
306
307 // Without these we can't go on
308 try
309 {
310 word_items = new WordItemsMem(this, create);
311
312 if (memory) word_insts = new WordInstsMem(this, create);
313 else word_insts = new WordInstsDisk(this, create);
314 }
315 catch (Exception ex)
316 {
317 if (ex instanceof BookException) throw (BookException) ex;
318 throw new BookException("Error initializing resource. System error: "+ex);
319 }
320
321 // We can still produce text without these though so they
322 // should not except if the load fails.
323 StringBuffer messages = new StringBuffer();
324
325 if (memory) punc_insts = new PuncInstsMem(this, create, messages);
326 else punc_insts = new PuncInstsDisk(this, create, messages);
327
328 punc_items = new PuncItemsMem(this, create, messages);
329 case_insts = new CaseInstsMem(this, create, messages);
330 para_insts = new ParaInstsMem(this, create, messages);
331
332 // So if any of them have failed to load we have a record of it.
333 // We can carry on work fine, but shouldn't we be telling someone?
334
335 /** @todo work out if we should have this configurable */
336 //createSearchCache();
337 }
338
339 /**
340 * Shut the Bible down.
341 */
342 public void destroy() throws BookException
343 {
344 }
345
346 /**
347 * What driver is controlling this Bible?
348 * @return A BibleDriver relevant to this Bible
349 */
350 public BibleDriver getDriver()
351 {
352 return RawBibleDriver.driver;
353 }
354
355 /**
356 * Meta-Information: What name can I use to get this Bible in a call
357 * to Bibles.getBible(name);
358 * @return The name of this Bible
359 */
360 public String getName()
361 {
362 return name;
363 }
364
365 /**
366 * Meta-Information: What version of the Bible is this?
367 * @return A Version for this Bible
368 */
369 public Version getVersion()
370 {
371 return version;
372 }
373
374 /**
375 * Setup the Version information
376 * @param version The version that this Bible is becoming
377 */
378 public void setVersion(Version version)
379 {
380 this.version = version;
381 }
382
383 /**
384 * Create an String for the specified Verses
385 * @param range The verses to search for
386 * @return The Bible text
387 */
388 public String getText(VerseRange range) throws BookException
389 {
390 if (!started) init();
391
392 StringBuffer retcode = new StringBuffer();
393
394 Verse[] verses = range.toVerseArray();
395 for (int i=0; i<verses.length; i++)
396 {
397 int[] word_idxs = word_insts.getIndexes(verses[i]);
398 int[] case_idxs = case_insts.getIndexes(verses[i]);
399 int[] punc_idxs = punc_insts.getIndexes(verses[i]);
400
401 for (int j=0; j<word_idxs.length; j++)
402 {
403 String punc = null;
404 String word = null;
405
406 try
407 {
408 int punc_idx = punc_idxs[j];
409 int word_idx = word_idxs[j];
410 int case_idx = case_idxs[j];
411
412 punc = punc_items.getItem(punc_idx);
413 word = PassageUtil.setCase(word_items.getItem(word_idx), case_idx);
414 }
415 catch (Exception ex)
416 {
417 Reporter.informUser(this, ex);
418 }
419
420 retcode.append(punc);
421 retcode.append(word);
422 }
423
424 try
425 {
426 if (punc_idxs.length != 0)
427 retcode.append(punc_items.getItem(punc_idxs[punc_idxs.length-1]));
428 }
429 catch (Exception ex)
430 {
431 Reporter.informUser(this, ex);
432 }
433 }
434
435 return retcode.toString().trim();
436 }
437
438 /**
439 * Create an XML document for the specified Verses
440 * @param doc The XML document
441 * @param ref The verses to search for
442 */
443 public void getDocument(BibleEle doc, Passage ref) throws BookException
444 {
445 if (!started) init();
446
447 Enumeration en = ref.rangeElements();
448 while (en.hasMoreElements())
449 {
450 VerseRange range = (VerseRange) en.nextElement();
451 append(doc, range);
452 }
453 }
454
455 /**
456 * For a given word find a list of references to it
457 * @param word The text to search for
458 * @return The references to the word
459 */
460 public Passage findPassage(String word) throws BookException
461 {
462 if (!started) init();
463 if (word == null)
464 return PassageFactory.createPassage();
465
466 int word_idx = word_items.getIndex(word);
467
468 // Are we caching searches?
469 if (cache != null && cache[word_idx] != null)
470 {
471 return cache[word_idx];
472 }
473
474 // Do the real seacrh
475 Passage ref = PassageFactory.createPassage();
476 try
477 {
478 int total = Books.versesInBible();
479
480 for (int ord=1; ord<=total; ord++)
481 {
482 int[] word_items = word_insts.getIndexes(ord);
483 for (int i=0; i<word_items.length; i++)
484 {
485 if (word_items[i] == word_idx)
486 ref.add(new Verse(ord));
487 }
488 }
489 }
490 catch (NoSuchVerseException ex)
491 {
492 throw new BookException("raw_bible_find", ex);
493 }
494
495 return ref;
496 }
497
498 /**
499 * Retrieval: Get a list of the words used by this Version. This is
500 * not vital for normal display, however it is very useful for various
501 * things, not least of which is new Version generation. However if
502 * you are only looking to <i>display</i> from this Bible then you
503 * could skip this one.
504 * @return The references to the word
505 */
506 public Enumeration listWords() throws BookException
507 {
508 if (!started) init();
509 return word_items.getEnumeration();
510 }
511
512 /**
513 * Write the XML to disk. Now this code limits us to only having para
514 * marks at the start of verses, and in the NIV there are marks in the
515 * middle of verses. However all the data sources that I have (ex-OLB)
516 * have the same limitation so I'm not to bothered just yet.
517 * @param verse The verse to write
518 * @param text The data to write
519 */
520 public void setDocument(BibleEle doc) throws BookException
521 {
522 if (!started) init();
523
524 // For all of the sections
525 for (Enumeration sen=doc.getSectionEles(); sen.hasMoreElements(); )
526 {
527 SectionEle section = (SectionEle) sen.nextElement();
528
529 // For all of the Verses in the section
530 for (Enumeration ven=section.getRefEles(); ven.hasMoreElements(); )
531 {
532 RefEle vel = (RefEle) ven.nextElement();
533
534 Verse verse = vel.getVerse();
535 String text = vel.getPlainText();
536
537 // Is this verse part of a new paragraph?
538 boolean para = BookUtil.isNewPara(doc);
539 para_insts.setPara(para, verse);
540
541 // Chop the sentance into words.
542 String[] text_array = BookUtil.tokenize(text);
543
544 // The word index
545 String[] word_array = BookUtil.stripPunctuation(text_array);
546 int[] word_indexes = word_items.getIndex(word_array);
547 word_insts.setIndexes(word_indexes, verse);
548
549 // The punctuation index
550 String[] punc_array = BookUtil.stripWords(text_array);
551 int[] punc_indexes = punc_items.getIndex(punc_array);
552 punc_insts.setIndexes(punc_indexes, verse);
553
554 // The case index
555 int[] case_indexes = BookUtil.getCases(word_array);
556 case_insts.setIndexes(case_indexes, verse);
557 }
558 }
559 }
560
561 /**
562 * Save a list of found words. This has been dome already
563 * @param word The word to write
564 * @param ref The data to write
565 */
566 public void foundPassage(String word, Passage ref) throws BookException
567 {
568 }
569
570 /**
571 * Flush the data written to disk
572 */
573 public void flush() throws BookException
574 {
575 if (!started) init();
576
577 try
578 {
579 word_items.save();
580 word_insts.save();
581
582 punc_items.save();
583 punc_insts.save();
584
585 case_insts.save();
586 para_insts.save();
587
588 // generateSearchCache();
589
590 Properties prop = new Properties();
591 prop.put("Version", getVersion().getFullName());
592 URL prop_url = NetUtil.lengthenURL(dir, "bible.properties");
593 OutputStream prop_out = NetUtil.getOutputStream(prop_url);
594 PropertiesUtil.save(prop, prop_out, "RawBible Config");
595 }
596 catch (IOException ex)
597 {
598 throw new BookException("raw_bible_flush", ex);
599 }
600 }
601
602 /**
603 * The directory that holds the RawBible files
604 * @return The index file directory
605 */
606 public URL getBaseURL()
607 {
608 return dir;
609 }
610
611 /**
612 * Accessor for the list of Words. For testing only
613 */
614 protected WordItemsMem getWords()
615 {
616 return (WordItemsMem) word_items;
617 }
618
619 /**
620 * Accessor for the Verse/Words arrays. For testing only
621 */
622 protected WordInstsMem getWordData()
623 {
624 return (WordInstsMem) word_insts;
625 }
626
627 /**
628 * Part of the Bible interface - Get the text for this reference.
629 * Fetch the Bible text for a single reference from a PassageID and a Bible
630 */
631 protected void append(BibleEle doc, VerseRange range) throws BookException
632 {
633 try
634 {
635 Verse start = range.getStart();
636 Verse end = range.getEnd();
637 int start_id = Books.verseOrdinal(start.getRefArray());
638 int end_id = Books.verseOrdinal(end.getRefArray());
639
640 SectionEle section = doc.createSectionEle(range.getName(), "AV");
641
642 Verse[] array = range.toVerseArray();
643 for (int i=0; i<array.length; i++)
644 {
645 Verse verse = array[i];
646 String text = getText(new VerseRange(verse));
647 boolean para = para_insts.getPara(verse);
648
649 RefEle ref = section.createRefEle(verse, para);
650 ref.setPlainText(text);
651 }
652 }
653 catch (NoSuchVerseException ex)
654 {
655 throw new BookException("raw_bible_append", ex);
656 }
657 }
658
659 /**
660 * Create a cache to speed up searches.
661 */
662 protected void createSearchCache() throws BookException
663 {
664 try
665 {
666 // Create a passage for each word
667 cache = new Passage[word_items.size()];
668 for (int i=0; i<word_items.size(); i++)
669 {
670 cache[i] = PassageFactory.createPassage();
671 }
672
673 // For each verse in the Bible
674 for (int ord=1; ord<=Books.versesInBible(); ord++)
675 {
676 // and each word in the verse
677 int[] word_items = word_insts.getIndexes(ord);
678 for (int i=0; i<word_items.length; i++)
679 {
680 // add the word to that words passage
681 cache[word_items[i]].add(new Verse(ord));
682 }
683 }
684 }
685 catch (NoSuchVerseException ex)
686 {
687 throw new BookException("raw_bible_find", ex);
688 }
689 }
690
691 /**
692 * Create a cache to speed up searches.
693 */
694 protected void deleteSearchCache() throws BookException
695 {
696 cache = null;
697 }
698
699 /**
700 * Find a list of words that start with the given word
701 * @param word The word to search for
702 * @return An array of matches
703 */
704 public String[] getStartsWith(String word) throws BookException
705 {
706 return ((WordItemsMem) word_items).getStartsWith(word);
707 }
708
709 /** Constant for read-only, data in memory mode */
710 public static final int MODE_READ_MEMORY = 0;
711
712 /** Constant for read-only, data on disk mode */
713 public static final int MODE_READ_DISK = 1;
714
715 /** Constant for create mode */
716 public static final int MODE_WRITE = 2;
717
718 /** The directory that the data files are stored in */
719 private URL dir;
720
721 /** Are we in create mode? */
722 private boolean create;
723
724 /** Has init() been called? */
725 private boolean started = false;
726
727 /** The Source of Words */
728 private Items word_items;
729
730 /** The Source of Word Instances */
731 private Insts word_insts;
732
733 /** The source of Punctuation */
734 private Items punc_items;
735
736 /** The source of Punctuation Instances */
737 private Insts punc_insts;
738
739 /** The source of Case Instances */
740 private Insts case_insts;
741
742 /** The source of Para Instances */
743 private ParaInstsMem para_insts;
744
745 /** The name of this version */
746 private String name;
747
748 /** The cache of word searches */
749 private Passage[] cache;
750
751 /** Are we cacheing or in on disk mode */
752 private boolean memory = true;
753
754 /** The Version of the Bible that this produces */
755 private Version version;
756
757 /** The log stream */
758 protected static Logger log = Logger.getLogger("bible.book");
759 }