1 package org.apache.lucene.index;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.analysis.TokenStream;
21 import org.apache.lucene.document;
22 import org.apache.lucene.store.Directory;
23 import org.apache.lucene.store.IndexInput;
24 import org.apache.lucene.store.AlreadyClosedException;
25 import org.apache.lucene.store.BufferedIndexInput;
26 import org.apache.lucene.util.CloseableThreadLocal;
27
28 import java.io.ByteArrayOutputStream;
29 import java.io.IOException;
30 import java.io.Reader;
31 import java.util.zip.DataFormatException;
32 import java.util.zip.Inflater;
33
34 /**
35 * Class responsible for access to stored document fields.
36 * <p/>
37 * It uses <segment>.fdt and <segment>.fdx; files.
38 *
39 * @version $Id: FieldsReader.java 713970 2008-11-14 10:29:11Z mikemccand $
40 */
41 final class FieldsReader {
42 private final FieldInfos fieldInfos;
43
44 // The main fieldStream, used only for cloning.
45 private final IndexInput cloneableFieldsStream;
46
47 // This is a clone of cloneableFieldsStream used for reading documents.
48 // It should not be cloned outside of a synchronized context.
49 private final IndexInput fieldsStream;
50
51 private final IndexInput indexStream;
52 private int numTotalDocs;
53 private int size;
54 private boolean closed;
55 private final int format;
56 private final int formatSize;
57
58 // The docID offset where our docs begin in the index
59 // file. This will be 0 if we have our own private file.
60 private int docStoreOffset;
61
62 private CloseableThreadLocal fieldsStreamTL = new CloseableThreadLocal();
63
64 FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
65 this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE, -1, 0);
66 }
67
68 FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
69 this(d, segment, fn, readBufferSize, -1, 0);
70 }
71
72 FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize, int docStoreOffset, int size) throws IOException {
73 boolean success = false;
74
75 try {
76 fieldInfos = fn;
77
78 cloneableFieldsStream = d.openInput(segment + "." + IndexFileNames.FIELDS_EXTENSION, readBufferSize);
79 indexStream = d.openInput(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION, readBufferSize);
80
81 // First version of fdx did not include a format
82 // header, but, the first int will always be 0 in that
83 // case
84 int firstInt = indexStream.readInt();
85 if (firstInt == 0)
86 format = 0;
87 else
88 format = firstInt;
89
90 if (format > FieldsWriter.FORMAT_CURRENT)
91 throw new CorruptIndexException("Incompatible format version: " + format + " expected "
92 + FieldsWriter.FORMAT_CURRENT + " or lower");
93
94 if (format > FieldsWriter.FORMAT)
95 formatSize = 4;
96 else
97 formatSize = 0;
98
99 if (format < FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
100 cloneableFieldsStream.setModifiedUTF8StringsMode();
101
102 fieldsStream = (IndexInput) cloneableFieldsStream.clone();
103
104 final long indexSize = indexStream.length()-formatSize;
105
106 if (docStoreOffset != -1) {
107 // We read only a slice out of this shared fields file
108 this.docStoreOffset = docStoreOffset;
109 this.size = size;
110
111 // Verify the file is long enough to hold all of our
112 // docs
113 assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset;
114 } else {
115 this.docStoreOffset = 0;
116 this.size = (int) (indexSize >> 3);
117 }
118
119 numTotalDocs = (int) (indexSize >> 3);
120 success = true;
121 } finally {
122 // With lock-less commits, it's entirely possible (and
123 // fine) to hit a FileNotFound exception above. In
124 // this case, we want to explicitly close any subset
125 // of things that were opened so that we don't have to
126 // wait for a GC to do so.
127 if (!success) {
128 close();
129 }
130 }
131 }
132
133 /**
134 * @throws AlreadyClosedException if this FieldsReader is closed
135 */
136 protected final void ensureOpen() throws AlreadyClosedException {
137 if (closed) {
138 throw new AlreadyClosedException("this FieldsReader is closed");
139 }
140 }
141
142 /**
143 * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
144 * lazy implementation of a Field. This means that the Fields values will not be accessible.
145 *
146 * @throws IOException
147 */
148 final void close() throws IOException {
149 if (!closed) {
150 if (fieldsStream != null) {
151 fieldsStream.close();
152 }
153 if (cloneableFieldsStream != null) {
154 cloneableFieldsStream.close();
155 }
156 if (indexStream != null) {
157 indexStream.close();
158 }
159 fieldsStreamTL.close();
160 closed = true;
161 }
162 }
163
164 final int size() {
165 return size;
166 }
167
168 private final void seekIndex(int docID) throws IOException {
169 indexStream.seek(formatSize + (docID + docStoreOffset) * 8L);
170 }
171
172 boolean canReadRawDocs() {
173 return format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
174 }
175
176 final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
177 seekIndex(n);
178 long position = indexStream.readLong();
179 fieldsStream.seek(position);
180
181 Document doc = new Document();
182 int numFields = fieldsStream.readVInt();
183 for (int i = 0; i < numFields; i++) {
184 int fieldNumber = fieldsStream.readVInt();
185 FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
186 FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
187
188 byte bits = fieldsStream.readByte();
189 assert bits <= FieldsWriter.FIELD_IS_COMPRESSED + FieldsWriter.FIELD_IS_TOKENIZED + FieldsWriter.FIELD_IS_BINARY;
190
191 boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
192 boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
193 boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
194 //TODO: Find an alternative approach here if this list continues to grow beyond the
195 //list of 5 or 6 currently here. See Lucene 762 for discussion
196 if (acceptField.equals(FieldSelectorResult.LOAD)) {
197 addField(doc, fi, binary, compressed, tokenize);
198 }
199 else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE)) {
200 addFieldForMerge(doc, fi, binary, compressed, tokenize);
201 }
202 else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
203 addField(doc, fi, binary, compressed, tokenize);
204 break;//Get out of this loop
205 }
206 else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
207 addFieldLazy(doc, fi, binary, compressed, tokenize);
208 }
209 else if (acceptField.equals(FieldSelectorResult.SIZE)){
210 skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
211 }
212 else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
213 addFieldSize(doc, fi, binary, compressed);
214 break;
215 }
216 else {
217 skipField(binary, compressed);
218 }
219 }
220
221 return doc;
222 }
223
224 /** Returns the length in bytes of each raw document in a
225 * contiguous range of length numDocs starting with
226 * startDocID. Returns the IndexInput (the fieldStream),
227 * already seeked to the starting point for startDocID.*/
228 final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException {
229 seekIndex(startDocID);
230 long startOffset = indexStream.readLong();
231 long lastOffset = startOffset;
232 int count = 0;
233 while (count < numDocs) {
234 final long offset;
235 final int docID = docStoreOffset + startDocID + count + 1;
236 assert docID <= numTotalDocs;
237 if (docID < numTotalDocs)
238 offset = indexStream.readLong();
239 else
240 offset = fieldsStream.length();
241 lengths[count++] = (int) (offset-lastOffset);
242 lastOffset = offset;
243 }
244
245 fieldsStream.seek(startOffset);
246
247 return fieldsStream;
248 }
249
250 /**
251 * Skip the field. We still have to read some of the information about the field, but can skip past the actual content.
252 * This will have the most payoff on large fields.
253 */
254 private void skipField(boolean binary, boolean compressed) throws IOException {
255 skipField(binary, compressed, fieldsStream.readVInt());
256 }
257
258 private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
259 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES || binary || compressed) {
260 fieldsStream.seek(fieldsStream.getFilePointer() + toRead);
261 } else {
262 // We need to skip chars. This will slow us down, but still better
263 fieldsStream.skipChars(toRead);
264 }
265 }
266
267 private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
268 if (binary) {
269 int toRead = fieldsStream.readVInt();
270 long pointer = fieldsStream.getFilePointer();
271 if (compressed) {
272 //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
273 doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer, binary));
274 } else {
275 //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
276 doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer, binary));
277 }
278 //Need to move the pointer ahead by toRead positions
279 fieldsStream.seek(pointer + toRead);
280 } else {
281 Field.Store store = Field.Store.YES;
282 Field.Index index = getIndexType(fi, tokenize);
283 Field.TermVector termVector = getTermVectorType(fi);
284
285 Fieldable f;
286 if (compressed) {
287 store = Field.Store.COMPRESS;
288 int toRead = fieldsStream.readVInt();
289 long pointer = fieldsStream.getFilePointer();
290 f = new LazyField(fi.name, store, toRead, pointer, binary);
291 //skip over the part that we aren't loading
292 fieldsStream.seek(pointer + toRead);
293 f.setOmitNorms(fi.omitNorms);
294 } else {
295 int length = fieldsStream.readVInt();
296 long pointer = fieldsStream.getFilePointer();
297 //Skip ahead of where we are by the length of what is stored
298 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES)
299 fieldsStream.seek(pointer+length);
300 else
301 fieldsStream.skipChars(length);
302 f = new LazyField(fi.name, store, index, termVector, length, pointer, binary);
303 f.setOmitNorms(fi.omitNorms);
304 }
305 doc.add(f);
306 }
307
308 }
309
310 // in merge mode we don't uncompress the data of a compressed field
311 private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
312 Object data;
313
314 if (binary || compressed) {
315 int toRead = fieldsStream.readVInt();
316 final byte[] b = new byte[toRead];
317 fieldsStream.readBytes(b, 0, b.length);
318 data = b;
319 } else {
320 data = fieldsStream.readString();
321 }
322
323 doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize));
324 }
325
326 private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException {
327
328 //we have a binary stored field, and it may be compressed
329 if (binary) {
330 int toRead = fieldsStream.readVInt();
331 final byte[] b = new byte[toRead];
332 fieldsStream.readBytes(b, 0, b.length);
333 if (compressed)
334 doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
335 else
336 doc.add(new Field(fi.name, b, Field.Store.YES));
337
338 } else {
339 Field.Store store = Field.Store.YES;
340 Field.Index index = getIndexType(fi, tokenize);
341 Field.TermVector termVector = getTermVectorType(fi);
342
343 Fieldable f;
344 if (compressed) {
345 store = Field.Store.COMPRESS;
346 int toRead = fieldsStream.readVInt();
347
348 final byte[] b = new byte[toRead];
349 fieldsStream.readBytes(b, 0, b.length);
350 f = new Field(fi.name, // field name
351 new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
352 store,
353 index,
354 termVector);
355 f.setOmitNorms(fi.omitNorms);
356 } else {
357 f = new Field(fi.name, // name
358 fieldsStream.readString(), // read value
359 store,
360 index,
361 termVector);
362 f.setOmitNorms(fi.omitNorms);
363 }
364 doc.add(f);
365 }
366 }
367
368 // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
369 // Read just the size -- caller must skip the field content to continue reading fields
370 // Return the size in bytes or chars, depending on field type
371 private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException {
372 int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size;
373 byte[] sizebytes = new byte[4];
374 sizebytes[0] = (byte) (bytesize>>>24);
375 sizebytes[1] = (byte) (bytesize>>>16);
376 sizebytes[2] = (byte) (bytesize>>> 8);
377 sizebytes[3] = (byte) bytesize ;
378 doc.add(new Field(fi.name, sizebytes, Field.Store.YES));
379 return size;
380 }
381
382 private Field.TermVector getTermVectorType(FieldInfo fi) {
383 Field.TermVector termVector = null;
384 if (fi.storeTermVector) {
385 if (fi.storeOffsetWithTermVector) {
386 if (fi.storePositionWithTermVector) {
387 termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
388 } else {
389 termVector = Field.TermVector.WITH_OFFSETS;
390 }
391 } else if (fi.storePositionWithTermVector) {
392 termVector = Field.TermVector.WITH_POSITIONS;
393 } else {
394 termVector = Field.TermVector.YES;
395 }
396 } else {
397 termVector = Field.TermVector.NO;
398 }
399 return termVector;
400 }
401
402 private Field.Index getIndexType(FieldInfo fi, boolean tokenize) {
403 Field.Index index;
404 if (fi.isIndexed && tokenize)
405 index = Field.Index.ANALYZED;
406 else if (fi.isIndexed && !tokenize)
407 index = Field.Index.NOT_ANALYZED;
408 else
409 index = Field.Index.NO;
410 return index;
411 }
412
413 /**
414 * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is
415 * loaded.
416 */
417 private class LazyField extends AbstractField implements Fieldable {
418 private int toRead;
419 private long pointer;
420
421 public LazyField(String name, Field.Store store, int toRead, long pointer, boolean isBinary) {
422 super(name, store, Field.Index.NO, Field.TermVector.NO);
423 this.toRead = toRead;
424 this.pointer = pointer;
425 this.isBinary = isBinary;
426 if (isBinary)
427 binaryLength = toRead;
428 lazy = true;
429 }
430
431 public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer, boolean isBinary) {
432 super(name, store, index, termVector);
433 this.toRead = toRead;
434 this.pointer = pointer;
435 this.isBinary = isBinary;
436 if (isBinary)
437 binaryLength = toRead;
438 lazy = true;
439 }
440
441 private IndexInput getFieldStream() {
442 IndexInput localFieldsStream = (IndexInput) fieldsStreamTL.get();
443 if (localFieldsStream == null) {
444 localFieldsStream = (IndexInput) cloneableFieldsStream.clone();
445 fieldsStreamTL.set(localFieldsStream);
446 }
447 return localFieldsStream;
448 }
449
450 /** The value of the field in Binary, or null. If null, the Reader value,
451 * String value, or TokenStream value is used. Exactly one of stringValue(),
452 * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
453 public byte[] binaryValue() {
454 return getBinaryValue(null);
455 }
456
457 /** The value of the field as a Reader, or null. If null, the String value,
458 * binary value, or TokenStream value is used. Exactly one of stringValue(),
459 * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
460 public Reader readerValue() {
461 ensureOpen();
462 return null;
463 }
464
465 /** The value of the field as a TokenStream, or null. If null, the Reader value,
466 * String value, or binary value is used. Exactly one of stringValue(),
467 * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
468 public TokenStream tokenStreamValue() {
469 ensureOpen();
470 return null;
471 }
472
473 /** The value of the field as a String, or null. If null, the Reader value,
474 * binary value, or TokenStream value is used. Exactly one of stringValue(),
475 * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
476 public String stringValue() {
477 ensureOpen();
478 if (isBinary)
479 return null;
480 else {
481 if (fieldsData == null) {
482 IndexInput localFieldsStream = getFieldStream();
483 try {
484 localFieldsStream.seek(pointer);
485 if (isCompressed) {
486 final byte[] b = new byte[toRead];
487 localFieldsStream.readBytes(b, 0, b.length);
488 fieldsData = new String(uncompress(b), "UTF-8");
489 } else {
490 if (format >= FieldsWriter.FORMAT_VERSION_UTF8_LENGTH_IN_BYTES) {
491 byte[] bytes = new byte[toRead];
492 localFieldsStream.readBytes(bytes, 0, toRead);
493 fieldsData = new String(bytes, "UTF-8");
494 } else {
495 //read in chars b/c we already know the length we need to read
496 char[] chars = new char[toRead];
497 localFieldsStream.readChars(chars, 0, toRead);
498 fieldsData = new String(chars);
499 }
500 }
501 } catch (IOException e) {
502 throw new FieldReaderException(e);
503 }
504 }
505 return (String) fieldsData;
506 }
507 }
508
509 public long getPointer() {
510 ensureOpen();
511 return pointer;
512 }
513
514 public void setPointer(long pointer) {
515 ensureOpen();
516 this.pointer = pointer;
517 }
518
519 public int getToRead() {
520 ensureOpen();
521 return toRead;
522 }
523
524 public void setToRead(int toRead) {
525 ensureOpen();
526 this.toRead = toRead;
527 }
528
529 public byte[] getBinaryValue(byte[] result) {
530 ensureOpen();
531
532 if (isBinary) {
533 if (fieldsData == null) {
534 // Allocate new buffer if result is null or too small
535 final byte[] b;
536 if (result == null || result.length < toRead)
537 b = new byte[toRead];
538 else
539 b = result;
540
541 IndexInput localFieldsStream = getFieldStream();
542
543 // Throw this IOException since IndexReader.document does so anyway, so probably not that big of a change for people
544 // since they are already handling this exception when getting the document
545 try {
546 localFieldsStream.seek(pointer);
547 localFieldsStream.readBytes(b, 0, toRead);
548 if (isCompressed == true) {
549 fieldsData = uncompress(b);
550 } else {
551 fieldsData = b;
552 }
553 } catch (IOException e) {
554 throw new FieldReaderException(e);
555 }
556
557 binaryOffset = 0;
558 binaryLength = toRead;
559 }
560
561 return (byte[]) fieldsData;
562 } else
563 return null;
564 }
565 }
566
567 private final byte[] uncompress(final byte[] input)
568 throws CorruptIndexException, IOException {
569
570 // Create an expandable byte array to hold the decompressed data
571 ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);
572
573 Inflater decompressor = new Inflater();
574
575 try {
576 decompressor.setInput(input);
577
578 // Decompress the data
579 byte[] buf = new byte[1024];
580 while (!decompressor.finished()) {
581 try {
582 int count = decompressor.inflate(buf);
583 bos.write(buf, 0, count);
584 }
585 catch (DataFormatException e) {
586 // this will happen if the field is not compressed
587 CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
588 newException.initCause(e);
589 throw newException;
590 }
591 }
592 } finally {
593 decompressor.end();
594 }
595
596 // Get the decompressed data
597 return bos.toByteArray();
598 }
599
600 // Instances of this class hold field properties and data
601 // for merge
602 final static class FieldForMerge extends AbstractField {
603 public String stringValue() {
604 return (String) this.fieldsData;
605 }
606
607 public Reader readerValue() {
608 // not needed for merge
609 return null;
610 }
611
612 public byte[] binaryValue() {
613 return (byte[]) this.fieldsData;
614 }
615
616 public TokenStream tokenStreamValue() {
617 // not needed for merge
618 return null;
619 }
620
621 public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
622 this.isStored = true;
623 this.fieldsData = value;
624 this.isCompressed = compressed;
625 this.isBinary = binary;
626 if (binary)
627 binaryLength = ((byte[]) value).length;
628
629 this.isTokenized = tokenize;
630
631 this.name = fi.name.intern();
632 this.isIndexed = fi.isIndexed;
633 this.omitNorms = fi.omitNorms;
634 this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
635 this.storePositionWithTermVector = fi.storePositionWithTermVector;
636 this.storeTermVector = fi.storeTermVector;
637 }
638
639 }
640 }