1 package org.apache.lucene.index;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Collection;
23
24 import java.util.List;
25
26 import org.apache.lucene.document.Document;
27 import org.apache.lucene.index.IndexReader.FieldOption;
28 import org.apache.lucene.index.MergePolicy.MergeAbortedException;
29 import org.apache.lucene.store.Directory;
30 import org.apache.lucene.store.IndexInput;
31 import org.apache.lucene.store.IndexOutput;
32
33 /**
34 * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
35 * into a single Segment. After adding the appropriate readers, call the merge method to combine the
36 * segments.
37 *<P>
38 * If the compoundFile flag is set, then the segments will be merged into a compound file.
39 *
40 *
41 * @see #merge
42 * @see #add
43 */
44 final class SegmentMerger {
45
46 /** norms header placeholder */
47 static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1};
48
49 private Directory directory;
50 private String segment;
51 private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
52
53 private List<IndexReader> readers = new ArrayList<IndexReader>();
54 private FieldInfos fieldInfos;
55
56 private int mergedDocs;
57
58 private final CheckAbort checkAbort;
59
60 // Whether we should merge doc stores (stored fields and
61 // vectors files). When all segments we are merging
62 // already share the same doc store files, we don't need
63 // to merge the doc stores.
64 private boolean mergeDocStores;
65
66 /** Maximum number of contiguous documents to bulk-copy
67 when merging stored fields */
68 private final static int MAX_RAW_MERGE_DOCS = 4192;
69
70 /** This ctor used only by test code.
71 *
72 * @param dir The Directory to merge the other segments into
73 * @param name The name of the new segment
74 */
75 SegmentMerger(Directory dir, String name) {
76 directory = dir;
77 segment = name;
78 checkAbort = new CheckAbort(null, null) {
79 @Override
80 public void work(double units) throws MergeAbortedException {
81 // do nothing
82 }
83 };
84 }
85
86 SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) {
87 directory = writer.getDirectory();
88 segment = name;
89 if (merge != null) {
90 checkAbort = new CheckAbort(merge, directory);
91 } else {
92 checkAbort = new CheckAbort(null, null) {
93 @Override
94 public void work(double units) throws MergeAbortedException {
95 // do nothing
96 }
97 };
98 }
99 termIndexInterval = writer.getTermIndexInterval();
100 }
101
102 boolean hasProx() {
103 return fieldInfos.hasProx();
104 }
105
106 /**
107 * Add an IndexReader to the collection of readers that are to be merged
108 * @param reader
109 */
110 final void add(IndexReader reader) {
111 readers.add(reader);
112 }
113
114 /**
115 *
116 * @param i The index of the reader to return
117 * @return The ith reader to be merged
118 */
119 final IndexReader segmentReader(int i) {
120 return readers.get(i);
121 }
122
123 /**
124 * Merges the readers specified by the {@link #add} method into the directory passed to the constructor
125 * @return The number of documents that were merged
126 * @throws CorruptIndexException if the index is corrupt
127 * @throws IOException if there is a low-level IO error
128 */
129 final int merge() throws CorruptIndexException, IOException {
130 return merge(true);
131 }
132
133 /**
134 * Merges the readers specified by the {@link #add} method
135 * into the directory passed to the constructor.
136 * @param mergeDocStores if false, we will not merge the
137 * stored fields nor vectors files
138 * @return The number of documents that were merged
139 * @throws CorruptIndexException if the index is corrupt
140 * @throws IOException if there is a low-level IO error
141 */
142 final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException {
143
144 this.mergeDocStores = mergeDocStores;
145
146 // NOTE: it's important to add calls to
147 // checkAbort.work(...) if you make any changes to this
148 // method that will spend alot of time. The frequency
149 // of this check impacts how long
150 // IndexWriter.close(false) takes to actually stop the
151 // threads.
152
153 mergedDocs = mergeFields();
154 mergeTerms();
155 mergeNorms();
156
157 if (mergeDocStores && fieldInfos.hasVectors())
158 mergeVectors();
159
160 return mergedDocs;
161 }
162
163 /**
164 * close all IndexReaders that have been added.
165 * Should not be called before merge().
166 * @throws IOException
167 */
168 final void closeReaders() throws IOException {
169 for (final IndexReader reader : readers) {
170 reader.close();
171 }
172 }
173
174 final List<String> createCompoundFile(String fileName)
175 throws IOException {
176 CompoundFileWriter cfsWriter =
177 new CompoundFileWriter(directory, fileName, checkAbort);
178
179 List<String> files =
180 new ArrayList<String>(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);
181
182 // Basic files
183 for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
184 String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
185
186 if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx())
187 continue;
188
189 if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) &&
190 !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
191 files.add(segment + "." + ext);
192 }
193
194 // Fieldable norm files
195 for (int i = 0; i < fieldInfos.size(); i++) {
196 FieldInfo fi = fieldInfos.fieldInfo(i);
197 if (fi.isIndexed && !fi.omitNorms) {
198 files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
199 break;
200 }
201 }
202
203 // Vector files
204 if (fieldInfos.hasVectors() && mergeDocStores) {
205 for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
206 files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
207 }
208 }
209
210 // Now merge all added files
211 for (String file : files) {
212 cfsWriter.addFile(file);
213 }
214
215 // Perform the merge
216 cfsWriter.close();
217
218 return files;
219 }
220
221 private void addIndexed(IndexReader reader, FieldInfos fInfos,
222 Collection<String> names, boolean storeTermVectors,
223 boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
224 boolean storePayloads, boolean omitTFAndPositions)
225 throws IOException {
226 for (String field : names) {
227 fInfos.add(field, true, storeTermVectors,
228 storePositionWithTermVector, storeOffsetWithTermVector, !reader
229 .hasNorms(field), storePayloads, omitTFAndPositions);
230 }
231 }
232
233 private SegmentReader[] matchingSegmentReaders;
234 private int[] rawDocLengths;
235 private int[] rawDocLengths2;
236
237 private void setMatchingSegmentReaders() {
238 // If the i'th reader is a SegmentReader and has
239 // identical fieldName -> number mapping, then this
240 // array will be non-null at position i:
241 int numReaders = readers.size();
242 matchingSegmentReaders = new SegmentReader[numReaders];
243
244 // If this reader is a SegmentReader, and all of its
245 // field name -> number mappings match the "merged"
246 // FieldInfos, then we can do a bulk copy of the
247 // stored fields:
248 for (int i = 0; i < numReaders; i++) {
249 IndexReader reader = readers.get(i);
250 if (reader instanceof SegmentReader) {
251 SegmentReader segmentReader = (SegmentReader) reader;
252 boolean same = true;
253 FieldInfos segmentFieldInfos = segmentReader.fieldInfos();
254 int numFieldInfos = segmentFieldInfos.size();
255 for (int j = 0; same && j < numFieldInfos; j++) {
256 same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
257 }
258 if (same) {
259 matchingSegmentReaders[i] = segmentReader;
260 }
261 }
262 }
263
264 // Used for bulk-reading raw bytes for stored fields
265 rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
266 rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
267 }
268
269 /**
270 *
271 * @return The number of documents in all of the readers
272 * @throws CorruptIndexException if the index is corrupt
273 * @throws IOException if there is a low-level IO error
274 */
275 private final int mergeFields() throws CorruptIndexException, IOException {
276
277 if (!mergeDocStores) {
278 // When we are not merging by doc stores, their field
279 // name -> number mapping are the same. So, we start
280 // with the fieldInfos of the last segment in this
281 // case, to keep that numbering.
282 final SegmentReader sr = (SegmentReader) readers.get(readers.size()-1);
283 fieldInfos = (FieldInfos) sr.core.fieldInfos.clone();
284 } else {
285 fieldInfos = new FieldInfos(); // merge field names
286 }
287
288 for (IndexReader reader : readers) {
289 if (reader instanceof SegmentReader) {
290 SegmentReader segmentReader = (SegmentReader) reader;
291 FieldInfos readerFieldInfos = segmentReader.fieldInfos();
292 int numReaderFieldInfos = readerFieldInfos.size();
293 for (int j = 0; j < numReaderFieldInfos; j++) {
294 FieldInfo fi = readerFieldInfos.fieldInfo(j);
295 fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector,
296 fi.storePositionWithTermVector, fi.storeOffsetWithTermVector,
297 !reader.hasNorms(fi.name), fi.storePayloads,
298 fi.omitTermFreqAndPositions);
299 }
300 } else {
301 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
302 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
303 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
304 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false);
305 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true);
306 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false);
307 addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false);
308 fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false);
309 }
310 }
311 fieldInfos.write(directory, segment + ".fnm");
312
313 int docCount = 0;
314
315 setMatchingSegmentReaders();
316
317 if (mergeDocStores) {
318 // merge field values
319 final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
320
321 try {
322 int idx = 0;
323 for (IndexReader reader : readers) {
324 final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
325 FieldsReader matchingFieldsReader = null;
326 if (matchingSegmentReader != null) {
327 final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
328 if (fieldsReader != null && fieldsReader.canReadRawDocs()) {
329 matchingFieldsReader = fieldsReader;
330 }
331 }
332 if (reader.hasDeletions()) {
333 docCount += copyFieldsWithDeletions(fieldsWriter,
334 reader, matchingFieldsReader);
335 } else {
336 docCount += copyFieldsNoDeletions(fieldsWriter,
337 reader, matchingFieldsReader);
338 }
339 }
340 } finally {
341 fieldsWriter.close();
342 }
343
344 final String fileName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
345 final long fdxFileLength = directory.fileLength(fileName);
346
347 if (4+((long) docCount)*8 != fdxFileLength)
348 // This is most likely a bug in Sun JRE 1.6.0_04/_05;
349 // we detect that the bug has struck, here, and
350 // throw an exception to prevent the corruption from
351 // entering the index. See LUCENE-1282 for
352 // details.
353 throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
354
355 } else
356 // If we are skipping the doc stores, that means there
357 // are no deletions in any of these segments, so we
358 // just sum numDocs() of each segment to get total docCount
359 for (final IndexReader reader : readers) {
360 docCount += reader.numDocs();
361 }
362
363 return docCount;
364 }
365
366 private int copyFieldsWithDeletions(final FieldsWriter fieldsWriter, final IndexReader reader,
367 final FieldsReader matchingFieldsReader)
368 throws IOException, MergeAbortedException, CorruptIndexException {
369 int docCount = 0;
370 final int maxDoc = reader.maxDoc();
371 if (matchingFieldsReader != null) {
372 // We can bulk-copy because the fieldInfos are "congruent"
373 for (int j = 0; j < maxDoc;) {
374 if (reader.isDeleted(j)) {
375 // skip deleted docs
376 ++j;
377 continue;
378 }
379 // We can optimize this case (doing a bulk byte copy) since the field
380 // numbers are identical
381 int start = j, numDocs = 0;
382 do {
383 j++;
384 numDocs++;
385 if (j >= maxDoc) break;
386 if (reader.isDeleted(j)) {
387 j++;
388 break;
389 }
390 } while(numDocs < MAX_RAW_MERGE_DOCS);
391
392 IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
393 fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
394 docCount += numDocs;
395 checkAbort.work(300 * numDocs);
396 }
397 } else {
398 for (int j = 0; j < maxDoc; j++) {
399 if (reader.isDeleted(j)) {
400 // skip deleted docs
401 continue;
402 }
403 // NOTE: it's very important to first assign to doc then pass it to
404 // termVectorsWriter.addAllDocVectors; see LUCENE-1282
405 Document doc = reader.document(j);
406 fieldsWriter.addDocument(doc);
407 docCount++;
408 checkAbort.work(300);
409 }
410 }
411 return docCount;
412 }
413
414 private int copyFieldsNoDeletions(final FieldsWriter fieldsWriter, final IndexReader reader,
415 final FieldsReader matchingFieldsReader)
416 throws IOException, MergeAbortedException, CorruptIndexException {
417 final int maxDoc = reader.maxDoc();
418 int docCount = 0;
419 if (matchingFieldsReader != null) {
420 // We can bulk-copy because the fieldInfos are "congruent"
421 while (docCount < maxDoc) {
422 int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
423 IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len);
424 fieldsWriter.addRawDocuments(stream, rawDocLengths, len);
425 docCount += len;
426 checkAbort.work(300 * len);
427 }
428 } else {
429 for (; docCount < maxDoc; docCount++) {
430 // NOTE: it's very important to first assign to doc then pass it to
431 // termVectorsWriter.addAllDocVectors; see LUCENE-1282
432 Document doc = reader.document(docCount);
433 fieldsWriter.addDocument(doc);
434 checkAbort.work(300);
435 }
436 }
437 return docCount;
438 }
439
440 /**
441 * Merge the TermVectors from each of the segments into the new one.
442 * @throws IOException
443 */
444 private final void mergeVectors() throws IOException {
445 TermVectorsWriter termVectorsWriter =
446 new TermVectorsWriter(directory, segment, fieldInfos);
447
448 try {
449 int idx = 0;
450 for (final IndexReader reader : readers) {
451 final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
452 TermVectorsReader matchingVectorsReader = null;
453 if (matchingSegmentReader != null) {
454 TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReaderOrig();
455
456 // If the TV* files are an older format then they cannot read raw docs:
457 if (vectorsReader != null && vectorsReader.canReadRawDocs()) {
458 matchingVectorsReader = vectorsReader;
459 }
460 }
461 if (reader.hasDeletions()) {
462 copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
463 } else {
464 copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
465
466 }
467 }
468 } finally {
469 termVectorsWriter.close();
470 }
471
472 final String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
473 final long tvxSize = directory.fileLength(fileName);
474
475 if (4+((long) mergedDocs)*16 != tvxSize)
476 // This is most likely a bug in Sun JRE 1.6.0_04/_05;
477 // we detect that the bug has struck, here, and
478 // throw an exception to prevent the corruption from
479 // entering the index. See LUCENE-1282 for
480 // details.
481 throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
482 }
483
484 private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter,
485 final TermVectorsReader matchingVectorsReader,
486 final IndexReader reader)
487 throws IOException, MergeAbortedException {
488 final int maxDoc = reader.maxDoc();
489 if (matchingVectorsReader != null) {
490 // We can bulk-copy because the fieldInfos are "congruent"
491 for (int docNum = 0; docNum < maxDoc;) {
492 if (reader.isDeleted(docNum)) {
493 // skip deleted docs
494 ++docNum;
495 continue;
496 }
497 // We can optimize this case (doing a bulk byte copy) since the field
498 // numbers are identical
499 int start = docNum, numDocs = 0;
500 do {
501 docNum++;
502 numDocs++;
503 if (docNum >= maxDoc) break;
504 if (reader.isDeleted(docNum)) {
505 docNum++;
506 break;
507 }
508 } while(numDocs < MAX_RAW_MERGE_DOCS);
509
510 matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
511 termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
512 checkAbort.work(300 * numDocs);
513 }
514 } else {
515 for (int docNum = 0; docNum < maxDoc; docNum++) {
516 if (reader.isDeleted(docNum)) {
517 // skip deleted docs
518 continue;
519 }
520
521 // NOTE: it's very important to first assign to vectors then pass it to
522 // termVectorsWriter.addAllDocVectors; see LUCENE-1282
523 TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
524 termVectorsWriter.addAllDocVectors(vectors);
525 checkAbort.work(300);
526 }
527 }
528 }
529
530 private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter,
531 final TermVectorsReader matchingVectorsReader,
532 final IndexReader reader)
533 throws IOException, MergeAbortedException {
534 final int maxDoc = reader.maxDoc();
535 if (matchingVectorsReader != null) {
536 // We can bulk-copy because the fieldInfos are "congruent"
537 int docCount = 0;
538 while (docCount < maxDoc) {
539 int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
540 matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
541 termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
542 docCount += len;
543 checkAbort.work(300 * len);
544 }
545 } else {
546 for (int docNum = 0; docNum < maxDoc; docNum++) {
547 // NOTE: it's very important to first assign to vectors then pass it to
548 // termVectorsWriter.addAllDocVectors; see LUCENE-1282
549 TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
550 termVectorsWriter.addAllDocVectors(vectors);
551 checkAbort.work(300);
552 }
553 }
554 }
555
556 private SegmentMergeQueue queue = null;
557
558 private final void mergeTerms() throws CorruptIndexException, IOException {
559
560 SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval);
561
562 final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
563
564 try {
565 queue = new SegmentMergeQueue(readers.size());
566
567 mergeTermInfos(consumer);
568
569 } finally {
570 consumer.finish();
571 if (queue != null) queue.close();
572 }
573 }
574
575 boolean omitTermFreqAndPositions;
576
577 private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException {
578 int base = 0;
579 final int readerCount = readers.size();
580 for (int i = 0; i < readerCount; i++) {
581 IndexReader reader = readers.get(i);
582 TermEnum termEnum = reader.terms();
583 SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
584 int[] docMap = smi.getDocMap();
585 if (docMap != null) {
586 if (docMaps == null) {
587 docMaps = new int[readerCount][];
588 delCounts = new int[readerCount];
589 }
590 docMaps[i] = docMap;
591 delCounts[i] = smi.reader.maxDoc() - smi.reader.numDocs();
592 }
593
594 base += reader.numDocs();
595
596 assert reader.numDocs() == reader.maxDoc() - smi.delCount;
597
598 if (smi.next())
599 queue.add(smi); // initialize queue
600 else
601 smi.close();
602 }
603
604 SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
605
606 String currentField = null;
607 FormatPostingsTermsConsumer termsConsumer = null;
608
609 while (queue.size() > 0) {
610 int matchSize = 0; // pop matching terms
611 match[matchSize++] = queue.pop();
612 Term term = match[0].term;
613 SegmentMergeInfo top = queue.top();
614
615 while (top != null && term.compareTo(top.term) == 0) {
616 match[matchSize++] = queue.pop();
617 top = queue.top();
618 }
619
620 if (currentField != term.field) {
621 currentField = term.field;
622 if (termsConsumer != null)
623 termsConsumer.finish();
624 final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField);
625 termsConsumer = consumer.addField(fieldInfo);
626 omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
627 }
628
629 int df = appendPostings(termsConsumer, match, matchSize); // add new TermInfo
630
631 checkAbort.work(df/3.0);
632
633 while (matchSize > 0) {
634 SegmentMergeInfo smi = match[--matchSize];
635 if (smi.next())
636 queue.add(smi); // restore queue
637 else
638 smi.close(); // done with a segment
639 }
640 }
641 }
642
643 private byte[] payloadBuffer;
644 private int[][] docMaps;
645 int[][] getDocMaps() {
646 return docMaps;
647 }
648 private int[] delCounts;
649 int[] getDelCounts() {
650 return delCounts;
651 }
652
653 /** Process postings from multiple segments all positioned on the
654 * same term. Writes out merged entries into freqOutput and
655 * the proxOutput streams.
656 *
657 * @param smis array of segments
658 * @param n number of cells in the array actually occupied
659 * @return number of documents across all segments where this term was found
660 * @throws CorruptIndexException if the index is corrupt
661 * @throws IOException if there is a low-level IO error
662 */
663 private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
664 throws CorruptIndexException, IOException {
665
666 final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text);
667 int df = 0;
668 for (int i = 0; i < n; i++) {
669 SegmentMergeInfo smi = smis[i];
670 TermPositions postings = smi.getPositions();
671 assert postings != null;
672 int base = smi.base;
673 int[] docMap = smi.getDocMap();
674 postings.seek(smi.termEnum);
675
676 while (postings.next()) {
677 df++;
678 int doc = postings.doc();
679 if (docMap != null)
680 doc = docMap[doc]; // map around deletions
681 doc += base; // convert to merged space
682
683 final int freq = postings.freq();
684 final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq);
685
686 if (!omitTermFreqAndPositions) {
687 for (int j = 0; j < freq; j++) {
688 final int position = postings.nextPosition();
689 final int payloadLength = postings.getPayloadLength();
690 if (payloadLength > 0) {
691 if (payloadBuffer == null || payloadBuffer.length < payloadLength)
692 payloadBuffer = new byte[payloadLength];
693 postings.getPayload(payloadBuffer, 0);
694 }
695 posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
696 }
697 posConsumer.finish();
698 }
699 }
700 }
701 docConsumer.finish();
702
703 return df;
704 }
705
706 private void mergeNorms() throws IOException {
707 byte[] normBuffer = null;
708 IndexOutput output = null;
709 try {
710 int numFieldInfos = fieldInfos.size();
711 for (int i = 0; i < numFieldInfos; i++) {
712 FieldInfo fi = fieldInfos.fieldInfo(i);
713 if (fi.isIndexed && !fi.omitNorms) {
714 if (output == null) {
715 output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
716 output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
717 }
718 for ( IndexReader reader : readers) {
719 int maxDoc = reader.maxDoc();
720 if (normBuffer == null || normBuffer.length < maxDoc) {
721 // the buffer is too small for the current segment
722 normBuffer = new byte[maxDoc];
723 }
724 reader.norms(fi.name, normBuffer, 0);
725 if (!reader.hasDeletions()) {
726 //optimized case for segments without deleted docs
727 output.writeBytes(normBuffer, maxDoc);
728 } else {
729 // this segment has deleted docs, so we have to
730 // check for every doc if it is deleted or not
731 for (int k = 0; k < maxDoc; k++) {
732 if (!reader.isDeleted(k)) {
733 output.writeByte(normBuffer[k]);
734 }
735 }
736 }
737 checkAbort.work(maxDoc);
738 }
739 }
740 }
741 } finally {
742 if (output != null) {
743 output.close();
744 }
745 }
746 }
747
748 static class CheckAbort {
749 private double workCount;
750 private MergePolicy.OneMerge merge;
751 private Directory dir;
752 public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
753 this.merge = merge;
754 this.dir = dir;
755 }
756
757 /**
758 * Records the fact that roughly units amount of work
759 * have been done since this method was last called.
760 * When adding time-consuming code into SegmentMerger,
761 * you should test different values for units to ensure
762 * that the time in between calls to merge.checkAborted
763 * is up to ~ 1 second.
764 */
765 public void work(double units) throws MergePolicy.MergeAbortedException {
766 workCount += units;
767 if (workCount >= 10000.0) {
768 merge.checkAborted(dir);
769 workCount = 0;
770 }
771 }
772 }
773
774 }