Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » index » [javadoc | source]
    1   package org.apache.lucene.index;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.IOException;
   21   import java.util.ArrayList;
   22   import java.util.Collection;
   23   
   24   import java.util.List;
   25   
   26   import org.apache.lucene.document.Document;
   27   import org.apache.lucene.index.IndexReader.FieldOption;
   28   import org.apache.lucene.index.MergePolicy.MergeAbortedException;
   29   import org.apache.lucene.store.Directory;
   30   import org.apache.lucene.store.IndexInput;
   31   import org.apache.lucene.store.IndexOutput;
   32   
   33   /**
   34    * The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add},
   35    * into a single Segment.  After adding the appropriate readers, call the merge method to combine the 
   36    * segments.
   37    *<P> 
   38    * If the compoundFile flag is set, then the segments will be merged into a compound file.
   39    *   
   40    * 
   41    * @see #merge
   42    * @see #add
   43    */
   44   final class SegmentMerger {
   45     
   46     /** norms header placeholder */
   47     static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; 
   48     
   49     private Directory directory;
   50     private String segment;
   51     private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
   52   
   53     private List<IndexReader> readers = new ArrayList<IndexReader>();
   54     private FieldInfos fieldInfos;
   55     
   56     private int mergedDocs;
   57   
   58     private final CheckAbort checkAbort;
   59   
   60     // Whether we should merge doc stores (stored fields and
   61     // vectors files).  When all segments we are merging
   62     // already share the same doc store files, we don't need
   63     // to merge the doc stores.
   64     private boolean mergeDocStores;
   65   
   66     /** Maximum number of contiguous documents to bulk-copy
   67         when merging stored fields */
   68     private final static int MAX_RAW_MERGE_DOCS = 4192;
   69   
   70     /** This ctor used only by test code.
   71      * 
   72      * @param dir The Directory to merge the other segments into
   73      * @param name The name of the new segment
   74      */
   75     SegmentMerger(Directory dir, String name) {
   76       directory = dir;
   77       segment = name;
   78       checkAbort = new CheckAbort(null, null) {
   79         @Override
   80         public void work(double units) throws MergeAbortedException {
   81           // do nothing
   82         }
   83       };
   84     }
   85   
   86     SegmentMerger(IndexWriter writer, String name, MergePolicy.OneMerge merge) {
   87       directory = writer.getDirectory();
   88       segment = name;
   89       if (merge != null) {
   90         checkAbort = new CheckAbort(merge, directory);
   91       } else {
   92         checkAbort = new CheckAbort(null, null) {
   93           @Override
   94           public void work(double units) throws MergeAbortedException {
   95             // do nothing
   96           }
   97         };
   98       }
   99       termIndexInterval = writer.getTermIndexInterval();
  100     }
  101     
  102     boolean hasProx() {
  103       return fieldInfos.hasProx();
  104     }
  105   
  106     /**
  107      * Add an IndexReader to the collection of readers that are to be merged
  108      * @param reader
  109      */
  110     final void add(IndexReader reader) {
  111       readers.add(reader);
  112     }
  113   
  114     /**
  115      * 
  116      * @param i The index of the reader to return
  117      * @return The ith reader to be merged
  118      */
  119     final IndexReader segmentReader(int i) {
  120       return readers.get(i);
  121     }
  122   
  123     /**
  124      * Merges the readers specified by the {@link #add} method into the directory passed to the constructor
  125      * @return The number of documents that were merged
  126      * @throws CorruptIndexException if the index is corrupt
  127      * @throws IOException if there is a low-level IO error
  128      */
  129     final int merge() throws CorruptIndexException, IOException {
  130       return merge(true);
  131     }
  132   
  133     /**
  134      * Merges the readers specified by the {@link #add} method
  135      * into the directory passed to the constructor.
  136      * @param mergeDocStores if false, we will not merge the
  137      * stored fields nor vectors files
  138      * @return The number of documents that were merged
  139      * @throws CorruptIndexException if the index is corrupt
  140      * @throws IOException if there is a low-level IO error
  141      */
  142     final int merge(boolean mergeDocStores) throws CorruptIndexException, IOException {
  143   
  144       this.mergeDocStores = mergeDocStores;
  145       
  146       // NOTE: it's important to add calls to
  147       // checkAbort.work(...) if you make any changes to this
  148       // method that will spend alot of time.  The frequency
  149       // of this check impacts how long
  150       // IndexWriter.close(false) takes to actually stop the
  151       // threads.
  152   
  153       mergedDocs = mergeFields();
  154       mergeTerms();
  155       mergeNorms();
  156   
  157       if (mergeDocStores && fieldInfos.hasVectors())
  158         mergeVectors();
  159   
  160       return mergedDocs;
  161     }
  162   
  163     /**
  164      * close all IndexReaders that have been added.
  165      * Should not be called before merge().
  166      * @throws IOException
  167      */
  168     final void closeReaders() throws IOException {
  169       for (final IndexReader reader : readers) {
  170         reader.close();
  171       }
  172     }
  173   
  174     final List<String> createCompoundFile(String fileName)
  175             throws IOException {
  176       CompoundFileWriter cfsWriter =
  177         new CompoundFileWriter(directory, fileName, checkAbort);
  178   
  179       List<String> files =
  180         new ArrayList<String>(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);    
  181       
  182       // Basic files
  183       for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
  184         String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
  185   
  186         if (ext.equals(IndexFileNames.PROX_EXTENSION) && !hasProx())
  187           continue;
  188   
  189         if (mergeDocStores || (!ext.equals(IndexFileNames.FIELDS_EXTENSION) &&
  190                               !ext.equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
  191           files.add(segment + "." + ext);
  192       }
  193   
  194       // Fieldable norm files
  195       for (int i = 0; i < fieldInfos.size(); i++) {
  196         FieldInfo fi = fieldInfos.fieldInfo(i);
  197         if (fi.isIndexed && !fi.omitNorms) {
  198           files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
  199           break;
  200         }
  201       }
  202   
  203       // Vector files
  204       if (fieldInfos.hasVectors() && mergeDocStores) {
  205         for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
  206           files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
  207         }
  208       }
  209   
  210       // Now merge all added files
  211       for (String file : files) {
  212         cfsWriter.addFile(file);
  213       }
  214       
  215       // Perform the merge
  216       cfsWriter.close();
  217      
  218       return files;
  219     }
  220   
  221     private void addIndexed(IndexReader reader, FieldInfos fInfos,
  222         Collection<String> names, boolean storeTermVectors,
  223         boolean storePositionWithTermVector, boolean storeOffsetWithTermVector,
  224         boolean storePayloads, boolean omitTFAndPositions)
  225         throws IOException {
  226       for (String field : names) {
  227         fInfos.add(field, true, storeTermVectors,
  228             storePositionWithTermVector, storeOffsetWithTermVector, !reader
  229                 .hasNorms(field), storePayloads, omitTFAndPositions);
  230       }
  231     }
  232   
  233     private SegmentReader[] matchingSegmentReaders;
  234     private int[] rawDocLengths;
  235     private int[] rawDocLengths2;
  236   
  237     private void setMatchingSegmentReaders() {
  238       // If the i'th reader is a SegmentReader and has
  239       // identical fieldName -> number mapping, then this
  240       // array will be non-null at position i:
  241       int numReaders = readers.size();
  242       matchingSegmentReaders = new SegmentReader[numReaders];
  243   
  244       // If this reader is a SegmentReader, and all of its
  245       // field name -> number mappings match the "merged"
  246       // FieldInfos, then we can do a bulk copy of the
  247       // stored fields:
  248       for (int i = 0; i < numReaders; i++) {
  249         IndexReader reader = readers.get(i);
  250         if (reader instanceof SegmentReader) {
  251           SegmentReader segmentReader = (SegmentReader) reader;
  252           boolean same = true;
  253           FieldInfos segmentFieldInfos = segmentReader.fieldInfos();
  254           int numFieldInfos = segmentFieldInfos.size();
  255           for (int j = 0; same && j < numFieldInfos; j++) {
  256             same = fieldInfos.fieldName(j).equals(segmentFieldInfos.fieldName(j));
  257           }
  258           if (same) {
  259             matchingSegmentReaders[i] = segmentReader;
  260           }
  261         }
  262       }
  263   
  264       // Used for bulk-reading raw bytes for stored fields
  265       rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
  266       rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
  267     }
  268   
  269     /**
  270      * 
  271      * @return The number of documents in all of the readers
  272      * @throws CorruptIndexException if the index is corrupt
  273      * @throws IOException if there is a low-level IO error
  274      */
  275     private final int mergeFields() throws CorruptIndexException, IOException {
  276   
  277       if (!mergeDocStores) {
  278         // When we are not merging by doc stores, their field
  279         // name -> number mapping are the same.  So, we start
  280         // with the fieldInfos of the last segment in this
  281         // case, to keep that numbering.
  282         final SegmentReader sr = (SegmentReader) readers.get(readers.size()-1);
  283         fieldInfos = (FieldInfos) sr.core.fieldInfos.clone();
  284       } else {
  285         fieldInfos = new FieldInfos();		  // merge field names
  286       }
  287   
  288       for (IndexReader reader : readers) {
  289         if (reader instanceof SegmentReader) {
  290           SegmentReader segmentReader = (SegmentReader) reader;
  291           FieldInfos readerFieldInfos = segmentReader.fieldInfos();
  292           int numReaderFieldInfos = readerFieldInfos.size();
  293           for (int j = 0; j < numReaderFieldInfos; j++) {
  294             FieldInfo fi = readerFieldInfos.fieldInfo(j);
  295             fieldInfos.add(fi.name, fi.isIndexed, fi.storeTermVector,
  296                 fi.storePositionWithTermVector, fi.storeOffsetWithTermVector,
  297                 !reader.hasNorms(fi.name), fi.storePayloads,
  298                 fi.omitTermFreqAndPositions);
  299           }
  300         } else {
  301           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
  302           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
  303           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
  304           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false);
  305           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true);
  306           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false);
  307           addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false);
  308           fieldInfos.add(reader.getFieldNames(FieldOption.UNINDEXED), false);
  309         }
  310       }
  311       fieldInfos.write(directory, segment + ".fnm");
  312   
  313       int docCount = 0;
  314   
  315       setMatchingSegmentReaders();
  316   
  317       if (mergeDocStores) {
  318         // merge field values
  319         final FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
  320   
  321         try {
  322           int idx = 0;
  323           for (IndexReader reader : readers) {
  324             final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
  325             FieldsReader matchingFieldsReader = null;
  326             if (matchingSegmentReader != null) {
  327               final FieldsReader fieldsReader = matchingSegmentReader.getFieldsReader();
  328               if (fieldsReader != null && fieldsReader.canReadRawDocs()) {            
  329                 matchingFieldsReader = fieldsReader;
  330               }
  331             }
  332             if (reader.hasDeletions()) {
  333               docCount += copyFieldsWithDeletions(fieldsWriter,
  334                                                   reader, matchingFieldsReader);
  335             } else {
  336               docCount += copyFieldsNoDeletions(fieldsWriter,
  337                                                 reader, matchingFieldsReader);
  338             }
  339           }
  340         } finally {
  341           fieldsWriter.close();
  342         }
  343   
  344         final String fileName = segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION;
  345         final long fdxFileLength = directory.fileLength(fileName);
  346   
  347         if (4+((long) docCount)*8 != fdxFileLength)
  348           // This is most likely a bug in Sun JRE 1.6.0_04/_05;
  349           // we detect that the bug has struck, here, and
  350           // throw an exception to prevent the corruption from
  351           // entering the index.  See LUCENE-1282 for
  352           // details.
  353           throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
  354   
  355       } else
  356         // If we are skipping the doc stores, that means there
  357         // are no deletions in any of these segments, so we
  358         // just sum numDocs() of each segment to get total docCount
  359         for (final IndexReader reader : readers) {
  360           docCount += reader.numDocs();
  361         }
  362   
  363       return docCount;
  364     }
  365   
  366     private int copyFieldsWithDeletions(final FieldsWriter fieldsWriter, final IndexReader reader,
  367                                         final FieldsReader matchingFieldsReader)
  368       throws IOException, MergeAbortedException, CorruptIndexException {
  369       int docCount = 0;
  370       final int maxDoc = reader.maxDoc();
  371       if (matchingFieldsReader != null) {
  372         // We can bulk-copy because the fieldInfos are "congruent"
  373         for (int j = 0; j < maxDoc;) {
  374           if (reader.isDeleted(j)) {
  375             // skip deleted docs
  376             ++j;
  377             continue;
  378           }
  379           // We can optimize this case (doing a bulk byte copy) since the field 
  380           // numbers are identical
  381           int start = j, numDocs = 0;
  382           do {
  383             j++;
  384             numDocs++;
  385             if (j >= maxDoc) break;
  386             if (reader.isDeleted(j)) {
  387               j++;
  388               break;
  389             }
  390           } while(numDocs < MAX_RAW_MERGE_DOCS);
  391           
  392           IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, start, numDocs);
  393           fieldsWriter.addRawDocuments(stream, rawDocLengths, numDocs);
  394           docCount += numDocs;
  395           checkAbort.work(300 * numDocs);
  396         }
  397       } else {
  398         for (int j = 0; j < maxDoc; j++) {
  399           if (reader.isDeleted(j)) {
  400             // skip deleted docs
  401             continue;
  402           }
  403           // NOTE: it's very important to first assign to doc then pass it to
  404           // termVectorsWriter.addAllDocVectors; see LUCENE-1282
  405           Document doc = reader.document(j);
  406           fieldsWriter.addDocument(doc);
  407           docCount++;
  408           checkAbort.work(300);
  409         }
  410       }
  411       return docCount;
  412     }
  413   
  414     private int copyFieldsNoDeletions(final FieldsWriter fieldsWriter, final IndexReader reader,
  415                                       final FieldsReader matchingFieldsReader)
  416       throws IOException, MergeAbortedException, CorruptIndexException {
  417       final int maxDoc = reader.maxDoc();
  418       int docCount = 0;
  419       if (matchingFieldsReader != null) {
  420         // We can bulk-copy because the fieldInfos are "congruent"
  421         while (docCount < maxDoc) {
  422           int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
  423           IndexInput stream = matchingFieldsReader.rawDocs(rawDocLengths, docCount, len);
  424           fieldsWriter.addRawDocuments(stream, rawDocLengths, len);
  425           docCount += len;
  426           checkAbort.work(300 * len);
  427         }
  428       } else {
  429         for (; docCount < maxDoc; docCount++) {
  430           // NOTE: it's very important to first assign to doc then pass it to
  431           // termVectorsWriter.addAllDocVectors; see LUCENE-1282
  432           Document doc = reader.document(docCount);
  433           fieldsWriter.addDocument(doc);
  434           checkAbort.work(300);
  435         }
  436       }
  437       return docCount;
  438     }
  439   
  440     /**
  441      * Merge the TermVectors from each of the segments into the new one.
  442      * @throws IOException
  443      */
  444     private final void mergeVectors() throws IOException {
  445       TermVectorsWriter termVectorsWriter = 
  446         new TermVectorsWriter(directory, segment, fieldInfos);
  447   
  448       try {
  449         int idx = 0;
  450         for (final IndexReader reader : readers) {
  451           final SegmentReader matchingSegmentReader = matchingSegmentReaders[idx++];
  452           TermVectorsReader matchingVectorsReader = null;
  453           if (matchingSegmentReader != null) {
  454             TermVectorsReader vectorsReader = matchingSegmentReader.getTermVectorsReaderOrig();
  455   
  456             // If the TV* files are an older format then they cannot read raw docs:
  457             if (vectorsReader != null && vectorsReader.canReadRawDocs()) {
  458               matchingVectorsReader = vectorsReader;
  459             }
  460           }
  461           if (reader.hasDeletions()) {
  462             copyVectorsWithDeletions(termVectorsWriter, matchingVectorsReader, reader);
  463           } else {
  464             copyVectorsNoDeletions(termVectorsWriter, matchingVectorsReader, reader);
  465             
  466           }
  467         }
  468       } finally {
  469         termVectorsWriter.close();
  470       }
  471   
  472       final String fileName = segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION;
  473       final long tvxSize = directory.fileLength(fileName);
  474   
  475       if (4+((long) mergedDocs)*16 != tvxSize)
  476         // This is most likely a bug in Sun JRE 1.6.0_04/_05;
  477         // we detect that the bug has struck, here, and
  478         // throw an exception to prevent the corruption from
  479         // entering the index.  See LUCENE-1282 for
  480         // details.
  481         throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + " file=" + fileName + " file exists?=" + directory.fileExists(fileName) + "; now aborting this merge to prevent index corruption");
  482     }
  483   
  484     private void copyVectorsWithDeletions(final TermVectorsWriter termVectorsWriter,
  485                                           final TermVectorsReader matchingVectorsReader,
  486                                           final IndexReader reader)
  487       throws IOException, MergeAbortedException {
  488       final int maxDoc = reader.maxDoc();
  489       if (matchingVectorsReader != null) {
  490         // We can bulk-copy because the fieldInfos are "congruent"
  491         for (int docNum = 0; docNum < maxDoc;) {
  492           if (reader.isDeleted(docNum)) {
  493             // skip deleted docs
  494             ++docNum;
  495             continue;
  496           }
  497           // We can optimize this case (doing a bulk byte copy) since the field 
  498           // numbers are identical
  499           int start = docNum, numDocs = 0;
  500           do {
  501             docNum++;
  502             numDocs++;
  503             if (docNum >= maxDoc) break;
  504             if (reader.isDeleted(docNum)) {
  505               docNum++;
  506               break;
  507             }
  508           } while(numDocs < MAX_RAW_MERGE_DOCS);
  509           
  510           matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
  511           termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
  512           checkAbort.work(300 * numDocs);
  513         }
  514       } else {
  515         for (int docNum = 0; docNum < maxDoc; docNum++) {
  516           if (reader.isDeleted(docNum)) {
  517             // skip deleted docs
  518             continue;
  519           }
  520           
  521           // NOTE: it's very important to first assign to vectors then pass it to
  522           // termVectorsWriter.addAllDocVectors; see LUCENE-1282
  523           TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
  524           termVectorsWriter.addAllDocVectors(vectors);
  525           checkAbort.work(300);
  526         }
  527       }
  528     }
  529     
  530     private void copyVectorsNoDeletions(final TermVectorsWriter termVectorsWriter,
  531                                         final TermVectorsReader matchingVectorsReader,
  532                                         final IndexReader reader)
  533         throws IOException, MergeAbortedException {
  534       final int maxDoc = reader.maxDoc();
  535       if (matchingVectorsReader != null) {
  536         // We can bulk-copy because the fieldInfos are "congruent"
  537         int docCount = 0;
  538         while (docCount < maxDoc) {
  539           int len = Math.min(MAX_RAW_MERGE_DOCS, maxDoc - docCount);
  540           matchingVectorsReader.rawDocs(rawDocLengths, rawDocLengths2, docCount, len);
  541           termVectorsWriter.addRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, len);
  542           docCount += len;
  543           checkAbort.work(300 * len);
  544         }
  545       } else {
  546         for (int docNum = 0; docNum < maxDoc; docNum++) {
  547           // NOTE: it's very important to first assign to vectors then pass it to
  548           // termVectorsWriter.addAllDocVectors; see LUCENE-1282
  549           TermFreqVector[] vectors = reader.getTermFreqVectors(docNum);
  550           termVectorsWriter.addAllDocVectors(vectors);
  551           checkAbort.work(300);
  552         }
  553       }
  554     }
  555   
  556     private SegmentMergeQueue queue = null;
  557   
  558     private final void mergeTerms() throws CorruptIndexException, IOException {
  559   
  560       SegmentWriteState state = new SegmentWriteState(null, directory, segment, null, mergedDocs, 0, termIndexInterval);
  561   
  562       final FormatPostingsFieldsConsumer consumer = new FormatPostingsFieldsWriter(state, fieldInfos);
  563   
  564       try {
  565         queue = new SegmentMergeQueue(readers.size());
  566   
  567         mergeTermInfos(consumer);
  568   
  569       } finally {
  570         consumer.finish();
  571         if (queue != null) queue.close();
  572       }
  573     }
  574   
  575     boolean omitTermFreqAndPositions;
  576   
  577     private final void mergeTermInfos(final FormatPostingsFieldsConsumer consumer) throws CorruptIndexException, IOException {
  578       int base = 0;
  579       final int readerCount = readers.size();
  580       for (int i = 0; i < readerCount; i++) {
  581         IndexReader reader = readers.get(i);
  582         TermEnum termEnum = reader.terms();
  583         SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
  584         int[] docMap  = smi.getDocMap();
  585         if (docMap != null) {
  586           if (docMaps == null) {
  587             docMaps = new int[readerCount][];
  588             delCounts = new int[readerCount];
  589           }
  590           docMaps[i] = docMap;
  591           delCounts[i] = smi.reader.maxDoc() - smi.reader.numDocs();
  592         }
  593         
  594         base += reader.numDocs();
  595   
  596         assert reader.numDocs() == reader.maxDoc() - smi.delCount;
  597   
  598         if (smi.next())
  599           queue.add(smi);				  // initialize queue
  600         else
  601           smi.close();
  602       }
  603   
  604       SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];
  605   
  606       String currentField = null;
  607       FormatPostingsTermsConsumer termsConsumer = null;
  608   
  609       while (queue.size() > 0) {
  610         int matchSize = 0;			  // pop matching terms
  611         match[matchSize++] = queue.pop();
  612         Term term = match[0].term;
  613         SegmentMergeInfo top = queue.top();
  614   
  615         while (top != null && term.compareTo(top.term) == 0) {
  616           match[matchSize++] =  queue.pop();
  617           top =  queue.top();
  618         }
  619   
  620         if (currentField != term.field) {
  621           currentField = term.field;
  622           if (termsConsumer != null)
  623             termsConsumer.finish();
  624           final FieldInfo fieldInfo = fieldInfos.fieldInfo(currentField);
  625           termsConsumer = consumer.addField(fieldInfo);
  626           omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions;
  627         }
  628   
  629         int df = appendPostings(termsConsumer, match, matchSize);		  // add new TermInfo
  630   
  631         checkAbort.work(df/3.0);
  632   
  633         while (matchSize > 0) {
  634           SegmentMergeInfo smi = match[--matchSize];
  635           if (smi.next())
  636             queue.add(smi);			  // restore queue
  637           else
  638             smi.close();				  // done with a segment
  639         }
  640       }
  641     }
  642   
  643     private byte[] payloadBuffer;
  644     private int[][] docMaps;
  645     int[][] getDocMaps() {
  646       return docMaps;
  647     }
  648     private int[] delCounts;
  649     int[] getDelCounts() {
  650       return delCounts;
  651     }
  652   
  653     /** Process postings from multiple segments all positioned on the
  654      *  same term. Writes out merged entries into freqOutput and
  655      *  the proxOutput streams.
  656      *
  657      * @param smis array of segments
  658      * @param n number of cells in the array actually occupied
  659      * @return number of documents across all segments where this term was found
  660      * @throws CorruptIndexException if the index is corrupt
  661      * @throws IOException if there is a low-level IO error
  662      */
  663     private final int appendPostings(final FormatPostingsTermsConsumer termsConsumer, SegmentMergeInfo[] smis, int n)
  664           throws CorruptIndexException, IOException {
  665   
  666       final FormatPostingsDocsConsumer docConsumer = termsConsumer.addTerm(smis[0].term.text);
  667       int df = 0;
  668       for (int i = 0; i < n; i++) {
  669         SegmentMergeInfo smi = smis[i];
  670         TermPositions postings = smi.getPositions();
  671         assert postings != null;
  672         int base = smi.base;
  673         int[] docMap = smi.getDocMap();
  674         postings.seek(smi.termEnum);
  675   
  676         while (postings.next()) {
  677           df++;
  678           int doc = postings.doc();
  679           if (docMap != null)
  680             doc = docMap[doc];                      // map around deletions
  681           doc += base;                              // convert to merged space
  682   
  683           final int freq = postings.freq();
  684           final FormatPostingsPositionsConsumer posConsumer = docConsumer.addDoc(doc, freq);
  685   
  686           if (!omitTermFreqAndPositions) {
  687             for (int j = 0; j < freq; j++) {
  688               final int position = postings.nextPosition();
  689               final int payloadLength = postings.getPayloadLength();
  690               if (payloadLength > 0) {
  691                 if (payloadBuffer == null || payloadBuffer.length < payloadLength)
  692                   payloadBuffer = new byte[payloadLength];
  693                 postings.getPayload(payloadBuffer, 0);
  694               }
  695               posConsumer.addPosition(position, payloadBuffer, 0, payloadLength);
  696             }
  697             posConsumer.finish();
  698           }
  699         }
  700       }
  701       docConsumer.finish();
  702   
  703       return df;
  704     }
  705   
  706     private void mergeNorms() throws IOException {
  707       byte[] normBuffer = null;
  708       IndexOutput output = null;
  709       try {
  710         int numFieldInfos = fieldInfos.size();
  711         for (int i = 0; i < numFieldInfos; i++) {
  712           FieldInfo fi = fieldInfos.fieldInfo(i);
  713           if (fi.isIndexed && !fi.omitNorms) {
  714             if (output == null) { 
  715               output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
  716               output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
  717             }
  718             for ( IndexReader reader : readers) {
  719               int maxDoc = reader.maxDoc();
  720               if (normBuffer == null || normBuffer.length < maxDoc) {
  721                 // the buffer is too small for the current segment
  722                 normBuffer = new byte[maxDoc];
  723               }
  724               reader.norms(fi.name, normBuffer, 0);
  725               if (!reader.hasDeletions()) {
  726                 //optimized case for segments without deleted docs
  727                 output.writeBytes(normBuffer, maxDoc);
  728               } else {
  729                 // this segment has deleted docs, so we have to
  730                 // check for every doc if it is deleted or not
  731                 for (int k = 0; k < maxDoc; k++) {
  732                   if (!reader.isDeleted(k)) {
  733                     output.writeByte(normBuffer[k]);
  734                   }
  735                 }
  736               }
  737               checkAbort.work(maxDoc);
  738             }
  739           }
  740         }
  741       } finally {
  742         if (output != null) { 
  743           output.close();
  744         }
  745       }
  746     }
  747   
  748     static class CheckAbort {
  749       private double workCount;
  750       private MergePolicy.OneMerge merge;
  751       private Directory dir;
  752       public CheckAbort(MergePolicy.OneMerge merge, Directory dir) {
  753         this.merge = merge;
  754         this.dir = dir;
  755       }
  756   
  757       /**
  758        * Records the fact that roughly units amount of work
  759        * have been done since this method was last called.
  760        * When adding time-consuming code into SegmentMerger,
  761        * you should test different values for units to ensure
  762        * that the time in between calls to merge.checkAborted
  763        * is up to ~ 1 second.
  764        */
  765       public void work(double units) throws MergePolicy.MergeAbortedException {
  766         workCount += units;
  767         if (workCount >= 10000.0) {
  768           merge.checkAborted(dir);
  769           workCount = 0;
  770         }
  771       }
  772     }
  773     
  774   }

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » index » [javadoc | source]