Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » index » [javadoc | source]
    1   package org.apache.lucene.index;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.IOException;
   21   
   22   import org.apache.lucene.store.Directory;
   23   import org.apache.lucene.store.BufferedIndexInput;
   24   import org.apache.lucene.util.cache.Cache;
   25   import org.apache.lucene.util.cache.SimpleLRUCache;
   26   import org.apache.lucene.util.CloseableThreadLocal;
   27   
   28   /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
   29    * Directory.  Pairs are accessed either by Term or by ordinal position the
   30    * set.  */
   31   
   32   final class TermInfosReader {
   33     private Directory directory;
   34     private String segment;
   35     private FieldInfos fieldInfos;
   36   
   37     private CloseableThreadLocal threadResources = new CloseableThreadLocal();
   38     private SegmentTermEnum origEnum;
   39     private long size;
   40   
   41     private Term[] indexTerms = null;
   42     private TermInfo[] indexInfos;
   43     private long[] indexPointers;
   44     
   45     private SegmentTermEnum indexEnum;
   46     
   47     private int indexDivisor = 1;
   48     private int totalIndexInterval;
   49   
   50     private final static int DEFAULT_CACHE_SIZE = 1024;
   51     
   52     /**
   53      * Per-thread resources managed by ThreadLocal
   54      */
   55     private static final class ThreadResources {
   56       SegmentTermEnum termEnum;
   57       
   58       // Used for caching the least recently looked-up Terms
   59       Cache termInfoCache;
   60     }
   61     
   62     TermInfosReader(Directory dir, String seg, FieldInfos fis)
   63          throws CorruptIndexException, IOException {
   64       this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
   65     }
   66   
   67     TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize)
   68          throws CorruptIndexException, IOException {
   69       boolean success = false;
   70   
   71       try {
   72         directory = dir;
   73         segment = seg;
   74         fieldInfos = fis;
   75   
   76         origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
   77             readBufferSize), fieldInfos, false);
   78         size = origEnum.size;
   79         totalIndexInterval = origEnum.indexInterval;
   80   
   81         indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
   82             readBufferSize), fieldInfos, true);
   83   
   84         success = true;
   85       } finally {
   86         // With lock-less commits, it's entirely possible (and
   87         // fine) to hit a FileNotFound exception above. In
   88         // this case, we want to explicitly close any subset
   89         // of things that were opened so that we don't have to
   90         // wait for a GC to do so.
   91         if (!success) {
   92           close();
   93         }
   94       }
   95     }
   96   
   97     public int getSkipInterval() {
   98       return origEnum.skipInterval;
   99     }
  100     
  101     public int getMaxSkipLevels() {
  102       return origEnum.maxSkipLevels;
  103     }
  104   
  105     /**
  106      * <p>Sets the indexDivisor, which subsamples the number
  107      * of indexed terms loaded into memory.  This has a
  108      * similar effect as {@link
  109      * IndexWriter#setTermIndexInterval} except that setting
  110      * must be done at indexing time while this setting can be
  111      * set per reader.  When set to N, then one in every
  112      * N*termIndexInterval terms in the index is loaded into
  113      * memory.  By setting this to a value > 1 you can reduce
  114      * memory usage, at the expense of higher latency when
  115      * loading a TermInfo.  The default value is 1.</p>
  116      *
  117      * <b>NOTE:</b> you must call this before the term
  118      * index is loaded.  If the index is already loaded,
  119      * an IllegalStateException is thrown.
  120      *
  121      + @throws IllegalStateException if the term index has
  122      * already been loaded into memory.
  123      */
  124     public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
  125       if (indexDivisor < 1)
  126         throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);
  127   
  128       if (indexTerms != null)
  129         throw new IllegalStateException("index terms are already loaded");
  130   
  131       this.indexDivisor = indexDivisor;
  132       totalIndexInterval = origEnum.indexInterval * indexDivisor;
  133     }
  134   
  135     /** Returns the indexDivisor.
  136      * @see #setIndexDivisor
  137      */
  138     public int getIndexDivisor() {
  139       return indexDivisor;
  140     }
  141     
  142     final void close() throws IOException {
  143       if (origEnum != null)
  144         origEnum.close();
  145       if (indexEnum != null)
  146         indexEnum.close();
  147       threadResources.close();
  148     }
  149   
  150     /** Returns the number of term/value pairs in the set. */
  151     final long size() {
  152       return size;
  153     }
  154   
  155     private ThreadResources getThreadResources() {
  156       ThreadResources resources = (ThreadResources)threadResources.get();
  157       if (resources == null) {
  158         resources = new ThreadResources();
  159         resources.termEnum = terms();
  160         // Cache does not have to be thread-safe, it is only used by one thread at the same time
  161         resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
  162         threadResources.set(resources);
  163       }
  164       return resources;
  165     }
  166   
  167     private synchronized void ensureIndexIsRead() throws IOException {
  168       if (indexTerms != null)                                    // index already read
  169         return;                                                  // do nothing
  170       try {
  171         int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index
  172   
  173         indexTerms = new Term[indexSize];
  174         indexInfos = new TermInfo[indexSize];
  175         indexPointers = new long[indexSize];
  176           
  177         for (int i = 0; indexEnum.next(); i++) {
  178           indexTerms[i] = indexEnum.term();
  179           indexInfos[i] = indexEnum.termInfo();
  180           indexPointers[i] = indexEnum.indexPointer;
  181           
  182           for (int j = 1; j < indexDivisor; j++)
  183               if (!indexEnum.next())
  184                   break;
  185         }
  186       } finally {
  187           indexEnum.close();
  188           indexEnum = null;
  189       }
  190     }
  191   
  192     /** Returns the offset of the greatest index entry which is less than or equal to term.*/
  193     private final int getIndexOffset(Term term) {
  194       int lo = 0;					  // binary search indexTerms[]
  195       int hi = indexTerms.length - 1;
  196   
  197       while (hi >= lo) {
  198         int mid = (lo + hi) >>> 1;
  199         int delta = term.compareTo(indexTerms[mid]);
  200         if (delta < 0)
  201   	hi = mid - 1;
  202         else if (delta > 0)
  203   	lo = mid + 1;
  204         else
  205   	return mid;
  206       }
  207       return hi;
  208     }
  209   
  210     private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
  211       enumerator.seek(indexPointers[indexOffset],
  212                      (indexOffset * totalIndexInterval) - 1,
  213                      indexTerms[indexOffset], indexInfos[indexOffset]);
  214     }
  215   
  216     /** Returns the TermInfo for a Term in the set, or null. */
  217     TermInfo get(Term term) throws IOException {
  218       return get(term, true);
  219     }
  220     
  221     /** Returns the TermInfo for a Term in the set, or null. */
  222     private TermInfo get(Term term, boolean useCache) throws IOException {
  223       if (size == 0) return null;
  224   
  225       ensureIndexIsRead();
  226       
  227       TermInfo ti;
  228       ThreadResources resources = getThreadResources();
  229       Cache cache = null;
  230       
  231       if (useCache) {
  232         cache = resources.termInfoCache;
  233         // check the cache first if the term was recently looked up
  234         ti = (TermInfo) cache.get(term);
  235         if (ti != null) {
  236           return ti;
  237         }
  238       }
  239       
  240       // optimize sequential access: first try scanning cached enum w/o seeking
  241       SegmentTermEnum enumerator = resources.termEnum;
  242       if (enumerator.term() != null                 // term is at or past current
  243   	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
  244   	    || term.compareTo(enumerator.term()) >= 0)) {
  245         int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
  246         if (indexTerms.length == enumOffset	  // but before end of block
  247       || term.compareTo(indexTerms[enumOffset]) < 0) {
  248          // no need to seek
  249   
  250           int numScans = enumerator.scanTo(term);
  251           if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
  252             ti = enumerator.termInfo();
  253             if (cache != null && numScans > 1) {
  254               // we only  want to put this TermInfo into the cache if
  255               // scanEnum skipped more than one dictionary entry.
  256               // This prevents RangeQueries or WildcardQueries to 
  257               // wipe out the cache when they iterate over a large numbers
  258               // of terms in order
  259               cache.put(term, ti);
  260             }
  261           } else {
  262             ti = null;
  263           }
  264   
  265           return ti;
  266         }  
  267       }
  268   
  269       // random-access: must seek
  270       seekEnum(enumerator, getIndexOffset(term));
  271       enumerator.scanTo(term);
  272       if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
  273         ti = enumerator.termInfo();
  274         if (cache != null) {
  275           cache.put(term, ti);
  276         }
  277       } else {
  278         ti = null;
  279       }
  280       return ti;
  281     }
  282   
  283     /** Returns the nth term in the set. */
  284     final Term get(int position) throws IOException {
  285       if (size == 0) return null;
  286   
  287       SegmentTermEnum enumerator = getThreadResources().termEnum;
  288       if (enumerator != null && enumerator.term() != null &&
  289           position >= enumerator.position &&
  290   	position < (enumerator.position + totalIndexInterval))
  291         return scanEnum(enumerator, position);      // can avoid seek
  292   
  293       seekEnum(enumerator, position/totalIndexInterval); // must seek
  294       return scanEnum(enumerator, position);
  295     }
  296   
  297     private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
  298       while(enumerator.position < position)
  299         if (!enumerator.next())
  300   	return null;
  301   
  302       return enumerator.term();
  303     }
  304   
  305     /** Returns the position of a Term in the set or -1. */
  306     final long getPosition(Term term) throws IOException {
  307       if (size == 0) return -1;
  308   
  309       ensureIndexIsRead();
  310       int indexOffset = getIndexOffset(term);
  311       
  312       SegmentTermEnum enumerator = getThreadResources().termEnum;
  313       seekEnum(enumerator, indexOffset);
  314   
  315       while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
  316   
  317       if (term.compareTo(enumerator.term()) == 0)
  318         return enumerator.position;
  319       else
  320         return -1;
  321     }
  322   
  323     /** Returns an enumeration of all the Terms and TermInfos in the set. */
  324     public SegmentTermEnum terms() {
  325       return (SegmentTermEnum)origEnum.clone();
  326     }
  327   
  328     /** Returns an enumeration of terms starting at or after the named term. */
  329     public SegmentTermEnum terms(Term term) throws IOException {
  330       // don't use the cache in this call because we want to reposition the
  331       // enumeration
  332       get(term, false);
  333       return (SegmentTermEnum)getThreadResources().termEnum.clone();
  334     }
  335   }

Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » index » [javadoc | source]