Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » index » [javadoc | source]
    1   package org.apache.lucene.index;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.document.Document;
   21   import org.apache.lucene.document.FieldSelector;
   22   import org.apache.lucene.search.Similarity;
   23   import org.apache.lucene.store;
   24   
   25   import java.io.File;
   26   import java.io.FileOutputStream;
   27   import java.io.IOException;
   28   import java.util.Arrays;
   29   import java.util.Collection;
   30   
   31   /** IndexReader is an abstract class, providing an interface for accessing an
   32    index.  Search of an index is done entirely through this abstract interface,
   33    so that any subclass which implements it is searchable.
   34   
   35    <p> Concrete subclasses of IndexReader are usually constructed with a call to
   36    one of the static <code>open()</code> methods, e.g. {@link #open(String)}.
   37   
   38    <p> For efficiency, in this API documents are often referred to via
   39    <i>document numbers</i>, non-negative integers which each name a unique
   40    document in the index.  These document numbers are ephemeral--they may change
   41    as documents are added to and deleted from an index.  Clients should thus not
   42    rely on a given document having the same number between sessions.
   43   
   44    <p> An IndexReader can be opened on a directory for which an IndexWriter is
   45    opened already, but it cannot be used to delete documents from the index then.
   46   
   47    <p>
   48    <b>NOTE</b>: for backwards API compatibility, several methods are not listed 
   49    as abstract, but have no useful implementations in this base class and 
   50    instead always throw UnsupportedOperationException.  Subclasses are 
   51    strongly encouraged to override these methods, but in many cases may not 
   52    need to.
   53    </p>
   54   
   55    <p>
   56   
   57    <b>NOTE</b>: as of 2.4, it's possible to open a read-only
   58    IndexReader using one of the static open methods that
   59    accepts the boolean readOnly parameter.  Such a reader has
   60    better concurrency as it's not necessary to synchronize on
   61    the isDeleted method.  Currently the default for readOnly
   62    is false, meaning if not specified you will get a
   63    read/write IndexReader.  But in 3.0 this default will
   64    change to true, meaning you must explicitly specify false
   65    if you want to make changes with the resulting IndexReader.
   66    </p>
   67   
   68    @version $Id: IndexReader.java 695510 2008-09-15 15:33:15Z otis $
   69   */
   70   public abstract class IndexReader {
   71   
   72     // NOTE: in 3.0 this will change to true
   73     final static boolean READ_ONLY_DEFAULT = false;
   74   
   75     /**
   76      * Constants describing field properties, for example used for
   77      * {@link IndexReader#getFieldNames(FieldOption)}.
   78      */
   79     public static final class FieldOption {
   80       private String option;
   81       private FieldOption() { }
   82       private FieldOption(String option) {
   83         this.option = option;
   84       }
   85       public String toString() {
   86         return this.option;
   87       }
   88       /** All fields */
   89       public static final FieldOption ALL = new FieldOption ("ALL");
   90       /** All indexed fields */
   91       public static final FieldOption INDEXED = new FieldOption ("INDEXED");
   92       /** All fields that store payloads */
   93       public static final FieldOption STORES_PAYLOADS = new FieldOption ("STORES_PAYLOADS");
   94       /** All fields that omit tf */
   95       public static final FieldOption OMIT_TF = new FieldOption ("OMIT_TF");
   96       /** All fields which are not indexed */
   97       public static final FieldOption UNINDEXED = new FieldOption ("UNINDEXED");
   98       /** All fields which are indexed with termvectors enabled */
   99       public static final FieldOption INDEXED_WITH_TERMVECTOR = new FieldOption ("INDEXED_WITH_TERMVECTOR");
  100       /** All fields which are indexed but don't have termvectors enabled */
  101       public static final FieldOption INDEXED_NO_TERMVECTOR = new FieldOption ("INDEXED_NO_TERMVECTOR");
  102       /** All fields with termvectors enabled. Please note that only standard termvector fields are returned */
  103       public static final FieldOption TERMVECTOR = new FieldOption ("TERMVECTOR");
  104       /** All fields with termvectors with position values enabled */
  105       public static final FieldOption TERMVECTOR_WITH_POSITION = new FieldOption ("TERMVECTOR_WITH_POSITION");
  106       /** All fields with termvectors with offset values enabled */
  107       public static final FieldOption TERMVECTOR_WITH_OFFSET = new FieldOption ("TERMVECTOR_WITH_OFFSET");
  108       /** All fields with termvectors with offset values and position values enabled */
  109       public static final FieldOption TERMVECTOR_WITH_POSITION_OFFSET = new FieldOption ("TERMVECTOR_WITH_POSITION_OFFSET");
  110     }
  111   
  112     private boolean closed;
  113     protected boolean hasChanges;
  114     
  115     private volatile int refCount;
  116     
  117     // for testing
  118     synchronized int getRefCount() {
  119       return refCount;
  120     }
  121     
  122     /**
  123      * Expert: increments the refCount of this IndexReader
  124      * instance.  RefCounts are used to determine when a
  125      * reader can be closed safely, i.e. as soon as there are
  126      * no more references.  Be sure to always call a
  127      * corresponding {@link #decRef}, in a finally clause;
  128      * otherwise the reader may never be closed.  Note that
  129      * {@link #close} simply calls decRef(), which means that
  130      * the IndexReader will not really be closed until {@link
  131      * #decRef} has been called for all outstanding
  132      * references.
  133      *
  134      * @see #decRef
  135      */
  136     public synchronized void incRef() {
  137       assert refCount > 0;
  138       ensureOpen();
  139       refCount++;
  140     }
  141   
  142     /**
  143      * Expert: decreases the refCount of this IndexReader
  144      * instance.  If the refCount drops to 0, then pending
  145      * changes (if any) are committed to the index and this
  146      * reader is closed.
  147      * 
  148      * @throws IOException in case an IOException occurs in commit() or doClose()
  149      *
  150      * @see #incRef
  151      */
  152     public synchronized void decRef() throws IOException {
  153       assert refCount > 0;
  154       ensureOpen();
  155       if (refCount == 1) {
  156         commit();
  157         doClose();
  158       }
  159       refCount--;
  160     }
  161     
  162     /** 
  163      * @deprecated will be deleted when IndexReader(Directory) is deleted
  164      * @see #directory()
  165      */
  166     private Directory directory;
  167   
  168     /**
  169      * Legacy Constructor for backwards compatibility.
  170      *
  171      * <p>
  172      * This Constructor should not be used, it exists for backwards 
  173      * compatibility only to support legacy subclasses that did not "own" 
  174      * a specific directory, but needed to specify something to be returned 
  175      * by the directory() method.  Future subclasses should delegate to the 
  176      * no arg constructor and implement the directory() method as appropriate.
  177      * 
  178      * @param directory Directory to be returned by the directory() method
  179      * @see #directory()
  180      * @deprecated - use IndexReader()
  181      */
  182     protected IndexReader(Directory directory) {
  183       this();
  184       this.directory = directory;
  185     }
  186     
  187     protected IndexReader() { 
  188       refCount = 1;
  189     }
  190     
  191     /**
  192      * @throws AlreadyClosedException if this IndexReader is closed
  193      */
  194     protected final void ensureOpen() throws AlreadyClosedException {
  195       if (refCount <= 0) {
  196         throw new AlreadyClosedException("this IndexReader is closed");
  197       }
  198     }
  199   
  200     /** Returns a read/write IndexReader reading the index in an FSDirectory in the named
  201      path.  <b>NOTE</b>: starting in 3.0 this will return a readOnly IndexReader.
  202      * @throws CorruptIndexException if the index is corrupt
  203      * @throws IOException if there is a low-level IO error
  204      * @param path the path to the index directory */
  205     public static IndexReader open(String path) throws CorruptIndexException, IOException {
  206       return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT);
  207     }
  208   
  209     /** Returns a read/write IndexReader reading the index in an FSDirectory in the named
  210      * path.  <b>NOTE</b>: starting in 3.0 this will return a readOnly IndexReader.
  211      * @param path the path to the index directory
  212      * @throws CorruptIndexException if the index is corrupt
  213      * @throws IOException if there is a low-level IO error
  214      */
  215     public static IndexReader open(File path) throws CorruptIndexException, IOException {
  216       return open(FSDirectory.getDirectory(path), true, null, null, READ_ONLY_DEFAULT);
  217     }
  218   
  219     /** Returns a read/write IndexReader reading the index in
  220      * the given Directory. <b>NOTE</b>: starting in 3.0 this
  221      * will return a readOnly IndexReader.
  222      * @param directory the index directory
  223      * @throws CorruptIndexException if the index is corrupt
  224      * @throws IOException if there is a low-level IO error
  225      */
  226     public static IndexReader open(final Directory directory) throws CorruptIndexException, IOException {
  227       return open(directory, false, null, null, READ_ONLY_DEFAULT);
  228     }
  229   
  230     /** Returns a read/write or read only IndexReader reading the index in the given Directory.
  231      * @param directory the index directory
  232      * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
  233      * @throws CorruptIndexException if the index is corrupt
  234      * @throws IOException if there is a low-level IO error
  235      */
  236     public static IndexReader open(final Directory directory, boolean readOnly) throws CorruptIndexException, IOException {
  237       return open(directory, false, null, null, readOnly);
  238     }
  239   
  240     /** Expert: returns a read/write IndexReader reading the index in the given
  241      * {@link IndexCommit}.  <b>NOTE</b>: starting in 3.0 this
  242      * will return a readOnly IndexReader.
  243      * @param commit the commit point to open
  244      * @throws CorruptIndexException if the index is corrupt
  245      * @throws IOException if there is a low-level IO error
  246      */
  247     public static IndexReader open(final IndexCommit commit) throws CorruptIndexException, IOException {
  248       return open(commit.getDirectory(), false, null, commit, READ_ONLY_DEFAULT);
  249     }
  250   
  251     /** Expert: returns a read/write IndexReader reading the index in the given
  252      * Directory, with a custom {@link IndexDeletionPolicy}.
  253      * <b>NOTE</b>: starting in 3.0 this will return a
  254      * readOnly IndexReader.
  255      * @param directory the index directory
  256      * @param deletionPolicy a custom deletion policy (only used
  257      *  if you use this reader to perform deletes or to set
  258      *  norms); see {@link IndexWriter} for details.
  259      * @throws CorruptIndexException if the index is corrupt
  260      * @throws IOException if there is a low-level IO error
  261      */
  262     public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException {
  263       return open(directory, false, deletionPolicy, null, READ_ONLY_DEFAULT);
  264     }
  265   
  266     /** Expert: returns a read/write or read only IndexReader reading the index in the given
  267      * Directory, with a custom {@link IndexDeletionPolicy}.
  268      * <b>NOTE</b>: starting in 3.0 this will return a
  269      * readOnly IndexReader.
  270      * @param directory the index directory
  271      * @param deletionPolicy a custom deletion policy (only used
  272      *  if you use this reader to perform deletes or to set
  273      *  norms); see {@link IndexWriter} for details.
  274      * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
  275      * @throws CorruptIndexException if the index is corrupt
  276      * @throws IOException if there is a low-level IO error
  277      */
  278     public static IndexReader open(final Directory directory, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
  279       return open(directory, false, deletionPolicy, null, readOnly);
  280     }
  281   
  282     /** Expert: returns a read/write IndexReader reading the index in the given
  283      * Directory, using a specific commit and with a custom
  284      * {@link IndexDeletionPolicy}.  <b>NOTE</b>: starting in
  285      * 3.0 this will return a readOnly IndexReader.
  286      * @param commit the specific {@link IndexCommit} to open;
  287      * see {@link IndexReader#listCommits} to list all commits
  288      * in a directory
  289      * @param deletionPolicy a custom deletion policy (only used
  290      *  if you use this reader to perform deletes or to set
  291      *  norms); see {@link IndexWriter} for details.
  292      * @throws CorruptIndexException if the index is corrupt
  293      * @throws IOException if there is a low-level IO error
  294      */
  295     public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy) throws CorruptIndexException, IOException {
  296       return open(commit.getDirectory(), false, deletionPolicy, commit, READ_ONLY_DEFAULT);
  297     }
  298   
  299     /** Expert: returns a read/write or read only IndexReader reading the index in the given
  300      * Directory, using a specific commit and with a custom {@link IndexDeletionPolicy}.
  301      * @param commit the specific {@link IndexCommit} to open;
  302      * see {@link IndexReader#listCommits} to list all commits
  303      * in a directory
  304      * @param deletionPolicy a custom deletion policy (only used
  305      *  if you use this reader to perform deletes or to set
  306      *  norms); see {@link IndexWriter} for details.
  307      * @param readOnly true if no changes (deletions, norms) will be made with this IndexReader
  308      * @throws CorruptIndexException if the index is corrupt
  309      * @throws IOException if there is a low-level IO error
  310      */
  311     public static IndexReader open(final IndexCommit commit, IndexDeletionPolicy deletionPolicy, boolean readOnly) throws CorruptIndexException, IOException {
  312       return open(commit.getDirectory(), false, deletionPolicy, commit, readOnly);
  313     }
  314   
  315     private static IndexReader open(final Directory directory, final boolean closeDirectory, final IndexDeletionPolicy deletionPolicy, final IndexCommit commit, final boolean readOnly) throws CorruptIndexException, IOException {
  316       return DirectoryIndexReader.open(directory, closeDirectory, deletionPolicy, commit, readOnly);
  317     }
  318   
  319     /**
  320      * Refreshes an IndexReader if the index has changed since this instance 
  321      * was (re)opened. 
  322      * <p>
  323      * Opening an IndexReader is an expensive operation. This method can be used
  324      * to refresh an existing IndexReader to reduce these costs. This method 
  325      * tries to only load segments that have changed or were created after the 
  326      * IndexReader was (re)opened.
  327      * <p>
  328      * If the index has not changed since this instance was (re)opened, then this
  329      * call is a NOOP and returns this instance. Otherwise, a new instance is 
  330      * returned. The old instance is <b>not</b> closed and remains usable.<br>
  331      * <b>Note:</b> The re-opened reader instance and the old instance might share
  332      * the same resources. For this reason no index modification operations 
  333      * (e. g. {@link #deleteDocument(int)}, {@link #setNorm(int, String, byte)}) 
  334      * should be performed using one of the readers until the old reader instance
  335      * is closed. <b>Otherwise, the behavior of the readers is undefined.</b> 
  336      * <p>   
  337      * You can determine whether a reader was actually reopened by comparing the
  338      * old instance with the instance returned by this method: 
  339      * <pre>
  340      * IndexReader reader = ... 
  341      * ...
  342      * IndexReader new = r.reopen();
  343      * if (new != reader) {
  344      *   ...     // reader was reopened
  345      *   reader.close(); 
  346      * }
  347      * reader = new;
  348      * ...
  349      * </pre>
  350      * 
  351      * @throws CorruptIndexException if the index is corrupt
  352      * @throws IOException if there is a low-level IO error
  353      */  
  354     public synchronized IndexReader reopen() throws CorruptIndexException, IOException {
  355       throw new UnsupportedOperationException("This reader does not support reopen().");
  356     }
  357   
  358     /** 
  359      * Returns the directory associated with this index.  The Default 
  360      * implementation returns the directory specified by subclasses when 
  361      * delegating to the IndexReader(Directory) constructor, or throws an 
  362      * UnsupportedOperationException if one was not specified.
  363      * @throws UnsupportedOperationException if no directory
  364      */
  365     public Directory directory() {
  366       ensureOpen();
  367       if (null != directory) {
  368         return directory;
  369       } else {
  370         throw new UnsupportedOperationException("This reader does not support this method.");  
  371       }
  372     }
  373   
  374     /**
  375      * Returns the time the index in the named directory was last modified.
  376      * Do not use this to check whether the reader is still up-to-date, use
  377      * {@link #isCurrent()} instead. 
  378      * @throws CorruptIndexException if the index is corrupt
  379      * @throws IOException if there is a low-level IO error
  380      */
  381     public static long lastModified(String directory) throws CorruptIndexException, IOException {
  382       return lastModified(new File(directory));
  383     }
  384   
  385     /**
  386      * Returns the time the index in the named directory was last modified. 
  387      * Do not use this to check whether the reader is still up-to-date, use
  388      * {@link #isCurrent()} instead. 
  389      * @throws CorruptIndexException if the index is corrupt
  390      * @throws IOException if there is a low-level IO error
  391      */
  392     public static long lastModified(File fileDirectory) throws CorruptIndexException, IOException {
  393       return ((Long) new SegmentInfos.FindSegmentsFile(fileDirectory) {
  394           public Object doBody(String segmentFileName) {
  395             return new Long(FSDirectory.fileModified(fileDirectory, segmentFileName));
  396           }
  397         }.run()).longValue();
  398     }
  399   
  400     /**
  401      * Returns the time the index in the named directory was last modified. 
  402      * Do not use this to check whether the reader is still up-to-date, use
  403      * {@link #isCurrent()} instead. 
  404      * @throws CorruptIndexException if the index is corrupt
  405      * @throws IOException if there is a low-level IO error
  406      */
  407     public static long lastModified(final Directory directory2) throws CorruptIndexException, IOException {
  408       return ((Long) new SegmentInfos.FindSegmentsFile(directory2) {
  409           public Object doBody(String segmentFileName) throws IOException {
  410             return new Long(directory2.fileModified(segmentFileName));
  411           }
  412         }.run()).longValue();
  413     }
  414   
  415     /**
  416      * Reads version number from segments files. The version number is
  417      * initialized with a timestamp and then increased by one for each change of
  418      * the index.
  419      * 
  420      * @param directory where the index resides.
  421      * @return version number.
  422      * @throws CorruptIndexException if the index is corrupt
  423      * @throws IOException if there is a low-level IO error
  424      */
  425     public static long getCurrentVersion(String directory) throws CorruptIndexException, IOException {
  426       return getCurrentVersion(new File(directory));
  427     }
  428   
  429     /**
  430      * Reads version number from segments files. The version number is
  431      * initialized with a timestamp and then increased by one for each change of
  432      * the index.
  433      * 
  434      * @param directory where the index resides.
  435      * @return version number.
  436      * @throws CorruptIndexException if the index is corrupt
  437      * @throws IOException if there is a low-level IO error
  438      */
  439     public static long getCurrentVersion(File directory) throws CorruptIndexException, IOException {
  440       Directory dir = FSDirectory.getDirectory(directory);
  441       long version = getCurrentVersion(dir);
  442       dir.close();
  443       return version;
  444     }
  445   
  446     /**
  447      * Reads version number from segments files. The version number is
  448      * initialized with a timestamp and then increased by one for each change of
  449      * the index.
  450      * 
  451      * @param directory where the index resides.
  452      * @return version number.
  453      * @throws CorruptIndexException if the index is corrupt
  454      * @throws IOException if there is a low-level IO error
  455      */
  456     public static long getCurrentVersion(Directory directory) throws CorruptIndexException, IOException {
  457       return SegmentInfos.readCurrentVersion(directory);
  458     }
  459   
  460     /**
  461      * Version number when this IndexReader was opened. Not implemented in the IndexReader base class.
  462      * @throws UnsupportedOperationException unless overridden in subclass
  463      */
  464     public long getVersion() {
  465       throw new UnsupportedOperationException("This reader does not support this method.");
  466     }
  467   
  468     /**<p>For IndexReader implementations that use
  469      * TermInfosReader to read terms, this sets the
  470      * indexDivisor to subsample the number of indexed terms
  471      * loaded into memory.  This has the same effect as {@link
  472      * IndexWriter#setTermIndexInterval} except that setting
  473      * must be done at indexing time while this setting can be
  474      * set per reader.  When set to N, then one in every
  475      * N*termIndexInterval terms in the index is loaded into
  476      * memory.  By setting this to a value > 1 you can reduce
  477      * memory usage, at the expense of higher latency when
  478      * loading a TermInfo.  The default value is 1.</p>
  479      *
  480      * <b>NOTE:</b> you must call this before the term
  481      * index is loaded.  If the index is already loaded, 
  482      * an IllegalStateException is thrown.
  483      * @throws IllegalStateException if the term index has already been loaded into memory
  484      */
  485     public void setTermInfosIndexDivisor(int indexDivisor) throws IllegalStateException {
  486       throw new UnsupportedOperationException("This reader does not support this method.");
  487     }
  488   
  489     /** <p>For IndexReader implementations that use
  490      *  TermInfosReader to read terms, this returns the
  491      *  current indexDivisor.
  492      *  @see #setTermInfosIndexDivisor */
  493     public int getTermInfosIndexDivisor() {
  494       throw new UnsupportedOperationException("This reader does not support this method.");
  495     }
  496   
  497     /**
  498      * Check whether this IndexReader is still using the
  499      * current (i.e., most recently committed) version of the
  500      * index.  If a writer has committed any changes to the
  501      * index since this reader was opened, this will return
  502      * <code>false</code>, in which case you must open a new
  503      * IndexReader in order to see the changes.  See the
  504      * description of the <a href="IndexWriter.html#autoCommit"><code>autoCommit</code></a>
  505      * flag which controls when the {@link IndexWriter}
  506      * actually commits changes to the index.
  507      * 
  508      * <p>
  509      * Not implemented in the IndexReader base class.
  510      * </p>
  511      * @throws CorruptIndexException if the index is corrupt
  512      * @throws IOException if there is a low-level IO error
  513      * @throws UnsupportedOperationException unless overridden in subclass
  514      */
  515     public boolean isCurrent() throws CorruptIndexException, IOException {
  516       throw new UnsupportedOperationException("This reader does not support this method.");
  517     }
  518   
  519     /**
  520      * Checks is the index is optimized (if it has a single segment and 
  521      * no deletions).  Not implemented in the IndexReader base class.
  522      * @return <code>true</code> if the index is optimized; <code>false</code> otherwise
  523      * @throws UnsupportedOperationException unless overridden in subclass
  524      */
  525     public boolean isOptimized() {
  526       throw new UnsupportedOperationException("This reader does not support this method.");
  527     }
  528     
  529     /**
  530      *  Return an array of term frequency vectors for the specified document.
  531      *  The array contains a vector for each vectorized field in the document.
  532      *  Each vector contains terms and frequencies for all terms in a given vectorized field.
  533      *  If no such fields existed, the method returns null. The term vectors that are
  534      * returned my either be of type TermFreqVector or of type TermPositionsVector if
  535      * positions or offsets have been stored.
  536      * 
  537      * @param docNumber document for which term frequency vectors are returned
  538      * @return array of term frequency vectors. May be null if no term vectors have been
  539      *  stored for the specified document.
  540      * @throws IOException if index cannot be accessed
  541      * @see org.apache.lucene.document.Field.TermVector
  542      */
  543     abstract public TermFreqVector[] getTermFreqVectors(int docNumber)
  544             throws IOException;
  545   
  546   
  547     /**
  548      *  Return a term frequency vector for the specified document and field. The
  549      *  returned vector contains terms and frequencies for the terms in
  550      *  the specified field of this document, if the field had the storeTermVector
  551      *  flag set. If termvectors had been stored with positions or offsets, a 
  552      *  TermPositionsVector is returned.
  553      * 
  554      * @param docNumber document for which the term frequency vector is returned
  555      * @param field field for which the term frequency vector is returned.
  556      * @return term frequency vector May be null if field does not exist in the specified
  557      * document or term vector was not stored.
  558      * @throws IOException if index cannot be accessed
  559      * @see org.apache.lucene.document.Field.TermVector
  560      */
  561     abstract public TermFreqVector getTermFreqVector(int docNumber, String field)
  562             throws IOException;
  563   
  564     /**
  565      * Load the Term Vector into a user-defined data structure instead of relying on the parallel arrays of
  566      * the {@link TermFreqVector}.
  567      * @param docNumber The number of the document to load the vector for
  568      * @param field The name of the field to load
  569      * @param mapper The {@link TermVectorMapper} to process the vector.  Must not be null
  570      * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
  571      * 
  572      */
  573     abstract public void getTermFreqVector(int docNumber, String field, TermVectorMapper mapper) throws IOException;
  574   
  575     /**
  576      * Map all the term vectors for all fields in a Document
  577      * @param docNumber The number of the document to load the vector for
  578      * @param mapper The {@link TermVectorMapper} to process the vector.  Must not be null
  579      * @throws IOException if term vectors cannot be accessed or if they do not exist on the field and doc. specified.
  580      */
  581     abstract public void getTermFreqVector(int docNumber, TermVectorMapper mapper) throws IOException;
  582   
  583     /**
  584      * Returns <code>true</code> if an index exists at the specified directory.
  585      * If the directory does not exist or if there is no index in it.
  586      * <code>false</code> is returned.
  587      * @param  directory the directory to check for an index
  588      * @return <code>true</code> if an index exists; <code>false</code> otherwise
  589      */
  590     public static boolean indexExists(String directory) {
  591       return indexExists(new File(directory));
  592     }
  593   
  594     /**
  595      * Returns <code>true</code> if an index exists at the specified directory.
  596      * If the directory does not exist or if there is no index in it.
  597      * @param  directory the directory to check for an index
  598      * @return <code>true</code> if an index exists; <code>false</code> otherwise
  599      */
  600   
  601     public static boolean indexExists(File directory) {
  602       return SegmentInfos.getCurrentSegmentGeneration(directory.list()) != -1;
  603     }
  604   
  605     /**
  606      * Returns <code>true</code> if an index exists at the specified directory.
  607      * If the directory does not exist or if there is no index in it.
  608      * @param  directory the directory to check for an index
  609      * @return <code>true</code> if an index exists; <code>false</code> otherwise
  610      * @throws IOException if there is a problem with accessing the index
  611      */
  612     public static boolean indexExists(Directory directory) throws IOException {
  613       return SegmentInfos.getCurrentSegmentGeneration(directory) != -1;
  614     }
  615   
  616     /** Returns the number of documents in this index. */
  617     public abstract int numDocs();
  618   
  619     /** Returns one greater than the largest possible document number.
  620      * This may be used to, e.g., determine how big to allocate an array which
  621      * will have an element for every document number in an index.
  622      */
  623     public abstract int maxDoc();
  624   
  625     /** Returns the number of deleted documents. */
  626     public int numDeletedDocs() {
  627       return maxDoc() - numDocs();
  628     }
  629   
  630     /** Returns the stored fields of the <code>n</code><sup>th</sup>
  631      <code>Document</code> in this index.
  632      * @throws CorruptIndexException if the index is corrupt
  633      * @throws IOException if there is a low-level IO error
  634      */
  635     public Document document(int n) throws CorruptIndexException, IOException {
  636       ensureOpen();
  637       return document(n, null);
  638     }
  639   
  640     /**
  641      * Get the {@link org.apache.lucene.document.Document} at the <code>n</code><sup>th</sup> position. The {@link org.apache.lucene.document.FieldSelector}
  642      * may be used to determine what {@link org.apache.lucene.document.Field}s to load and how they should be loaded.
  643      * 
  644      * <b>NOTE:</b> If this Reader (more specifically, the underlying <code>FieldsReader</code>) is closed before the lazy {@link org.apache.lucene.document.Field} is
  645      * loaded an exception may be thrown.  If you want the value of a lazy {@link org.apache.lucene.document.Field} to be available after closing you must
  646      * explicitly load it or fetch the Document again with a new loader.
  647      * 
  648      *  
  649      * @param n Get the document at the <code>n</code><sup>th</sup> position
  650      * @param fieldSelector The {@link org.apache.lucene.document.FieldSelector} to use to determine what Fields should be loaded on the Document.  May be null, in which case all Fields will be loaded.
  651      * @return The stored fields of the {@link org.apache.lucene.document.Document} at the nth position
  652      * @throws CorruptIndexException if the index is corrupt
  653      * @throws IOException if there is a low-level IO error
  654      * 
  655      * @see org.apache.lucene.document.Fieldable
  656      * @see org.apache.lucene.document.FieldSelector
  657      * @see org.apache.lucene.document.SetBasedFieldSelector
  658      * @see org.apache.lucene.document.LoadFirstFieldSelector
  659      */
  660     //When we convert to JDK 1.5 make this Set<String>
  661     public abstract Document document(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException;
  662     
  663     
  664   
  665     /** Returns true if document <i>n</i> has been deleted */
  666     public abstract boolean isDeleted(int n);
  667   
  668     /** Returns true if any documents have been deleted */
  669     public abstract boolean hasDeletions();
  670   
  671     /** Returns true if there are norms stored for this field. */
  672     public boolean hasNorms(String field) throws IOException {
  673       // backward compatible implementation.
  674       // SegmentReader has an efficient implementation.
  675       ensureOpen();
  676       return norms(field) != null;
  677     }
  678   
  679     /** Returns the byte-encoded normalization factor for the named field of
  680      * every document.  This is used by the search code to score documents.
  681      *
  682      * @see org.apache.lucene.document.Field#setBoost(float)
  683      */
  684     public abstract byte[] norms(String field) throws IOException;
  685   
  686     /** Reads the byte-encoded normalization factor for the named field of every
  687      *  document.  This is used by the search code to score documents.
  688      *
  689      * @see org.apache.lucene.document.Field#setBoost(float)
  690      */
  691     public abstract void norms(String field, byte[] bytes, int offset)
  692       throws IOException;
  693   
  694     /** Expert: Resets the normalization factor for the named field of the named
  695      * document.  The norm represents the product of the field's {@link
  696      * org.apache.lucene.document.Fieldable#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
  697      * int) length normalization}.  Thus, to preserve the length normalization
  698      * values when resetting this, one should base the new value upon the old.
  699      *
  700      * @see #norms(String)
  701      * @see Similarity#decodeNorm(byte)
  702      * @throws StaleReaderException if the index has changed
  703      *  since this reader was opened
  704      * @throws CorruptIndexException if the index is corrupt
  705      * @throws LockObtainFailedException if another writer
  706      *  has this index open (<code>write.lock</code> could not
  707      *  be obtained)
  708      * @throws IOException if there is a low-level IO error
  709      */
  710     public synchronized  void setNorm(int doc, String field, byte value)
  711             throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
  712       ensureOpen();
  713       acquireWriteLock();
  714       hasChanges = true;
  715       doSetNorm(doc, field, value);
  716     }
  717   
  718     /** Implements setNorm in subclass.*/
  719     protected abstract void doSetNorm(int doc, String field, byte value)
  720             throws CorruptIndexException, IOException;
  721   
  722     /** Expert: Resets the normalization factor for the named field of the named
  723      * document.
  724      *
  725      * @see #norms(String)
  726      * @see Similarity#decodeNorm(byte)
  727      * 
  728      * @throws StaleReaderException if the index has changed
  729      *  since this reader was opened
  730      * @throws CorruptIndexException if the index is corrupt
  731      * @throws LockObtainFailedException if another writer
  732      *  has this index open (<code>write.lock</code> could not
  733      *  be obtained)
  734      * @throws IOException if there is a low-level IO error
  735      */
  736     public void setNorm(int doc, String field, float value)
  737             throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
  738       ensureOpen();
  739       setNorm(doc, field, Similarity.encodeNorm(value));
  740     }
  741   
  742     /** Returns an enumeration of all the terms in the index. The
  743      * enumeration is ordered by Term.compareTo(). Each term is greater
  744      * than all that precede it in the enumeration. Note that after
  745      * calling terms(), {@link TermEnum#next()} must be called
  746      * on the resulting enumeration before calling other methods such as
  747      * {@link TermEnum#term()}.
  748      * @throws IOException if there is a low-level IO error
  749      */
  750     public abstract TermEnum terms() throws IOException;
  751   
  752     /** Returns an enumeration of all terms starting at a given term. If
  753      * the given term does not exist, the enumeration is positioned at the
  754      * first term greater than the supplied term. The enumeration is
  755      * ordered by Term.compareTo(). Each term is greater than all that
  756      * precede it in the enumeration.
  757      * @throws IOException if there is a low-level IO error
  758      */
  759     public abstract TermEnum terms(Term t) throws IOException;
  760   
  761     /** Returns the number of documents containing the term <code>t</code>.
  762      * @throws IOException if there is a low-level IO error
  763      */
  764     public abstract int docFreq(Term t) throws IOException;
  765   
  766     /** Returns an enumeration of all the documents which contain
  767      * <code>term</code>. For each document, the document number, the frequency of
  768      * the term in that document is also provided, for use in search scoring.
  769      * Thus, this method implements the mapping:
  770      * <p><ul>
  771      * Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq&gt;<sup>*</sup>
  772      * </ul>
  773      * <p>The enumeration is ordered by document number.  Each document number
  774      * is greater than all that precede it in the enumeration.
  775      * @throws IOException if there is a low-level IO error
  776      */
  777     public TermDocs termDocs(Term term) throws IOException {
  778       ensureOpen();
  779       TermDocs termDocs = termDocs();
  780       termDocs.seek(term);
  781       return termDocs;
  782     }
  783   
  784     /** Returns an unpositioned {@link TermDocs} enumerator.
  785      * @throws IOException if there is a low-level IO error
  786      */
  787     public abstract TermDocs termDocs() throws IOException;
  788   
  789     /** Returns an enumeration of all the documents which contain
  790      * <code>term</code>.  For each document, in addition to the document number
  791      * and frequency of the term in that document, a list of all of the ordinal
  792      * positions of the term in the document is available.  Thus, this method
  793      * implements the mapping:
  794      *
  795      * <p><ul>
  796      * Term &nbsp;&nbsp; =&gt; &nbsp;&nbsp; &lt;docNum, freq,
  797      * &lt;pos<sub>1</sub>, pos<sub>2</sub>, ...
  798      * pos<sub>freq-1</sub>&gt;
  799      * &gt;<sup>*</sup>
  800      * </ul>
  801      * <p> This positional information facilitates phrase and proximity searching.
  802      * <p>The enumeration is ordered by document number.  Each document number is
  803      * greater than all that precede it in the enumeration.
  804      * @throws IOException if there is a low-level IO error
  805      */
  806     public TermPositions termPositions(Term term) throws IOException {
  807       ensureOpen();
  808       TermPositions termPositions = termPositions();
  809       termPositions.seek(term);
  810       return termPositions;
  811     }
  812   
  813     /** Returns an unpositioned {@link TermPositions} enumerator.
  814      * @throws IOException if there is a low-level IO error
  815      */
  816     public abstract TermPositions termPositions() throws IOException;
  817   
  818   
  819   
  820     /** Deletes the document numbered <code>docNum</code>.  Once a document is
  821      * deleted it will not appear in TermDocs or TermPostitions enumerations.
  822      * Attempts to read its field with the {@link #document}
  823      * method will result in an error.  The presence of this document may still be
  824      * reflected in the {@link #docFreq} statistic, though
  825      * this will be corrected eventually as the index is further modified.
  826      *
  827      * @throws StaleReaderException if the index has changed
  828      * since this reader was opened
  829      * @throws CorruptIndexException if the index is corrupt
  830      * @throws LockObtainFailedException if another writer
  831      *  has this index open (<code>write.lock</code> could not
  832      *  be obtained)
  833      * @throws IOException if there is a low-level IO error
  834      */
  835     public synchronized void deleteDocument(int docNum) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
  836       ensureOpen();
  837       acquireWriteLock();
  838       hasChanges = true;
  839       doDelete(docNum);
  840     }
  841   
  842   
  843     /** Implements deletion of the document numbered <code>docNum</code>.
  844      * Applications should call {@link #deleteDocument(int)} or {@link #deleteDocuments(Term)}.
  845      */
  846     protected abstract void doDelete(int docNum) throws CorruptIndexException, IOException;
  847   
  848   
  849     /** Deletes all documents that have a given <code>term</code> indexed.
  850      * This is useful if one uses a document field to hold a unique ID string for
  851      * the document.  Then to delete such a document, one merely constructs a
  852      * term with the appropriate field and the unique ID string as its text and
  853      * passes it to this method.
  854      * See {@link #deleteDocument(int)} for information about when this deletion will 
  855      * become effective.
  856      *
  857      * @return the number of documents deleted
  858      * @throws StaleReaderException if the index has changed
  859      *  since this reader was opened
  860      * @throws CorruptIndexException if the index is corrupt
  861      * @throws LockObtainFailedException if another writer
  862      *  has this index open (<code>write.lock</code> could not
  863      *  be obtained)
  864      * @throws IOException if there is a low-level IO error
  865      */
  866     public int deleteDocuments(Term term) throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
  867       ensureOpen();
  868       TermDocs docs = termDocs(term);
  869       if (docs == null) return 0;
  870       int n = 0;
  871       try {
  872         while (docs.next()) {
  873           deleteDocument(docs.doc());
  874           n++;
  875         }
  876       } finally {
  877         docs.close();
  878       }
  879       return n;
  880     }
  881   
  882     /** Undeletes all documents currently marked as deleted in this index.
  883      *
  884      * @throws StaleReaderException if the index has changed
  885      *  since this reader was opened
  886      * @throws LockObtainFailedException if another writer
  887      *  has this index open (<code>write.lock</code> could not
  888      *  be obtained)
  889      * @throws CorruptIndexException if the index is corrupt
  890      * @throws IOException if there is a low-level IO error
  891      */
  892     public synchronized void undeleteAll() throws StaleReaderException, CorruptIndexException, LockObtainFailedException, IOException {
  893       ensureOpen();
  894       acquireWriteLock();
  895       hasChanges = true;
  896       doUndeleteAll();
  897     }
  898   
  899     /** Implements actual undeleteAll() in subclass. */
  900     protected abstract void doUndeleteAll() throws CorruptIndexException, IOException;
  901   
  902     /** Does nothing by default. Subclasses that require a write lock for
  903      *  index modifications must implement this method. */
  904     protected synchronized void acquireWriteLock() throws IOException {
  905       /* NOOP */
  906     }
  907     
  908     /**
  909      * 
  910      * @throws IOException
  911      */
  912     public final synchronized void flush() throws IOException {
  913       ensureOpen();
  914       commit();
  915     }
  916   
  917     /**
  918      * Commit changes resulting from delete, undeleteAll, or
  919      * setNorm operations
  920      *
  921      * If an exception is hit, then either no changes or all
  922      * changes will have been committed to the index
  923      * (transactional semantics).
  924      * @throws IOException if there is a low-level IO error
  925      */
  926     protected final synchronized void commit() throws IOException {
  927       if(hasChanges){
  928         doCommit();
  929       }
  930       hasChanges = false;
  931     }
  932   
  933     /** Implements commit. */
  934     protected abstract void doCommit() throws IOException;
  935   
  936     /**
  937      * Closes files associated with this index.
  938      * Also saves any new deletions to disk.
  939      * No other methods should be called after this has been called.
  940      * @throws IOException if there is a low-level IO error
  941      */
  942     public final synchronized void close() throws IOException {
  943       if (!closed) {
  944         decRef();
  945         closed = true;
  946       }
  947     }
  948     
  949     /** Implements close. */
  950     protected abstract void doClose() throws IOException;
  951   
  952   
  953     /**
  954      * Get a list of unique field names that exist in this index and have the specified
  955      * field option information.
  956      * @param fldOption specifies which field option should be available for the returned fields
  957      * @return Collection of Strings indicating the names of the fields.
  958      * @see IndexReader.FieldOption
  959      */
  960     public abstract Collection getFieldNames(FieldOption fldOption);
  961   
  962     /**
  963      * Returns <code>true</code> iff the index in the named directory is
  964      * currently locked.
  965      * @param directory the directory to check for a lock
  966      * @throws IOException if there is a low-level IO error
  967      * @deprecated Please use {@link IndexWriter#isLocked(Directory)} instead
  968      */
  969     public static boolean isLocked(Directory directory) throws IOException {
  970       return
  971         directory.makeLock(IndexWriter.WRITE_LOCK_NAME).isLocked();
  972     }
  973   
  974     /**
  975      * Returns <code>true</code> iff the index in the named directory is
  976      * currently locked.
  977      * @param directory the directory to check for a lock
  978      * @throws IOException if there is a low-level IO error
  979      * @deprecated Please use {@link IndexWriter#isLocked(String)} instead
  980      */
  981     public static boolean isLocked(String directory) throws IOException {
  982       Directory dir = FSDirectory.getDirectory(directory);
  983       boolean result = isLocked(dir);
  984       dir.close();
  985       return result;
  986     }
  987   
  988     /**
  989      * Forcibly unlocks the index in the named directory.
  990      * <P>
  991      * Caution: this should only be used by failure recovery code,
  992      * when it is known that no other process nor thread is in fact
  993      * currently accessing this index.
  994      * @deprecated Please use {@link IndexWriter#unlock(Directory)} instead
  995      */
  996     public static void unlock(Directory directory) throws IOException {
  997       directory.makeLock(IndexWriter.WRITE_LOCK_NAME).release();
  998     }
  999   
 1000     /**
 1001      * Expert: return the IndexCommit that this reader has
 1002      * opened.  This method is only implemented by those
 1003      * readers that correspond to a Directory with its own
 1004      * segments_N file.
 1005      *
 1006      * <p><b>WARNING</b>: this API is new and experimental and
 1007      * may suddenly change.</p>
 1008      */
 1009     public IndexCommit getIndexCommit() throws IOException {
 1010       throw new UnsupportedOperationException("This reader does not support this method.");
 1011     }
 1012     
 1013     /**
 1014      * Prints the filename and size of each file within a given compound file.
 1015      * Add the -extract flag to extract files to the current working directory.
 1016      * In order to make the extracted version of the index work, you have to copy
 1017      * the segments file from the compound index into the directory where the extracted files are stored.
 1018      * @param args Usage: org.apache.lucene.index.IndexReader [-extract] &lt;cfsfile&gt;
 1019      */
 1020     public static void main(String [] args) {
 1021       String filename = null;
 1022       boolean extract = false;
 1023   
 1024       for (int i = 0; i < args.length; ++i) {
 1025         if (args[i].equals("-extract")) {
 1026           extract = true;
 1027         } else if (filename == null) {
 1028           filename = args[i];
 1029         }
 1030       }
 1031   
 1032       if (filename == null) {
 1033         System.out.println("Usage: org.apache.lucene.index.IndexReader [-extract] <cfsfile>");
 1034         return;
 1035       }
 1036   
 1037       Directory dir = null;
 1038       CompoundFileReader cfr = null;
 1039   
 1040       try {
 1041         File file = new File(filename);
 1042         String dirname = file.getAbsoluteFile().getParent();
 1043         filename = file.getName();
 1044         dir = FSDirectory.getDirectory(dirname);
 1045         cfr = new CompoundFileReader(dir, filename);
 1046   
 1047         String [] files = cfr.list();
 1048         Arrays.sort(files);   // sort the array of filename so that the output is more readable
 1049   
 1050         for (int i = 0; i < files.length; ++i) {
 1051           long len = cfr.fileLength(files[i]);
 1052   
 1053           if (extract) {
 1054             System.out.println("extract " + files[i] + " with " + len + " bytes to local directory...");
 1055             IndexInput ii = cfr.openInput(files[i]);
 1056   
 1057             FileOutputStream f = new FileOutputStream(files[i]);
 1058   
 1059             // read and write with a small buffer, which is more effectiv than reading byte by byte
 1060             byte[] buffer = new byte[1024];
 1061             int chunk = buffer.length;
 1062             while(len > 0) {
 1063               final int bufLen = (int) Math.min(chunk, len);
 1064               ii.readBytes(buffer, 0, bufLen);
 1065               f.write(buffer, 0, bufLen);
 1066               len -= bufLen;
 1067             }
 1068   
 1069             f.close();
 1070             ii.close();
 1071           }
 1072           else
 1073             System.out.println(files[i] + ": " + len + " bytes");
 1074         }
 1075       } catch (IOException ioe) {
 1076         ioe.printStackTrace();
 1077       }
 1078       finally {
 1079         try {
 1080           if (dir != null)
 1081             dir.close();
 1082           if (cfr != null)
 1083             cfr.close();
 1084         }
 1085         catch (IOException ioe) {
 1086           ioe.printStackTrace();
 1087         }
 1088       }
 1089     }
 1090   
 1091     /** Returns all commit points that exist in the Directory.
 1092      *  Normally, because the default is {@link
 1093      *  KeepOnlyLastCommitDeletionPolicy}, there would be only
 1094      *  one commit point.  But if you're using a custom {@link
 1095      *  IndexDeletionPolicy} then there could be many commits.
 1096      *  Once you have a given commit, you can open a reader on
 1097      *  it by calling {@link IndexReader#open(IndexCommit)}
 1098      *  There must be at least one commit in
 1099      *  the Directory, else this method throws {@link
 1100      *  java.io.IOException}.  Note that if a commit is in
 1101      *  progress while this method is running, that commit
 1102      *  may or may not be returned array.  */
 1103     public static Collection listCommits(Directory dir) throws IOException {
 1104       return DirectoryIndexReader.listCommits(dir);
 1105     }
 1106   }

Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » index » [javadoc | source]