Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » document » [javadoc | source]
    1   package org.apache.lucene.document;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.analysis.TokenStream;
   21   import org.apache.lucene.index.IndexWriter;   // for javadoc
   22   import org.apache.lucene.util.Parameter;
   23   
   24   import java.io.Reader;
   25   import java.io.Serializable;
   26   
   27   /**
   28     A field is a section of a Document.  Each field has two parts, a name and a
   29     value.  Values may be free text, provided as a String or as a Reader, or they
   30     may be atomic keywords, which are not further processed.  Such keywords may
   31     be used to represent dates, urls, etc.  Fields are optionally stored in the
   32     index, so that they may be returned with hits on the document.
   33     */
   34   
   35   public final class Field extends AbstractField implements Fieldable, Serializable {
   36     
   37     /** Specifies whether and how a field should be stored. */
   38     public static final class Store extends Parameter implements Serializable {
   39   
   40       private Store(String name) {
   41         super(name);
   42       }
   43   
   44       /** Store the original field value in the index in a compressed form. This is
   45        * useful for long documents and for binary valued fields.
   46        */
   47       public static final Store COMPRESS = new Store("COMPRESS");
   48   
   49       /** Store the original field value in the index. This is useful for short texts
   50        * like a document's title which should be displayed with the results. The
   51        * value is stored in its original form, i.e. no analyzer is used before it is
   52        * stored.
   53        */
   54       public static final Store YES = new Store("YES");
   55   
   56       /** Do not store the field value in the index. */
   57       public static final Store NO = new Store("NO");
   58     }
   59   
   60     /** Specifies whether and how a field should be indexed. */
   61     public static final class Index extends Parameter implements Serializable {
   62   
   63       private Index(String name) {
   64         super(name);
   65       }
   66   
   67       /** Do not index the field value. This field can thus not be searched,
   68        * but one can still access its contents provided it is
   69        * {@link Field.Store stored}. */
   70       public static final Index NO = new Index("NO");
   71   
   72       /** Index the tokens produced by running the field's
   73        * value through an Analyzer.  This is useful for
   74        * common text. */
   75       public static final Index ANALYZED = new Index("ANALYZED");
   76   
   77       /** @deprecated this has been renamed to {@link #ANALYZED} */
   78       public static final Index TOKENIZED = ANALYZED;
   79   
   80       /** Index the field's value without using an Analyzer, so it can be searched.
   81        * As no analyzer is used the value will be stored as a single term. This is
   82        * useful for unique Ids like product numbers.
   83        */
   84       public static final Index NOT_ANALYZED = new Index("NOT_ANALYZED");
   85   
   86       /** @deprecated This has been renamed to {@link #NOT_ANALYZED} */
   87       public static final Index UN_TOKENIZED = NOT_ANALYZED;
   88   
   89       /** Expert: Index the field's value without an Analyzer,
   90        * and also disable the storing of norms.  Note that you
   91        * can also separately enable/disable norms by calling
   92        * {@link #setOmitNorms}.  No norms means that
   93        * index-time field and document boosting and field
   94        * length normalization are disabled.  The benefit is
   95        * less memory usage as norms take up one byte of RAM
   96        * per indexed field for every document in the index,
   97        * during searching.  Note that once you index a given
   98        * field <i>with</i> norms enabled, disabling norms will
   99        * have no effect.  In other words, for this to have the
  100        * above described effect on a field, all instances of
  101        * that field must be indexed with NOT_ANALYZED_NO_NORMS
  102        * from the beginning. */
  103       public static final Index NOT_ANALYZED_NO_NORMS = new Index("NOT_ANALYZED_NO_NORMS");
  104   
  105       /** @deprecated This has been renamed to
  106        *  {@link #NOT_ANALYZED_NO_NORMS} */
  107       public static final Index NO_NORMS = NOT_ANALYZED_NO_NORMS;
  108   
  109       /** Expert: Index the tokens produced by running the
  110        *  field's value through an Analyzer, and also
  111        *  separately disable the storing of norms.  See
  112        *  {@link #NOT_ANALYZED_NO_NORMS} for what norms are
  113        *  and why you may want to disable them. */
  114       public static final Index ANALYZED_NO_NORMS = new Index("ANALYZED_NO_NORMS");
  115     }
  116   
  117     /** Specifies whether and how a field should have term vectors. */
  118     public static final class TermVector  extends Parameter implements Serializable {
  119       
  120       private TermVector(String name) {
  121         super(name);
  122       }
  123       
  124       /** Do not store term vectors. 
  125        */
  126       public static final TermVector NO = new TermVector("NO");
  127       
  128       /** Store the term vectors of each document. A term vector is a list
  129        * of the document's terms and their number of occurences in that document. */
  130       public static final TermVector YES = new TermVector("YES");
  131       
  132       /**
  133        * Store the term vector + token position information
  134        * 
  135        * @see #YES
  136        */ 
  137       public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
  138       
  139       /**
  140        * Store the term vector + Token offset information
  141        * 
  142        * @see #YES
  143        */ 
  144       public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
  145       
  146       /**
  147        * Store the term vector + Token position and offset information
  148        * 
  149        * @see #YES
  150        * @see #WITH_POSITIONS
  151        * @see #WITH_OFFSETS
  152        */ 
  153       public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
  154     }
  155     
  156     
  157     /** The value of the field as a String, or null.  If null, the Reader value,
  158      * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
  159      * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
  160     public String stringValue()   { return fieldsData instanceof String ? (String)fieldsData : null; }
  161     
  162     /** The value of the field as a Reader, or null.  If null, the String value,
  163      * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
  164      * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
  165     public Reader readerValue()   { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
  166     
  167     /** The value of the field in Binary, or null.  If null, the Reader value,
  168      * String value, or TokenStream value is used. Exactly one of stringValue(), 
  169      * readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
  170      * @deprecated This method must allocate a new byte[] if
  171      * the {@link AbstractField#getBinaryOffset()} is non-zero
  172      * or {@link AbstractField#getBinaryLength()} is not the
  173      * full length of the byte[]. Please use {@link
  174      * AbstractField#getBinaryValue()} instead, which simply
  175      * returns the byte[].
  176      */ 
  177     public byte[] binaryValue() {
  178       if (!isBinary)
  179         return null;
  180       final byte[] data = (byte[]) fieldsData;
  181       if (binaryOffset == 0 && data.length == binaryLength)
  182         return data; //Optimization
  183       
  184       final byte[] ret = new byte[binaryLength];
  185       System.arraycopy(data, binaryOffset, ret, 0, binaryLength);
  186       return ret;    
  187     }
  188     
  189     /** The value of the field as a TokesStream, or null.  If null, the Reader value,
  190      * String value, or binary value is used. Exactly one of stringValue(), 
  191      * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
  192     public TokenStream tokenStreamValue()   { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
  193     
  194   
  195     /** <p>Expert: change the value of this field.  This can
  196      *  be used during indexing to re-use a single Field
  197      *  instance to improve indexing speed by avoiding GC cost
  198      *  of new'ing and reclaiming Field instances.  Typically
  199      *  a single {@link Document} instance is re-used as
  200      *  well.  This helps most on small documents.</p>
  201      * 
  202      *  <p>Note that you should only use this method after the
  203      *  Field has been consumed (ie, the {@link Document}
  204      *  containing this Field has been added to the index).
  205      *  Also, each Field instance should only be used once
  206      *  within a single {@link Document} instance.  See <a
  207      *  href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
  208      *  for details.</p> */
  209     public void setValue(String value) {
  210       fieldsData = value;
  211     }
  212   
  213     /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
  214     public void setValue(Reader value) {
  215       fieldsData = value;
  216     }
  217   
  218     /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
  219     public void setValue(byte[] value) {
  220       fieldsData = value;
  221       binaryLength = value.length;
  222       binaryOffset = 0;
  223     }
  224   
  225     /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
  226     public void setValue(byte[] value, int offset, int length) {
  227       fieldsData = value;
  228       binaryLength = length;
  229       binaryOffset = offset;
  230     }
  231     
  232     
  233     /** Expert: change the value of this field.  See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
  234     public void setValue(TokenStream value) {
  235       fieldsData = value;
  236     }
  237   
  238     /**
  239      * Create a field by specifying its name, value and how it will
  240      * be saved in the index. Term vectors will not be stored in the index.
  241      * 
  242      * @param name The name of the field
  243      * @param value The string to process
  244      * @param store Whether <code>value</code> should be stored in the index
  245      * @param index Whether the field should be indexed, and if so, if it should
  246      *  be tokenized before indexing 
  247      * @throws NullPointerException if name or value is <code>null</code>
  248      * @throws IllegalArgumentException if the field is neither stored nor indexed 
  249      */
  250     public Field(String name, String value, Store store, Index index) {
  251       this(name, value, store, index, TermVector.NO);
  252     }
  253     
  254     /**
  255      * Create a field by specifying its name, value and how it will
  256      * be saved in the index.
  257      * 
  258      * @param name The name of the field
  259      * @param value The string to process
  260      * @param store Whether <code>value</code> should be stored in the index
  261      * @param index Whether the field should be indexed, and if so, if it should
  262      *  be tokenized before indexing 
  263      * @param termVector Whether term vector should be stored
  264      * @throws NullPointerException if name or value is <code>null</code>
  265      * @throws IllegalArgumentException in any of the following situations:
  266      * <ul> 
  267      *  <li>the field is neither stored nor indexed</li> 
  268      *  <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
  269      * </ul> 
  270      */ 
  271     public Field(String name, String value, Store store, Index index, TermVector termVector) {
  272       if (name == null)
  273         throw new NullPointerException("name cannot be null");
  274       if (value == null)
  275         throw new NullPointerException("value cannot be null");
  276       if (name.length() == 0 && value.length() == 0)
  277         throw new IllegalArgumentException("name and value cannot both be empty");
  278       if (index == Index.NO && store == Store.NO)
  279         throw new IllegalArgumentException("it doesn't make sense to have a field that "
  280            + "is neither indexed nor stored");
  281       if (index == Index.NO && termVector != TermVector.NO)
  282         throw new IllegalArgumentException("cannot store term vector information "
  283            + "for a field that is not indexed");
  284             
  285       this.name = name.intern();        // field names are interned
  286       this.fieldsData = value;
  287   
  288       if (store == Store.YES){
  289         this.isStored = true;
  290         this.isCompressed = false;
  291       }
  292       else if (store == Store.COMPRESS) {
  293         this.isStored = true;
  294         this.isCompressed = true;
  295       }
  296       else if (store == Store.NO){
  297         this.isStored = false;
  298         this.isCompressed = false;
  299       }
  300       else
  301         throw new IllegalArgumentException("unknown store parameter " + store);
  302      
  303       if (index == Index.NO) {
  304         this.isIndexed = false;
  305         this.isTokenized = false;
  306       } else if (index == Index.ANALYZED) {
  307         this.isIndexed = true;
  308         this.isTokenized = true;
  309       } else if (index == Index.NOT_ANALYZED) {
  310         this.isIndexed = true;
  311         this.isTokenized = false;
  312       } else if (index == Index.NOT_ANALYZED_NO_NORMS) {
  313         this.isIndexed = true;
  314         this.isTokenized = false;
  315         this.omitNorms = true;
  316       } else if (index == Index.ANALYZED_NO_NORMS) {
  317         this.isIndexed = true;
  318         this.isTokenized = true;
  319         this.omitNorms = true;
  320       } else {
  321         throw new IllegalArgumentException("unknown index parameter " + index);
  322       }
  323       
  324       this.isBinary = false;
  325   
  326       setStoreTermVector(termVector);
  327     }
  328   
  329     /**
  330      * Create a tokenized and indexed field that is not stored. Term vectors will
  331      * not be stored.  The Reader is read only when the Document is added to the index,
  332      * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
  333      * has been called.
  334      * 
  335      * @param name The name of the field
  336      * @param reader The reader with the content
  337      * @throws NullPointerException if name or reader is <code>null</code>
  338      */
  339     public Field(String name, Reader reader) {
  340       this(name, reader, TermVector.NO);
  341     }
  342   
  343     /**
  344      * Create a tokenized and indexed field that is not stored, optionally with 
  345      * storing term vectors.  The Reader is read only when the Document is added to the index,
  346      * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
  347      * has been called.
  348      * 
  349      * @param name The name of the field
  350      * @param reader The reader with the content
  351      * @param termVector Whether term vector should be stored
  352      * @throws NullPointerException if name or reader is <code>null</code>
  353      */ 
  354     public Field(String name, Reader reader, TermVector termVector) {
  355       if (name == null)
  356         throw new NullPointerException("name cannot be null");
  357       if (reader == null)
  358         throw new NullPointerException("reader cannot be null");
  359       
  360       this.name = name.intern();        // field names are interned
  361       this.fieldsData = reader;
  362       
  363       this.isStored = false;
  364       this.isCompressed = false;
  365       
  366       this.isIndexed = true;
  367       this.isTokenized = true;
  368       
  369       this.isBinary = false;
  370       
  371       setStoreTermVector(termVector);
  372     }
  373   
  374     /**
  375      * Create a tokenized and indexed field that is not stored. Term vectors will
  376      * not be stored. This is useful for pre-analyzed fields.
  377      * The TokenStream is read only when the Document is added to the index,
  378      * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
  379      * has been called.
  380      * 
  381      * @param name The name of the field
  382      * @param tokenStream The TokenStream with the content
  383      * @throws NullPointerException if name or tokenStream is <code>null</code>
  384      */ 
  385     public Field(String name, TokenStream tokenStream) {
  386       this(name, tokenStream, TermVector.NO);
  387     }
  388     
  389     /**
  390      * Create a tokenized and indexed field that is not stored, optionally with 
  391      * storing term vectors.  This is useful for pre-analyzed fields.
  392      * The TokenStream is read only when the Document is added to the index,
  393      * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
  394      * has been called.
  395      * 
  396      * @param name The name of the field
  397      * @param tokenStream The TokenStream with the content
  398      * @param termVector Whether term vector should be stored
  399      * @throws NullPointerException if name or tokenStream is <code>null</code>
  400      */ 
  401     public Field(String name, TokenStream tokenStream, TermVector termVector) {
  402       if (name == null)
  403         throw new NullPointerException("name cannot be null");
  404       if (tokenStream == null)
  405         throw new NullPointerException("tokenStream cannot be null");
  406       
  407       this.name = name.intern();        // field names are interned
  408       this.fieldsData = tokenStream;
  409       
  410       this.isStored = false;
  411       this.isCompressed = false;
  412       
  413       this.isIndexed = true;
  414       this.isTokenized = true;
  415       
  416       this.isBinary = false;
  417       
  418       setStoreTermVector(termVector);
  419     }
  420   
  421     
  422     /**
  423      * Create a stored field with binary value. Optionally the value may be compressed.
  424      * 
  425      * @param name The name of the field
  426      * @param value The binary value
  427      * @param store How <code>value</code> should be stored (compressed or not)
  428      * @throws IllegalArgumentException if store is <code>Store.NO</code> 
  429      */
  430     public Field(String name, byte[] value, Store store) {
  431       this(name, value, 0, value.length, store);
  432     }
  433   
  434     /**
  435      * Create a stored field with binary value. Optionally the value may be compressed.
  436      * 
  437      * @param name The name of the field
  438      * @param value The binary value
  439      * @param offset Starting offset in value where this Field's bytes are
  440      * @param length Number of bytes to use for this Field, starting at offset
  441      * @param store How <code>value</code> should be stored (compressed or not)
  442      * @throws IllegalArgumentException if store is <code>Store.NO</code> 
  443      */
  444     public Field(String name, byte[] value, int offset, int length, Store store) {
  445   
  446       if (name == null)
  447         throw new IllegalArgumentException("name cannot be null");
  448       if (value == null)
  449         throw new IllegalArgumentException("value cannot be null");
  450       
  451       this.name = name.intern();
  452       fieldsData = value;
  453       
  454       if (store == Store.YES) {
  455         isStored = true;
  456         isCompressed = false;
  457       }
  458       else if (store == Store.COMPRESS) {
  459         isStored = true;
  460         isCompressed = true;
  461       }
  462       else if (store == Store.NO)
  463         throw new IllegalArgumentException("binary values can't be unstored");
  464       else
  465         throw new IllegalArgumentException("unknown store parameter " + store);
  466       
  467       isIndexed   = false;
  468       isTokenized = false;
  469       
  470       isBinary    = true;
  471       binaryLength = length;
  472       binaryOffset = offset;
  473       
  474       setStoreTermVector(TermVector.NO);
  475     }
  476   }

Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » document » [javadoc | source]