1 package org.apache.lucene.document;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.analysis.TokenStream;
21 import org.apache.lucene.index.IndexWriter; // for javadoc
22 import org.apache.lucene.util.Parameter;
23
24 import java.io.Reader;
25 import java.io.Serializable;
26
27 /**
28 A field is a section of a Document. Each field has two parts, a name and a
29 value. Values may be free text, provided as a String or as a Reader, or they
30 may be atomic keywords, which are not further processed. Such keywords may
31 be used to represent dates, urls, etc. Fields are optionally stored in the
32 index, so that they may be returned with hits on the document.
33 */
34
35 public final class Field extends AbstractField implements Fieldable, Serializable {
36
37 /** Specifies whether and how a field should be stored. */
38 public static final class Store extends Parameter implements Serializable {
39
40 private Store(String name) {
41 super(name);
42 }
43
44 /** Store the original field value in the index in a compressed form. This is
45 * useful for long documents and for binary valued fields.
46 */
47 public static final Store COMPRESS = new Store("COMPRESS");
48
49 /** Store the original field value in the index. This is useful for short texts
50 * like a document's title which should be displayed with the results. The
51 * value is stored in its original form, i.e. no analyzer is used before it is
52 * stored.
53 */
54 public static final Store YES = new Store("YES");
55
56 /** Do not store the field value in the index. */
57 public static final Store NO = new Store("NO");
58 }
59
60 /** Specifies whether and how a field should be indexed. */
61 public static final class Index extends Parameter implements Serializable {
62
63 private Index(String name) {
64 super(name);
65 }
66
67 /** Do not index the field value. This field can thus not be searched,
68 * but one can still access its contents provided it is
69 * {@link Field.Store stored}. */
70 public static final Index NO = new Index("NO");
71
72 /** Index the tokens produced by running the field's
73 * value through an Analyzer. This is useful for
74 * common text. */
75 public static final Index ANALYZED = new Index("ANALYZED");
76
77 /** @deprecated this has been renamed to {@link #ANALYZED} */
78 public static final Index TOKENIZED = ANALYZED;
79
80 /** Index the field's value without using an Analyzer, so it can be searched.
81 * As no analyzer is used the value will be stored as a single term. This is
82 * useful for unique Ids like product numbers.
83 */
84 public static final Index NOT_ANALYZED = new Index("NOT_ANALYZED");
85
86 /** @deprecated This has been renamed to {@link #NOT_ANALYZED} */
87 public static final Index UN_TOKENIZED = NOT_ANALYZED;
88
89 /** Expert: Index the field's value without an Analyzer,
90 * and also disable the storing of norms. Note that you
91 * can also separately enable/disable norms by calling
92 * {@link #setOmitNorms}. No norms means that
93 * index-time field and document boosting and field
94 * length normalization are disabled. The benefit is
95 * less memory usage as norms take up one byte of RAM
96 * per indexed field for every document in the index,
97 * during searching. Note that once you index a given
98 * field <i>with</i> norms enabled, disabling norms will
99 * have no effect. In other words, for this to have the
100 * above described effect on a field, all instances of
101 * that field must be indexed with NOT_ANALYZED_NO_NORMS
102 * from the beginning. */
103 public static final Index NOT_ANALYZED_NO_NORMS = new Index("NOT_ANALYZED_NO_NORMS");
104
105 /** @deprecated This has been renamed to
106 * {@link #NOT_ANALYZED_NO_NORMS} */
107 public static final Index NO_NORMS = NOT_ANALYZED_NO_NORMS;
108
109 /** Expert: Index the tokens produced by running the
110 * field's value through an Analyzer, and also
111 * separately disable the storing of norms. See
112 * {@link #NOT_ANALYZED_NO_NORMS} for what norms are
113 * and why you may want to disable them. */
114 public static final Index ANALYZED_NO_NORMS = new Index("ANALYZED_NO_NORMS");
115 }
116
117 /** Specifies whether and how a field should have term vectors. */
118 public static final class TermVector extends Parameter implements Serializable {
119
120 private TermVector(String name) {
121 super(name);
122 }
123
124 /** Do not store term vectors.
125 */
126 public static final TermVector NO = new TermVector("NO");
127
128 /** Store the term vectors of each document. A term vector is a list
129 * of the document's terms and their number of occurences in that document. */
130 public static final TermVector YES = new TermVector("YES");
131
132 /**
133 * Store the term vector + token position information
134 *
135 * @see #YES
136 */
137 public static final TermVector WITH_POSITIONS = new TermVector("WITH_POSITIONS");
138
139 /**
140 * Store the term vector + Token offset information
141 *
142 * @see #YES
143 */
144 public static final TermVector WITH_OFFSETS = new TermVector("WITH_OFFSETS");
145
146 /**
147 * Store the term vector + Token position and offset information
148 *
149 * @see #YES
150 * @see #WITH_POSITIONS
151 * @see #WITH_OFFSETS
152 */
153 public static final TermVector WITH_POSITIONS_OFFSETS = new TermVector("WITH_POSITIONS_OFFSETS");
154 }
155
156
157 /** The value of the field as a String, or null. If null, the Reader value,
158 * binary value, or TokenStream value is used. Exactly one of stringValue(),
159 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
160 public String stringValue() { return fieldsData instanceof String ? (String)fieldsData : null; }
161
162 /** The value of the field as a Reader, or null. If null, the String value,
163 * binary value, or TokenStream value is used. Exactly one of stringValue(),
164 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
165 public Reader readerValue() { return fieldsData instanceof Reader ? (Reader)fieldsData : null; }
166
167 /** The value of the field in Binary, or null. If null, the Reader value,
168 * String value, or TokenStream value is used. Exactly one of stringValue(),
169 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set.
170 * @deprecated This method must allocate a new byte[] if
171 * the {@link AbstractField#getBinaryOffset()} is non-zero
172 * or {@link AbstractField#getBinaryLength()} is not the
173 * full length of the byte[]. Please use {@link
174 * AbstractField#getBinaryValue()} instead, which simply
175 * returns the byte[].
176 */
177 public byte[] binaryValue() {
178 if (!isBinary)
179 return null;
180 final byte[] data = (byte[]) fieldsData;
181 if (binaryOffset == 0 && data.length == binaryLength)
182 return data; //Optimization
183
184 final byte[] ret = new byte[binaryLength];
185 System.arraycopy(data, binaryOffset, ret, 0, binaryLength);
186 return ret;
187 }
188
189 /** The value of the field as a TokesStream, or null. If null, the Reader value,
190 * String value, or binary value is used. Exactly one of stringValue(),
191 * readerValue(), getBinaryValue(), and tokenStreamValue() must be set. */
192 public TokenStream tokenStreamValue() { return fieldsData instanceof TokenStream ? (TokenStream)fieldsData : null; }
193
194
195 /** <p>Expert: change the value of this field. This can
196 * be used during indexing to re-use a single Field
197 * instance to improve indexing speed by avoiding GC cost
198 * of new'ing and reclaiming Field instances. Typically
199 * a single {@link Document} instance is re-used as
200 * well. This helps most on small documents.</p>
201 *
202 * <p>Note that you should only use this method after the
203 * Field has been consumed (ie, the {@link Document}
204 * containing this Field has been added to the index).
205 * Also, each Field instance should only be used once
206 * within a single {@link Document} instance. See <a
207 * href="http://wiki.apache.org/lucene-java/ImproveIndexingSpeed">ImproveIndexingSpeed</a>
208 * for details.</p> */
209 public void setValue(String value) {
210 fieldsData = value;
211 }
212
213 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
214 public void setValue(Reader value) {
215 fieldsData = value;
216 }
217
218 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
219 public void setValue(byte[] value) {
220 fieldsData = value;
221 binaryLength = value.length;
222 binaryOffset = 0;
223 }
224
225 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
226 public void setValue(byte[] value, int offset, int length) {
227 fieldsData = value;
228 binaryLength = length;
229 binaryOffset = offset;
230 }
231
232
233 /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */
234 public void setValue(TokenStream value) {
235 fieldsData = value;
236 }
237
238 /**
239 * Create a field by specifying its name, value and how it will
240 * be saved in the index. Term vectors will not be stored in the index.
241 *
242 * @param name The name of the field
243 * @param value The string to process
244 * @param store Whether <code>value</code> should be stored in the index
245 * @param index Whether the field should be indexed, and if so, if it should
246 * be tokenized before indexing
247 * @throws NullPointerException if name or value is <code>null</code>
248 * @throws IllegalArgumentException if the field is neither stored nor indexed
249 */
250 public Field(String name, String value, Store store, Index index) {
251 this(name, value, store, index, TermVector.NO);
252 }
253
254 /**
255 * Create a field by specifying its name, value and how it will
256 * be saved in the index.
257 *
258 * @param name The name of the field
259 * @param value The string to process
260 * @param store Whether <code>value</code> should be stored in the index
261 * @param index Whether the field should be indexed, and if so, if it should
262 * be tokenized before indexing
263 * @param termVector Whether term vector should be stored
264 * @throws NullPointerException if name or value is <code>null</code>
265 * @throws IllegalArgumentException in any of the following situations:
266 * <ul>
267 * <li>the field is neither stored nor indexed</li>
268 * <li>the field is not indexed but termVector is <code>TermVector.YES</code></li>
269 * </ul>
270 */
271 public Field(String name, String value, Store store, Index index, TermVector termVector) {
272 if (name == null)
273 throw new NullPointerException("name cannot be null");
274 if (value == null)
275 throw new NullPointerException("value cannot be null");
276 if (name.length() == 0 && value.length() == 0)
277 throw new IllegalArgumentException("name and value cannot both be empty");
278 if (index == Index.NO && store == Store.NO)
279 throw new IllegalArgumentException("it doesn't make sense to have a field that "
280 + "is neither indexed nor stored");
281 if (index == Index.NO && termVector != TermVector.NO)
282 throw new IllegalArgumentException("cannot store term vector information "
283 + "for a field that is not indexed");
284
285 this.name = name.intern(); // field names are interned
286 this.fieldsData = value;
287
288 if (store == Store.YES){
289 this.isStored = true;
290 this.isCompressed = false;
291 }
292 else if (store == Store.COMPRESS) {
293 this.isStored = true;
294 this.isCompressed = true;
295 }
296 else if (store == Store.NO){
297 this.isStored = false;
298 this.isCompressed = false;
299 }
300 else
301 throw new IllegalArgumentException("unknown store parameter " + store);
302
303 if (index == Index.NO) {
304 this.isIndexed = false;
305 this.isTokenized = false;
306 } else if (index == Index.ANALYZED) {
307 this.isIndexed = true;
308 this.isTokenized = true;
309 } else if (index == Index.NOT_ANALYZED) {
310 this.isIndexed = true;
311 this.isTokenized = false;
312 } else if (index == Index.NOT_ANALYZED_NO_NORMS) {
313 this.isIndexed = true;
314 this.isTokenized = false;
315 this.omitNorms = true;
316 } else if (index == Index.ANALYZED_NO_NORMS) {
317 this.isIndexed = true;
318 this.isTokenized = true;
319 this.omitNorms = true;
320 } else {
321 throw new IllegalArgumentException("unknown index parameter " + index);
322 }
323
324 this.isBinary = false;
325
326 setStoreTermVector(termVector);
327 }
328
329 /**
330 * Create a tokenized and indexed field that is not stored. Term vectors will
331 * not be stored. The Reader is read only when the Document is added to the index,
332 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
333 * has been called.
334 *
335 * @param name The name of the field
336 * @param reader The reader with the content
337 * @throws NullPointerException if name or reader is <code>null</code>
338 */
339 public Field(String name, Reader reader) {
340 this(name, reader, TermVector.NO);
341 }
342
343 /**
344 * Create a tokenized and indexed field that is not stored, optionally with
345 * storing term vectors. The Reader is read only when the Document is added to the index,
346 * i.e. you may not close the Reader until {@link IndexWriter#addDocument(Document)}
347 * has been called.
348 *
349 * @param name The name of the field
350 * @param reader The reader with the content
351 * @param termVector Whether term vector should be stored
352 * @throws NullPointerException if name or reader is <code>null</code>
353 */
354 public Field(String name, Reader reader, TermVector termVector) {
355 if (name == null)
356 throw new NullPointerException("name cannot be null");
357 if (reader == null)
358 throw new NullPointerException("reader cannot be null");
359
360 this.name = name.intern(); // field names are interned
361 this.fieldsData = reader;
362
363 this.isStored = false;
364 this.isCompressed = false;
365
366 this.isIndexed = true;
367 this.isTokenized = true;
368
369 this.isBinary = false;
370
371 setStoreTermVector(termVector);
372 }
373
374 /**
375 * Create a tokenized and indexed field that is not stored. Term vectors will
376 * not be stored. This is useful for pre-analyzed fields.
377 * The TokenStream is read only when the Document is added to the index,
378 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
379 * has been called.
380 *
381 * @param name The name of the field
382 * @param tokenStream The TokenStream with the content
383 * @throws NullPointerException if name or tokenStream is <code>null</code>
384 */
385 public Field(String name, TokenStream tokenStream) {
386 this(name, tokenStream, TermVector.NO);
387 }
388
389 /**
390 * Create a tokenized and indexed field that is not stored, optionally with
391 * storing term vectors. This is useful for pre-analyzed fields.
392 * The TokenStream is read only when the Document is added to the index,
393 * i.e. you may not close the TokenStream until {@link IndexWriter#addDocument(Document)}
394 * has been called.
395 *
396 * @param name The name of the field
397 * @param tokenStream The TokenStream with the content
398 * @param termVector Whether term vector should be stored
399 * @throws NullPointerException if name or tokenStream is <code>null</code>
400 */
401 public Field(String name, TokenStream tokenStream, TermVector termVector) {
402 if (name == null)
403 throw new NullPointerException("name cannot be null");
404 if (tokenStream == null)
405 throw new NullPointerException("tokenStream cannot be null");
406
407 this.name = name.intern(); // field names are interned
408 this.fieldsData = tokenStream;
409
410 this.isStored = false;
411 this.isCompressed = false;
412
413 this.isIndexed = true;
414 this.isTokenized = true;
415
416 this.isBinary = false;
417
418 setStoreTermVector(termVector);
419 }
420
421
422 /**
423 * Create a stored field with binary value. Optionally the value may be compressed.
424 *
425 * @param name The name of the field
426 * @param value The binary value
427 * @param store How <code>value</code> should be stored (compressed or not)
428 * @throws IllegalArgumentException if store is <code>Store.NO</code>
429 */
430 public Field(String name, byte[] value, Store store) {
431 this(name, value, 0, value.length, store);
432 }
433
434 /**
435 * Create a stored field with binary value. Optionally the value may be compressed.
436 *
437 * @param name The name of the field
438 * @param value The binary value
439 * @param offset Starting offset in value where this Field's bytes are
440 * @param length Number of bytes to use for this Field, starting at offset
441 * @param store How <code>value</code> should be stored (compressed or not)
442 * @throws IllegalArgumentException if store is <code>Store.NO</code>
443 */
444 public Field(String name, byte[] value, int offset, int length, Store store) {
445
446 if (name == null)
447 throw new IllegalArgumentException("name cannot be null");
448 if (value == null)
449 throw new IllegalArgumentException("value cannot be null");
450
451 this.name = name.intern();
452 fieldsData = value;
453
454 if (store == Store.YES) {
455 isStored = true;
456 isCompressed = false;
457 }
458 else if (store == Store.COMPRESS) {
459 isStored = true;
460 isCompressed = true;
461 }
462 else if (store == Store.NO)
463 throw new IllegalArgumentException("binary values can't be unstored");
464 else
465 throw new IllegalArgumentException("unknown store parameter " + store);
466
467 isIndexed = false;
468 isTokenized = false;
469
470 isBinary = true;
471 binaryLength = length;
472 binaryOffset = offset;
473
474 setStoreTermVector(TermVector.NO);
475 }
476 }