1 package org.apache.lucene.analysis;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import java.io.Reader;
21 import java.io.IOException;
22
23 import org.apache.lucene.util.CloseableThreadLocal;
24 import org.apache.lucene.store.AlreadyClosedException;
25
26 /** An Analyzer builds TokenStreams, which analyze text. It thus represents a
27 * policy for extracting index terms from text.
28 * <p>
29 * Typical implementations first build a Tokenizer, which breaks the stream of
30 * characters from the Reader into raw Tokens. One or more TokenFilters may
31 * then be applied to the output of the Tokenizer.
32 */
33 public abstract class Analyzer {
34 /** Creates a TokenStream which tokenizes all the text in the provided
35 * Reader. Must be able to handle null field name for backward compatibility.
36 */
37 public abstract TokenStream tokenStream(String fieldName, Reader reader);
38
39 /** Creates a TokenStream that is allowed to be re-used
40 * from the previous time that the same thread called
41 * this method. Callers that do not need to use more
42 * than one TokenStream at the same time from this
43 * analyzer should use this method for better
44 * performance.
45 */
46 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
47 return tokenStream(fieldName, reader);
48 }
49
50 private CloseableThreadLocal tokenStreams = new CloseableThreadLocal();
51
52 /** Used by Analyzers that implement reusableTokenStream
53 * to retrieve previously saved TokenStreams for re-use
54 * by the same thread. */
55 protected Object getPreviousTokenStream() {
56 try {
57 return tokenStreams.get();
58 } catch (NullPointerException npe) {
59 if (tokenStreams == null) {
60 throw new AlreadyClosedException("this Analyzer is closed");
61 } else {
62 throw npe;
63 }
64 }
65 }
66
67 /** Used by Analyzers that implement reusableTokenStream
68 * to save a TokenStream for later re-use by the same
69 * thread. */
70 protected void setPreviousTokenStream(Object obj) {
71 try {
72 tokenStreams.set(obj);
73 } catch (NullPointerException npe) {
74 if (tokenStreams == null) {
75 throw new AlreadyClosedException("this Analyzer is closed");
76 } else {
77 throw npe;
78 }
79 }
80 }
81
82
83 /**
84 * Invoked before indexing a Fieldable instance if
85 * terms have already been added to that field. This allows custom
86 * analyzers to place an automatic position increment gap between
87 * Fieldable instances using the same field name. The default value
88 * position increment gap is 0. With a 0 position increment gap and
89 * the typical default token position increment of 1, all terms in a field,
90 * including across Fieldable instances, are in successive positions, allowing
91 * exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
92 *
93 * @param fieldName Fieldable name being indexed.
94 * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
95 */
96 public int getPositionIncrementGap(String fieldName)
97 {
98 return 0;
99 }
100
101 /** Frees persistent resources used by this Analyzer */
102 public void close() {
103 tokenStreams.close();
104 tokenStreams = null;
105 }
106 }