1 package org.apache.lucene.analysis.standard;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.analysis;
21
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.Reader;
25 import java.util.Set;
26
27 /**
28 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
29 * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
30 *
31 * @version $Id: StandardAnalyzer.java 692634 2008-09-06 10:58:33Z mikemccand $
32 */
33 public class StandardAnalyzer extends Analyzer {
34 private Set stopSet;
35
36 /**
37 * Specifies whether deprecated acronyms should be replaced with HOST type.
38 * This is false by default to support backward compatibility.
39 *
40 * @deprecated this should be removed in the next release (3.0).
41 *
42 * See https://issues.apache.org/jira/browse/LUCENE-1068
43 */
44 private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
45
46 private static boolean defaultReplaceInvalidAcronym;
47
48 // Default to true (fixed the bug), unless the system prop is set
49 static {
50 final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
51 if (v == null || v.equals("true"))
52 defaultReplaceInvalidAcronym = true;
53 else
54 defaultReplaceInvalidAcronym = false;
55 }
56
57 /**
58 *
59 * @return true if new instances of StandardTokenizer will
60 * replace mischaracterized acronyms
61 *
62 * See https://issues.apache.org/jira/browse/LUCENE-1068
63 * @deprecated This will be removed (hardwired to true) in 3.0
64 */
65 public static boolean getDefaultReplaceInvalidAcronym() {
66 return defaultReplaceInvalidAcronym;
67 }
68
69 /**
70 *
71 * @param replaceInvalidAcronym Set to true to have new
72 * instances of StandardTokenizer replace mischaracterized
73 * acronyms by default. Set to false to preseve the
74 * previous (before 2.4) buggy behavior. Alternatively,
75 * set the system property
76 * org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
77 * to false.
78 *
79 * See https://issues.apache.org/jira/browse/LUCENE-1068
80 * @deprecated This will be removed (hardwired to true) in 3.0
81 */
82 public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
83 defaultReplaceInvalidAcronym = replaceInvalidAcronym;
84 }
85
86
87 /** An array containing some common English words that are usually not
88 useful for searching. */
89 public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
90
91 /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
92 public StandardAnalyzer() {
93 this(STOP_WORDS);
94 }
95
96 /** Builds an analyzer with the given stop words. */
97 public StandardAnalyzer(Set stopWords) {
98 stopSet = stopWords;
99 }
100
101 /** Builds an analyzer with the given stop words. */
102 public StandardAnalyzer(String[] stopWords) {
103 stopSet = StopFilter.makeStopSet(stopWords);
104 }
105
106 /** Builds an analyzer with the stop words from the given file.
107 * @see WordlistLoader#getWordSet(File)
108 */
109 public StandardAnalyzer(File stopwords) throws IOException {
110 stopSet = WordlistLoader.getWordSet(stopwords);
111 }
112
113 /** Builds an analyzer with the stop words from the given reader.
114 * @see WordlistLoader#getWordSet(Reader)
115 */
116 public StandardAnalyzer(Reader stopwords) throws IOException {
117 stopSet = WordlistLoader.getWordSet(stopwords);
118 }
119
120 /**
121 *
122 * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
123 *
124 * See https://issues.apache.org/jira/browse/LUCENE-1068
125 *
126 * @deprecated Remove in 3.X and make true the only valid value
127 */
128 public StandardAnalyzer(boolean replaceInvalidAcronym) {
129 this(STOP_WORDS);
130 this.replaceInvalidAcronym = replaceInvalidAcronym;
131 }
132
133 /**
134 * @param stopwords The stopwords to use
135 * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
136 *
137 * See https://issues.apache.org/jira/browse/LUCENE-1068
138 *
139 * @deprecated Remove in 3.X and make true the only valid value
140 */
141 public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{
142 this(stopwords);
143 this.replaceInvalidAcronym = replaceInvalidAcronym;
144 }
145
146 /**
147 * @param stopwords The stopwords to use
148 * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
149 *
150 * See https://issues.apache.org/jira/browse/LUCENE-1068
151 *
152 * @deprecated Remove in 3.X and make true the only valid value
153 */
154 public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{
155 this(stopwords);
156 this.replaceInvalidAcronym = replaceInvalidAcronym;
157 }
158
159 /**
160 *
161 * @param stopwords The stopwords to use
162 * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
163 *
164 * See https://issues.apache.org/jira/browse/LUCENE-1068
165 *
166 * @deprecated Remove in 3.X and make true the only valid value
167 */
168 public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
169 this(stopwords);
170 this.replaceInvalidAcronym = replaceInvalidAcronym;
171 }
172
173 /**
174 * @param stopwords The stopwords to use
175 * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
176 *
177 * See https://issues.apache.org/jira/browse/LUCENE-1068
178 *
179 * @deprecated Remove in 3.X and make true the only valid value
180 */
181 public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{
182 this(stopwords);
183 this.replaceInvalidAcronym = replaceInvalidAcronym;
184 }
185
186 /** Constructs a {@link StandardTokenizer} filtered by a {@link
187 StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
188 public TokenStream tokenStream(String fieldName, Reader reader) {
189 StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
190 tokenStream.setMaxTokenLength(maxTokenLength);
191 TokenStream result = new StandardFilter(tokenStream);
192 result = new LowerCaseFilter(result);
193 result = new StopFilter(result, stopSet);
194 return result;
195 }
196
197 private static final class SavedStreams {
198 StandardTokenizer tokenStream;
199 TokenStream filteredTokenStream;
200 }
201
202 /** Default maximum allowed token length */
203 public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
204
205 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
206
207 /**
208 * Set maximum allowed token length. If a token is seen
209 * that exceeds this length then it is discarded. This
210 * setting only takes effect the next time tokenStream or
211 * reusableTokenStream is called.
212 */
213 public void setMaxTokenLength(int length) {
214 maxTokenLength = length;
215 }
216
217 /**
218 * @see #setMaxTokenLength
219 */
220 public int getMaxTokenLength() {
221 return maxTokenLength;
222 }
223
224 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
225 SavedStreams streams = (SavedStreams) getPreviousTokenStream();
226 if (streams == null) {
227 streams = new SavedStreams();
228 setPreviousTokenStream(streams);
229 streams.tokenStream = new StandardTokenizer(reader);
230 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
231 streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
232 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
233 } else {
234 streams.tokenStream.reset(reader);
235 }
236 streams.tokenStream.setMaxTokenLength(maxTokenLength);
237
238 streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
239
240 return streams.filteredTokenStream;
241 }
242
243 /**
244 *
245 * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
246 *
247 * See https://issues.apache.org/jira/browse/LUCENE-1068
248 * @deprecated This will be removed (hardwired to true) in 3.0
249 */
250 public boolean isReplaceInvalidAcronym() {
251 return replaceInvalidAcronym;
252 }
253
254 /**
255 *
256 * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
257 *
258 * See https://issues.apache.org/jira/browse/LUCENE-1068
259 * @deprecated This will be removed (hardwired to true) in 3.0
260 */
261 public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
262 this.replaceInvalidAcronym = replaceInvalidAcronym;
263 }
264 }