1 package org.apache.lucene.index;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import java.io.IOException;
21
22 import org.apache.lucene.store.Directory;
23 import org.apache.lucene.store.BufferedIndexInput;
24 import org.apache.lucene.util.cache.Cache;
25 import org.apache.lucene.util.cache.SimpleLRUCache;
26 import org.apache.lucene.util.CloseableThreadLocal;
27
28 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
29 * Directory. Pairs are accessed either by Term or by ordinal position the
30 * set. */
31
32 final class TermInfosReader {
33 private Directory directory;
34 private String segment;
35 private FieldInfos fieldInfos;
36
37 private CloseableThreadLocal threadResources = new CloseableThreadLocal();
38 private SegmentTermEnum origEnum;
39 private long size;
40
41 private Term[] indexTerms = null;
42 private TermInfo[] indexInfos;
43 private long[] indexPointers;
44
45 private SegmentTermEnum indexEnum;
46
47 private int indexDivisor = 1;
48 private int totalIndexInterval;
49
50 private final static int DEFAULT_CACHE_SIZE = 1024;
51
52 /**
53 * Per-thread resources managed by ThreadLocal
54 */
55 private static final class ThreadResources {
56 SegmentTermEnum termEnum;
57
58 // Used for caching the least recently looked-up Terms
59 Cache termInfoCache;
60 }
61
62 TermInfosReader(Directory dir, String seg, FieldInfos fis)
63 throws CorruptIndexException, IOException {
64 this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
65 }
66
67 TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize)
68 throws CorruptIndexException, IOException {
69 boolean success = false;
70
71 try {
72 directory = dir;
73 segment = seg;
74 fieldInfos = fis;
75
76 origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
77 readBufferSize), fieldInfos, false);
78 size = origEnum.size;
79 totalIndexInterval = origEnum.indexInterval;
80
81 indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
82 readBufferSize), fieldInfos, true);
83
84 success = true;
85 } finally {
86 // With lock-less commits, it's entirely possible (and
87 // fine) to hit a FileNotFound exception above. In
88 // this case, we want to explicitly close any subset
89 // of things that were opened so that we don't have to
90 // wait for a GC to do so.
91 if (!success) {
92 close();
93 }
94 }
95 }
96
97 public int getSkipInterval() {
98 return origEnum.skipInterval;
99 }
100
101 public int getMaxSkipLevels() {
102 return origEnum.maxSkipLevels;
103 }
104
105 /**
106 * <p>Sets the indexDivisor, which subsamples the number
107 * of indexed terms loaded into memory. This has a
108 * similar effect as {@link
109 * IndexWriter#setTermIndexInterval} except that setting
110 * must be done at indexing time while this setting can be
111 * set per reader. When set to N, then one in every
112 * N*termIndexInterval terms in the index is loaded into
113 * memory. By setting this to a value > 1 you can reduce
114 * memory usage, at the expense of higher latency when
115 * loading a TermInfo. The default value is 1.</p>
116 *
117 * <b>NOTE:</b> you must call this before the term
118 * index is loaded. If the index is already loaded,
119 * an IllegalStateException is thrown.
120 *
121 + @throws IllegalStateException if the term index has
122 * already been loaded into memory.
123 */
124 public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
125 if (indexDivisor < 1)
126 throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);
127
128 if (indexTerms != null)
129 throw new IllegalStateException("index terms are already loaded");
130
131 this.indexDivisor = indexDivisor;
132 totalIndexInterval = origEnum.indexInterval * indexDivisor;
133 }
134
135 /** Returns the indexDivisor.
136 * @see #setIndexDivisor
137 */
138 public int getIndexDivisor() {
139 return indexDivisor;
140 }
141
142 final void close() throws IOException {
143 if (origEnum != null)
144 origEnum.close();
145 if (indexEnum != null)
146 indexEnum.close();
147 threadResources.close();
148 }
149
150 /** Returns the number of term/value pairs in the set. */
151 final long size() {
152 return size;
153 }
154
155 private ThreadResources getThreadResources() {
156 ThreadResources resources = (ThreadResources)threadResources.get();
157 if (resources == null) {
158 resources = new ThreadResources();
159 resources.termEnum = terms();
160 // Cache does not have to be thread-safe, it is only used by one thread at the same time
161 resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
162 threadResources.set(resources);
163 }
164 return resources;
165 }
166
167 private synchronized void ensureIndexIsRead() throws IOException {
168 if (indexTerms != null) // index already read
169 return; // do nothing
170 try {
171 int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index
172
173 indexTerms = new Term[indexSize];
174 indexInfos = new TermInfo[indexSize];
175 indexPointers = new long[indexSize];
176
177 for (int i = 0; indexEnum.next(); i++) {
178 indexTerms[i] = indexEnum.term();
179 indexInfos[i] = indexEnum.termInfo();
180 indexPointers[i] = indexEnum.indexPointer;
181
182 for (int j = 1; j < indexDivisor; j++)
183 if (!indexEnum.next())
184 break;
185 }
186 } finally {
187 indexEnum.close();
188 indexEnum = null;
189 }
190 }
191
192 /** Returns the offset of the greatest index entry which is less than or equal to term.*/
193 private final int getIndexOffset(Term term) {
194 int lo = 0; // binary search indexTerms[]
195 int hi = indexTerms.length - 1;
196
197 while (hi >= lo) {
198 int mid = (lo + hi) >>> 1;
199 int delta = term.compareTo(indexTerms[mid]);
200 if (delta < 0)
201 hi = mid - 1;
202 else if (delta > 0)
203 lo = mid + 1;
204 else
205 return mid;
206 }
207 return hi;
208 }
209
210 private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
211 enumerator.seek(indexPointers[indexOffset],
212 (indexOffset * totalIndexInterval) - 1,
213 indexTerms[indexOffset], indexInfos[indexOffset]);
214 }
215
216 /** Returns the TermInfo for a Term in the set, or null. */
217 TermInfo get(Term term) throws IOException {
218 return get(term, true);
219 }
220
221 /** Returns the TermInfo for a Term in the set, or null. */
222 private TermInfo get(Term term, boolean useCache) throws IOException {
223 if (size == 0) return null;
224
225 ensureIndexIsRead();
226
227 TermInfo ti;
228 ThreadResources resources = getThreadResources();
229 Cache cache = null;
230
231 if (useCache) {
232 cache = resources.termInfoCache;
233 // check the cache first if the term was recently looked up
234 ti = (TermInfo) cache.get(term);
235 if (ti != null) {
236 return ti;
237 }
238 }
239
240 // optimize sequential access: first try scanning cached enum w/o seeking
241 SegmentTermEnum enumerator = resources.termEnum;
242 if (enumerator.term() != null // term is at or past current
243 && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
244 || term.compareTo(enumerator.term()) >= 0)) {
245 int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
246 if (indexTerms.length == enumOffset // but before end of block
247 || term.compareTo(indexTerms[enumOffset]) < 0) {
248 // no need to seek
249
250 int numScans = enumerator.scanTo(term);
251 if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
252 ti = enumerator.termInfo();
253 if (cache != null && numScans > 1) {
254 // we only want to put this TermInfo into the cache if
255 // scanEnum skipped more than one dictionary entry.
256 // This prevents RangeQueries or WildcardQueries to
257 // wipe out the cache when they iterate over a large numbers
258 // of terms in order
259 cache.put(term, ti);
260 }
261 } else {
262 ti = null;
263 }
264
265 return ti;
266 }
267 }
268
269 // random-access: must seek
270 seekEnum(enumerator, getIndexOffset(term));
271 enumerator.scanTo(term);
272 if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
273 ti = enumerator.termInfo();
274 if (cache != null) {
275 cache.put(term, ti);
276 }
277 } else {
278 ti = null;
279 }
280 return ti;
281 }
282
283 /** Returns the nth term in the set. */
284 final Term get(int position) throws IOException {
285 if (size == 0) return null;
286
287 SegmentTermEnum enumerator = getThreadResources().termEnum;
288 if (enumerator != null && enumerator.term() != null &&
289 position >= enumerator.position &&
290 position < (enumerator.position + totalIndexInterval))
291 return scanEnum(enumerator, position); // can avoid seek
292
293 seekEnum(enumerator, position/totalIndexInterval); // must seek
294 return scanEnum(enumerator, position);
295 }
296
297 private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
298 while(enumerator.position < position)
299 if (!enumerator.next())
300 return null;
301
302 return enumerator.term();
303 }
304
305 /** Returns the position of a Term in the set or -1. */
306 final long getPosition(Term term) throws IOException {
307 if (size == 0) return -1;
308
309 ensureIndexIsRead();
310 int indexOffset = getIndexOffset(term);
311
312 SegmentTermEnum enumerator = getThreadResources().termEnum;
313 seekEnum(enumerator, indexOffset);
314
315 while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}
316
317 if (term.compareTo(enumerator.term()) == 0)
318 return enumerator.position;
319 else
320 return -1;
321 }
322
323 /** Returns an enumeration of all the Terms and TermInfos in the set. */
324 public SegmentTermEnum terms() {
325 return (SegmentTermEnum)origEnum.clone();
326 }
327
328 /** Returns an enumeration of terms starting at or after the named term. */
329 public SegmentTermEnum terms(Term term) throws IOException {
330 // don't use the cache in this call because we want to reposition the
331 // enumeration
332 get(term, false);
333 return (SegmentTermEnum)getThreadResources().termEnum.clone();
334 }
335 }