1 package org.apache.lucene.demo;
2
3 /**
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 */
19
20 import org.apache.lucene.analysis.standard.StandardAnalyzer;
21 import org.apache.lucene.document.Document;
22 import org.apache.lucene.index.IndexReader;
23 import org.apache.lucene.index.IndexWriter;
24 import org.apache.lucene.index.Term;
25 import org.apache.lucene.index.TermEnum;
26 import java.io.File;
27 import java.util.Date;
28 import java.util.Arrays;
29
30 /** Indexer for HTML files. */
31 public class IndexHTML {
32 private IndexHTML() {}
33
34 private static boolean deleting = false; // true during deletion pass
35 private static IndexReader reader; // existing index
36 private static IndexWriter writer; // new index being built
37 private static TermEnum uidIter; // document id iterator
38
39 /** Indexer for HTML files.*/
40 public static void main(String[] argv) {
41 try {
42 String index = "index";
43 boolean create = false;
44 File root = null;
45
46 String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
47
48 if (argv.length == 0) {
49 System.err.println("Usage: " + usage);
50 return;
51 }
52
53 for (int i = 0; i < argv.length; i++) {
54 if (argv[i].equals("-index")) { // parse -index option
55 index = argv[++i];
56 } else if (argv[i].equals("-create")) { // parse -create option
57 create = true;
58 } else if (i != argv.length-1) {
59 System.err.println("Usage: " + usage);
60 return;
61 } else
62 root = new File(argv[i]);
63 }
64
65 Date start = new Date();
66
67 if (!create) { // delete stale docs
68 deleting = true;
69 indexDocs(root, index, create);
70 }
71 writer = new IndexWriter(index, new StandardAnalyzer(), create,
72 new IndexWriter.MaxFieldLength(1000000));
73 indexDocs(root, index, create); // add new docs
74
75 System.out.println("Optimizing index...");
76 writer.optimize();
77 writer.close();
78
79 Date end = new Date();
80
81 System.out.print(end.getTime() - start.getTime());
82 System.out.println(" total milliseconds");
83
84 } catch (Exception e) {
85 System.out.println(" caught a " + e.getClass() +
86 "\n with message: " + e.getMessage());
87 }
88 }
89
90 /* Walk directory hierarchy in uid order, while keeping uid iterator from
91 /* existing index in sync. Mismatches indicate one of: (a) old documents to
92 /* be deleted; (b) unchanged documents, to be left alone; or (c) new
93 /* documents, to be indexed.
94 */
95
96 private static void indexDocs(File file, String index, boolean create)
97 throws Exception {
98 if (!create) { // incrementally update
99
100 reader = IndexReader.open(index); // open existing index
101 uidIter = reader.terms(new Term("uid", "")); // init uid iterator
102
103 indexDocs(file);
104
105 if (deleting) { // delete rest of stale docs
106 while (uidIter.term() != null && uidIter.term().field() == "uid") {
107 System.out.println("deleting " +
108 HTMLDocument.uid2url(uidIter.term().text()));
109 reader.deleteDocuments(uidIter.term());
110 uidIter.next();
111 }
112 deleting = false;
113 }
114
115 uidIter.close(); // close uid iterator
116 reader.close(); // close existing index
117
118 } else // don't have exisiting
119 indexDocs(file);
120 }
121
122 private static void indexDocs(File file) throws Exception {
123 if (file.isDirectory()) { // if a directory
124 String[] files = file.list(); // list its files
125 Arrays.sort(files); // sort the files
126 for (int i = 0; i < files.length; i++) // recursively index them
127 indexDocs(new File(file, files[i]));
128
129 } else if (file.getPath().endsWith(".html") || // index .html files
130 file.getPath().endsWith(".htm") || // index .htm files
131 file.getPath().endsWith(".txt")) { // index .txt files
132
133 if (uidIter != null) {
134 String uid = HTMLDocument.uid(file); // construct uid for doc
135
136 while (uidIter.term() != null && uidIter.term().field() == "uid" &&
137 uidIter.term().text().compareTo(uid) < 0) {
138 if (deleting) { // delete stale docs
139 System.out.println("deleting " +
140 HTMLDocument.uid2url(uidIter.term().text()));
141 reader.deleteDocuments(uidIter.term());
142 }
143 uidIter.next();
144 }
145 if (uidIter.term() != null && uidIter.term().field() == "uid" &&
146 uidIter.term().text().compareTo(uid) == 0) {
147 uidIter.next(); // keep matching docs
148 } else if (!deleting) { // add new docs
149 Document doc = HTMLDocument.Document(file);
150 System.out.println("adding " + doc.get("path"));
151 writer.addDocument(doc);
152 }
153 } else { // creating a new index
154 Document doc = HTMLDocument.Document(file);
155 System.out.println("adding " + doc.get("path"));
156 writer.addDocument(doc); // add docs unconditionally
157 }
158 }
159 }
160 }