Source code: com/telefonicasoluciones/search/server/HLIndexThread.java
1 package com.telefonicasoluciones.search.server;
2
3 import com.telefonicasoluciones.search.server.parser.html.*;
4 import com.telefonicasoluciones.search.server.parser.xml.*;
5 import com.telefonicasoluciones.search.server.parser.pdf.*;
6 import com.telefonicasoluciones.search.server.util.*;
7 import java.io.*;
8 import java.net.*;
9 import java.util.*;
10 import java.util.zip.ZipEntry;
11 import java.util.zip.ZipInputStream;
12 import org.apache.lucene.document.*;
13 import org.apache.lucene.index.*;
14 import org.apache.lucene.store.Directory;
15 import org.apache.lucene.store.FSDirectory;
16
17 public class HLIndexThread extends Thread {
18 private static final int TEXT = 0;
19 private static final int HTML = 1;
20 private static final int XML = 2;
21 private static final int ZIP = 3;
22 private static final int PDF = 4;
23 private HLHandler hls;
24 private Directory directory;
25 private ArrayList links;
26 boolean newindex = true;
27 private boolean recursive = false;
28 private boolean deleteOldDocuments = false;
29
30 public HLIndexThread(HLHandler newHls) throws Exception {
31 super(newHls.TG, "server-index-"+(newHls.TG.activeCount()+1));
32 hls = newHls;
33 links = new ArrayList();
34 try {
35 directory = FSDirectory.getDirectory(hls.getIndexDirectory(), false);
36 if(hls.getIndexDirectory().lastIndexOf("/") < hls.getIndexDirectory().length())
37 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("/segments"))).exists();
38 else
39 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("segments"))).exists();
40 } catch(Exception e) {
41 throw new Exception(e.toString());
42 }
43 }
44 private void index(HTTPClient session, boolean deleteOldDocuments) throws HLIndexException {
45 try {
46 int type = -1;
47 boolean noerror = true;
48
49 Document doc = new Document();
50 Reader content = null;
51 String url = session.getUrlString();
52 doc.add(Field.Keyword("path", url));
53 if(session.getContentType().indexOf("text/html")!=-1) {
54 type = HTML;
55 if(url.endsWith(".xml"))
56 type = XML;
57 } else if(session.getContentType().indexOf("text/plain")!=-1) {
58 type = TEXT;
59 } else if(session.getContentType().indexOf("application/zip")!=-1) {
60 type = ZIP;
61 } else if(session.getContentType().indexOf("application/pdf")!=-1) {
62 type = PDF;
63 } else if(url.endsWith(".xml")) {
64 type = XML;
65 } else {
66 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
67 String[] values =(String[]) hls.getFileTypes().values().toArray();
68 for(int i=ext.length; --i>=0;) {
69 if(url.endsWith(ext[i])) {
70 if(values[i].toLowerCase().trim().equals("html")) {
71 type = HTML;
72 } else if (values[i].toLowerCase().trim().equals("xml")) {
73 type = XML;
74 } else if (values[i].toLowerCase().trim().equals("text")) {
75 type = TEXT;
76 }
77 }
78 }
79 }
80 switch(type) {
81 case HTML:
82 ArrayList tempLinks = new ArrayList();
83 try {
84 HTMLParser htmlparser = new HTMLParser(session.getContent());
85 tempLinks = htmlparser.getLinks();
86 content = htmlparser.getContentReader();
87 if(htmlparser.getTitle() != null)
88 doc.add(Field.Text("title", htmlparser.getTitle()));
89 else
90 doc.add(Field.Text("title", ""));
91 HashMap meta = htmlparser.getMeta();
92 Object[] keys = meta.keySet().toArray();
93 for(int i=meta.size(); --i>=0; ) {
94 doc.add(Field.Keyword((String)keys[i],(String)meta.get(keys[i])));
95 }
96 } catch(Exception e) {
97 hls.writeErrorLog("HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
98 noerror = false;
99 }
100 fixLinks(tempLinks, session);
101 doc.add(Field.Text("type", "html"));
102 break;
103 case XML:
104 try {
105 XMLParser xmlparser = new XMLParser(session.getContent());
106 content = xmlparser.getContentReader();
107 } catch(Exception e) {
108 hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
109 noerror = false;
110 }
111 doc.add(Field.Text("type", "xml"));
112 break;
113 case TEXT:
114 content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
115 doc.add(Field.Text("type", "text"));
116 break;
117 case ZIP:
118 ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
119 Vector zipDocuments = new Vector();
120 ZipEntry entry;
121 while((entry = zis.getNextEntry()) != null) {
122 FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
123 byte b[] = new byte[512];
124 for(int len = 0; (len = zis.read(b)) != -1;)
125 fos.write(b, 0, len);
126 index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
127 fos.close();
128 }
129 break;
130 case PDF:
131 try {
132 PDFHandler p = new PDFHandler();
133 p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
134 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
135 } catch(Exception e) {
136 hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
137 noerror = false;
138 }
139 doc.add(Field.Text("type", "pdf"));
140 break;
141 }
142 if(content != null)
143 doc.add(Field.Text("contents", content));
144 if(type == -1)
145 hls.writeErrorLog("Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
146 if(noerror) {
147 if(deleteOldDocuments) {
148 IndexReader reader = IndexReader.open(directory);
149 Term term = new Term("path",session.getUrlString());
150 hls.writeIndexLog("Deleting "+reader.delete(term)+" documents");
151 reader.close();
152 }
153 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
154 writer.mergeFactor = 20;
155 writer.addDocument(doc);
156 writer.optimize();
157 writer.close();
158 hls.writeSocketResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
159 hls.writeIndexLog("Document ["+session.getFile()+"] added");
160 } else {
161 hls.writeSocketResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
162 }
163 doc = null;
164 } catch(IOException e) {
165 hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
166 } catch(NullPointerException npe) {
167 hls.writeErrorLog("301 name=\"index\" method=\"index\" [NullPointer]");
168 } catch(Exception e) {
169 hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
170 }
171 }
172 public void deleteOldDocuments(boolean newDeleteOldDocuments) {
173 deleteOldDocuments = newDeleteOldDocuments;
174 }
175 private String fix(String link) throws Exception {
176 String auxString = link.substring(0, link.lastIndexOf("/", link.indexOf("..")) - 1);
177 auxString = auxString.substring(0, auxString.lastIndexOf("/"));
178 link = String.valueOf(auxString) + String.valueOf(link.substring(link.indexOf("..") + 2, link.length()));
179 return link;
180 }
181 private void index(HTTPClient session, boolean recursive, boolean deleteOldDocuments) {
182 try {
183 boolean noerror = true;
184 int type = -1;
185 Document doc = new Document();
186 Reader content = null;
187 hls.writeSocketResponse("<index status=\"active\"/>");
188 String url = session.getUrlString();
189 doc.add(Field.Keyword("path", url));
190 if(session.getContentType().indexOf("text/html")!=-1) {
191 type = HTML;
192 if(url.endsWith(".xml"))
193 type = XML;
194 } else if(session.getContentType().indexOf("text/plain")!=-1&&url.endsWith(".txt")) {
195 type = TEXT;
196 } else if(session.getContentType().indexOf("application/zip")!=-1) {
197 type = ZIP;
198 } else if(session.getContentType().indexOf("application/pdf")!=-1) {
199 type = PDF;
200 } else if(url.endsWith(".xml")) {
201 type = XML;
202 } else {
203 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
204 String[] values =(String[]) hls.getFileTypes().values().toArray();
205 for(int i=ext.length; --i>=0;) {
206 if(url.endsWith(ext[i])) {
207 if(values[i].toLowerCase().trim().equals("html")) {
208 type = HTML;
209 } else if (values[i].toLowerCase().trim().equals("xml")) {
210 type = XML;
211 } else if (values[i].toLowerCase().trim().equals("text")) {
212 type = TEXT;
213 }
214 }
215 }
216 }
217 switch(type) {
218 case HTML:
219 ArrayList tempLinks = new ArrayList();
220 try {
221 HTMLParser htmlparser = new HTMLParser(session.getContent());
222 tempLinks = htmlparser.getLinks();
223 content = htmlparser.getContentReader();
224 if(htmlparser.getTitle() != null)
225 doc.add(Field.Text("title", htmlparser.getTitle()));
226 else
227 doc.add(Field.Text("title", ""));
228 } catch(Exception e) {
229 hls.writeErrorLog("HTML parser error: "+e.getMessage()+" ["+session.getFile()+"]");
230 noerror = false;
231 }
232 fixLinks(tempLinks, session);
233 doc.add(Field.Text("type", "html"));
234 break;
235 case XML:
236 try {
237 XMLParser xmlparser = new XMLParser(session.getContent());
238 ArrayList al = xmlparser.getTokens();
239 System.out.println("Tokens:");
240 for(int u=0; u<al.size(); u++) {
241 System.out.println(al.get(u));
242 }
243 content = xmlparser.getContentReader();
244 } catch(Exception e) {
245 hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
246 noerror = false;
247 }
248 doc.add(Field.Text("type", "xml"));
249 break;
250 case TEXT:
251 content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
252 doc.add(Field.Text("type", "text"));
253 break;
254 case ZIP:
255 ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
256 Vector zipDocuments = new Vector();
257 ZipEntry entry;
258 while((entry = zis.getNextEntry()) != null) {
259 FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
260 byte b[] = new byte[512];
261 for(int len = 0; (len = zis.read(b)) != -1;)
262 fos.write(b, 0, len);
263 index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
264 fos.close();
265 }
266 break;
267 case PDF:
268 try {
269 PDFHandler p = new PDFHandler();
270 p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
271 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
272 } catch(Exception e) {
273 hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
274 noerror = false;
275 }
276 doc.add(Field.Text("type", "pdf"));
277 break;
278 }
279 if(content != null)
280 doc.add(Field.Text("contents", content));
281 if(type == -1)
282 hls.writeErrorLog("Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
283 if(newindex) {
284 hls.writeIndexLog("Building a new indexes");
285 }
286 if(noerror) {
287 if(deleteOldDocuments && !newindex) {
288 IndexReader reader = IndexReader.open(directory);
289 Term term = new Term("path",session.getUrlString());
290 hls.writeIndexLog("Deleting "+reader.delete(term)+" documents");
291 reader.close();
292 }
293 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), newindex);
294 writer.mergeFactor = 20;
295 writer.addDocument(doc);
296 writer.optimize();
297 writer.close();
298 hls.writeSocketResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
299 hls.writeIndexLog("Document ["+session.getFile()+"] added");
300 if(recursive) {
301 for(int i=0; i<links.size(); i++) {
302 try {
303 session.load(HTTPClient.getDocumentString((String) links.get(i)));
304 index(session, deleteOldDocuments);
305 } catch(HTTPClientException e) {
306 hls.writeErrorLog("304 name=\"index\" method=\"index\" url=\""+HTTPClient.getDocumentString((String) links.get(i))+"\" ["+e.getMessage()+"]");
307 } catch(Exception e) {
308 hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
309 }
310 }
311 }
312 } else {
313 hls.writeSocketResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
314 }
315 hls.writeSocketResponse("<index status=\"inactive\"/>");
316 doc = null;
317 finalize();
318 } catch(IOException e) {
319 hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
320 } catch(NullPointerException npe) {
321 hls.writeErrorLog("301 name=\"index\" method=\"index\" [NullPointer]");
322 } catch(Exception e) {
323 hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
324 }
325 }
326 private void index(File fileName, HTTPClient session) throws IOException {
327 try {
328 int type = -1;
329 boolean noerror = true;
330 Document doc = new Document();
331 Reader content = null;
332 String url = session.getUrlString();
333 doc.add(Field.Keyword("path", url));
334 if(url.endsWith(".html")||url.endsWith(".htm")) {
335 type = HTML;
336 } else if(url.endsWith(".txt")) {
337 type = TEXT;
338 } else if(url.endsWith(".pdf")) {
339 type = PDF;
340 } else if(url.endsWith(".xml")) {
341 type = XML;
342 }
343 switch(type) {
344 case HTML:
345 try {
346 StringBuffer sb = new StringBuffer();
347 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
348 String line;
349 while((line = br.readLine()) != null) { sb.append(line); }
350 HTMLParser htmlparser = new HTMLParser(sb.toString());
351 content = htmlparser.getContentReader();
352 if(htmlparser.getTitle() != null)
353 doc.add(Field.Text("title", htmlparser.getTitle()));
354 else
355 doc.add(Field.Text("title", ""));
356 } catch(Exception e) {
357 hls.writeIndexLog("HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
358 noerror = false;
359 }
360 doc.add(Field.Keyword("type", "HTML"));
361 break;
362 case XML:
363 try {
364 XMLParser xmlparser = new XMLParser(session.getContent());
365 content = xmlparser.getContentReader();
366 } catch(Exception e) {
367 hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
368 noerror = false;
369 }
370 doc.add(Field.Keyword("type", "XML"));
371 break;
372 case PDF:
373 try {
374 PDFHandler p = new PDFHandler();
375 p.parse(new FileInputStream(fileName));
376 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
377 } catch(Exception e) {
378 hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getFile()+"]");
379 noerror = false;
380 }
381 doc.add(Field.Keyword("type", "PDF"));
382 break;
383 case TEXT:
384 content = new InputStreamReader(new FileInputStream(fileName));
385 doc.add(Field.Keyword("type", "TEXT"));
386 break;
387 }
388 if(content != null)
389 doc.add(Field.Text("contents", content));
390 if(noerror) {
391 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
392 writer.mergeFactor = 20;
393 writer.addDocument(doc);
394 writer.optimize();
395 writer.close();
396 doc = null;
397 }
398 if(!fileName.delete())
399 hls.writeIndexLog("Can't remove unzipped file: "+fileName.getName());
400 } catch(IOException e) {
401 hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
402 }
403 }
404 public void run() {
405 try {
406 index(hls.getSession(), recursive, deleteOldDocuments);
407 } catch(Exception e) {
408 hls.writeErrorLog("321 name=\"index\" method=\"run\" ["+e.getMessage()+"]");
409 }
410 }
411 public void setRecursive(boolean newRecursive) {
412 recursive = newRecursive;
413 }
414 private void fixLinks(ArrayList tempLinks, HTTPClient session) {
415 int docs = 0;
416 for(int i=tempLinks.size(); --i>=0 && tempLinks.get(i)!=null; ) {
417 String link = (String) tempLinks.get(i);
418 String directory = "";
419 try {
420 directory = session.getFile().substring(0, session.getFile().lastIndexOf("/"));
421 } catch(StringIndexOutOfBoundsException e) {}
422 if(link.indexOf("mailto:") != -1 || link.indexOf("#") != -1 || link.indexOf("javascript:") != -1 || link.indexOf("http://") != -1)
423 continue;
424 if(!link.startsWith("/")) {
425 if(link.startsWith("https://")&&link.indexOf("https://")>1) {
426 continue;
427 }
428 if(!link.startsWith("http://")) {
429 if(link.indexOf("http://")>1) {
430 link = link.substring(link.indexOf("http://")+7,link.length());
431 }
432 link = "http://"+session.getHost()+directory+"/"+link;
433 }
434 do {
435 if(link.indexOf("..") == -1)
436 break;
437 try {
438 link = fix(link);
439 continue;
440 } catch(Exception e) {
441 break;
442 }
443 } while(true);
444 } else {
445 link = "http://"+session.getHost()+link;
446 }
447 link = link.trim();
448 if(!links.contains(link) && HTTPClient.getServerString(link).equals(session.getHost())) {
449 docs++;
450 links.add(link);
451 }
452 }
453 if(docs > 0) { hls.writeIndexLog("Found "+docs+" new documents"); }
454 }
455 public void finalize() {
456 interrupt();
457 }
458 }