Source code: com/telefonicasoluciones/search/server/HLIndex.java
1 package com.telefonicasoluciones.search.server;
2
3 import com.telefonicasoluciones.search.server.parser.html.*;
4 import com.telefonicasoluciones.search.server.parser.xml.*;
5 import com.telefonicasoluciones.search.server.parser.pdf.*;
6 import com.telefonicasoluciones.search.server.util.*;
7 import java.io.*;
8 import java.net.*;
9 import java.util.*;
10 import java.util.zip.ZipEntry;
11 import java.util.zip.ZipInputStream;
12 import org.apache.lucene.document.*;
13 import org.apache.lucene.index.*;
14 import org.apache.lucene.store.Directory;
15 import org.apache.lucene.store.FSDirectory;
16
17 public class HLIndex {
18 private static final int TEXT = 0;
19 private static final int HTML = 1;
20 private static final int XML = 2;
21 private static final int ZIP = 3;
22 private static final int PDF = 4;
23 private HLHandler hls;
24 private Directory directory;
25 private HTTPClient session;
26 private ArrayList links;
27 private Socket connector;
28 private boolean recursive = false;
29 private boolean deleteOldDocuments = false;
30
31 public HLIndex(Socket s, HLHandler newHls) throws HLIndexException {
32 hls = newHls;
33 links = new ArrayList();
34 connector = s;
35 try {
36 directory = FSDirectory.getDirectory(hls.getIndexDirectory(), false);
37 } catch(Exception e) {
38 throw new HLIndexException(e.getMessage());
39 }
40 }
41 private void index() throws HLIndexException {
42 try {
43 int type = -1;
44 boolean noerror = true;
45
46 Document doc = new Document();
47 Reader content = null;
48 String url = session.getUrlString();
49 doc.add(Field.Keyword("path", url));
50 if(session.getContentType().indexOf("text/html")!=-1) {
51 type = HTML;
52 if(url.endsWith(".xml"))
53 type = XML;
54 } else if(session.getContentType().indexOf("text/plain")!=-1) {
55 type = TEXT;
56 } else if(session.getContentType().indexOf("application/zip")!=-1) {
57 type = ZIP;
58 } else if(session.getContentType().indexOf("application/pdf")!=-1) {
59 type = PDF;
60 } else if(url.endsWith(".xml")) {
61 type = XML;
62 } else {
63 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
64 String[] values =(String[]) hls.getFileTypes().values().toArray();
65 for(int i=ext.length; --i>=0;) {
66 if(url.endsWith(ext[i])) {
67 if(values[i].toLowerCase().trim().equals("html")) {
68 type = HTML;
69 } else if (values[i].toLowerCase().trim().equals("xml")) {
70 type = XML;
71 } else if (values[i].toLowerCase().trim().equals("text")) {
72 type = TEXT;
73 }
74 }
75 }
76 }
77 switch(type) {
78 case HTML:
79 ArrayList tempLinks = new ArrayList();
80 try {
81 HTMLParser htmlparser = new HTMLParser(session.getContent());
82 tempLinks = htmlparser.getLinks();
83 content = htmlparser.getContentReader();
84 if(htmlparser.getTitle() != null)
85 doc.add(Field.Text("title", htmlparser.getTitle()));
86 else
87 doc.add(Field.Text("title", ""));
88 HashMap meta = htmlparser.getMeta();
89 Object[] keys = meta.keySet().toArray();
90 for(int i=meta.size(); --i>=0; ) {
91 doc.add(Field.Keyword((String)keys[i],(String)meta.get(keys[i])));
92 }
93 } catch(Exception e) {
94 hls.writeLog(HLHandler.ERROR_LOG,"HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
95 noerror = false;
96 }
97 fixLinks(tempLinks, session);
98 doc.add(Field.Text("type", "html"));
99 break;
100 case XML:
101 try {
102 XMLParser xmlparser = new XMLParser(session.getContent());
103 content = xmlparser.getContentReader();
104 } catch(Exception e) {
105 hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
106 noerror = false;
107 }
108 doc.add(Field.Text("type", "xml"));
109 break;
110 case TEXT:
111 content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
112 doc.add(Field.Text("type", "text"));
113 break;
114 case ZIP:
115 ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
116 Vector zipDocuments = new Vector();
117 ZipEntry entry;
118 while((entry = zis.getNextEntry()) != null) {
119 FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
120 byte b[] = new byte[512];
121 for(int len = 0; (len = zis.read(b)) != -1;)
122 fos.write(b, 0, len);
123 index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
124 fos.close();
125 }
126 break;
127 case PDF:
128 try {
129 PDFHandler p = new PDFHandler();
130 p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
131 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
132 } catch(Exception e) {
133 hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
134 noerror = false;
135 }
136 doc.add(Field.Text("type", "pdf"));
137 break;
138 }
139 if(content != null)
140 doc.add(Field.Text("contents", content));
141 if(type == -1)
142 hls.writeLog(HLHandler.ERROR_LOG,"Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
143 if(noerror) {
144 if(this.deleteOldDocuments) {
145 IndexReader reader = IndexReader.open(directory);
146 Term term = new Term("path",session.getUrlString());
147 hls.writeLog(HLHandler.INDEX_LOG,"Deleting "+reader.delete(term)+" documents");
148 reader.close();
149 }
150 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
151 writer.mergeFactor = 20;
152 writer.addDocument(doc);
153 writer.optimize();
154 writer.close();
155 writeResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
156 hls.writeLog(HLHandler.INDEX_LOG,"Document ["+session.getFile()+"] added");
157 } else {
158 writeResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
159 }
160 doc = null;
161 } catch(IOException e) {
162 hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
163 } catch(NullPointerException npe) {
164 hls.writeLog(HLHandler.ERROR_LOG,"301 name=\"index\" method=\"index\" [NullPointer]");
165 } catch(Exception e) {
166 hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
167 }
168 }
169 public void deleteOldDocuments(boolean newDeleteOldDocuments) {
170 deleteOldDocuments = newDeleteOldDocuments;
171 }
172 private String fix(String link) throws Exception {
173 String auxString = link.substring(0, link.lastIndexOf("/", link.indexOf("..")) - 1);
174 auxString = auxString.substring(0, auxString.lastIndexOf("/"));
175 link = String.valueOf(auxString) + String.valueOf(link.substring(link.indexOf("..") + 2, link.length()));
176 return link;
177 }
178 public void index(URL URLFile) {
179 try {
180 boolean newindex = true;
181 if(hls.getIndexDirectory().getAbsolutePath().lastIndexOf("/") < hls.getIndexDirectory().length())
182 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("/segments"))).exists();
183 else
184 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("segments"))).exists();boolean noerror = true;
185 session = new HTTPClient(URLFile.getHost());
186 session.load(HTTPClient.getDocumentString(URLFile.toString()));
187 int type = -1;
188 Document doc = new Document();
189 Reader content = null;
190 writeResponse("<index status=\"active\"/>");
191 String url = session.getUrlString();
192 doc.add(Field.Keyword("path", url));
193 if(session.getContentType().indexOf("text/html")!=-1) {
194 type = HTML;
195 if(url.endsWith(".xml"))
196 type = XML;
197 } else if(session.getContentType().indexOf("text/plain")!=-1&&url.endsWith(".txt")) {
198 type = TEXT;
199 } else if(session.getContentType().indexOf("application/zip")!=-1) {
200 type = ZIP;
201 } else if(session.getContentType().indexOf("application/pdf")!=-1) {
202 type = PDF;
203 } else if(url.endsWith(".xml")) {
204 type = XML;
205 } else {
206 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
207 String[] values =(String[]) hls.getFileTypes().values().toArray();
208 for(int i=ext.length; --i>=0;) {
209 if(url.endsWith(ext[i])) {
210 if(values[i].toLowerCase().trim().equals("html")) {
211 type = HTML;
212 } else if (values[i].toLowerCase().trim().equals("xml")) {
213 type = XML;
214 } else if (values[i].toLowerCase().trim().equals("text")) {
215 type = TEXT;
216 }
217 }
218 }
219 }
220 switch(type) {
221 case HTML:
222 ArrayList tempLinks = new ArrayList();
223 try {
224 HTMLParser htmlparser = new HTMLParser(session.getContent());
225 tempLinks = htmlparser.getLinks();
226 content = htmlparser.getContentReader();
227 if(htmlparser.getTitle() != null)
228 doc.add(Field.Text("title", htmlparser.getTitle()));
229 else
230 doc.add(Field.Text("title", ""));
231 } catch(Exception e) {
232 hls.writeLog(HLHandler.ERROR_LOG,"HTML parser error: "+e.getMessage()+" ["+session.getFile()+"]");
233 noerror = false;
234 }
235 fixLinks(tempLinks, session);
236 doc.add(Field.Text("type", "html"));
237 break;
238 case XML:
239 try {
240 XMLParser xmlparser = new XMLParser(session.getContent());
241 ArrayList al = xmlparser.getTokens();
242 content = xmlparser.getContentReader();
243 } catch(Exception e) {
244 hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
245 noerror = false;
246 }
247 doc.add(Field.Text("type", "xml"));
248 break;
249 case TEXT:
250 content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
251 doc.add(Field.Text("type", "text"));
252 break;
253 case ZIP:
254 ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
255 Vector zipDocuments = new Vector();
256 ZipEntry entry;
257 while((entry = zis.getNextEntry()) != null) {
258 FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
259 byte b[] = new byte[512];
260 for(int len = 0; (len = zis.read(b)) != -1;)
261 fos.write(b, 0, len);
262 index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
263 fos.close();
264 }
265 break;
266 case PDF:
267 try {
268 PDFHandler p = new PDFHandler();
269 p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
270 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
271 } catch(Exception e) {
272 hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
273 noerror = false;
274 }
275 doc.add(Field.Text("type", "pdf"));
276 break;
277 }
278 if(content != null)
279 doc.add(Field.Text("contents", content));
280 if(type == -1)
281 hls.writeLog(HLHandler.ERROR_LOG,"Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
282 if(newindex) {
283 hls.writeLog(HLHandler.INDEX_LOG,"Building a new indexes");
284 }
285 if(noerror) {
286 if(this.deleteOldDocuments && !newindex) {
287 IndexReader reader = IndexReader.open(directory);
288 Term term = new Term("path",session.getUrlString());
289 hls.writeLog(HLHandler.INDEX_LOG,"Deleting "+reader.delete(term)+" documents");
290 reader.close();
291 }
292 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), newindex);
293 writer.mergeFactor = 20;
294 writer.addDocument(doc);
295 writer.optimize();
296 writer.close();
297 writeResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
298 hls.writeLog(HLHandler.INDEX_LOG,"Document ["+session.getFile()+"] added");
299 if(this.recursive) {
300 for(int i=0; i<links.size(); i++) {
301 try {
302 session.load(HTTPClient.getDocumentString((String) links.get(i)));
303 index();
304 } catch(HTTPClientException e) {
305 hls.writeLog(HLHandler.ERROR_LOG,"304 name=\"index\" method=\"index\" url=\""+HTTPClient.getDocumentString((String) links.get(i))+"\" ["+e.getMessage()+"]");
306 } catch(Exception e) {
307 hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
308 }
309 }
310 }
311 } else {
312 writeResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
313 }
314 writeResponse("<index status=\"inactive\"/>");
315 doc = null;
316 } catch(IOException e) {
317 hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
318 } catch(NullPointerException npe) {
319 hls.writeLog(HLHandler.ERROR_LOG,"301 name=\"index\" method=\"index\" [NullPointer]");
320 } catch(Exception e) {
321 hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
322 }
323 }
324 private void index(File fileName, HTTPClient session) throws IOException {
325 try {
326 int type = -1;
327 boolean noerror = true;
328 Document doc = new Document();
329 Reader content = null;
330 String url = session.getUrlString();
331 doc.add(Field.Keyword("path", url));
332 if(url.endsWith(".html")||url.endsWith(".htm")) {
333 type = HTML;
334 } else if(url.endsWith(".txt")) {
335 type = TEXT;
336 } else if(url.endsWith(".pdf")) {
337 type = PDF;
338 } else if(url.endsWith(".xml")) {
339 type = XML;
340 }
341 switch(type) {
342 case HTML:
343 try {
344 StringBuffer sb = new StringBuffer();
345 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
346 String line;
347 while((line = br.readLine()) != null) { sb.append(line); }
348 HTMLParser htmlparser = new HTMLParser(sb.toString());
349 content = htmlparser.getContentReader();
350 if(htmlparser.getTitle() != null)
351 doc.add(Field.Text("title", htmlparser.getTitle()));
352 else
353 doc.add(Field.Text("title", ""));
354 } catch(Exception e) {
355 hls.writeLog(HLHandler.ERROR_LOG,"HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
356 noerror = false;
357 }
358 doc.add(Field.Keyword("type", "HTML"));
359 break;
360 case XML:
361 try {
362 XMLParser xmlparser = new XMLParser(session.getContent());
363 content = xmlparser.getContentReader();
364 } catch(Exception e) {
365 hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
366 noerror = false;
367 }
368 doc.add(Field.Keyword("type", "XML"));
369 break;
370 case PDF:
371 try {
372 PDFHandler p = new PDFHandler();
373 p.parse(new FileInputStream(fileName));
374 content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
375 } catch(Exception e) {
376 hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getFile()+"]");
377 noerror = false;
378 }
379 doc.add(Field.Keyword("type", "PDF"));
380 break;
381 case TEXT:
382 content = new InputStreamReader(new FileInputStream(fileName));
383 doc.add(Field.Keyword("type", "TEXT"));
384 break;
385 }
386 if(content != null)
387 doc.add(Field.Text("contents", content));
388 if(noerror) {
389 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
390 writer.mergeFactor = 20;
391 writer.addDocument(doc);
392 writer.optimize();
393 writer.close();
394 doc = null;
395 }
396 if(!fileName.delete())
397 hls.writeLog(HLHandler.ERROR_LOG,"Can't remove unzipped file: "+fileName.getName());
398 } catch(IOException e) {
399 hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
400 }
401 }
402 public void setRecursive(boolean newRecursive) {
403 recursive = newRecursive;
404 }
405 private void fixLinks(ArrayList tempLinks, HTTPClient session) {
406 int docs = 0;
407 for(int i=tempLinks.size(); --i>=0 && tempLinks.get(i)!=null; ) {
408 String link = (String) tempLinks.get(i);
409 String directory = "";
410 try {
411 directory = session.getFile().substring(0, session.getFile().lastIndexOf("/"));
412 } catch(StringIndexOutOfBoundsException e) {}
413 if(link.indexOf("mailto:") != -1 || link.indexOf("#") != -1 || link.indexOf("javascript:") != -1 || link.indexOf("http://") != -1)
414 continue;
415 if(!link.startsWith("/")) {
416 if(link.startsWith("https://")&&link.indexOf("https://")>1) {
417 continue;
418 }
419 if(!link.startsWith("http://")) {
420 if(link.indexOf("http://")>1) {
421 link = link.substring(link.indexOf("http://")+7,link.length());
422 }
423 link = "http://"+session.getHost()+directory+"/"+link;
424 }
425 do {
426 if(link.indexOf("..") == -1)
427 break;
428 try {
429 link = fix(link);
430 continue;
431 } catch(Exception e) {
432 break;
433 }
434 } while(true);
435 } else {
436 link = "http://"+session.getHost()+link;
437 }
438 link = link.trim();
439 if(!links.contains(link) && HTTPClient.getServerString(link).equals(session.getHost())) {
440 docs++;
441 links.add(link);
442 }
443 }
444 if(docs > 0) { hls.writeLog(HLHandler.INDEX_LOG,"Found "+docs+" new documents"); }
445 }
446 public void writeResponse(String text) {
447 try {
448 connector.getOutputStream().write((text+"\r\n").getBytes());
449 connector.getOutputStream().flush();
450 } catch (IOException ioe) {
451 } catch (NullPointerException npe) {}
452 }
453 }