Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/telefonicasoluciones/search/server/HLIndexThread.java


1   package com.telefonicasoluciones.search.server;
2   
3   import com.telefonicasoluciones.search.server.parser.html.*;
4   import com.telefonicasoluciones.search.server.parser.xml.*;
5   import com.telefonicasoluciones.search.server.parser.pdf.*;
6   import com.telefonicasoluciones.search.server.util.*;
7   import java.io.*;
8   import java.net.*;
9   import java.util.*;
10  import java.util.zip.ZipEntry;
11  import java.util.zip.ZipInputStream;
12  import org.apache.lucene.document.*;
13  import org.apache.lucene.index.*;
14  import org.apache.lucene.store.Directory;
15  import org.apache.lucene.store.FSDirectory;
16  
17  public class HLIndexThread extends Thread {
18      private static final int TEXT = 0;
19      private static final int HTML = 1;
20      private static final int XML = 2;
21      private static final int ZIP = 3;
22      private static final int PDF = 4;
23      private HLHandler hls;
24      private Directory directory;
25      private ArrayList links;
26      boolean newindex = true;
27      private boolean recursive = false;
28      private boolean deleteOldDocuments = false;
29      
30      public HLIndexThread(HLHandler newHls) throws Exception {
31    super(newHls.TG, "server-index-"+(newHls.TG.activeCount()+1));
32    hls = newHls;
33    links = new ArrayList();
34          try {
35              directory = FSDirectory.getDirectory(hls.getIndexDirectory(), false);
36              if(hls.getIndexDirectory().lastIndexOf("/") < hls.getIndexDirectory().length())
37                  newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("/segments"))).exists();
38              else
39                  newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("segments"))).exists();
40    } catch(Exception e) {
41              throw new Exception(e.toString());
42    }
43      }
44      private void index(HTTPClient session, boolean deleteOldDocuments) throws HLIndexException {
45    try {
46              int type = -1;
47              boolean noerror = true;
48              
49              Document doc = new Document();
50              Reader content = null;
51              String url = session.getUrlString();
52              doc.add(Field.Keyword("path", url));
53              if(session.getContentType().indexOf("text/html")!=-1) {
54                  type = HTML;
55                  if(url.endsWith(".xml"))
56                      type = XML;
57              } else if(session.getContentType().indexOf("text/plain")!=-1) {
58                  type = TEXT;
59              } else if(session.getContentType().indexOf("application/zip")!=-1) {
60                  type = ZIP;
61              } else if(session.getContentType().indexOf("application/pdf")!=-1) {
62                  type = PDF;
63              } else if(url.endsWith(".xml")) {
64                  type = XML;
65              } else {
66                  String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
67                  String[] values =(String[]) hls.getFileTypes().values().toArray();
68                  for(int i=ext.length; --i>=0;) {
69                      if(url.endsWith(ext[i])) {
70                          if(values[i].toLowerCase().trim().equals("html")) {
71                              type = HTML;
72                          } else if (values[i].toLowerCase().trim().equals("xml")) {
73                              type = XML;
74                          } else if (values[i].toLowerCase().trim().equals("text")) {
75                              type = TEXT;
76                          }
77                      }
78                  }
79              }
80              switch(type) {
81                  case HTML:
82                      ArrayList tempLinks = new ArrayList();
83                      try {
84                          HTMLParser htmlparser = new HTMLParser(session.getContent());
85                          tempLinks = htmlparser.getLinks();
86                          content = htmlparser.getContentReader();
87                          if(htmlparser.getTitle() != null)
88                              doc.add(Field.Text("title", htmlparser.getTitle()));
89                          else
90                              doc.add(Field.Text("title", ""));
91                          HashMap meta = htmlparser.getMeta();
92                          Object[] keys = meta.keySet().toArray();
93                          for(int i=meta.size(); --i>=0; ) {
94                              doc.add(Field.Keyword((String)keys[i],(String)meta.get(keys[i])));
95                          }
96                      } catch(Exception e) {
97                          hls.writeErrorLog("HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
98                          noerror = false;
99                      }
100                     fixLinks(tempLinks, session);
101                     doc.add(Field.Text("type", "html"));
102                     break;
103                 case XML:
104                     try {
105                         XMLParser xmlparser = new XMLParser(session.getContent());
106                         content = xmlparser.getContentReader();
107                     } catch(Exception e) {
108                         hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
109                         noerror = false;
110                     }
111                     doc.add(Field.Text("type", "xml"));
112                     break;
113                 case TEXT:
114                     content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
115                     doc.add(Field.Text("type", "text"));
116                     break;
117                 case ZIP:
118                     ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
119                     Vector zipDocuments = new Vector();
120                     ZipEntry entry;
121                     while((entry = zis.getNextEntry()) != null) {
122                         FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
123                         byte b[] = new byte[512];
124                         for(int len = 0; (len = zis.read(b)) != -1;)
125                             fos.write(b, 0, len);
126                         index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
127                         fos.close();
128                     }
129                     break;
130                 case PDF:
131                     try {
132                         PDFHandler p = new PDFHandler();
133                         p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
134                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
135                     } catch(Exception e) {
136                         hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
137                         noerror = false;
138                     }
139                     doc.add(Field.Text("type", "pdf"));
140                     break;
141             }
142             if(content != null)
143                 doc.add(Field.Text("contents", content));
144             if(type == -1)
145                 hls.writeErrorLog("Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
146             if(noerror) {
147                 if(deleteOldDocuments) {
148                     IndexReader reader = IndexReader.open(directory);
149                     Term term = new Term("path",session.getUrlString());
150                     hls.writeIndexLog("Deleting "+reader.delete(term)+" documents");
151                     reader.close();
152                 }
153                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
154                 writer.mergeFactor = 20;
155                 writer.addDocument(doc);
156                 writer.optimize();
157                 writer.close();
158                 hls.writeSocketResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
159                 hls.writeIndexLog("Document ["+session.getFile()+"] added");
160              } else {
161                 hls.writeSocketResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
162             }
163             doc = null;
164   } catch(IOException e) {
165             hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
166   } catch(NullPointerException npe) {
167             hls.writeErrorLog("301 name=\"index\" method=\"index\" [NullPointer]");
168   } catch(Exception e) {
169             hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
170   }
171     }
172     public void deleteOldDocuments(boolean newDeleteOldDocuments) {
173   deleteOldDocuments = newDeleteOldDocuments;
174     }
175     private String fix(String link) throws Exception {
176   String auxString = link.substring(0, link.lastIndexOf("/", link.indexOf("..")) - 1);
177   auxString = auxString.substring(0, auxString.lastIndexOf("/"));
178   link = String.valueOf(auxString) + String.valueOf(link.substring(link.indexOf("..") + 2, link.length()));
179   return link;
180     }
181     private void index(HTTPClient session, boolean recursive, boolean deleteOldDocuments) {
182   try {
183             boolean noerror = true;
184             int type = -1;
185             Document doc = new Document();
186             Reader content = null;
187             hls.writeSocketResponse("<index status=\"active\"/>");
188             String url = session.getUrlString();
189             doc.add(Field.Keyword("path", url));
190             if(session.getContentType().indexOf("text/html")!=-1) {
191                 type = HTML;
192                 if(url.endsWith(".xml"))
193                     type = XML;
194             } else if(session.getContentType().indexOf("text/plain")!=-1&&url.endsWith(".txt")) {
195                 type = TEXT;
196             } else if(session.getContentType().indexOf("application/zip")!=-1) {
197                 type = ZIP;
198             } else if(session.getContentType().indexOf("application/pdf")!=-1) {
199                 type = PDF;
200             } else if(url.endsWith(".xml")) {
201                 type = XML;
202             } else {
203                 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
204                 String[] values =(String[]) hls.getFileTypes().values().toArray();
205                 for(int i=ext.length; --i>=0;) {
206                     if(url.endsWith(ext[i])) {
207                         if(values[i].toLowerCase().trim().equals("html")) {
208                             type = HTML;
209                         } else if (values[i].toLowerCase().trim().equals("xml")) {
210                             type = XML;
211                         } else if (values[i].toLowerCase().trim().equals("text")) {
212                             type = TEXT;
213                         }
214                     }
215                 }
216             }
217             switch(type) {
218                 case HTML:
219                     ArrayList tempLinks = new ArrayList();
220                     try {
221                         HTMLParser htmlparser = new HTMLParser(session.getContent());
222                         tempLinks = htmlparser.getLinks();
223                         content = htmlparser.getContentReader();
224                         if(htmlparser.getTitle() != null)
225                             doc.add(Field.Text("title", htmlparser.getTitle()));
226                         else
227                             doc.add(Field.Text("title", ""));
228                     } catch(Exception e) {
229                         hls.writeErrorLog("HTML parser error: "+e.getMessage()+" ["+session.getFile()+"]");
230                         noerror = false;
231                     }
232                     fixLinks(tempLinks, session);
233                     doc.add(Field.Text("type", "html"));
234                     break;
235                 case XML:
236                     try {
237                         XMLParser xmlparser = new XMLParser(session.getContent());
238                         ArrayList al = xmlparser.getTokens();
239                         System.out.println("Tokens:");
240                         for(int u=0; u<al.size(); u++) {
241                             System.out.println(al.get(u));
242                         }
243                         content = xmlparser.getContentReader();
244                     } catch(Exception e) {
245                         hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
246                         noerror = false;
247                     }
248                     doc.add(Field.Text("type", "xml"));
249                     break;
250                 case TEXT:
251                     content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
252                     doc.add(Field.Text("type", "text"));
253                     break;
254                 case ZIP:
255                     ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
256                     Vector zipDocuments = new Vector();
257                     ZipEntry entry;
258                     while((entry = zis.getNextEntry()) != null) {
259                         FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
260                         byte b[] = new byte[512];
261                         for(int len = 0; (len = zis.read(b)) != -1;)
262                             fos.write(b, 0, len);
263                         index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
264                         fos.close();
265                     }
266                     break;
267                 case PDF:
268                     try {
269                         PDFHandler p = new PDFHandler();
270                         p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
271                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
272                     } catch(Exception e) {
273                         hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
274                         noerror = false;
275                     }
276                     doc.add(Field.Text("type", "pdf"));
277                     break;
278             }
279             if(content != null)
280                 doc.add(Field.Text("contents", content));
281             if(type == -1)
282                 hls.writeErrorLog("Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
283             if(newindex) {
284                hls.writeIndexLog("Building a new indexes");
285             }
286             if(noerror) {
287                 if(deleteOldDocuments && !newindex) {
288                     IndexReader reader = IndexReader.open(directory);
289                     Term term = new Term("path",session.getUrlString());
290                     hls.writeIndexLog("Deleting "+reader.delete(term)+" documents");
291                     reader.close();
292                 }
293                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), newindex);
294                 writer.mergeFactor = 20;
295                 writer.addDocument(doc);
296                 writer.optimize();
297                 writer.close();
298                 hls.writeSocketResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
299                 hls.writeIndexLog("Document ["+session.getFile()+"] added");
300                 if(recursive) {
301                     for(int i=0; i<links.size(); i++) {
302                         try {
303                             session.load(HTTPClient.getDocumentString((String) links.get(i)));
304                             index(session, deleteOldDocuments);
305                         } catch(HTTPClientException e) {
306                             hls.writeErrorLog("304 name=\"index\" method=\"index\" url=\""+HTTPClient.getDocumentString((String) links.get(i))+"\" ["+e.getMessage()+"]");
307                         } catch(Exception e) {
308                             hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
309                         }
310                     }
311                 }
312             } else {
313                 hls.writeSocketResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
314             }
315             hls.writeSocketResponse("<index status=\"inactive\"/>");
316             doc = null;
317             finalize();
318   } catch(IOException e) {
319             hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
320   } catch(NullPointerException npe) {
321             hls.writeErrorLog("301 name=\"index\" method=\"index\" [NullPointer]");
322   } catch(Exception e) {
323             hls.writeErrorLog("302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
324   }
325     }
326     private void index(File fileName, HTTPClient session) throws IOException {
327   try {
328             int type = -1;
329             boolean noerror = true;
330             Document doc = new Document();
331             Reader content = null;
332             String url = session.getUrlString();
333             doc.add(Field.Keyword("path", url));
334             if(url.endsWith(".html")||url.endsWith(".htm")) {
335                 type = HTML;
336             } else if(url.endsWith(".txt")) {
337                 type = TEXT;
338             } else if(url.endsWith(".pdf")) {
339                 type = PDF;
340             } else if(url.endsWith(".xml")) {
341                 type = XML;
342             } 
343             switch(type) {
344                 case HTML:
345                     try {
346                         StringBuffer sb = new StringBuffer();
347                         BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
348                         String line;
349                         while((line = br.readLine()) != null) { sb.append(line); }
350                         HTMLParser htmlparser = new HTMLParser(sb.toString());
351                         content = htmlparser.getContentReader();
352                         if(htmlparser.getTitle() != null)
353                             doc.add(Field.Text("title", htmlparser.getTitle()));
354                         else
355                             doc.add(Field.Text("title", ""));
356                     } catch(Exception e) {
357                         hls.writeIndexLog("HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
358                         noerror = false;
359                     }
360                     doc.add(Field.Keyword("type", "HTML"));
361                     break;
362                 case XML:
363                     try  {
364                         XMLParser xmlparser = new XMLParser(session.getContent());
365                         content = xmlparser.getContentReader();
366                     } catch(Exception e) {
367                         hls.writeErrorLog("XML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
368                         noerror = false;
369                     }
370                     doc.add(Field.Keyword("type", "XML"));
371                     break;
372                 case PDF:
373                     try {
374                         PDFHandler p = new PDFHandler();
375                         p.parse(new FileInputStream(fileName));
376                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
377                     } catch(Exception e) {
378                         hls.writeErrorLog("PDF parse error: "+e.getMessage()+" ["+session.getFile()+"]");
379                         noerror = false;
380                     }
381                     doc.add(Field.Keyword("type", "PDF"));
382                     break;
383                 case TEXT:
384                     content = new InputStreamReader(new FileInputStream(fileName));
385                     doc.add(Field.Keyword("type", "TEXT"));
386                     break;
387             }
388             if(content != null)
389                 doc.add(Field.Text("contents", content));
390             if(noerror) {
391                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
392                 writer.mergeFactor = 20;
393                 writer.addDocument(doc);
394                 writer.optimize();
395                 writer.close();
396                 doc = null;
397             }
398             if(!fileName.delete())
399                 hls.writeIndexLog("Can't remove unzipped file: "+fileName.getName());
400   } catch(IOException e) {
401             hls.writeErrorLog("307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
402   }
403     }
404     public void run() {
405   try  {
406             index(hls.getSession(), recursive, deleteOldDocuments);
407   } catch(Exception e) {
408             hls.writeErrorLog("321 name=\"index\" method=\"run\" ["+e.getMessage()+"]");
409   }
410     }
411     public void setRecursive(boolean newRecursive) {
412   recursive = newRecursive;
413     }
414     private void fixLinks(ArrayList tempLinks, HTTPClient session) {
415   int docs = 0;
416         for(int i=tempLinks.size(); --i>=0 && tempLinks.get(i)!=null; ) {
417             String link = (String) tempLinks.get(i);
418             String directory = "";
419             try {
420                 directory = session.getFile().substring(0, session.getFile().lastIndexOf("/"));
421             } catch(StringIndexOutOfBoundsException e) {}
422             if(link.indexOf("mailto:") != -1 || link.indexOf("#") != -1 || link.indexOf("javascript:") != -1 || link.indexOf("http://") != -1)
423                 continue;
424             if(!link.startsWith("/")) {
425                 if(link.startsWith("https://")&&link.indexOf("https://")>1) {
426                     continue;
427                 }
428                 if(!link.startsWith("http://")) {
429                     if(link.indexOf("http://")>1) {
430                         link = link.substring(link.indexOf("http://")+7,link.length());
431                     }
432                 link = "http://"+session.getHost()+directory+"/"+link;
433                 }
434                 do {
435                     if(link.indexOf("..") == -1)
436                         break;
437                     try  {
438                         link = fix(link);
439                         continue;
440                     } catch(Exception e) {
441                         break;
442                     }
443                 } while(true);
444             } else {
445                 link = "http://"+session.getHost()+link;
446             }
447             link = link.trim();
448             if(!links.contains(link) && HTTPClient.getServerString(link).equals(session.getHost())) {
449                 docs++;
450                 links.add(link);
451             }
452         }
453         if(docs > 0) { hls.writeIndexLog("Found "+docs+" new documents"); }
454     }
455     public void finalize()  {
456   interrupt();
457     }
458 }