Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/telefonicasoluciones/search/server/HLIndex.java


1   package com.telefonicasoluciones.search.server;
2   
3   import com.telefonicasoluciones.search.server.parser.html.*;
4   import com.telefonicasoluciones.search.server.parser.xml.*;
5   import com.telefonicasoluciones.search.server.parser.pdf.*;
6   import com.telefonicasoluciones.search.server.util.*;
7   import java.io.*;
8   import java.net.*;
9   import java.util.*;
10  import java.util.zip.ZipEntry;
11  import java.util.zip.ZipInputStream;
12  import org.apache.lucene.document.*;
13  import org.apache.lucene.index.*;
14  import org.apache.lucene.store.Directory;
15  import org.apache.lucene.store.FSDirectory;
16  
17  public class HLIndex {
18      private static final int TEXT = 0;
19      private static final int HTML = 1;
20      private static final int XML = 2;
21      private static final int ZIP = 3;
22      private static final int PDF = 4;
23      private HLHandler hls;
24      private Directory directory;
25      private HTTPClient session;
26      private ArrayList links;
27      private Socket connector;
28      private boolean recursive = false;
29      private boolean deleteOldDocuments = false;
30      
31      public HLIndex(Socket s, HLHandler newHls) throws HLIndexException {
32    hls = newHls;
33    links = new ArrayList();
34          connector = s;
35          try {
36              directory = FSDirectory.getDirectory(hls.getIndexDirectory(), false);            
37    } catch(Exception e) {
38              throw new HLIndexException(e.getMessage());
39    }
40      }
41      private void index() throws HLIndexException {
42    try {
43              int type = -1;
44              boolean noerror = true;
45              
46              Document doc = new Document();
47              Reader content = null;
48              String url = session.getUrlString();
49              doc.add(Field.Keyword("path", url));
50              if(session.getContentType().indexOf("text/html")!=-1) {
51                  type = HTML;
52                  if(url.endsWith(".xml"))
53                      type = XML;
54              } else if(session.getContentType().indexOf("text/plain")!=-1) {
55                  type = TEXT;
56              } else if(session.getContentType().indexOf("application/zip")!=-1) {
57                  type = ZIP;
58              } else if(session.getContentType().indexOf("application/pdf")!=-1) {
59                  type = PDF;
60              } else if(url.endsWith(".xml")) {
61                  type = XML;
62              } else {
63                  String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
64                  String[] values =(String[]) hls.getFileTypes().values().toArray();
65                  for(int i=ext.length; --i>=0;) {
66                      if(url.endsWith(ext[i])) {
67                          if(values[i].toLowerCase().trim().equals("html")) {
68                              type = HTML;
69                          } else if (values[i].toLowerCase().trim().equals("xml")) {
70                              type = XML;
71                          } else if (values[i].toLowerCase().trim().equals("text")) {
72                              type = TEXT;
73                          }
74                      }
75                  }
76              }
77              switch(type) {
78                  case HTML:
79                      ArrayList tempLinks = new ArrayList();
80                      try {
81                          HTMLParser htmlparser = new HTMLParser(session.getContent());
82                          tempLinks = htmlparser.getLinks();
83                          content = htmlparser.getContentReader();
84                          if(htmlparser.getTitle() != null)
85                              doc.add(Field.Text("title", htmlparser.getTitle()));
86                          else
87                              doc.add(Field.Text("title", ""));
88                          HashMap meta = htmlparser.getMeta();
89                          Object[] keys = meta.keySet().toArray();
90                          for(int i=meta.size(); --i>=0; ) {
91                              doc.add(Field.Keyword((String)keys[i],(String)meta.get(keys[i])));
92                          }
93                      } catch(Exception e) {
94                          hls.writeLog(HLHandler.ERROR_LOG,"HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
95                          noerror = false;
96                      }
97                      fixLinks(tempLinks, session);
98                      doc.add(Field.Text("type", "html"));
99                      break;
100                 case XML:
101                     try {
102                         XMLParser xmlparser = new XMLParser(session.getContent());
103                         content = xmlparser.getContentReader();
104                     } catch(Exception e) {
105                         hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
106                         noerror = false;
107                     }
108                     doc.add(Field.Text("type", "xml"));
109                     break;
110                 case TEXT:
111                     content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
112                     doc.add(Field.Text("type", "text"));
113                     break;
114                 case ZIP:
115                     ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
116                     Vector zipDocuments = new Vector();
117                     ZipEntry entry;
118                     while((entry = zis.getNextEntry()) != null) {
119                         FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
120                         byte b[] = new byte[512];
121                         for(int len = 0; (len = zis.read(b)) != -1;)
122                             fos.write(b, 0, len);
123                         index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
124                         fos.close();
125                     }
126                     break;
127                 case PDF:
128                     try {
129                         PDFHandler p = new PDFHandler();
130                         p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
131                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
132                     } catch(Exception e) {
133                         hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
134                         noerror = false;
135                     }
136                     doc.add(Field.Text("type", "pdf"));
137                     break;
138             }
139             if(content != null)
140                 doc.add(Field.Text("contents", content));
141             if(type == -1)
142                 hls.writeLog(HLHandler.ERROR_LOG,"Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
143             if(noerror) {
144                 if(this.deleteOldDocuments) {
145                     IndexReader reader = IndexReader.open(directory);
146                     Term term = new Term("path",session.getUrlString());
147                     hls.writeLog(HLHandler.INDEX_LOG,"Deleting "+reader.delete(term)+" documents");
148                     reader.close();
149                 }
150                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
151                 writer.mergeFactor = 20;
152                 writer.addDocument(doc);
153                 writer.optimize();
154                 writer.close();
155                 writeResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
156                 hls.writeLog(HLHandler.INDEX_LOG,"Document ["+session.getFile()+"] added");
157              } else {
158                 writeResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
159             }
160             doc = null;
161   } catch(IOException e) {
162             hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
163   } catch(NullPointerException npe) {
164             hls.writeLog(HLHandler.ERROR_LOG,"301 name=\"index\" method=\"index\" [NullPointer]");
165   } catch(Exception e) {
166             hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
167   }
168     }
169     public void deleteOldDocuments(boolean newDeleteOldDocuments) {
170   deleteOldDocuments = newDeleteOldDocuments;
171     }
172     private String fix(String link) throws Exception {
173   String auxString = link.substring(0, link.lastIndexOf("/", link.indexOf("..")) - 1);
174   auxString = auxString.substring(0, auxString.lastIndexOf("/"));
175   link = String.valueOf(auxString) + String.valueOf(link.substring(link.indexOf("..") + 2, link.length()));
176   return link;
177     }
178     public void index(URL URLFile) {
179   try {
180             boolean newindex = true;
181             if(hls.getIndexDirectory().getAbsolutePath().lastIndexOf("/") < hls.getIndexDirectory().length())
182                 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("/segments"))).exists();
183             else
184                 newindex = !(new File(String.valueOf(String.valueOf(hls.getIndexDirectory())).concat("segments"))).exists();boolean noerror = true;
185             session = new HTTPClient(URLFile.getHost());
186             session.load(HTTPClient.getDocumentString(URLFile.toString()));
187             int type = -1;
188             Document doc = new Document();
189             Reader content = null;
190             writeResponse("<index status=\"active\"/>");
191             String url = session.getUrlString();
192             doc.add(Field.Keyword("path", url));
193             if(session.getContentType().indexOf("text/html")!=-1) {
194                 type = HTML;
195                 if(url.endsWith(".xml"))
196                     type = XML;
197             } else if(session.getContentType().indexOf("text/plain")!=-1&&url.endsWith(".txt")) {
198                 type = TEXT;
199             } else if(session.getContentType().indexOf("application/zip")!=-1) {
200                 type = ZIP;
201             } else if(session.getContentType().indexOf("application/pdf")!=-1) {
202                 type = PDF;
203             } else if(url.endsWith(".xml")) {
204                 type = XML;
205             } else {
206                 String[] ext = (String[]) hls.getFileTypes().keySet().toArray();
207                 String[] values =(String[]) hls.getFileTypes().values().toArray();
208                 for(int i=ext.length; --i>=0;) {
209                     if(url.endsWith(ext[i])) {
210                         if(values[i].toLowerCase().trim().equals("html")) {
211                             type = HTML;
212                         } else if (values[i].toLowerCase().trim().equals("xml")) {
213                             type = XML;
214                         } else if (values[i].toLowerCase().trim().equals("text")) {
215                             type = TEXT;
216                         }
217                     }
218                 }
219             }
220             switch(type) {
221                 case HTML:
222                     ArrayList tempLinks = new ArrayList();
223                     try {
224                         HTMLParser htmlparser = new HTMLParser(session.getContent());
225                         tempLinks = htmlparser.getLinks();
226                         content = htmlparser.getContentReader();
227                         if(htmlparser.getTitle() != null)
228                             doc.add(Field.Text("title", htmlparser.getTitle()));
229                         else
230                             doc.add(Field.Text("title", ""));
231                     } catch(Exception e) {
232                         hls.writeLog(HLHandler.ERROR_LOG,"HTML parser error: "+e.getMessage()+" ["+session.getFile()+"]");
233                         noerror = false;
234                     }
235                     fixLinks(tempLinks, session);
236                     doc.add(Field.Text("type", "html"));
237                     break;
238                 case XML:
239                     try {
240                         XMLParser xmlparser = new XMLParser(session.getContent());
241                         ArrayList al = xmlparser.getTokens();
242                         content = xmlparser.getContentReader();
243                     } catch(Exception e) {
244                         hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
245                         noerror = false;
246                     }
247                     doc.add(Field.Text("type", "xml"));
248                     break;
249                 case TEXT:
250                     content = new InputStreamReader(new ByteArrayInputStream(session.getContent().getBytes()));
251                     doc.add(Field.Text("type", "text"));
252                     break;
253                 case ZIP:
254                     ZipInputStream zis = new ZipInputStream(new ByteArrayInputStream(session.getContent().getBytes()));
255                     Vector zipDocuments = new Vector();
256                     ZipEntry entry;
257                     while((entry = zis.getNextEntry()) != null) {
258                         FileOutputStream fos = new FileOutputStream(String.valueOf(hls.getTemporaryDirectory()) + String.valueOf(entry.getName()));
259                         byte b[] = new byte[512];
260                         for(int len = 0; (len = zis.read(b)) != -1;)
261                             fos.write(b, 0, len);
262                         index(new File(hls.getTemporaryDirectory() + entry.getName()), session);
263                         fos.close();
264                     }
265                     break;
266                 case PDF:
267                     try {
268                         PDFHandler p = new PDFHandler();
269                         p.parse(new ByteArrayInputStream(session.getContent().getBytes()));
270                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
271                     } catch(Exception e) {
272                         hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getUrlString()+"]");
273                         noerror = false;
274                     }
275                     doc.add(Field.Text("type", "pdf"));
276                     break;
277             }
278             if(content != null)
279                 doc.add(Field.Text("contents", content));
280             if(type == -1)
281                 hls.writeLog(HLHandler.ERROR_LOG,"Document type "+session.getContentType()+" uknown ["+session.getFile()+"]");
282             if(newindex) {
283                hls.writeLog(HLHandler.INDEX_LOG,"Building a new indexes");
284             }
285             if(noerror) {
286                 if(this.deleteOldDocuments && !newindex) {
287                     IndexReader reader = IndexReader.open(directory);
288                     Term term = new Term("path",session.getUrlString());
289                     hls.writeLog(HLHandler.INDEX_LOG,"Deleting "+reader.delete(term)+" documents");
290                     reader.close();
291                 }
292                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), newindex);
293                 writer.mergeFactor = 20;
294                 writer.addDocument(doc);
295                 writer.optimize();
296                 writer.close();
297                 writeResponse("<index status=\"active\" target=\"success\">"+url+"</index>");
298                 hls.writeLog(HLHandler.INDEX_LOG,"Document ["+session.getFile()+"] added");
299                 if(this.recursive) {
300                     for(int i=0; i<links.size(); i++) {
301                         try {
302                             session.load(HTTPClient.getDocumentString((String) links.get(i)));
303                             index();
304                         } catch(HTTPClientException e) {
305                             hls.writeLog(HLHandler.ERROR_LOG,"304 name=\"index\" method=\"index\" url=\""+HTTPClient.getDocumentString((String) links.get(i))+"\" ["+e.getMessage()+"]");
306                         } catch(Exception e) {
307                             hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
308                         }
309                     }
310                 }
311             } else {
312                 writeResponse("<index status=\"active\" target=\"error\">"+url+"</index>");
313             }
314             writeResponse("<index status=\"inactive\"/>");
315             doc = null;
316   } catch(IOException e) {
317             hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
318   } catch(NullPointerException npe) {
319             hls.writeLog(HLHandler.ERROR_LOG,"301 name=\"index\" method=\"index\" [NullPointer]");
320   } catch(Exception e) {
321             hls.writeLog(HLHandler.ERROR_LOG,"302 name=\"index\" method=\"index\" type=\""+e.getClass()+"\" ["+e.getMessage()+"]");
322   }
323     }
324     private void index(File fileName, HTTPClient session) throws IOException {
325   try {
326             int type = -1;
327             boolean noerror = true;
328             Document doc = new Document();
329             Reader content = null;
330             String url = session.getUrlString();
331             doc.add(Field.Keyword("path", url));
332             if(url.endsWith(".html")||url.endsWith(".htm")) {
333                 type = HTML;
334             } else if(url.endsWith(".txt")) {
335                 type = TEXT;
336             } else if(url.endsWith(".pdf")) {
337                 type = PDF;
338             } else if(url.endsWith(".xml")) {
339                 type = XML;
340             } 
341             switch(type) {
342                 case HTML:
343                     try {
344                         StringBuffer sb = new StringBuffer();
345                         BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName)));
346                         String line;
347                         while((line = br.readLine()) != null) { sb.append(line); }
348                         HTMLParser htmlparser = new HTMLParser(sb.toString());
349                         content = htmlparser.getContentReader();
350                         if(htmlparser.getTitle() != null)
351                             doc.add(Field.Text("title", htmlparser.getTitle()));
352                         else
353                             doc.add(Field.Text("title", ""));
354                     } catch(Exception e) {
355                         hls.writeLog(HLHandler.ERROR_LOG,"HTML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
356                         noerror = false;
357                     }
358                     doc.add(Field.Keyword("type", "HTML"));
359                     break;
360                 case XML:
361                     try  {
362                         XMLParser xmlparser = new XMLParser(session.getContent());
363                         content = xmlparser.getContentReader();
364                     } catch(Exception e) {
365                         hls.writeLog(HLHandler.ERROR_LOG,"XML parse error: "+e.getMessage()+" ["+session.getFile()+"]");
366                         noerror = false;
367                     }
368                     doc.add(Field.Keyword("type", "XML"));
369                     break;
370                 case PDF:
371                     try {
372                         PDFHandler p = new PDFHandler();
373                         p.parse(new FileInputStream(fileName));
374                         content = new InputStreamReader(new ByteArrayInputStream(p.getContents().getBytes()));
375                     } catch(Exception e) {
376                         hls.writeLog(HLHandler.ERROR_LOG,"PDF parse error: "+e.getMessage()+" ["+session.getFile()+"]");
377                         noerror = false;
378                     }
379                     doc.add(Field.Keyword("type", "PDF"));
380                     break;
381                 case TEXT:
382                     content = new InputStreamReader(new FileInputStream(fileName));
383                     doc.add(Field.Keyword("type", "TEXT"));
384                     break;
385             }
386             if(content != null)
387                 doc.add(Field.Text("contents", content));
388             if(noerror) {
389                 IndexWriter writer = new IndexWriter(directory, new HLAnalyzer(), false);
390                 writer.mergeFactor = 20;
391                 writer.addDocument(doc);
392                 writer.optimize();
393                 writer.close();
394                 doc = null;
395             }
396             if(!fileName.delete())
397                 hls.writeLog(HLHandler.ERROR_LOG,"Can't remove unzipped file: "+fileName.getName());
398   } catch(IOException e) {
399             hls.writeLog(HLHandler.ERROR_LOG,"307 name=\"index\" method=\"index\" ["+e.getMessage()+"]");
400   }
401     }
402     public void setRecursive(boolean newRecursive) {
403   recursive = newRecursive;
404     }
405     private void fixLinks(ArrayList tempLinks, HTTPClient session) {
406   int docs = 0;
407         for(int i=tempLinks.size(); --i>=0 && tempLinks.get(i)!=null; ) {
408             String link = (String) tempLinks.get(i);
409             String directory = "";
410             try {
411                 directory = session.getFile().substring(0, session.getFile().lastIndexOf("/"));
412             } catch(StringIndexOutOfBoundsException e) {}
413             if(link.indexOf("mailto:") != -1 || link.indexOf("#") != -1 || link.indexOf("javascript:") != -1 || link.indexOf("http://") != -1)
414                 continue;
415             if(!link.startsWith("/")) {
416                 if(link.startsWith("https://")&&link.indexOf("https://")>1) {
417                     continue;
418                 }
419                 if(!link.startsWith("http://")) {
420                     if(link.indexOf("http://")>1) {
421                         link = link.substring(link.indexOf("http://")+7,link.length());
422                     }
423                 link = "http://"+session.getHost()+directory+"/"+link;
424                 }
425                 do {
426                     if(link.indexOf("..") == -1)
427                         break;
428                     try  {
429                         link = fix(link);
430                         continue;
431                     } catch(Exception e) {
432                         break;
433                     }
434                 } while(true);
435             } else {
436                 link = "http://"+session.getHost()+link;
437             }
438             link = link.trim();
439             if(!links.contains(link) && HTTPClient.getServerString(link).equals(session.getHost())) {
440                 docs++;
441                 links.add(link);
442             }
443         }
444         if(docs > 0) { hls.writeLog(HLHandler.INDEX_LOG,"Found "+docs+" new documents"); }
445     }
446     public void writeResponse(String text)  {
447   try { 
448             connector.getOutputStream().write((text+"\r\n").getBytes());
449             connector.getOutputStream().flush();
450   } catch (IOException ioe) {
451   } catch (NullPointerException npe) {}
452     }
453 }