Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/telefonicasoluciones/search/server/parser/pdf/PDFHandler.java


1   package com.telefonicasoluciones.search.server.parser.pdf;
2   
3   /*-- 
4    Copyright (C) @year@ i2a and David Duddleston. All rights reserved.
5    */
6   
7   /**
8    * <p><code>PDFHandler</code>
9    *  Content handler for PDF documents.
10   * </p>
11   * 
12   * @author <a href="mailto:david@i2a.com">David Duddleston</a>
13   * @version 1.0
14   */
15  import java.util.zip.InflaterInputStream;
16  import java.text.ParseException;
17  import java.text.SimpleDateFormat;
18  import java.io.*;
19  /**
20   * Insert the type's description here.
21   * Creation date: (2/21/2001 7:19:20 PM)
22   * @author:
23   */
24  
25  import java.util.List;
26  
27  
28  public class PDFHandler {
29    
30    private InputStream in;
31    
32    /*
33     * Input cache.  This is much faster than calling down to a synchronized
34     * method of BufferedReader for each byte.  Measurements done 5/30/97
35     * show that there's no point in having a bigger buffer:  Increasing
36     * the buffer to 8192 had no measurable impact for a program discarding
37     * one character at a time (reading from an http URL to a local machine).
38     */
39    private byte buf[] = new byte[256];
40    private int pos;
41    private int len;
42    /*
43    tracks position relative to the beginning of the
44    document.
45     */
46    private int currentPosition;
47    
48    // 1996.07.10 15:08:56 PST
49    SimpleDateFormat dateFormatter;
50    
51    // Content Data
52    private String author;
53    private long published;
54    private String keywords;
55    private String description;
56    private String title;
57    private StringBuffer contents;
58    
59    // Flags
60    private boolean streamHit = false;
61    private boolean parseNextStream = false;
62    
63    // Compression
64    private static final int NONE = 0;
65    private static final int FLATE = 1;
66    private static final int LZW = 2;
67    private int compression = NONE;
68    
69    
70    // TOKENS
71    private static final char[] AUTHOR = "/Author".toCharArray();
72    private static final char[] CREATIONDATE = "/CreationDate".toCharArray();
73    private static final char[] ENDSTREAM = "endstream".toCharArray();
74    private static final char[] KEYWORDS = "/Keywords".toCharArray();
75    private static final char[] STREAM = "stream".toCharArray();
76    private static final char[] SUBJECT = "/Subject".toCharArray();
77    private static final char[] TITLE = "/Title".toCharArray();
78    private static final char[] NEWLINE = {'\n'};
79    private static final char[] RETURN = {'\r'};
80    private static final char[] PARAMSTART = {'<','<'};
81    
82    private static final char[][] tokens = {
83      AUTHOR, CREATIONDATE, ENDSTREAM, KEYWORDS, STREAM, SUBJECT,
84      TITLE, PARAMSTART
85    };
86    
87    /**
88     * PdfParser constructor comment.
89     */
90    public PDFHandler() {
91      contents = new StringBuffer();
92      published = -1;
93      
94      // 19960710150856
95      dateFormatter = new SimpleDateFormat("yyyyMMddHHmmss");
96    }
97    /**
98     * Look for tokens.  This is not effiecent.
99     * Should use low, hi method with ordered array. NEED TO RECODE
100    */
101   private char[] findToken() throws IOException {
102     
103     
104     // flags if token still matches.
105     boolean[] match = new boolean[tokens.length];
106     for (int i = 0; i < match.length; i++) {
107       match[i] = true;
108     }
109     
110     // how many tokens still match;
111     int matchCount = tokens.length;
112     
113     // current position to look for char match in tokens
114     int charPosition = 0;
115     
116     // look for matching tokens.
117     while (true) {
118       int b = read();
119       if (b == -1 ) break;
120       char ch = (char)b;
121       
122       
123       // loop through all tokens
124       for (int i = 0; i < tokens.length; i++) {
125         // check to see if match flag is true for this token
126         
127         if (match[i] == true) {
128           // get the token
129           char[] token = tokens[i];
130           // check if char array of token is in bounds
131           if (charPosition >= token.length) {
132             // out of bounds, check to see if other tokens still match
133             if (matchCount >= 2) {
134               // other tokens still match, set this one to false.
135               match[i] = false;
136               matchCount--;
137             } else {
138               // last matching token;
139               return token;
140             }
141             // token is in bounds, check for match on char at charPosition.
142           } else {
143             if (token[charPosition] != ch) {
144               // did not match, set match to false;
145               match[i] = false;
146               matchCount--;
147             }
148           }
149         }
150       }
151       if (matchCount <= 0 ) break;
152       
153       charPosition++;
154       
155     }
156     
157     return null;
158   }
159   /**
160    * Parse Content. [24] 320:1
161    */
162   public String getAuthor() {
163     return author;
164   }
165   /**
166    * Return categories (from META tags)
167    */
168   public String getCategories() {
169     return null;
170   }
171   /**
172    * Parse Content. [24] 320:1
173    */
174   public String getContents() {
175     return contents.toString();
176   }
177   /**
178    * Parse Content. [24] 320:1
179    */
180   public String getDescription() {
181     return description;
182   }
183   /**
184    *  Return META HREF
185    */
186   public String getHREF() {
187     return null;
188   }
189   /**
190    * Parse Content. [24] 320:1
191    */
192   public String getKeywords() {
193     return keywords;
194   }
195   /**
196    * Return links
197    */
198   public List getLinks() {
199     return null;
200   }
201   /**
202    * Parse Content. [24] 320:1
203    */
204   public long getPublished() {
205     return published;
206   }
207   /**
208    * Return boolean true if links are to be followed
209    */
210   public boolean getRobotFollow() {
211     return false;
212   }
213   /**
214    * Return boolean true it this is to be indexed
215    */
216   public boolean getRobotIndex() {
217     return true;
218   }
219   /**
220    * Parse Content. [24] 320:1
221    */
222   public String getTitle() {
223     return title;
224   }
225   /**
226    * Check for new line chars
227    */
228   private boolean isNewLineChar(char ch) {
229     switch (ch) {
230       case '\n' :
231         return true;
232       case '\r' :
233         return true;
234       default :
235         return false;
236     }
237     
238   }
239   /**
240    * Parse Content. [24] 320:1
241    */
242   private boolean nextLine() throws IOException {
243     //System.out.println("look for new line");
244     while (true) {
245       int b = read();
246       if (b == -1 ) return false;
247       if (isNewLineChar((char)b)) return true;
248     }
249     
250   }
251   /**
252    * Parse Content.
253    */
254   public void parse(InputStream in) {
255     
256     
257     //System.out.println("mark supported" + in.markSupported());
258     
259     try {
260       this.in = new BufferedInputStream(in);
261       reset();
262       parseContent();
263       //System.out.println("Title: " + getTitle());
264       //System.out.println("Author: " + getAuthor());
265       //System.out.println("Published " + getPublished());
266       //System.out.println("Keywords: " + getKeywords());
267       //System.out.println("Description: " + getDescription());
268       //System.out.println("Content: " + getContents());
269       
270       //int b;
271       //while ((b = in.read()) != -1) {
272       //System.out.print((byte)b + ".");
273       //System.out.print((char)b + "*");
274       //}
275       
276     } catch (Exception e) {e.printStackTrace();}
277   }
278   /**
279    * Parse Content. [24] 320:1
280    */
281   private void parseContent() throws IOException {
282     Thread curThread = Thread.currentThread();
283     while (true) {
284       if (curThread.isInterrupted()) {
285         curThread.interrupt(); // resignal the interrupt
286         break;
287       }
288       char[] token;
289       while (true) {
290         token = findToken();
291         if (token != null) {
292           //System.out.println("found a token : " + token);
293           if (token == AUTHOR) {
294             author = parseData();
295           } else if (token == CREATIONDATE) {
296             published = parseDate();
297           } else if (token == KEYWORDS) {
298             keywords = parseData();
299           } else if (token == SUBJECT) {
300             description = parseData();
301           } else if (token == TITLE) {
302             title = parseData();
303           } else if (token == PARAMSTART) {
304             //System.out.println("param set mark");
305             in.mark(10000);
306             //parseDataParams();
307           } else if (token == STREAM) {
308             if (!streamHit) {
309               //System.out.println("new stream hit");
310               // first time this stream has been hit
311               // go back and parseDataParams.
312               in.reset();
313               streamHit = true;
314               parseDataParams();
315             } else {
316               //System.out.println("second stream hit");
317               if (parseNextStream) {
318                 contents.append(parseDataStream());
319                 parseNextStream = false;
320               }
321               streamHit = false;
322             }
323           }
324         }
325         if (!nextLine()) {
326           //System.out.println("no new line");
327           break;
328         }
329         //System.out.println("new line");
330       }
331       //System.out.println("hello");
332       break;
333       
334     }
335   }
336   /**
337    * Look for tokens.  This is not effiecent.
338    * Should use low, hi method with ordered array. NEED TO RECODE
339    */
340   private String parseData() throws IOException {
341     
342     ByteArrayOutputStream temp = new ByteArrayOutputStream();
343     
344     // look for start '('
345     while (true) {
346       int b = read();
347       if (b == -1 ) break;
348       char ch = (char)b;
349       if (ch == '(') break;
350     }
351     while (true) {
352       int b = read();
353       if (b == -1 ) break;
354       char ch = (char)b;
355       if (ch == ')') break;
356       temp.write(b);
357     }
358     
359     return new String(temp.toByteArray());
360   }
361   /**
362    * Look for tokens.  This is not effiecent.
363    * Should use low, hi method with ordered array. NEED TO RECODE
364    */
365   private String parseDataParams() throws IOException {
366     
367     ByteArrayOutputStream temp = new ByteArrayOutputStream();
368     
369     boolean end = false;
370     int b = read();
371     while (true) {
372       // check to see if new line;
373       if ((char)b == '>') {
374         b = read();
375         if ((char)b == '>') {
376           end = true;
377           break;
378         } else {
379           temp.write(b);
380         }
381       } else {
382         temp.write(b);
383       }
384       if (end) break;
385       b = read();
386     }
387     String params = new String(temp.toByteArray());
388     //System.out.println(params.length());
389     //System.out.println(params);
390     if (params.length() < 38
391     && params.indexOf("0 R") != -1
392     && params.indexOf("/Length ") != -1)  {
393       if (params.indexOf("/FlateDecode") != -1) compression = FLATE;
394       if (params.indexOf("/LZWDecode") != -1) compression = LZW;
395       parseNextStream = true;
396       //System.out.println();
397       //System.out.println(params);
398     }
399     
400     return new String(temp.toByteArray());
401   }
402   /**
403    * Look for tokens.  This is not effiecent.
404    * Should use low, hi method with ordered array. NEED TO RECODE
405    */
406   private String parseDataStream() throws IOException {
407     
408     ByteArrayOutputStream temp = new ByteArrayOutputStream();
409     ByteArrayOutputStream tmp = new ByteArrayOutputStream(ENDSTREAM.length);
410     boolean endstream = false;
411     
412     int b = read();
413     char ch = (char)b;
414     while (true) {
415       // check to see if new line;
416       if (isNewLineChar(ch)) {
417         // check to see if it is endstream
418         tmp.reset();
419         boolean notMatch = false;
420         for (int i = 0; i < ENDSTREAM.length; i++) {
421           b = read();
422           tmp.write(b);
423           if ((char)b != ENDSTREAM[i]) {
424             // not endsteam break..
425             notMatch = true;
426             tmp.writeTo(temp);
427             break;
428           }
429         }
430         if (!notMatch) endstream = true;
431       } else {
432         // not new line append byte
433         temp.write(b);
434         b = read();
435         ch = (char)b;
436       }
437       if (endstream) break; // endstream found
438     }
439     
440     // Uncompress if flateDecode is used
441     if (compression == FLATE) {
442       //System.out.println("FlateDecode = " +flateDecode);
443       ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
444       InflaterInputStream iin = new InflaterInputStream(bis);
445       temp.reset();
446       while ((b = iin.read()) != -1) {
447         temp.write(b);
448       }
449     }
450     
451     //System.out.println(temp.size());
452     //System.out.println(new String(temp.toByteArray()));
453     
454     // parse content out from formating data. Content is wrapped in a
455     // bunch of ()
456     
457     // look for start '('
458     ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
459     tmp.reset();
460     boolean end = false;
461     while (true) {
462       b = bis.read();
463       if (b == -1 ) break;
464       if ((char)b == '(') {
465         while (true) {
466           b = bis.read();
467           if (b == -1 ) {end = true; break;}
468           // look for end ')'
469           if ((char)b == ')') break;
470           tmp.write(b);
471         }
472       }
473       if (end) break;
474     }
475     
476     // reset flateDecode flag
477     compression = NONE;
478     //System.out.println(tmp.size());
479     //System.out.println(new String(tmp.toByteArray()));
480     return new String(tmp.toByteArray());
481   }
482   /**
483    * Look for tokens.  This is not effiecent.
484    * Should use low, hi method with ordered array. NEED TO RECODE
485    */
486   private long parseDate() throws IOException {
487     
488     try {
489       String date = parseData();
490       return dateFormatter.parse(date.substring(2, date.length())).getTime();
491     } catch(ParseException e) {
492       e.printStackTrace();
493       return -1;
494     }
495   }
496   private final int read() throws IOException {
497     
498     ++currentPosition;
499     return in.read();
500     
501     //return in.read();
502   /*
503   if (pos >= len) {
504    
505     // This loop allows us to ignore interrupts if the flag
506     // says so
507     for (;;) {
508       try {
509         len = in.read(buf);
510         System.out.println("next");
511         break;
512       } catch (InterruptedIOException ex) {
513         throw ex;
514       }
515     }
516     if (len <= 0) {
517       return -1; // eof
518     }
519     pos = 0;
520   }
521   ++currentPosition;
522   return buf[pos++];
523    */
524   }
525   private final char readCh() throws IOException {
526     
527     ++currentPosition;
528     return (char)in.read();
529   /*
530   if (pos >= len) {
531    
532     // This loop allows us to ignore interrupts if the flag
533     // says so
534     for (;;) {
535       try {
536         len = in.read(buf);
537         System.out.println("next");
538         break;
539       } catch (InterruptedIOException ex) {
540         throw ex;
541       }
542     }
543     if (len <= 0) {
544       return -1; // eof
545     }
546     pos = 0;
547   }
548   ++currentPosition;
549   return buf[pos++];
550    */
551   }
552   /**
553    *  Return contents
554    */
555   private void reset() {
556     
557     // Content
558     title = null;
559     description = null;
560     keywords = null;
561     author = null;
562     
563     contents.setLength(0);
564     published = -1;
565     
566     
567     // Flags
568     streamHit = false;
569     parseNextStream = false;
570     compression = NONE;
571     
572     //buf[] = new byte[256];
573     //pos = 0;
574     //len = 0;
575     //currentPosition = 0;
576     
577     
578   }
579 }