Source code: com/telefonicasoluciones/search/server/parser/pdf/PDFHandler.java
1 package com.telefonicasoluciones.search.server.parser.pdf;
2
3 /*--
4 Copyright (C) @year@ i2a and David Duddleston. All rights reserved.
5 */
6
7 /**
8 * <p><code>PDFHandler</code>
9 * Content handler for PDF documents.
10 * </p>
11 *
12 * @author <a href="mailto:david@i2a.com">David Duddleston</a>
13 * @version 1.0
14 */
15 import java.util.zip.InflaterInputStream;
16 import java.text.ParseException;
17 import java.text.SimpleDateFormat;
18 import java.io.*;
19 /**
20 * Insert the type's description here.
21 * Creation date: (2/21/2001 7:19:20 PM)
22 * @author:
23 */
24
25 import java.util.List;
26
27
28 public class PDFHandler {
29
30 private InputStream in;
31
32 /*
33 * Input cache. This is much faster than calling down to a synchronized
34 * method of BufferedReader for each byte. Measurements done 5/30/97
35 * show that there's no point in having a bigger buffer: Increasing
36 * the buffer to 8192 had no measurable impact for a program discarding
37 * one character at a time (reading from an http URL to a local machine).
38 */
39 private byte buf[] = new byte[256];
40 private int pos;
41 private int len;
42 /*
43 tracks position relative to the beginning of the
44 document.
45 */
46 private int currentPosition;
47
48 // 1996.07.10 15:08:56 PST
49 SimpleDateFormat dateFormatter;
50
51 // Content Data
52 private String author;
53 private long published;
54 private String keywords;
55 private String description;
56 private String title;
57 private StringBuffer contents;
58
59 // Flags
60 private boolean streamHit = false;
61 private boolean parseNextStream = false;
62
63 // Compression
64 private static final int NONE = 0;
65 private static final int FLATE = 1;
66 private static final int LZW = 2;
67 private int compression = NONE;
68
69
70 // TOKENS
71 private static final char[] AUTHOR = "/Author".toCharArray();
72 private static final char[] CREATIONDATE = "/CreationDate".toCharArray();
73 private static final char[] ENDSTREAM = "endstream".toCharArray();
74 private static final char[] KEYWORDS = "/Keywords".toCharArray();
75 private static final char[] STREAM = "stream".toCharArray();
76 private static final char[] SUBJECT = "/Subject".toCharArray();
77 private static final char[] TITLE = "/Title".toCharArray();
78 private static final char[] NEWLINE = {'\n'};
79 private static final char[] RETURN = {'\r'};
80 private static final char[] PARAMSTART = {'<','<'};
81
82 private static final char[][] tokens = {
83 AUTHOR, CREATIONDATE, ENDSTREAM, KEYWORDS, STREAM, SUBJECT,
84 TITLE, PARAMSTART
85 };
86
87 /**
88 * PdfParser constructor comment.
89 */
90 public PDFHandler() {
91 contents = new StringBuffer();
92 published = -1;
93
94 // 19960710150856
95 dateFormatter = new SimpleDateFormat("yyyyMMddHHmmss");
96 }
97 /**
98 * Look for tokens. This is not effiecent.
99 * Should use low, hi method with ordered array. NEED TO RECODE
100 */
101 private char[] findToken() throws IOException {
102
103
104 // flags if token still matches.
105 boolean[] match = new boolean[tokens.length];
106 for (int i = 0; i < match.length; i++) {
107 match[i] = true;
108 }
109
110 // how many tokens still match;
111 int matchCount = tokens.length;
112
113 // current position to look for char match in tokens
114 int charPosition = 0;
115
116 // look for matching tokens.
117 while (true) {
118 int b = read();
119 if (b == -1 ) break;
120 char ch = (char)b;
121
122
123 // loop through all tokens
124 for (int i = 0; i < tokens.length; i++) {
125 // check to see if match flag is true for this token
126
127 if (match[i] == true) {
128 // get the token
129 char[] token = tokens[i];
130 // check if char array of token is in bounds
131 if (charPosition >= token.length) {
132 // out of bounds, check to see if other tokens still match
133 if (matchCount >= 2) {
134 // other tokens still match, set this one to false.
135 match[i] = false;
136 matchCount--;
137 } else {
138 // last matching token;
139 return token;
140 }
141 // token is in bounds, check for match on char at charPosition.
142 } else {
143 if (token[charPosition] != ch) {
144 // did not match, set match to false;
145 match[i] = false;
146 matchCount--;
147 }
148 }
149 }
150 }
151 if (matchCount <= 0 ) break;
152
153 charPosition++;
154
155 }
156
157 return null;
158 }
159 /**
160 * Parse Content. [24] 320:1
161 */
162 public String getAuthor() {
163 return author;
164 }
165 /**
166 * Return categories (from META tags)
167 */
168 public String getCategories() {
169 return null;
170 }
171 /**
172 * Parse Content. [24] 320:1
173 */
174 public String getContents() {
175 return contents.toString();
176 }
177 /**
178 * Parse Content. [24] 320:1
179 */
180 public String getDescription() {
181 return description;
182 }
183 /**
184 * Return META HREF
185 */
186 public String getHREF() {
187 return null;
188 }
189 /**
190 * Parse Content. [24] 320:1
191 */
192 public String getKeywords() {
193 return keywords;
194 }
195 /**
196 * Return links
197 */
198 public List getLinks() {
199 return null;
200 }
201 /**
202 * Parse Content. [24] 320:1
203 */
204 public long getPublished() {
205 return published;
206 }
207 /**
208 * Return boolean true if links are to be followed
209 */
210 public boolean getRobotFollow() {
211 return false;
212 }
213 /**
214 * Return boolean true it this is to be indexed
215 */
216 public boolean getRobotIndex() {
217 return true;
218 }
219 /**
220 * Parse Content. [24] 320:1
221 */
222 public String getTitle() {
223 return title;
224 }
225 /**
226 * Check for new line chars
227 */
228 private boolean isNewLineChar(char ch) {
229 switch (ch) {
230 case '\n' :
231 return true;
232 case '\r' :
233 return true;
234 default :
235 return false;
236 }
237
238 }
239 /**
240 * Parse Content. [24] 320:1
241 */
242 private boolean nextLine() throws IOException {
243 //System.out.println("look for new line");
244 while (true) {
245 int b = read();
246 if (b == -1 ) return false;
247 if (isNewLineChar((char)b)) return true;
248 }
249
250 }
251 /**
252 * Parse Content.
253 */
254 public void parse(InputStream in) {
255
256
257 //System.out.println("mark supported" + in.markSupported());
258
259 try {
260 this.in = new BufferedInputStream(in);
261 reset();
262 parseContent();
263 //System.out.println("Title: " + getTitle());
264 //System.out.println("Author: " + getAuthor());
265 //System.out.println("Published " + getPublished());
266 //System.out.println("Keywords: " + getKeywords());
267 //System.out.println("Description: " + getDescription());
268 //System.out.println("Content: " + getContents());
269
270 //int b;
271 //while ((b = in.read()) != -1) {
272 //System.out.print((byte)b + ".");
273 //System.out.print((char)b + "*");
274 //}
275
276 } catch (Exception e) {e.printStackTrace();}
277 }
278 /**
279 * Parse Content. [24] 320:1
280 */
281 private void parseContent() throws IOException {
282 Thread curThread = Thread.currentThread();
283 while (true) {
284 if (curThread.isInterrupted()) {
285 curThread.interrupt(); // resignal the interrupt
286 break;
287 }
288 char[] token;
289 while (true) {
290 token = findToken();
291 if (token != null) {
292 //System.out.println("found a token : " + token);
293 if (token == AUTHOR) {
294 author = parseData();
295 } else if (token == CREATIONDATE) {
296 published = parseDate();
297 } else if (token == KEYWORDS) {
298 keywords = parseData();
299 } else if (token == SUBJECT) {
300 description = parseData();
301 } else if (token == TITLE) {
302 title = parseData();
303 } else if (token == PARAMSTART) {
304 //System.out.println("param set mark");
305 in.mark(10000);
306 //parseDataParams();
307 } else if (token == STREAM) {
308 if (!streamHit) {
309 //System.out.println("new stream hit");
310 // first time this stream has been hit
311 // go back and parseDataParams.
312 in.reset();
313 streamHit = true;
314 parseDataParams();
315 } else {
316 //System.out.println("second stream hit");
317 if (parseNextStream) {
318 contents.append(parseDataStream());
319 parseNextStream = false;
320 }
321 streamHit = false;
322 }
323 }
324 }
325 if (!nextLine()) {
326 //System.out.println("no new line");
327 break;
328 }
329 //System.out.println("new line");
330 }
331 //System.out.println("hello");
332 break;
333
334 }
335 }
336 /**
337 * Look for tokens. This is not effiecent.
338 * Should use low, hi method with ordered array. NEED TO RECODE
339 */
340 private String parseData() throws IOException {
341
342 ByteArrayOutputStream temp = new ByteArrayOutputStream();
343
344 // look for start '('
345 while (true) {
346 int b = read();
347 if (b == -1 ) break;
348 char ch = (char)b;
349 if (ch == '(') break;
350 }
351 while (true) {
352 int b = read();
353 if (b == -1 ) break;
354 char ch = (char)b;
355 if (ch == ')') break;
356 temp.write(b);
357 }
358
359 return new String(temp.toByteArray());
360 }
361 /**
362 * Look for tokens. This is not effiecent.
363 * Should use low, hi method with ordered array. NEED TO RECODE
364 */
365 private String parseDataParams() throws IOException {
366
367 ByteArrayOutputStream temp = new ByteArrayOutputStream();
368
369 boolean end = false;
370 int b = read();
371 while (true) {
372 // check to see if new line;
373 if ((char)b == '>') {
374 b = read();
375 if ((char)b == '>') {
376 end = true;
377 break;
378 } else {
379 temp.write(b);
380 }
381 } else {
382 temp.write(b);
383 }
384 if (end) break;
385 b = read();
386 }
387 String params = new String(temp.toByteArray());
388 //System.out.println(params.length());
389 //System.out.println(params);
390 if (params.length() < 38
391 && params.indexOf("0 R") != -1
392 && params.indexOf("/Length ") != -1) {
393 if (params.indexOf("/FlateDecode") != -1) compression = FLATE;
394 if (params.indexOf("/LZWDecode") != -1) compression = LZW;
395 parseNextStream = true;
396 //System.out.println();
397 //System.out.println(params);
398 }
399
400 return new String(temp.toByteArray());
401 }
402 /**
403 * Look for tokens. This is not effiecent.
404 * Should use low, hi method with ordered array. NEED TO RECODE
405 */
406 private String parseDataStream() throws IOException {
407
408 ByteArrayOutputStream temp = new ByteArrayOutputStream();
409 ByteArrayOutputStream tmp = new ByteArrayOutputStream(ENDSTREAM.length);
410 boolean endstream = false;
411
412 int b = read();
413 char ch = (char)b;
414 while (true) {
415 // check to see if new line;
416 if (isNewLineChar(ch)) {
417 // check to see if it is endstream
418 tmp.reset();
419 boolean notMatch = false;
420 for (int i = 0; i < ENDSTREAM.length; i++) {
421 b = read();
422 tmp.write(b);
423 if ((char)b != ENDSTREAM[i]) {
424 // not endsteam break..
425 notMatch = true;
426 tmp.writeTo(temp);
427 break;
428 }
429 }
430 if (!notMatch) endstream = true;
431 } else {
432 // not new line append byte
433 temp.write(b);
434 b = read();
435 ch = (char)b;
436 }
437 if (endstream) break; // endstream found
438 }
439
440 // Uncompress if flateDecode is used
441 if (compression == FLATE) {
442 //System.out.println("FlateDecode = " +flateDecode);
443 ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
444 InflaterInputStream iin = new InflaterInputStream(bis);
445 temp.reset();
446 while ((b = iin.read()) != -1) {
447 temp.write(b);
448 }
449 }
450
451 //System.out.println(temp.size());
452 //System.out.println(new String(temp.toByteArray()));
453
454 // parse content out from formating data. Content is wrapped in a
455 // bunch of ()
456
457 // look for start '('
458 ByteArrayInputStream bis = new ByteArrayInputStream(temp.toByteArray());
459 tmp.reset();
460 boolean end = false;
461 while (true) {
462 b = bis.read();
463 if (b == -1 ) break;
464 if ((char)b == '(') {
465 while (true) {
466 b = bis.read();
467 if (b == -1 ) {end = true; break;}
468 // look for end ')'
469 if ((char)b == ')') break;
470 tmp.write(b);
471 }
472 }
473 if (end) break;
474 }
475
476 // reset flateDecode flag
477 compression = NONE;
478 //System.out.println(tmp.size());
479 //System.out.println(new String(tmp.toByteArray()));
480 return new String(tmp.toByteArray());
481 }
482 /**
483 * Look for tokens. This is not effiecent.
484 * Should use low, hi method with ordered array. NEED TO RECODE
485 */
486 private long parseDate() throws IOException {
487
488 try {
489 String date = parseData();
490 return dateFormatter.parse(date.substring(2, date.length())).getTime();
491 } catch(ParseException e) {
492 e.printStackTrace();
493 return -1;
494 }
495 }
496 private final int read() throws IOException {
497
498 ++currentPosition;
499 return in.read();
500
501 //return in.read();
502 /*
503 if (pos >= len) {
504
505 // This loop allows us to ignore interrupts if the flag
506 // says so
507 for (;;) {
508 try {
509 len = in.read(buf);
510 System.out.println("next");
511 break;
512 } catch (InterruptedIOException ex) {
513 throw ex;
514 }
515 }
516 if (len <= 0) {
517 return -1; // eof
518 }
519 pos = 0;
520 }
521 ++currentPosition;
522 return buf[pos++];
523 */
524 }
525 private final char readCh() throws IOException {
526
527 ++currentPosition;
528 return (char)in.read();
529 /*
530 if (pos >= len) {
531
532 // This loop allows us to ignore interrupts if the flag
533 // says so
534 for (;;) {
535 try {
536 len = in.read(buf);
537 System.out.println("next");
538 break;
539 } catch (InterruptedIOException ex) {
540 throw ex;
541 }
542 }
543 if (len <= 0) {
544 return -1; // eof
545 }
546 pos = 0;
547 }
548 ++currentPosition;
549 return buf[pos++];
550 */
551 }
552 /**
553 * Return contents
554 */
555 private void reset() {
556
557 // Content
558 title = null;
559 description = null;
560 keywords = null;
561 author = null;
562
563 contents.setLength(0);
564 published = -1;
565
566
567 // Flags
568 streamHit = false;
569 parseNextStream = false;
570 compression = NONE;
571
572 //buf[] = new byte[256];
573 //pos = 0;
574 //len = 0;
575 //currentPosition = 0;
576
577
578 }
579 }