1 /*
2 * $Id: PRTokeniser.java 3117 2008-01-31 05:53:22Z xlv $
3 *
4 * Copyright 2001, 2002 by Paulo Soares.
5 *
6 * The contents of this file are subject to the Mozilla Public License Version 1.1
7 * (the "License"); you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the License.
13 *
14 * The Original Code is 'iText, a free JAVA-PDF library'.
15 *
16 * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
17 * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
18 * All Rights Reserved.
19 * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
20 * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
21 *
22 * Contributor(s): all the names of the contributors are added in the source code
23 * where applicable.
24 *
25 * Alternatively, the contents of this file may be used under the terms of the
26 * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
27 * provisions of LGPL are applicable instead of those above. If you wish to
28 * allow use of your version of this file only under the terms of the LGPL
29 * License and not to allow others to use your version of this file under
30 * the MPL, indicate your decision by deleting the provisions above and
31 * replace them with the notice and other provisions required by the LGPL.
32 * If you do not delete the provisions above, a recipient may use your version
33 * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
34 *
35 * This library is free software; you can redistribute it and/or modify it
36 * under the terms of the MPL as stated above or under the terms of the GNU
37 * Library General Public License as published by the Free Software Foundation;
38 * either version 2 of the License, or any later version.
39 *
40 * This library is distributed in the hope that it will be useful, but WITHOUT
41 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
42 * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
43 * details.
44 *
45 * If you didn't download this code from the following link, you should check if
46 * you aren't using an obsolete version:
47 * http://www.lowagie.com/iText/
48 */
49
50 package com.lowagie.text.pdf;
51
52 import java.io.IOException;
53 /**
54 *
55 * @author Paulo Soares (psoares@consiste.pt)
56 */
57 public class PRTokeniser {
58
59 public static final int TK_NUMBER = 1;
60 public static final int TK_STRING = 2;
61 public static final int TK_NAME = 3;
62 public static final int TK_COMMENT = 4;
63 public static final int TK_START_ARRAY = 5;
64 public static final int TK_END_ARRAY = 6;
65 public static final int TK_START_DIC = 7;
66 public static final int TK_END_DIC = 8;
67 public static final int TK_REF = 9;
68 public static final int TK_OTHER = 10;
69 public static final boolean delims[] = {
70 true, true, false, false, false, false, false, false, false, false,
71 true, true, false, true, true, false, false, false, false, false,
72 false, false, false, false, false, false, false, false, false, false,
73 false, false, false, true, false, false, false, false, true, false,
74 false, true, true, false, false, false, false, false, true, false,
75 false, false, false, false, false, false, false, false, false, false,
76 false, true, false, true, false, false, false, false, false, false,
77 false, false, false, false, false, false, false, false, false, false,
78 false, false, false, false, false, false, false, false, false, false,
79 false, false, true, false, true, false, false, false, false, false,
80 false, false, false, false, false, false, false, false, false, false,
81 false, false, false, false, false, false, false, false, false, false,
82 false, false, false, false, false, false, false, false, false, false,
83 false, false, false, false, false, false, false, false, false, false,
84 false, false, false, false, false, false, false, false, false, false,
85 false, false, false, false, false, false, false, false, false, false,
86 false, false, false, false, false, false, false, false, false, false,
87 false, false, false, false, false, false, false, false, false, false,
88 false, false, false, false, false, false, false, false, false, false,
89 false, false, false, false, false, false, false, false, false, false,
90 false, false, false, false, false, false, false, false, false, false,
91 false, false, false, false, false, false, false, false, false, false,
92 false, false, false, false, false, false, false, false, false, false,
93 false, false, false, false, false, false, false, false, false, false,
94 false, false, false, false, false, false, false, false, false, false,
95 false, false, false, false, false, false, false};
96
97 static final String EMPTY = "";
98
99
100 protected RandomAccessFileOrArray file;
101 protected int type;
102 protected String stringValue;
103 protected int reference;
104 protected int generation;
105 protected boolean hexString;
106
107 public PRTokeniser(String filename) throws IOException {
108 file = new RandomAccessFileOrArray(filename);
109 }
110
111 public PRTokeniser(byte pdfIn[]) {
112 file = new RandomAccessFileOrArray(pdfIn);
113 }
114
115 public PRTokeniser(RandomAccessFileOrArray file) {
116 this.file = file;
117 }
118
119 public void seek(int pos) throws IOException {
120 file.seek(pos);
121 }
122
123 public int getFilePointer() throws IOException {
124 return file.getFilePointer();
125 }
126
127 public void close() throws IOException {
128 file.close();
129 }
130
131 public int length() throws IOException {
132 return file.length();
133 }
134
135 public int read() throws IOException {
136 return file.read();
137 }
138
139 public RandomAccessFileOrArray getSafeFile() {
140 return new RandomAccessFileOrArray(file);
141 }
142
143 public RandomAccessFileOrArray getFile() {
144 return file;
145 }
146
147 public String readString(int size) throws IOException {
148 StringBuffer buf = new StringBuffer();
149 int ch;
150 while ((size--) > 0) {
151 ch = file.read();
152 if (ch == -1)
153 break;
154 buf.append((char)ch);
155 }
156 return buf.toString();
157 }
158
159 public static final boolean isWhitespace(int ch) {
160 return (ch == 0 || ch == 9 || ch == 10 || ch == 12 || ch == 13 || ch == 32);
161 }
162
163 public static final boolean isDelimiter(int ch) {
164 return (ch == '(' || ch == ')' || ch == '<' || ch == '>' || ch == '[' || ch == ']' || ch == '/' || ch == '%');
165 }
166
167 public static final boolean isDelimiterWhitespace(int ch) {
168 return delims[ch + 1];
169 }
170
171 public int getTokenType() {
172 return type;
173 }
174
175 public String getStringValue() {
176 return stringValue;
177 }
178
179 public int getReference() {
180 return reference;
181 }
182
183 public int getGeneration() {
184 return generation;
185 }
186
187 public void backOnePosition(int ch) {
188 if (ch != -1)
189 file.pushBack((byte)ch);
190 }
191
192 public void throwError(String error) throws IOException {
193 throw new IOException(error + " at file pointer " + file.getFilePointer());
194 }
195
196 public char checkPdfHeader() throws IOException {
197 file.setStartOffset(0);
198 String str = readString(1024);
199 int idx = str.indexOf("%PDF-");
200 if (idx < 0)
201 throw new IOException("PDF header signature not found.");
202 file.setStartOffset(idx);
203 return str.charAt(idx + 7);
204 }
205
206 public void checkFdfHeader() throws IOException {
207 file.setStartOffset(0);
208 String str = readString(1024);
209 int idx = str.indexOf("%FDF-1.2");
210 if (idx < 0)
211 throw new IOException("FDF header signature not found.");
212 file.setStartOffset(idx);
213 }
214
215 public int getStartxref() throws IOException {
216 int size = Math.min(1024, file.length());
217 int pos = file.length() - size;
218 file.seek(pos);
219 String str = readString(1024);
220 int idx = str.lastIndexOf("startxref");
221 if (idx < 0)
222 throw new IOException("PDF startxref not found.");
223 return pos + idx;
224 }
225
226 public static int getHex(int v) {
227 if (v >= '0' && v <= '9')
228 return v - '0';
229 if (v >= 'A' && v <= 'F')
230 return v - 'A' + 10;
231 if (v >= 'a' && v <= 'f')
232 return v - 'a' + 10;
233 return -1;
234 }
235
236 public void nextValidToken() throws IOException {
237 int level = 0;
238 String n1 = null;
239 String n2 = null;
240 int ptr = 0;
241 while (nextToken()) {
242 if (type == TK_COMMENT)
243 continue;
244 switch (level) {
245 case 0:
246 {
247 if (type != TK_NUMBER)
248 return;
249 ptr = file.getFilePointer();
250 n1 = stringValue;
251 ++level;
252 break;
253 }
254 case 1:
255 {
256 if (type != TK_NUMBER) {
257 file.seek(ptr);
258 type = TK_NUMBER;
259 stringValue = n1;
260 return;
261 }
262 n2 = stringValue;
263 ++level;
264 break;
265 }
266 default:
267 {
268 if (type != TK_OTHER || !stringValue.equals("R")) {
269 file.seek(ptr);
270 type = TK_NUMBER;
271 stringValue = n1;
272 return;
273 }
274 type = TK_REF;
275 reference = Integer.parseInt(n1);
276 generation = Integer.parseInt(n2);
277 return;
278 }
279 }
280 }
281 throwError("Unexpected end of file");
282 }
283
284 public boolean nextToken() throws IOException {
285 StringBuffer outBuf = null;
286 stringValue = EMPTY;
287 int ch = 0;
288 do {
289 ch = file.read();
290 } while (ch != -1 && isWhitespace(ch));
291 if (ch == -1)
292 return false;
293 switch (ch) {
294 case '[':
295 type = TK_START_ARRAY;
296 break;
297 case ']':
298 type = TK_END_ARRAY;
299 break;
300 case '/':
301 {
302 outBuf = new StringBuffer();
303 type = TK_NAME;
304 while (true) {
305 ch = file.read();
306 if (delims[ch + 1])
307 break;
308 if (ch == '#') {
309 ch = (getHex(file.read()) << 4) + getHex(file.read());
310 }
311 outBuf.append((char)ch);
312 }
313 backOnePosition(ch);
314 break;
315 }
316 case '>':
317 ch = file.read();
318 if (ch != '>')
319 throwError("'>' not expected");
320 type = TK_END_DIC;
321 break;
322 case '<':
323 {
324 int v1 = file.read();
325 if (v1 == '<') {
326 type = TK_START_DIC;
327 break;
328 }
329 outBuf = new StringBuffer();
330 type = TK_STRING;
331 hexString = true;
332 int v2 = 0;
333 while (true) {
334 while (isWhitespace(v1))
335 v1 = file.read();
336 if (v1 == '>')
337 break;
338 v1 = getHex(v1);
339 if (v1 < 0)
340 break;
341 v2 = file.read();
342 while (isWhitespace(v2))
343 v2 = file.read();
344 if (v2 == '>') {
345 ch = v1 << 4;
346 outBuf.append((char)ch);
347 break;
348 }
349 v2 = getHex(v2);
350 if (v2 < 0)
351 break;
352 ch = (v1 << 4) + v2;
353 outBuf.append((char)ch);
354 v1 = file.read();
355 }
356 if (v1 < 0 || v2 < 0)
357 throwError("Error reading string");
358 break;
359 }
360 case '%':
361 type = TK_COMMENT;
362 do {
363 ch = file.read();
364 } while (ch != -1 && ch != '\r' && ch != '\n');
365 break;
366 case '(':
367 {
368 outBuf = new StringBuffer();
369 type = TK_STRING;
370 hexString = false;
371 int nesting = 0;
372 while (true) {
373 ch = file.read();
374 if (ch == -1)
375 break;
376 if (ch == '(') {
377 ++nesting;
378 }
379 else if (ch == ')') {
380 --nesting;
381 }
382 else if (ch == '\\') {
383 boolean lineBreak = false;
384 ch = file.read();
385 switch (ch) {
386 case 'n':
387 ch = '\n';
388 break;
389 case 'r':
390 ch = '\r';
391 break;
392 case 't':
393 ch = '\t';
394 break;
395 case 'b':
396 ch = '\b';
397 break;
398 case 'f':
399 ch = '\f';
400 break;
401 case '(':
402 case ')':
403 case '\\':
404 break;
405 case '\r':
406 lineBreak = true;
407 ch = file.read();
408 if (ch != '\n')
409 backOnePosition(ch);
410 break;
411 case '\n':
412 lineBreak = true;
413 break;
414 default:
415 {
416 if (ch < '0' || ch > '7') {
417 break;
418 }
419 int octal = ch - '0';
420 ch = file.read();
421 if (ch < '0' || ch > '7') {
422 backOnePosition(ch);
423 ch = octal;
424 break;
425 }
426 octal = (octal << 3) + ch - '0';
427 ch = file.read();
428 if (ch < '0' || ch > '7') {
429 backOnePosition(ch);
430 ch = octal;
431 break;
432 }
433 octal = (octal << 3) + ch - '0';
434 ch = octal & 0xff;
435 break;
436 }
437 }
438 if (lineBreak)
439 continue;
440 if (ch < 0)
441 break;
442 }
443 else if (ch == '\r') {
444 ch = file.read();
445 if (ch < 0)
446 break;
447 if (ch != '\n') {
448 backOnePosition(ch);
449 ch = '\n';
450 }
451 }
452 if (nesting == -1)
453 break;
454 outBuf.append((char)ch);
455 }
456 if (ch == -1)
457 throwError("Error reading string");
458 break;
459 }
460 default:
461 {
462 outBuf = new StringBuffer();
463 if (ch == '-' || ch == '+' || ch == '.' || (ch >= '0' && ch <= '9')) {
464 type = TK_NUMBER;
465 do {
466 outBuf.append((char)ch);
467 ch = file.read();
468 } while (ch != -1 && ((ch >= '0' && ch <= '9') || ch == '.'));
469 }
470 else {
471 type = TK_OTHER;
472 do {
473 outBuf.append((char)ch);
474 ch = file.read();
475 } while (!delims[ch + 1]);
476 }
477 backOnePosition(ch);
478 break;
479 }
480 }
481 if (outBuf != null)
482 stringValue = outBuf.toString();
483 return true;
484 }
485
486 public int intValue() {
487 return Integer.parseInt(stringValue);
488 }
489
490 public boolean readLineSegment(byte input[]) throws IOException {
491 int c = -1;
492 boolean eol = false;
493 int ptr = 0;
494 int len = input.length;
495 // ssteward, pdftk-1.10, 040922:
496 // skip initial whitespace; added this because PdfReader.rebuildXref()
497 // assumes that line provided by readLineSegment does not have init. whitespace;
498 if ( ptr < len ) {
499 while ( isWhitespace( (c = read()) ) );
500 }
501 while ( !eol && ptr < len ) {
502 switch (c) {
503 case -1:
504 case '\n':
505 eol = true;
506 break;
507 case '\r':
508 eol = true;
509 int cur = getFilePointer();
510 if ((read()) != '\n') {
511 seek(cur);
512 }
513 break;
514 default:
515 input[ptr++] = (byte)c;
516 break;
517 }
518
519 // break loop? do it before we read() again
520 if( eol || len <= ptr ) {
521 break;
522 }
523 else {
524 c = read();
525 }
526 }
527 if (ptr >= len) {
528 eol = false;
529 while (!eol) {
530 switch (c = read()) {
531 case -1:
532 case '\n':
533 eol = true;
534 break;
535 case '\r':
536 eol = true;
537 int cur = getFilePointer();
538 if ((read()) != '\n') {
539 seek(cur);
540 }
541 break;
542 }
543 }
544 }
545
546 if ((c == -1) && (ptr == 0)) {
547 return false;
548 }
549 if (ptr + 2 <= len) {
550 input[ptr++] = (byte)' ';
551 input[ptr] = (byte)'X';
552 }
553 return true;
554 }
555
556 public static int[] checkObjectStart(byte line[]) {
557 try {
558 PRTokeniser tk = new PRTokeniser(line);
559 int num = 0;
560 int gen = 0;
561 if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
562 return null;
563 num = tk.intValue();
564 if (!tk.nextToken() || tk.getTokenType() != TK_NUMBER)
565 return null;
566 gen = tk.intValue();
567 if (!tk.nextToken())
568 return null;
569 if (!tk.getStringValue().equals("obj"))
570 return null;
571 return new int[]{num, gen};
572 }
573 catch (Exception ioe) {
574 // empty on purpose
575 }
576 return null;
577 }
578
579 public boolean isHexString() {
580 return this.hexString;
581 }
582
583 }