1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.xerces.impl.io;
19
20 import java.io.InputStream;
21 import java.io.IOException;
22 import java.io.Reader;
23
24 import java.util.Locale;
25 import org.apache.xerces.util.MessageFormatter;
26 import org.apache.xerces.impl.msg.XMLMessageFormatter;
27
28 /**
29 * <p>A UTF-8 reader.</p>
30 *
31 * @xerces.internal
32 *
33 * @author Andy Clark, IBM
34 *
35 * @version $Id: UTF8Reader.java 554069 2007-07-06 21:56:14Z mrglavas $
36 */
37 public class UTF8Reader
38 extends Reader {
39
40 //
41 // Constants
42 //
43
44 /** Default byte buffer size (2048). */
45 public static final int DEFAULT_BUFFER_SIZE = 2048;
46
47 // debugging
48
49 /** Debug read. */
50 private static final boolean DEBUG_READ = false;
51
52 //
53 // Data
54 //
55
56 /** Input stream. */
57 protected final InputStream fInputStream;
58
59 /** Byte buffer. */
60 protected final byte[] fBuffer;
61
62 /** Offset into buffer. */
63 protected int fOffset;
64
65 /** Surrogate character. */
66 private int fSurrogate = -1;
67
68 // message formatter; used to produce localized
69 // exception messages
70 private final MessageFormatter fFormatter;
71
72 //Locale to use for messages
73 private final Locale fLocale;
74
75 //
76 // Constructors
77 //
78
79 /**
80 * Constructs a UTF-8 reader from the specified input stream
81 * using the default buffer size. Primarily for testing.
82 *
83 * @param inputStream The input stream.
84 */
85 public UTF8Reader(InputStream inputStream) {
86 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
87 } // <init>(InputStream, MessageFormatter)
88
89 /**
90 * Constructs a UTF-8 reader from the specified input stream
91 * using the default buffer size and the given MessageFormatter.
92 *
93 * @param inputStream The input stream.
94 * @param messageFormatter given MessageFormatter
95 * @param locale Locale to use for messages
96 */
97 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,
98 Locale locale) {
99 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
100 } // <init>(InputStream, MessageFormatter, Locale)
101
102 /**
103 * Constructs a UTF-8 reader from the specified input stream,
104 * buffer size and MessageFormatter.
105 *
106 * @param inputStream The input stream.
107 * @param size The initial buffer size.
108 * @param messageFormatter the formatter for localizing/formatting errors.
109 * @param locale the Locale to use for messages
110 */
111 public UTF8Reader(InputStream inputStream, int size,
112 MessageFormatter messageFormatter, Locale locale) {
113 this(inputStream, new byte[size], messageFormatter, locale);
114 } // <init>(InputStream, int, MessageFormatter, Locale)
115
116 /**
117 * Constructs a UTF-8 reader from the specified input stream,
118 * buffer and MessageFormatter.
119 *
120 * @param inputStream The input stream.
121 * @param buffer The byte buffer.
122 * @param messageFormatter the formatter for localizing/formatting errors.
123 * @param locale the Locale to use for messages
124 */
125 public UTF8Reader(InputStream inputStream, byte [] buffer,
126 MessageFormatter messageFormatter, Locale locale) {
127 fInputStream = inputStream;
128 fBuffer = buffer;
129 fFormatter = messageFormatter;
130 fLocale = locale;
131 } // <init>(InputStream, byte[], MessageFormatter, Locale)
132
133 //
134 // Reader methods
135 //
136
137 /**
138 * Read a single character. This method will block until a character is
139 * available, an I/O error occurs, or the end of the stream is reached.
140 *
141 * <p> Subclasses that intend to support efficient single-character input
142 * should override this method.
143 *
144 * @return The character read, as an integer in the range 0 to 16383
145 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
146 * been reached
147 *
148 * @exception IOException If an I/O error occurs
149 */
150 public int read() throws IOException {
151
152 // decode character
153 int c = fSurrogate;
154 if (fSurrogate == -1) {
155 // NOTE: We use the index into the buffer if there are remaining
156 // bytes from the last block read. -Ac
157 int index = 0;
158
159 // get first byte
160 int b0 = index == fOffset
161 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
162 if (b0 == -1) {
163 return -1;
164 }
165
166 // UTF-8: [0xxx xxxx]
167 // Unicode: [0000 0000] [0xxx xxxx]
168 if (b0 < 0x80) {
169 c = (char)b0;
170 }
171
172 // UTF-8: [110y yyyy] [10xx xxxx]
173 // Unicode: [0000 0yyy] [yyxx xxxx]
174 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
175 int b1 = index == fOffset
176 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
177 if (b1 == -1) {
178 expectedByte(2, 2);
179 }
180 if ((b1 & 0xC0) != 0x80) {
181 invalidByte(2, 2, b1);
182 }
183 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
184 }
185
186 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
187 // Unicode: [zzzz yyyy] [yyxx xxxx]
188 else if ((b0 & 0xF0) == 0xE0) {
189 int b1 = index == fOffset
190 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
191 if (b1 == -1) {
192 expectedByte(2, 3);
193 }
194 if ((b1 & 0xC0) != 0x80
195 || (b0 == 0xED && b1 >= 0xA0)
196 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
197 invalidByte(2, 3, b1);
198 }
199 int b2 = index == fOffset
200 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
201 if (b2 == -1) {
202 expectedByte(3, 3);
203 }
204 if ((b2 & 0xC0) != 0x80) {
205 invalidByte(3, 3, b2);
206 }
207 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
208 (b2 & 0x003F);
209 }
210
211 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
212 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
213 // [1101 11yy] [yyxx xxxx] (low surrogate)
214 // * uuuuu = wwww + 1
215 else if ((b0 & 0xF8) == 0xF0) {
216 int b1 = index == fOffset
217 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
218 if (b1 == -1) {
219 expectedByte(2, 4);
220 }
221 if ((b1 & 0xC0) != 0x80
222 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
223 invalidByte(2, 3, b1);
224 }
225 int b2 = index == fOffset
226 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
227 if (b2 == -1) {
228 expectedByte(3, 4);
229 }
230 if ((b2 & 0xC0) != 0x80) {
231 invalidByte(3, 3, b2);
232 }
233 int b3 = index == fOffset
234 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
235 if (b3 == -1) {
236 expectedByte(4, 4);
237 }
238 if ((b3 & 0xC0) != 0x80) {
239 invalidByte(4, 4, b3);
240 }
241 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
242 if (uuuuu > 0x10) {
243 invalidSurrogate(uuuuu);
244 }
245 int wwww = uuuuu - 1;
246 int hs = 0xD800 |
247 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
248 ((b2 >> 4) & 0x0003);
249 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
250 c = hs;
251 fSurrogate = ls;
252 }
253
254 // error
255 else {
256 invalidByte(1, 1, b0);
257 }
258 }
259
260 // use surrogate
261 else {
262 fSurrogate = -1;
263 }
264
265 // return character
266 if (DEBUG_READ) {
267 System.out.println("read(): 0x"+Integer.toHexString(c));
268 }
269 return c;
270
271 } // read():int
272
273 /**
274 * Read characters into a portion of an array. This method will block
275 * until some input is available, an I/O error occurs, or the end of the
276 * stream is reached.
277 *
278 * @param ch Destination buffer
279 * @param offset Offset at which to start storing characters
280 * @param length Maximum number of characters to read
281 *
282 * @return The number of characters read, or -1 if the end of the
283 * stream has been reached
284 *
285 * @exception IOException If an I/O error occurs
286 */
287 public int read(char ch[], int offset, int length) throws IOException {
288
289 // read bytes
290 int out = offset;
291 int count = 0;
292 if (fOffset == 0) {
293 // adjust length to read
294 if (length > fBuffer.length) {
295 length = fBuffer.length;
296 }
297
298 // handle surrogate
299 if (fSurrogate != -1) {
300 ch[out++] = (char)fSurrogate;
301 fSurrogate = -1;
302 length--;
303 }
304
305 // perform read operation
306 count = fInputStream.read(fBuffer, 0, length);
307 if (count == -1) {
308 return -1;
309 }
310 count += out - offset;
311 }
312
313 // skip read; last character was in error
314 // NOTE: Having an offset value other than zero means that there was
315 // an error in the last character read. In this case, we have
316 // skipped the read so we don't consume any bytes past the
317 // error. By signalling the error on the next block read we
318 // allow the method to return the most valid characters that
319 // it can on the previous block read. -Ac
320 else {
321 count = fOffset;
322 fOffset = 0;
323 }
324
325 // convert bytes to characters
326 final int total = count;
327 int in;
328 byte byte1;
329 final byte byte0 = 0;
330 for (in = 0; in < total; in++) {
331 byte1 = fBuffer[in];
332 if (byte1 >= byte0) {
333 ch[out++] = (char)byte1;
334 }
335 else {
336 break;
337 }
338 }
339 for ( ; in < total; in++) {
340 byte1 = fBuffer[in];
341
342 // UTF-8: [0xxx xxxx]
343 // Unicode: [0000 0000] [0xxx xxxx]
344 if (byte1 >= byte0) {
345 ch[out++] = (char)byte1;
346 continue;
347 }
348
349 // UTF-8: [110y yyyy] [10xx xxxx]
350 // Unicode: [0000 0yyy] [yyxx xxxx]
351 int b0 = byte1 & 0x0FF;
352 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
353 int b1 = -1;
354 if (++in < total) {
355 b1 = fBuffer[in] & 0x00FF;
356 }
357 else {
358 b1 = fInputStream.read();
359 if (b1 == -1) {
360 if (out > offset) {
361 fBuffer[0] = (byte)b0;
362 fOffset = 1;
363 return out - offset;
364 }
365 expectedByte(2, 2);
366 }
367 count++;
368 }
369 if ((b1 & 0xC0) != 0x80) {
370 if (out > offset) {
371 fBuffer[0] = (byte)b0;
372 fBuffer[1] = (byte)b1;
373 fOffset = 2;
374 return out - offset;
375 }
376 invalidByte(2, 2, b1);
377 }
378 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
379 ch[out++] = (char)c;
380 count -= 1;
381 continue;
382 }
383
384 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
385 // Unicode: [zzzz yyyy] [yyxx xxxx]
386 if ((b0 & 0xF0) == 0xE0) {
387 int b1 = -1;
388 if (++in < total) {
389 b1 = fBuffer[in] & 0x00FF;
390 }
391 else {
392 b1 = fInputStream.read();
393 if (b1 == -1) {
394 if (out > offset) {
395 fBuffer[0] = (byte)b0;
396 fOffset = 1;
397 return out - offset;
398 }
399 expectedByte(2, 3);
400 }
401 count++;
402 }
403 if ((b1 & 0xC0) != 0x80
404 || (b0 == 0xED && b1 >= 0xA0)
405 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
406 if (out > offset) {
407 fBuffer[0] = (byte)b0;
408 fBuffer[1] = (byte)b1;
409 fOffset = 2;
410 return out - offset;
411 }
412 invalidByte(2, 3, b1);
413 }
414 int b2 = -1;
415 if (++in < total) {
416 b2 = fBuffer[in] & 0x00FF;
417 }
418 else {
419 b2 = fInputStream.read();
420 if (b2 == -1) {
421 if (out > offset) {
422 fBuffer[0] = (byte)b0;
423 fBuffer[1] = (byte)b1;
424 fOffset = 2;
425 return out - offset;
426 }
427 expectedByte(3, 3);
428 }
429 count++;
430 }
431 if ((b2 & 0xC0) != 0x80) {
432 if (out > offset) {
433 fBuffer[0] = (byte)b0;
434 fBuffer[1] = (byte)b1;
435 fBuffer[2] = (byte)b2;
436 fOffset = 3;
437 return out - offset;
438 }
439 invalidByte(3, 3, b2);
440 }
441 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
442 (b2 & 0x003F);
443 ch[out++] = (char)c;
444 count -= 2;
445 continue;
446 }
447
448 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
449 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
450 // [1101 11yy] [yyxx xxxx] (low surrogate)
451 // * uuuuu = wwww + 1
452 if ((b0 & 0xF8) == 0xF0) {
453 int b1 = -1;
454 if (++in < total) {
455 b1 = fBuffer[in] & 0x00FF;
456 }
457 else {
458 b1 = fInputStream.read();
459 if (b1 == -1) {
460 if (out > offset) {
461 fBuffer[0] = (byte)b0;
462 fOffset = 1;
463 return out - offset;
464 }
465 expectedByte(2, 4);
466 }
467 count++;
468 }
469 if ((b1 & 0xC0) != 0x80
470 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
471 if (out > offset) {
472 fBuffer[0] = (byte)b0;
473 fBuffer[1] = (byte)b1;
474 fOffset = 2;
475 return out - offset;
476 }
477 invalidByte(2, 4, b1);
478 }
479 int b2 = -1;
480 if (++in < total) {
481 b2 = fBuffer[in] & 0x00FF;
482 }
483 else {
484 b2 = fInputStream.read();
485 if (b2 == -1) {
486 if (out > offset) {
487 fBuffer[0] = (byte)b0;
488 fBuffer[1] = (byte)b1;
489 fOffset = 2;
490 return out - offset;
491 }
492 expectedByte(3, 4);
493 }
494 count++;
495 }
496 if ((b2 & 0xC0) != 0x80) {
497 if (out > offset) {
498 fBuffer[0] = (byte)b0;
499 fBuffer[1] = (byte)b1;
500 fBuffer[2] = (byte)b2;
501 fOffset = 3;
502 return out - offset;
503 }
504 invalidByte(3, 4, b2);
505 }
506 int b3 = -1;
507 if (++in < total) {
508 b3 = fBuffer[in] & 0x00FF;
509 }
510 else {
511 b3 = fInputStream.read();
512 if (b3 == -1) {
513 if (out > offset) {
514 fBuffer[0] = (byte)b0;
515 fBuffer[1] = (byte)b1;
516 fBuffer[2] = (byte)b2;
517 fOffset = 3;
518 return out - offset;
519 }
520 expectedByte(4, 4);
521 }
522 count++;
523 }
524 if ((b3 & 0xC0) != 0x80) {
525 if (out > offset) {
526 fBuffer[0] = (byte)b0;
527 fBuffer[1] = (byte)b1;
528 fBuffer[2] = (byte)b2;
529 fBuffer[3] = (byte)b3;
530 fOffset = 4;
531 return out - offset;
532 }
533 invalidByte(4, 4, b2);
534 }
535
536 // decode bytes into surrogate characters
537 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
538 if (uuuuu > 0x10) {
539 invalidSurrogate(uuuuu);
540 }
541 int wwww = uuuuu - 1;
542 int zzzz = b1 & 0x000F;
543 int yyyyyy = b2 & 0x003F;
544 int xxxxxx = b3 & 0x003F;
545 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
546 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
547
548 // set characters
549 ch[out++] = (char)hs;
550 if ((count -= 2) <= length) {
551 ch[out++] = (char)ls;
552 }
553 // reached the end of the char buffer; save low surrogate for the next read
554 else {
555 fSurrogate = ls;
556 --count;
557 }
558 continue;
559 }
560
561 // error
562 if (out > offset) {
563 fBuffer[0] = (byte)b0;
564 fOffset = 1;
565 return out - offset;
566 }
567 invalidByte(1, 1, b0);
568 }
569
570 // return number of characters converted
571 if (DEBUG_READ) {
572 System.out.println("read(char[],"+offset+','+length+"): count="+count);
573 }
574 return count;
575
576 } // read(char[],int,int)
577
578 /**
579 * Skip characters. This method will block until some characters are
580 * available, an I/O error occurs, or the end of the stream is reached.
581 *
582 * @param n The number of characters to skip
583 *
584 * @return The number of characters actually skipped
585 *
586 * @exception IOException If an I/O error occurs
587 */
588 public long skip(long n) throws IOException {
589
590 long remaining = n;
591 final char[] ch = new char[fBuffer.length];
592 do {
593 int length = ch.length < remaining ? ch.length : (int)remaining;
594 int count = read(ch, 0, length);
595 if (count > 0) {
596 remaining -= count;
597 }
598 else {
599 break;
600 }
601 } while (remaining > 0);
602
603 long skipped = n - remaining;
604 return skipped;
605
606 } // skip(long):long
607
608 /**
609 * Tell whether this stream is ready to be read.
610 *
611 * @return True if the next read() is guaranteed not to block for input,
612 * false otherwise. Note that returning false does not guarantee that the
613 * next read will block.
614 *
615 * @exception IOException If an I/O error occurs
616 */
617 public boolean ready() throws IOException {
618 return false;
619 } // ready()
620
621 /**
622 * Tell whether this stream supports the mark() operation.
623 */
624 public boolean markSupported() {
625 return false;
626 } // markSupported()
627
628 /**
629 * Mark the present position in the stream. Subsequent calls to reset()
630 * will attempt to reposition the stream to this point. Not all
631 * character-input streams support the mark() operation.
632 *
633 * @param readAheadLimit Limit on the number of characters that may be
634 * read while still preserving the mark. After
635 * reading this many characters, attempting to
636 * reset the stream may fail.
637 *
638 * @exception IOException If the stream does not support mark(),
639 * or if some other I/O error occurs
640 */
641 public void mark(int readAheadLimit) throws IOException {
642 throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
643 } // mark(int)
644
645 /**
646 * Reset the stream. If the stream has been marked, then attempt to
647 * reposition it at the mark. If the stream has not been marked, then
648 * attempt to reset it in some way appropriate to the particular stream,
649 * for example by repositioning it to its starting point. Not all
650 * character-input streams support the reset() operation, and some support
651 * reset() without supporting mark().
652 *
653 * @exception IOException If the stream has not been marked,
654 * or if the mark has been invalidated,
655 * or if the stream does not support reset(),
656 * or if some other I/O error occurs
657 */
658 public void reset() throws IOException {
659 fOffset = 0;
660 fSurrogate = -1;
661 } // reset()
662
663 /**
664 * Close the stream. Once a stream has been closed, further read(),
665 * ready(), mark(), or reset() invocations will throw an IOException.
666 * Closing a previously-closed stream, however, has no effect.
667 *
668 * @exception IOException If an I/O error occurs
669 */
670 public void close() throws IOException {
671 fInputStream.close();
672 } // close()
673
674 //
675 // Private methods
676 //
677
678 /** Throws an exception for expected byte. */
679 private void expectedByte(int position, int count)
680 throws MalformedByteSequenceException {
681
682 throw new MalformedByteSequenceException(fFormatter,
683 fLocale,
684 XMLMessageFormatter.XML_DOMAIN,
685 "ExpectedByte",
686 new Object[] {Integer.toString(position), Integer.toString(count)});
687
688 } // expectedByte(int,int)
689
690 /** Throws an exception for invalid byte. */
691 private void invalidByte(int position, int count, int c)
692 throws MalformedByteSequenceException {
693
694 throw new MalformedByteSequenceException(fFormatter,
695 fLocale,
696 XMLMessageFormatter.XML_DOMAIN,
697 "InvalidByte",
698 new Object [] {Integer.toString(position), Integer.toString(count)});
699
700 } // invalidByte(int,int,int)
701
702 /** Throws an exception for invalid surrogate bits. */
703 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException {
704
705 throw new MalformedByteSequenceException(fFormatter,
706 fLocale,
707 XMLMessageFormatter.XML_DOMAIN,
708 "InvalidHighSurrogate",
709 new Object[] {Integer.toHexString(uuuuu)});
710
711 } // invalidSurrogate(int)
712
713 } // class UTF8Reader