1 /*
2 * reserved comment block
3 * DO NOT REMOVE OR ALTER!
4 */
5 /*
6 * Copyright 2000-2004 The Apache Software Foundation.
7 *
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 *
12 * http://www.apache.org/licenses/LICENSE-2.0
13 *
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 */
20
21 package com.sun.org.apache.xerces.internal.impl.io;
22
23 import java.io.InputStream;
24 import java.io.IOException;
25 import java.io.Reader;
26
27 import java.util.Locale;
28 import com.sun.org.apache.xerces.internal.util.MessageFormatter;
29 import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
30
31 import com.sun.xml.internal.stream.util.BufferAllocator;
32 import com.sun.xml.internal.stream.util.ThreadLocalBufferAllocator;
33
34 /**
35 * <p>A UTF-8 reader.</p>
36 *
37 * @xerces.internal
38 *
39 * @author Andy Clark, IBM
40 *
41 */
42 public class UTF8Reader
43 extends Reader {
44
45 //
46 // Constants
47 //
48
49 /** Default byte buffer size (2048). */
50 public static final int DEFAULT_BUFFER_SIZE = 2048;
51
52 // debugging
53
54 /** Debug read. */
55 private static final boolean DEBUG_READ = false;
56
57 //
58 // Data
59 //
60
61 /** Input stream. */
62 protected InputStream fInputStream;
63
64 /** Byte buffer. */
65 protected byte[] fBuffer;
66
67 /** Offset into buffer. */
68 protected int fOffset;
69
70 /** Surrogate character. */
71 private int fSurrogate = -1;
72
73 // message formatter; used to produce localized
74 // exception messages
75 private MessageFormatter fFormatter = null;
76
77 //Locale to use for messages
78 private Locale fLocale = null;
79
80 //
81 // Constructors
82 //
83
84 /**
85 * Constructs a UTF-8 reader from the specified input stream
86 * using the default buffer size. Primarily for testing.
87 *
88 * @param inputStream The input stream.
89 */
90 public UTF8Reader(InputStream inputStream) {
91 this(inputStream, DEFAULT_BUFFER_SIZE, new XMLMessageFormatter(), Locale.getDefault());
92 } // <init>(InputStream, MessageFormatter)
93
94 /**
95 * Constructs a UTF-8 reader from the specified input stream
96 * using the default buffer size and the given MessageFormatter.
97 *
98 * @param inputStream The input stream.
99 * @param messageFormatter given MessageFormatter
100 * @param locale Locale to use for messages
101 */
102 public UTF8Reader(InputStream inputStream, MessageFormatter messageFormatter,
103 Locale locale) {
104 this(inputStream, DEFAULT_BUFFER_SIZE, messageFormatter, locale);
105 } // <init>(InputStream, MessageFormatter, Locale)
106
107 /**
108 * Constructs a UTF-8 reader from the specified input stream,
109 * buffer size and MessageFormatter.
110 *
111 * @param inputStream The input stream.
112 * @param size The initial buffer size.
113 * @param messageFormatter the formatter for localizing/formatting errors.
114 * @param locale the Locale to use for messages
115 */
116 public UTF8Reader(InputStream inputStream, int size,
117 MessageFormatter messageFormatter, Locale locale) {
118 fInputStream = inputStream;
119 BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator();
120 fBuffer = ba.getByteBuffer(size);
121 if (fBuffer == null) {
122 fBuffer = new byte[size];
123 }
124 fFormatter = messageFormatter;
125 fLocale = locale;
126 } // <init>(InputStream, int, MessageFormatter, Locale)
127
128 //
129 // Reader methods
130 //
131
132 /**
133 * Read a single character. This method will block until a character is
134 * available, an I/O error occurs, or the end of the stream is reached.
135 *
136 * <p> Subclasses that intend to support efficient single-character input
137 * should override this method.
138 *
139 * @return The character read, as an integer in the range 0 to 16383
140 * (<tt>0x00-0xffff</tt>), or -1 if the end of the stream has
141 * been reached
142 *
143 * @exception IOException If an I/O error occurs
144 */
145 public int read() throws IOException {
146
147 // decode character
148 int c = fSurrogate;
149 if (fSurrogate == -1) {
150 // NOTE: We use the index into the buffer if there are remaining
151 // bytes from the last block read. -Ac
152 int index = 0;
153
154 // get first byte
155 int b0 = index == fOffset
156 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
157 if (b0 == -1) {
158 return -1;
159 }
160
161 // UTF-8: [0xxx xxxx]
162 // Unicode: [0000 0000] [0xxx xxxx]
163 if (b0 < 0x80) {
164 c = (char)b0;
165 }
166
167 // UTF-8: [110y yyyy] [10xx xxxx]
168 // Unicode: [0000 0yyy] [yyxx xxxx]
169 else if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
170 int b1 = index == fOffset
171 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
172 if (b1 == -1) {
173 expectedByte(2, 2);
174 }
175 if ((b1 & 0xC0) != 0x80) {
176 invalidByte(2, 2, b1);
177 }
178 c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
179 }
180
181 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
182 // Unicode: [zzzz yyyy] [yyxx xxxx]
183 else if ((b0 & 0xF0) == 0xE0) {
184 int b1 = index == fOffset
185 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
186 if (b1 == -1) {
187 expectedByte(2, 3);
188 }
189 if ((b1 & 0xC0) != 0x80
190 || (b0 == 0xED && b1 >= 0xA0)
191 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
192 invalidByte(2, 3, b1);
193 }
194 int b2 = index == fOffset
195 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
196 if (b2 == -1) {
197 expectedByte(3, 3);
198 }
199 if ((b2 & 0xC0) != 0x80) {
200 invalidByte(3, 3, b2);
201 }
202 c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
203 (b2 & 0x003F);
204 }
205
206 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
207 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
208 // [1101 11yy] [yyxx xxxx] (low surrogate)
209 // * uuuuu = wwww + 1
210 else if ((b0 & 0xF8) == 0xF0) {
211 int b1 = index == fOffset
212 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
213 if (b1 == -1) {
214 expectedByte(2, 4);
215 }
216 if ((b1 & 0xC0) != 0x80
217 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
218 invalidByte(2, 3, b1);
219 }
220 int b2 = index == fOffset
221 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
222 if (b2 == -1) {
223 expectedByte(3, 4);
224 }
225 if ((b2 & 0xC0) != 0x80) {
226 invalidByte(3, 3, b2);
227 }
228 int b3 = index == fOffset
229 ? fInputStream.read() : fBuffer[index++] & 0x00FF;
230 if (b3 == -1) {
231 expectedByte(4, 4);
232 }
233 if ((b3 & 0xC0) != 0x80) {
234 invalidByte(4, 4, b3);
235 }
236 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
237 if (uuuuu > 0x10) {
238 invalidSurrogate(uuuuu);
239 }
240 int wwww = uuuuu - 1;
241 int hs = 0xD800 |
242 ((wwww << 6) & 0x03C0) | ((b1 << 2) & 0x003C) |
243 ((b2 >> 4) & 0x0003);
244 int ls = 0xDC00 | ((b2 << 6) & 0x03C0) | (b3 & 0x003F);
245 c = hs;
246 fSurrogate = ls;
247 }
248
249 // error
250 else {
251 invalidByte(1, 1, b0);
252 }
253 }
254
255 // use surrogate
256 else {
257 fSurrogate = -1;
258 }
259
260 // return character
261 if (DEBUG_READ) {
262 System.out.println("read(): 0x"+Integer.toHexString(c));
263 }
264 return c;
265
266 } // read():int
267
268 /**
269 * Read characters into a portion of an array. This method will block
270 * until some input is available, an I/O error occurs, or the end of the
271 * stream is reached.
272 *
273 * @param ch Destination buffer
274 * @param offset Offset at which to start storing characters
275 * @param length Maximum number of characters to read
276 *
277 * @return The number of characters read, or -1 if the end of the
278 * stream has been reached
279 *
280 * @exception IOException If an I/O error occurs
281 */
282 public int read(char ch[], int offset, int length) throws IOException {
283
284 // handle surrogate
285 int out = offset;
286 if (fSurrogate != -1) {
287 ch[offset + 1] = (char)fSurrogate;
288 fSurrogate = -1;
289 length--;
290 out++;
291 }
292
293 // read bytes
294 int count = 0;
295 if (fOffset == 0) {
296 // adjust length to read
297 if (length > fBuffer.length) {
298 length = fBuffer.length;
299 }
300
301 // perform read operation
302 count = fInputStream.read(fBuffer, 0, length);
303 if (count == -1) {
304 return -1;
305 }
306 count += out - offset;
307 }
308
309 // skip read; last character was in error
310 // NOTE: Having an offset value other than zero means that there was
311 // an error in the last character read. In this case, we have
312 // skipped the read so we don't consume any bytes past the
313 // error. By signalling the error on the next block read we
314 // allow the method to return the most valid characters that
315 // it can on the previous block read. -Ac
316 else {
317 count = fOffset;
318 fOffset = 0;
319 }
320
321 // convert bytes to characters
322 final int total = count;
323 int in;
324 byte byte1;
325 final byte byte0 = 0;
326 for (in = 0; in < total; in++) {
327 byte1 = fBuffer[in];
328 if (byte1 >= byte0) {
329 ch[out++] = (char)byte1;
330 }
331 else {
332 break;
333 }
334 }
335 for ( ; in < total; in++) {
336 byte1 = fBuffer[in];
337
338 // UTF-8: [0xxx xxxx]
339 // Unicode: [0000 0000] [0xxx xxxx]
340 if (byte1 >= byte0) {
341 ch[out++] = (char)byte1;
342 continue;
343 }
344
345 // UTF-8: [110y yyyy] [10xx xxxx]
346 // Unicode: [0000 0yyy] [yyxx xxxx]
347 int b0 = byte1 & 0x0FF;
348 if ((b0 & 0xE0) == 0xC0 && (b0 & 0x1E) != 0) {
349 int b1 = -1;
350 if (++in < total) {
351 b1 = fBuffer[in] & 0x00FF;
352 }
353 else {
354 b1 = fInputStream.read();
355 if (b1 == -1) {
356 if (out > offset) {
357 fBuffer[0] = (byte)b0;
358 fOffset = 1;
359 return out - offset;
360 }
361 expectedByte(2, 2);
362 }
363 count++;
364 }
365 if ((b1 & 0xC0) != 0x80) {
366 if (out > offset) {
367 fBuffer[0] = (byte)b0;
368 fBuffer[1] = (byte)b1;
369 fOffset = 2;
370 return out - offset;
371 }
372 invalidByte(2, 2, b1);
373 }
374 int c = ((b0 << 6) & 0x07C0) | (b1 & 0x003F);
375 ch[out++] = (char)c;
376 count -= 1;
377 continue;
378 }
379
380 // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx]
381 // Unicode: [zzzz yyyy] [yyxx xxxx]
382 if ((b0 & 0xF0) == 0xE0) {
383 int b1 = -1;
384 if (++in < total) {
385 b1 = fBuffer[in] & 0x00FF;
386 }
387 else {
388 b1 = fInputStream.read();
389 if (b1 == -1) {
390 if (out > offset) {
391 fBuffer[0] = (byte)b0;
392 fOffset = 1;
393 return out - offset;
394 }
395 expectedByte(2, 3);
396 }
397 count++;
398 }
399 if ((b1 & 0xC0) != 0x80
400 || (b0 == 0xED && b1 >= 0xA0)
401 || ((b0 & 0x0F) == 0 && (b1 & 0x20) == 0)) {
402 if (out > offset) {
403 fBuffer[0] = (byte)b0;
404 fBuffer[1] = (byte)b1;
405 fOffset = 2;
406 return out - offset;
407 }
408 invalidByte(2, 3, b1);
409 }
410 int b2 = -1;
411 if (++in < total) {
412 b2 = fBuffer[in] & 0x00FF;
413 }
414 else {
415 b2 = fInputStream.read();
416 if (b2 == -1) {
417 if (out > offset) {
418 fBuffer[0] = (byte)b0;
419 fBuffer[1] = (byte)b1;
420 fOffset = 2;
421 return out - offset;
422 }
423 expectedByte(3, 3);
424 }
425 count++;
426 }
427 if ((b2 & 0xC0) != 0x80) {
428 if (out > offset) {
429 fBuffer[0] = (byte)b0;
430 fBuffer[1] = (byte)b1;
431 fBuffer[2] = (byte)b2;
432 fOffset = 3;
433 return out - offset;
434 }
435 invalidByte(3, 3, b2);
436 }
437 int c = ((b0 << 12) & 0xF000) | ((b1 << 6) & 0x0FC0) |
438 (b2 & 0x003F);
439 ch[out++] = (char)c;
440 count -= 2;
441 continue;
442 }
443
444 // UTF-8: [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
445 // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
446 // [1101 11yy] [yyxx xxxx] (low surrogate)
447 // * uuuuu = wwww + 1
448 if ((b0 & 0xF8) == 0xF0) {
449 int b1 = -1;
450 if (++in < total) {
451 b1 = fBuffer[in] & 0x00FF;
452 }
453 else {
454 b1 = fInputStream.read();
455 if (b1 == -1) {
456 if (out > offset) {
457 fBuffer[0] = (byte)b0;
458 fOffset = 1;
459 return out - offset;
460 }
461 expectedByte(2, 4);
462 }
463 count++;
464 }
465 if ((b1 & 0xC0) != 0x80
466 || ((b1 & 0x30) == 0 && (b0 & 0x07) == 0)) {
467 if (out > offset) {
468 fBuffer[0] = (byte)b0;
469 fBuffer[1] = (byte)b1;
470 fOffset = 2;
471 return out - offset;
472 }
473 invalidByte(2, 4, b1);
474 }
475 int b2 = -1;
476 if (++in < total) {
477 b2 = fBuffer[in] & 0x00FF;
478 }
479 else {
480 b2 = fInputStream.read();
481 if (b2 == -1) {
482 if (out > offset) {
483 fBuffer[0] = (byte)b0;
484 fBuffer[1] = (byte)b1;
485 fOffset = 2;
486 return out - offset;
487 }
488 expectedByte(3, 4);
489 }
490 count++;
491 }
492 if ((b2 & 0xC0) != 0x80) {
493 if (out > offset) {
494 fBuffer[0] = (byte)b0;
495 fBuffer[1] = (byte)b1;
496 fBuffer[2] = (byte)b2;
497 fOffset = 3;
498 return out - offset;
499 }
500 invalidByte(3, 4, b2);
501 }
502 int b3 = -1;
503 if (++in < total) {
504 b3 = fBuffer[in] & 0x00FF;
505 }
506 else {
507 b3 = fInputStream.read();
508 if (b3 == -1) {
509 if (out > offset) {
510 fBuffer[0] = (byte)b0;
511 fBuffer[1] = (byte)b1;
512 fBuffer[2] = (byte)b2;
513 fOffset = 3;
514 return out - offset;
515 }
516 expectedByte(4, 4);
517 }
518 count++;
519 }
520 if ((b3 & 0xC0) != 0x80) {
521 if (out > offset) {
522 fBuffer[0] = (byte)b0;
523 fBuffer[1] = (byte)b1;
524 fBuffer[2] = (byte)b2;
525 fBuffer[3] = (byte)b3;
526 fOffset = 4;
527 return out - offset;
528 }
529 invalidByte(4, 4, b2);
530 }
531
532 // decode bytes into surrogate characters
533 int uuuuu = ((b0 << 2) & 0x001C) | ((b1 >> 4) & 0x0003);
534 if (uuuuu > 0x10) {
535 invalidSurrogate(uuuuu);
536 }
537 int wwww = uuuuu - 1;
538 int zzzz = b1 & 0x000F;
539 int yyyyyy = b2 & 0x003F;
540 int xxxxxx = b3 & 0x003F;
541 int hs = 0xD800 | ((wwww << 6) & 0x03C0) | (zzzz << 2) | (yyyyyy >> 4);
542 int ls = 0xDC00 | ((yyyyyy << 6) & 0x03C0) | xxxxxx;
543
544 // set characters
545 ch[out++] = (char)hs;
546 ch[out++] = (char)ls;
547 count -= 2;
548 continue;
549 }
550
551 // error
552 if (out > offset) {
553 fBuffer[0] = (byte)b0;
554 fOffset = 1;
555 return out - offset;
556 }
557 invalidByte(1, 1, b0);
558 }
559
560 // return number of characters converted
561 if (DEBUG_READ) {
562 System.out.println("read(char[],"+offset+','+length+"): count="+count);
563 }
564 return count;
565
566 } // read(char[],int,int)
567
568 /**
569 * Skip characters. This method will block until some characters are
570 * available, an I/O error occurs, or the end of the stream is reached.
571 *
572 * @param n The number of characters to skip
573 *
574 * @return The number of characters actually skipped
575 *
576 * @exception IOException If an I/O error occurs
577 */
578 public long skip(long n) throws IOException {
579
580 long remaining = n;
581 final char[] ch = new char[fBuffer.length];
582 do {
583 int length = ch.length < remaining ? ch.length : (int)remaining;
584 int count = read(ch, 0, length);
585 if (count > 0) {
586 remaining -= count;
587 }
588 else {
589 break;
590 }
591 } while (remaining > 0);
592
593 long skipped = n - remaining;
594 return skipped;
595
596 } // skip(long):long
597
598 /**
599 * Tell whether this stream is ready to be read.
600 *
601 * @return True if the next read() is guaranteed not to block for input,
602 * false otherwise. Note that returning false does not guarantee that the
603 * next read will block.
604 *
605 * @exception IOException If an I/O error occurs
606 */
607 public boolean ready() throws IOException {
608 return false;
609 } // ready()
610
611 /**
612 * Tell whether this stream supports the mark() operation.
613 */
614 public boolean markSupported() {
615 return false;
616 } // markSupported()
617
618 /**
619 * Mark the present position in the stream. Subsequent calls to reset()
620 * will attempt to reposition the stream to this point. Not all
621 * character-input streams support the mark() operation.
622 *
623 * @param readAheadLimit Limit on the number of characters that may be
624 * read while still preserving the mark. After
625 * reading this many characters, attempting to
626 * reset the stream may fail.
627 *
628 * @exception IOException If the stream does not support mark(),
629 * or if some other I/O error occurs
630 */
631 public void mark(int readAheadLimit) throws IOException {
632 throw new IOException(fFormatter.formatMessage(fLocale, "OperationNotSupported", new Object[]{"mark()", "UTF-8"}));
633 } // mark(int)
634
635 /**
636 * Reset the stream. If the stream has been marked, then attempt to
637 * reposition it at the mark. If the stream has not been marked, then
638 * attempt to reset it in some way appropriate to the particular stream,
639 * for example by repositioning it to its starting point. Not all
640 * character-input streams support the reset() operation, and some support
641 * reset() without supporting mark().
642 *
643 * @exception IOException If the stream has not been marked,
644 * or if the mark has been invalidated,
645 * or if the stream does not support reset(),
646 * or if some other I/O error occurs
647 */
648 public void reset() throws IOException {
649 fOffset = 0;
650 fSurrogate = -1;
651 } // reset()
652
653 /**
654 * Close the stream. Once a stream has been closed, further read(),
655 * ready(), mark(), or reset() invocations will throw an IOException.
656 * Closing a previously-closed stream, however, has no effect.
657 *
658 * @exception IOException If an I/O error occurs
659 */
660 public void close() throws IOException {
661 BufferAllocator ba = ThreadLocalBufferAllocator.getBufferAllocator();
662 ba.returnByteBuffer(fBuffer);
663 fBuffer = null;
664 fInputStream.close();
665 } // close()
666
667 //
668 // Private methods
669 //
670
671 /** Throws an exception for expected byte. */
672 private void expectedByte(int position, int count)
673 throws MalformedByteSequenceException {
674
675 throw new MalformedByteSequenceException(fFormatter,
676 fLocale,
677 XMLMessageFormatter.XML_DOMAIN,
678 "ExpectedByte",
679 new Object[] {Integer.toString(position), Integer.toString(count)});
680
681 } // expectedByte(int,int)
682
683 /** Throws an exception for invalid byte. */
684 private void invalidByte(int position, int count, int c)
685 throws MalformedByteSequenceException {
686
687 throw new MalformedByteSequenceException(fFormatter,
688 fLocale,
689 XMLMessageFormatter.XML_DOMAIN,
690 "InvalidByte",
691 new Object [] {Integer.toString(position), Integer.toString(count)});
692
693 } // invalidByte(int,int,int)
694
695 /** Throws an exception for invalid surrogate bits. */
696 private void invalidSurrogate(int uuuuu) throws MalformedByteSequenceException {
697
698 throw new MalformedByteSequenceException(fFormatter,
699 fLocale,
700 XMLMessageFormatter.XML_DOMAIN,
701 "InvalidHighSurrogate",
702 new Object[] {Integer.toHexString(uuuuu)});
703
704 } // invalidSurrogate(int)
705
706 } // class UTF8Reader