1 /*
2 * Copyright 2000-2006 Sun Microsystems, Inc. All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation. Sun designates this
8 * particular file as subject to the "Classpath" exception as provided
9 * by Sun in the LICENSE file that accompanied this code.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
22 * CA 95054 USA or visit www.sun.com if you need additional information or
23 * have any questions.
24 */
25
26 package java.nio.charset;
27
28 import java.nio.ByteBuffer;
29 import java.nio.CharBuffer;
30 import java.nio.charset.spi.CharsetProvider;
31 import java.security.AccessController;
32 import java.security.AccessControlException;
33 import java.security.PrivilegedAction;
34 import java.util.Collections;
35 import java.util.HashSet;
36 import java.util.Iterator;
37 import java.util.Locale;
38 import java.util.Map;
39 import java.util.NoSuchElementException;
40 import java.util.Set;
41 import java.util.ServiceLoader;
42 import java.util.ServiceConfigurationError;
43 import java.util.SortedMap;
44 import java.util.TreeMap;
45 import sun.misc.ASCIICaseInsensitiveComparator;
46 import sun.nio.cs.StandardCharsets;
47 import sun.nio.cs.ThreadLocalCoders;
48 import sun.security.action.GetPropertyAction;
49
50
51 /**
52 * A named mapping between sequences of sixteen-bit Unicode <a
53 * href="../../lang/Character.html#unicode">code units</a> and sequences of
54 * bytes. This class defines methods for creating decoders and encoders and
55 * for retrieving the various names associated with a charset. Instances of
56 * this class are immutable.
57 *
58 * <p> This class also defines static methods for testing whether a particular
59 * charset is supported, for locating charset instances by name, and for
60 * constructing a map that contains every charset for which support is
61 * available in the current Java virtual machine. Support for new charsets can
62 * be added via the service-provider interface defined in the {@link
63 * java.nio.charset.spi.CharsetProvider} class.
64 *
65 * <p> All of the methods defined in this class are safe for use by multiple
66 * concurrent threads.
67 *
68 *
69 * <a name="names"><a name="charenc">
70 * <h4>Charset names</h4>
71 *
72 * <p> Charsets are named by strings composed of the following characters:
73 *
74 * <ul>
75 *
76 * <li> The uppercase letters <tt>'A'</tt> through <tt>'Z'</tt>
77 * (<tt>'\u0041'</tt> through <tt>'\u005a'</tt>),
78 *
79 * <li> The lowercase letters <tt>'a'</tt> through <tt>'z'</tt>
80 * (<tt>'\u0061'</tt> through <tt>'\u007a'</tt>),
81 *
82 * <li> The digits <tt>'0'</tt> through <tt>'9'</tt>
83 * (<tt>'\u0030'</tt> through <tt>'\u0039'</tt>),
84 *
85 * <li> The dash character <tt>'-'</tt>
86 * (<tt>'\u002d'</tt>, <small>HYPHEN-MINUS</small>),
87 *
88 * <li> The period character <tt>'.'</tt>
89 * (<tt>'\u002e'</tt>, <small>FULL STOP</small>),
90 *
91 * <li> The colon character <tt>':'</tt>
92 * (<tt>'\u003a'</tt>, <small>COLON</small>), and
93 *
94 * <li> The underscore character <tt>'_'</tt>
95 * (<tt>'\u005f'</tt>, <small>LOW LINE</small>).
96 *
97 * </ul>
98 *
99 * A charset name must begin with either a letter or a digit. The empty string
100 * is not a legal charset name. Charset names are not case-sensitive; that is,
101 * case is always ignored when comparing charset names. Charset names
102 * generally follow the conventions documented in <a
103 * href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278: IANA Charset
104 * Registration Procedures</i></a>.
105 *
106 * <p> Every charset has a <i>canonical name</i> and may also have one or more
107 * <i>aliases</i>. The canonical name is returned by the {@link #name() name} method
108 * of this class. Canonical names are, by convention, usually in upper case.
109 * The aliases of a charset are returned by the {@link #aliases() aliases}
110 * method.
111 *
112 * <a name="hn">
113 *
114 * <p> Some charsets have an <i>historical name</i> that is defined for
115 * compatibility with previous versions of the Java platform. A charset's
116 * historical name is either its canonical name or one of its aliases. The
117 * historical name is returned by the <tt>getEncoding()</tt> methods of the
118 * {@link java.io.InputStreamReader#getEncoding InputStreamReader} and {@link
119 * java.io.OutputStreamWriter#getEncoding OutputStreamWriter} classes.
120 *
121 * <a name="iana">
122 *
123 * <p> If a charset listed in the <a
124 * href="http://www.iana.org/assignments/character-sets"><i>IANA Charset
125 * Registry</i></a> is supported by an implementation of the Java platform then
126 * its canonical name must be the name listed in the registry. Many charsets
127 * are given more than one name in the registry, in which case the registry
128 * identifies one of the names as <i>MIME-preferred</i>. If a charset has more
129 * than one registry name then its canonical name must be the MIME-preferred
130 * name and the other names in the registry must be valid aliases. If a
131 * supported charset is not listed in the IANA registry then its canonical name
132 * must begin with one of the strings <tt>"X-"</tt> or <tt>"x-"</tt>.
133 *
134 * <p> The IANA charset registry does change over time, and so the canonical
135 * name and the aliases of a particular charset may also change over time. To
136 * ensure compatibility it is recommended that no alias ever be removed from a
137 * charset, and that if the canonical name of a charset is changed then its
138 * previous canonical name be made into an alias.
139 *
140 *
141 * <h4>Standard charsets</h4>
142 *
143 * <p> Every implementation of the Java platform is required to support the
144 * following standard charsets. Consult the release documentation for your
145 * implementation to see if any other charsets are supported. The behavior
146 * of such optional charsets may differ between implementations.
147 *
148 * <blockquote><table width="80%" summary="Description of standard charsets">
149 * <tr><th><p align="left">Charset</p></th><th><p align="left">Description</p></th></tr>
150 * <tr><td valign=top><tt>US-ASCII</tt></td>
151 * <td>Seven-bit ASCII, a.k.a. <tt>ISO646-US</tt>,
152 * a.k.a. the Basic Latin block of the Unicode character set</td></tr>
153 * <tr><td valign=top><tt>ISO-8859-1 </tt></td>
154 * <td>ISO Latin Alphabet No. 1, a.k.a. <tt>ISO-LATIN-1</tt></td></tr>
155 * <tr><td valign=top><tt>UTF-8</tt></td>
156 * <td>Eight-bit UCS Transformation Format</td></tr>
157 * <tr><td valign=top><tt>UTF-16BE</tt></td>
158 * <td>Sixteen-bit UCS Transformation Format,
159 * big-endian byte order</td></tr>
160 * <tr><td valign=top><tt>UTF-16LE</tt></td>
161 * <td>Sixteen-bit UCS Transformation Format,
162 * little-endian byte order</td></tr>
163 * <tr><td valign=top><tt>UTF-16</tt></td>
164 * <td>Sixteen-bit UCS Transformation Format,
165 * byte order identified by an optional byte-order mark</td></tr>
166 * </table></blockquote>
167 *
168 * <p> The <tt>UTF-8</tt> charset is specified by <a
169 * href="http://www.ietf.org/rfc/rfc2279.txt"><i>RFC 2279</i></a>; the
170 * transformation format upon which it is based is specified in
171 * Amendment 2 of ISO 10646-1 and is also described in the <a
172 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
173 * Standard</i></a>.
174 *
175 * <p> The <tt>UTF-16</tt> charsets are specified by <a
176 * href="http://www.ietf.org/rfc/rfc2781.txt"><i>RFC 2781</i></a>; the
177 * transformation formats upon which they are based are specified in
178 * Amendment 1 of ISO 10646-1 and are also described in the <a
179 * href="http://www.unicode.org/unicode/standard/standard.html"><i>Unicode
180 * Standard</i></a>.
181 *
182 * <p> The <tt>UTF-16</tt> charsets use sixteen-bit quantities and are
183 * therefore sensitive to byte order. In these encodings the byte order of a
184 * stream may be indicated by an initial <i>byte-order mark</i> represented by
185 * the Unicode character <tt>'\uFEFF'</tt>. Byte-order marks are handled
186 * as follows:
187 *
188 * <ul>
189 *
190 * <li><p> When decoding, the <tt>UTF-16BE</tt> and <tt>UTF-16LE</tt>
191 * charsets ignore byte-order marks; when encoding, they do not write
192 * byte-order marks. </p></li>
193 *
194 * <li><p> When decoding, the <tt>UTF-16</tt> charset interprets a byte-order
195 * mark to indicate the byte order of the stream but defaults to big-endian
196 * if there is no byte-order mark; when encoding, it uses big-endian byte
197 * order and writes a big-endian byte-order mark. </p></li>
198 *
199 * </ul>
200 *
201 * In any case, when a byte-order mark is read at the beginning of a decoding
202 * operation it is omitted from the resulting sequence of characters. Byte
203 * order marks occuring after the first element of an input sequence are not
204 * omitted since the same code is used to represent <small>ZERO-WIDTH
205 * NON-BREAKING SPACE</small>.
206 *
207 * <p> Every instance of the Java virtual machine has a default charset, which
208 * may or may not be one of the standard charsets. The default charset is
209 * determined during virtual-machine startup and typically depends upon the
210 * locale and charset being used by the underlying operating system. </p>
211 *
212 *
213 * <h4>Terminology</h4>
214 *
215 * <p> The name of this class is taken from the terms used in
216 * <a href="http://www.ietf.org/rfc/rfc2278.txt"><i>RFC 2278</i></a>.
217 * In that document a <i>charset</i> is defined as the combination of
218 * one or more coded character sets and a character-encoding scheme.
219 * (This definition is confusing; some other software systems define
220 * <i>charset</i> as a synonym for <i>coded character set</i>.)
221 *
222 * <p> A <i>coded character set</i> is a mapping between a set of abstract
223 * characters and a set of integers. US-ASCII, ISO 8859-1,
224 * JIS X 0201, and Unicode are examples of coded character sets.
225 *
226 * <p> Some standards have defined a <i>character set</i> to be simply a
227 * set of abstract characters without an associated assigned numbering.
228 * An alphabet is an example of such a character set. However, the subtle
229 * distinction between <i>character set</i> and <i>coded character set</i>
230 * is rarely used in practice; the former has become a short form for the
231 * latter, including in the Java API specification.
232 *
233 * <p> A <i>character-encoding scheme</i> is a mapping between one or more
234 * coded character sets and a set of octet (eight-bit byte) sequences.
235 * UTF-8, UTF-16, ISO 2022, and EUC are examples of
236 * character-encoding schemes. Encoding schemes are often associated with
237 * a particular coded character set; UTF-8, for example, is used only to
238 * encode Unicode. Some schemes, however, are associated with multiple
239 * coded character sets; EUC, for example, can be used to encode
240 * characters in a variety of Asian coded character sets.
241 *
242 * <p> When a coded character set is used exclusively with a single
243 * character-encoding scheme then the corresponding charset is usually
244 * named for the coded character set; otherwise a charset is usually named
245 * for the encoding scheme and, possibly, the locale of the coded
246 * character sets that it supports. Hence <tt>US-ASCII</tt> is both the
247 * name of a coded character set and of the charset that encodes it, while
248 * <tt>EUC-JP</tt> is the name of the charset that encodes the
249 * JIS X 0201, JIS X 0208, and JIS X 0212
250 * coded character sets for the Japanese language.
251 *
252 * <p> The native character encoding of the Java programming language is
253 * UTF-16. A charset in the Java platform therefore defines a mapping
254 * between sequences of sixteen-bit UTF-16 code units (that is, sequences
255 * of chars) and sequences of bytes. </p>
256 *
257 *
258 * @author Mark Reinhold
259 * @author JSR-51 Expert Group
260 * @since 1.4
261 *
262 * @see CharsetDecoder
263 * @see CharsetEncoder
264 * @see java.nio.charset.spi.CharsetProvider
265 * @see java.lang.Character
266 */
267
268 public abstract class Charset
269 implements Comparable<Charset>
270 {
271
272 /* -- Static methods -- */
273
274 private static String bugLevel = null;
275
276 static boolean atBugLevel(String bl) { // package-private
277 if (bugLevel == null) {
278 if (!sun.misc.VM.isBooted())
279 return false;
280 bugLevel = AccessController.doPrivileged(
281 new GetPropertyAction("sun.nio.cs.bugLevel"));
282 if (bugLevel == null)
283 bugLevel = "";
284 }
285 return (bugLevel != null) && bugLevel.equals(bl);
286 }
287
288 /**
289 * Checks that the given string is a legal charset name. </p>
290 *
291 * @param s
292 * A purported charset name
293 *
294 * @throws IllegalCharsetNameException
295 * If the given name is not a legal charset name
296 */
297 private static void checkName(String s) {
298 int n = s.length();
299 if (!atBugLevel("1.4")) {
300 if (n == 0)
301 throw new IllegalCharsetNameException(s);
302 }
303 for (int i = 0; i < n; i++) {
304 char c = s.charAt(i);
305 if (c >= 'A' && c <= 'Z') continue;
306 if (c >= 'a' && c <= 'z') continue;
307 if (c >= '0' && c <= '9') continue;
308 if (c == '-' && i != 0) continue;
309 if (c == ':' && i != 0) continue;
310 if (c == '_' && i != 0) continue;
311 if (c == '.' && i != 0) continue;
312 throw new IllegalCharsetNameException(s);
313 }
314 }
315
316 /* The standard set of charsets */
317 private static CharsetProvider standardProvider = new StandardCharsets();
318
319 // Cache of the most-recently-returned charsets,
320 // along with the names that were used to find them
321 //
322 private static volatile Object[] cache1 = null; // "Level 1" cache
323 private static volatile Object[] cache2 = null; // "Level 2" cache
324
325 private static void cache(String charsetName, Charset cs) {
326 cache2 = cache1;
327 cache1 = new Object[] { charsetName, cs };
328 }
329
330 // Creates an iterator that walks over the available providers, ignoring
331 // those whose lookup or instantiation causes a security exception to be
332 // thrown. Should be invoked with full privileges.
333 //
334 private static Iterator providers() {
335 return new Iterator() {
336
337 ClassLoader cl = ClassLoader.getSystemClassLoader();
338 ServiceLoader<CharsetProvider> sl =
339 ServiceLoader.load(CharsetProvider.class, cl);
340 Iterator<CharsetProvider> i = sl.iterator();
341
342 Object next = null;
343
344 private boolean getNext() {
345 while (next == null) {
346 try {
347 if (!i.hasNext())
348 return false;
349 next = i.next();
350 } catch (ServiceConfigurationError sce) {
351 if (sce.getCause() instanceof SecurityException) {
352 // Ignore security exceptions
353 continue;
354 }
355 throw sce;
356 }
357 }
358 return true;
359 }
360
361 public boolean hasNext() {
362 return getNext();
363 }
364
365 public Object next() {
366 if (!getNext())
367 throw new NoSuchElementException();
368 Object n = next;
369 next = null;
370 return n;
371 }
372
373 public void remove() {
374 throw new UnsupportedOperationException();
375 }
376
377 };
378 }
379
380 // Thread-local gate to prevent recursive provider lookups
381 private static ThreadLocal gate = new ThreadLocal();
382
383 private static Charset lookupViaProviders(final String charsetName) {
384
385 // The runtime startup sequence looks up standard charsets as a
386 // consequence of the VM's invocation of System.initializeSystemClass
387 // in order to, e.g., set system properties and encode filenames. At
388 // that point the application class loader has not been initialized,
389 // however, so we can't look for providers because doing so will cause
390 // that loader to be prematurely initialized with incomplete
391 // information.
392 //
393 if (!sun.misc.VM.isBooted())
394 return null;
395
396 if (gate.get() != null)
397 // Avoid recursive provider lookups
398 return null;
399 try {
400 gate.set(gate);
401
402 return AccessController.doPrivileged(
403 new PrivilegedAction<Charset>() {
404 public Charset run() {
405 for (Iterator i = providers(); i.hasNext();) {
406 CharsetProvider cp = (CharsetProvider)i.next();
407 Charset cs = cp.charsetForName(charsetName);
408 if (cs != null)
409 return cs;
410 }
411 return null;
412 }
413 });
414
415 } finally {
416 gate.set(null);
417 }
418 }
419
420 /* The extended set of charsets */
421 private static Object extendedProviderLock = new Object();
422 private static boolean extendedProviderProbed = false;
423 private static CharsetProvider extendedProvider = null;
424
425 private static void probeExtendedProvider() {
426 AccessController.doPrivileged(new PrivilegedAction<Object>() {
427 public Object run() {
428 try {
429 Class epc
430 = Class.forName("sun.nio.cs.ext.ExtendedCharsets");
431 extendedProvider = (CharsetProvider)epc.newInstance();
432 } catch (ClassNotFoundException x) {
433 // Extended charsets not available
434 // (charsets.jar not present)
435 } catch (InstantiationException x) {
436 throw new Error(x);
437 } catch (IllegalAccessException x) {
438 throw new Error(x);
439 }
440 return null;
441 }
442 });
443 }
444
445 private static Charset lookupExtendedCharset(String charsetName) {
446 CharsetProvider ecp = null;
447 synchronized (extendedProviderLock) {
448 if (!extendedProviderProbed) {
449 probeExtendedProvider();
450 extendedProviderProbed = true;
451 }
452 ecp = extendedProvider;
453 }
454 return (ecp != null) ? ecp.charsetForName(charsetName) : null;
455 }
456
457 private static Charset lookup(String charsetName) {
458 if (charsetName == null)
459 throw new IllegalArgumentException("Null charset name");
460
461 Object[] a;
462 if ((a = cache1) != null && charsetName.equals(a[0]))
463 return (Charset)a[1];
464 // We expect most programs to use one Charset repeatedly.
465 // We convey a hint to this effect to the VM by putting the
466 // level 1 cache miss code in a separate method.
467 return lookup2(charsetName);
468 }
469
470 private static Charset lookup2(String charsetName) {
471 Object[] a;
472 if ((a = cache2) != null && charsetName.equals(a[0])) {
473 cache2 = cache1;
474 cache1 = a;
475 return (Charset)a[1];
476 }
477
478 Charset cs;
479 if ((cs = standardProvider.charsetForName(charsetName)) != null ||
480 (cs = lookupExtendedCharset(charsetName)) != null ||
481 (cs = lookupViaProviders(charsetName)) != null)
482 {
483 cache(charsetName, cs);
484 return cs;
485 }
486
487 /* Only need to check the name if we didn't find a charset for it */
488 checkName(charsetName);
489 return null;
490 }
491
492 /**
493 * Tells whether the named charset is supported. </p>
494 *
495 * @param charsetName
496 * The name of the requested charset; may be either
497 * a canonical name or an alias
498 *
499 * @return <tt>true</tt> if, and only if, support for the named charset
500 * is available in the current Java virtual machine
501 *
502 * @throws IllegalCharsetNameException
503 * If the given charset name is illegal
504 *
505 * @throws IllegalArgumentException
506 * If the given <tt>charsetName</tt> is null
507 */
508 public static boolean isSupported(String charsetName) {
509 return (lookup(charsetName) != null);
510 }
511
512 /**
513 * Returns a charset object for the named charset. </p>
514 *
515 * @param charsetName
516 * The name of the requested charset; may be either
517 * a canonical name or an alias
518 *
519 * @return A charset object for the named charset
520 *
521 * @throws IllegalCharsetNameException
522 * If the given charset name is illegal
523 *
524 * @throws IllegalArgumentException
525 * If the given <tt>charsetName</tt> is null
526 *
527 * @throws UnsupportedCharsetException
528 * If no support for the named charset is available
529 * in this instance of the Java virtual machine
530 */
531 public static Charset forName(String charsetName) {
532 Charset cs = lookup(charsetName);
533 if (cs != null)
534 return cs;
535 throw new UnsupportedCharsetException(charsetName);
536 }
537
538 // Fold charsets from the given iterator into the given map, ignoring
539 // charsets whose names already have entries in the map.
540 //
541 private static void put(Iterator i, Map m) {
542 while (i.hasNext()) {
543 Charset cs = (Charset)i.next();
544 if (!m.containsKey(cs.name()))
545 m.put(cs.name(), cs);
546 }
547 }
548
549 /**
550 * Constructs a sorted map from canonical charset names to charset objects.
551 *
552 * <p> The map returned by this method will have one entry for each charset
553 * for which support is available in the current Java virtual machine. If
554 * two or more supported charsets have the same canonical name then the
555 * resulting map will contain just one of them; which one it will contain
556 * is not specified. </p>
557 *
558 * <p> The invocation of this method, and the subsequent use of the
559 * resulting map, may cause time-consuming disk or network I/O operations
560 * to occur. This method is provided for applications that need to
561 * enumerate all of the available charsets, for example to allow user
562 * charset selection. This method is not used by the {@link #forName
563 * forName} method, which instead employs an efficient incremental lookup
564 * algorithm.
565 *
566 * <p> This method may return different results at different times if new
567 * charset providers are dynamically made available to the current Java
568 * virtual machine. In the absence of such changes, the charsets returned
569 * by this method are exactly those that can be retrieved via the {@link
570 * #forName forName} method. </p>
571 *
572 * @return An immutable, case-insensitive map from canonical charset names
573 * to charset objects
574 */
575 public static SortedMap<String,Charset> availableCharsets() {
576 return AccessController.doPrivileged(
577 new PrivilegedAction<SortedMap<String,Charset>>() {
578 public SortedMap<String,Charset> run() {
579 TreeMap<String,Charset> m =
580 new TreeMap<String,Charset>(
581 ASCIICaseInsensitiveComparator.CASE_INSENSITIVE_ORDER);
582 put(standardProvider.charsets(), m);
583 for (Iterator i = providers(); i.hasNext();) {
584 CharsetProvider cp = (CharsetProvider)i.next();
585 put(cp.charsets(), m);
586 }
587 return Collections.unmodifiableSortedMap(m);
588 }
589 });
590 }
591
592 private static volatile Charset defaultCharset;
593
594 /**
595 * Returns the default charset of this Java virtual machine.
596 *
597 * <p> The default charset is determined during virtual-machine startup and
598 * typically depends upon the locale and charset of the underlying
599 * operating system.
600 *
601 * @return A charset object for the default charset
602 *
603 * @since 1.5
604 */
605 public static Charset defaultCharset() {
606 if (defaultCharset == null) {
607 synchronized (Charset.class) {
608 String csn = AccessController.doPrivileged(
609 new GetPropertyAction("file.encoding"));
610 Charset cs = lookup(csn);
611 if (cs != null)
612 defaultCharset = cs;
613 else
614 defaultCharset = forName("UTF-8");
615 }
616 }
617 return defaultCharset;
618 }
619
620
621 /* -- Instance fields and methods -- */
622
623 private final String name; // tickles a bug in oldjavac
624 private final String[] aliases; // tickles a bug in oldjavac
625 private Set aliasSet = null;
626
627 /**
628 * Initializes a new charset with the given canonical name and alias
629 * set. </p>
630 *
631 * @param canonicalName
632 * The canonical name of this charset
633 *
634 * @param aliases
635 * An array of this charset's aliases, or null if it has no aliases
636 *
637 * @throws IllegalCharsetNameException
638 * If the canonical name or any of the aliases are illegal
639 */
640 protected Charset(String canonicalName, String[] aliases) {
641 checkName(canonicalName);
642 String[] as = (aliases == null) ? new String[0] : aliases;
643 for (int i = 0; i < as.length; i++)
644 checkName(as[i]);
645 this.name = canonicalName;
646 this.aliases = as;
647 }
648
649 /**
650 * Returns this charset's canonical name. </p>
651 *
652 * @return The canonical name of this charset
653 */
654 public final String name() {
655 return name;
656 }
657
658 /**
659 * Returns a set containing this charset's aliases. </p>
660 *
661 * @return An immutable set of this charset's aliases
662 */
663 public final Set<String> aliases() {
664 if (aliasSet != null)
665 return aliasSet;
666 int n = aliases.length;
667 HashSet hs = new HashSet(n);
668 for (int i = 0; i < n; i++)
669 hs.add(aliases[i]);
670 aliasSet = Collections.unmodifiableSet(hs);
671 return aliasSet;
672 }
673
674 /**
675 * Returns this charset's human-readable name for the default locale.
676 *
677 * <p> The default implementation of this method simply returns this
678 * charset's canonical name. Concrete subclasses of this class may
679 * override this method in order to provide a localized display name. </p>
680 *
681 * @return The display name of this charset in the default locale
682 */
683 public String displayName() {
684 return name;
685 }
686
687 /**
688 * Tells whether or not this charset is registered in the <a
689 * href="http://www.iana.org/assignments/character-sets">IANA Charset
690 * Registry</a>. </p>
691 *
692 * @return <tt>true</tt> if, and only if, this charset is known by its
693 * implementor to be registered with the IANA
694 */
695 public final boolean isRegistered() {
696 return !name.startsWith("X-") && !name.startsWith("x-");
697 }
698
699 /**
700 * Returns this charset's human-readable name for the given locale.
701 *
702 * <p> The default implementation of this method simply returns this
703 * charset's canonical name. Concrete subclasses of this class may
704 * override this method in order to provide a localized display name. </p>
705 *
706 * @param locale
707 * The locale for which the display name is to be retrieved
708 *
709 * @return The display name of this charset in the given locale
710 */
711 public String displayName(Locale locale) {
712 return name;
713 }
714
715 /**
716 * Tells whether or not this charset contains the given charset.
717 *
718 * <p> A charset <i>C</i> is said to <i>contain</i> a charset <i>D</i> if,
719 * and only if, every character representable in <i>D</i> is also
720 * representable in <i>C</i>. If this relationship holds then it is
721 * guaranteed that every string that can be encoded in <i>D</i> can also be
722 * encoded in <i>C</i> without performing any replacements.
723 *
724 * <p> That <i>C</i> contains <i>D</i> does not imply that each character
725 * representable in <i>C</i> by a particular byte sequence is represented
726 * in <i>D</i> by the same byte sequence, although sometimes this is the
727 * case.
728 *
729 * <p> Every charset contains itself.
730 *
731 * <p> This method computes an approximation of the containment relation:
732 * If it returns <tt>true</tt> then the given charset is known to be
733 * contained by this charset; if it returns <tt>false</tt>, however, then
734 * it is not necessarily the case that the given charset is not contained
735 * in this charset.
736 *
737 * @return <tt>true</tt> if the given charset is contained in this charset
738 */
739 public abstract boolean contains(Charset cs);
740
741 /**
742 * Constructs a new decoder for this charset. </p>
743 *
744 * @return A new decoder for this charset
745 */
746 public abstract CharsetDecoder newDecoder();
747
748 /**
749 * Constructs a new encoder for this charset. </p>
750 *
751 * @return A new encoder for this charset
752 *
753 * @throws UnsupportedOperationException
754 * If this charset does not support encoding
755 */
756 public abstract CharsetEncoder newEncoder();
757
758 /**
759 * Tells whether or not this charset supports encoding.
760 *
761 * <p> Nearly all charsets support encoding. The primary exceptions are
762 * special-purpose <i>auto-detect</i> charsets whose decoders can determine
763 * which of several possible encoding schemes is in use by examining the
764 * input byte sequence. Such charsets do not support encoding because
765 * there is no way to determine which encoding should be used on output.
766 * Implementations of such charsets should override this method to return
767 * <tt>false</tt>. </p>
768 *
769 * @return <tt>true</tt> if, and only if, this charset supports encoding
770 */
771 public boolean canEncode() {
772 return true;
773 }
774
775 /**
776 * Convenience method that decodes bytes in this charset into Unicode
777 * characters.
778 *
779 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
780 * same result as the expression
781 *
782 * <pre>
783 * cs.newDecoder()
784 * .onMalformedInput(CodingErrorAction.REPLACE)
785 * .onUnmappableCharacter(CodingErrorAction.REPLACE)
786 * .decode(bb); </pre>
787 *
788 * except that it is potentially more efficient because it can cache
789 * decoders between successive invocations.
790 *
791 * <p> This method always replaces malformed-input and unmappable-character
792 * sequences with this charset's default replacement byte array. In order
793 * to detect such sequences, use the {@link
794 * CharsetDecoder#decode(java.nio.ByteBuffer)} method directly. </p>
795 *
796 * @param bb The byte buffer to be decoded
797 *
798 * @return A char buffer containing the decoded characters
799 */
800 public final CharBuffer decode(ByteBuffer bb) {
801 try {
802 return ThreadLocalCoders.decoderFor(this)
803 .onMalformedInput(CodingErrorAction.REPLACE)
804 .onUnmappableCharacter(CodingErrorAction.REPLACE)
805 .decode(bb);
806 } catch (CharacterCodingException x) {
807 throw new Error(x); // Can't happen
808 }
809 }
810
811 /**
812 * Convenience method that encodes Unicode characters into bytes in this
813 * charset.
814 *
815 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
816 * same result as the expression
817 *
818 * <pre>
819 * cs.newEncoder()
820 * .onMalformedInput(CodingErrorAction.REPLACE)
821 * .onUnmappableCharacter(CodingErrorAction.REPLACE)
822 * .encode(bb); </pre>
823 *
824 * except that it is potentially more efficient because it can cache
825 * encoders between successive invocations.
826 *
827 * <p> This method always replaces malformed-input and unmappable-character
828 * sequences with this charset's default replacement string. In order to
829 * detect such sequences, use the {@link
830 * CharsetEncoder#encode(java.nio.CharBuffer)} method directly. </p>
831 *
832 * @param cb The char buffer to be encoded
833 *
834 * @return A byte buffer containing the encoded characters
835 */
836 public final ByteBuffer encode(CharBuffer cb) {
837 try {
838 return ThreadLocalCoders.encoderFor(this)
839 .onMalformedInput(CodingErrorAction.REPLACE)
840 .onUnmappableCharacter(CodingErrorAction.REPLACE)
841 .encode(cb);
842 } catch (CharacterCodingException x) {
843 throw new Error(x); // Can't happen
844 }
845 }
846
847 /**
848 * Convenience method that encodes a string into bytes in this charset.
849 *
850 * <p> An invocation of this method upon a charset <tt>cs</tt> returns the
851 * same result as the expression
852 *
853 * <pre>
854 * cs.encode(CharBuffer.wrap(s)); </pre>
855 *
856 * @param str The string to be encoded
857 *
858 * @return A byte buffer containing the encoded characters
859 */
860 public final ByteBuffer encode(String str) {
861 return encode(CharBuffer.wrap(str));
862 }
863
864 /**
865 * Compares this charset to another.
866 *
867 * <p> Charsets are ordered by their canonical names, without regard to
868 * case. </p>
869 *
870 * @param that
871 * The charset to which this charset is to be compared
872 *
873 * @return A negative integer, zero, or a positive integer as this charset
874 * is less than, equal to, or greater than the specified charset
875 */
876 public final int compareTo(Charset that) {
877 return (name().compareToIgnoreCase(that.name()));
878 }
879
880 /**
881 * Computes a hashcode for this charset. </p>
882 *
883 * @return An integer hashcode
884 */
885 public final int hashCode() {
886 return name().hashCode();
887 }
888
889 /**
890 * Tells whether or not this object is equal to another.
891 *
892 * <p> Two charsets are equal if, and only if, they have the same canonical
893 * names. A charset is never equal to any other type of object. </p>
894 *
895 * @return <tt>true</tt> if, and only if, this charset is equal to the
896 * given object
897 */
898 public final boolean equals(Object ob) {
899 if (!(ob instanceof Charset))
900 return false;
901 if (this == ob)
902 return true;
903 return name.equals(((Charset)ob).name());
904 }
905
906 /**
907 * Returns a string describing this charset. </p>
908 *
909 * @return A string describing this charset
910 */
911 public final String toString() {
912 return name();
913 }
914
915 }