Save This Page
Home » openjdk-7 » java » net » [javadoc | source]
    1   /*
    2    * Copyright 2000-2006 Sun Microsystems, Inc.  All Rights Reserved.
    3    * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
    4    *
    5    * This code is free software; you can redistribute it and/or modify it
    6    * under the terms of the GNU General Public License version 2 only, as
    7    * published by the Free Software Foundation.  Sun designates this
    8    * particular file as subject to the "Classpath" exception as provided
    9    * by Sun in the LICENSE file that accompanied this code.
   10    *
   11    * This code is distributed in the hope that it will be useful, but WITHOUT
   12    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
   13    * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
   14    * version 2 for more details (a copy is included in the LICENSE file that
   15    * accompanied this code).
   16    *
   17    * You should have received a copy of the GNU General Public License version
   18    * 2 along with this work; if not, write to the Free Software Foundation,
   19    * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
   20    *
   21    * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
   22    * CA 95054 USA or visit www.sun.com if you need additional information or
   23    * have any questions.
   24    */
   25   
   26   package java.net;
   27   
   28   import java.io.IOException;
   29   import java.io.InvalidObjectException;
   30   import java.io.ObjectInputStream;
   31   import java.io.ObjectOutputStream;
   32   import java.io.Serializable;
   33   import java.nio.ByteBuffer;
   34   import java.nio.CharBuffer;
   35   import java.nio.charset.CharsetDecoder;
   36   import java.nio.charset.CharsetEncoder;
   37   import java.nio.charset.CoderResult;
   38   import java.nio.charset.CodingErrorAction;
   39   import java.nio.charset.CharacterCodingException;
   40   import java.text.Normalizer;
   41   import sun.nio.cs.ThreadLocalCoders;
   42   
   43   import java.lang.Character;             // for javadoc
   44   import java.lang.NullPointerException;  // for javadoc
   45   
   46   
   47   /**
   48    * Represents a Uniform Resource Identifier (URI) reference.
   49    *
   50    * <p> Aside from some minor deviations noted below, an instance of this
   51    * class represents a URI reference as defined by
   52    * <a href="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
   53    * Resource Identifiers (URI): Generic Syntax</i></a>, amended by <a
   54    * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
   55    * Literal IPv6 Addresses in URLs</i></a>. The Literal IPv6 address format
   56    * also supports scope_ids. The syntax and usage of scope_ids is described
   57    * <a href="Inet6Address.html#scoped">here</a>.
   58    * This class provides constructors for creating URI instances from
   59    * their components or by parsing their string forms, methods for accessing the
   60    * various components of an instance, and methods for normalizing, resolving,
   61    * and relativizing URI instances.  Instances of this class are immutable.
   62    *
   63    *
   64    * <h4> URI syntax and components </h4>
   65    *
   66    * At the highest level a URI reference (hereinafter simply "URI") in string
   67    * form has the syntax
   68    *
   69    * <blockquote>
   70    * [<i>scheme</i><tt><b>:</b></tt><i></i>]<i>scheme-specific-part</i>[<tt><b>#</b></tt><i>fragment</i>]
   71    * </blockquote>
   72    *
   73    * where square brackets [...] delineate optional components and the characters
   74    * <tt><b>:</b></tt> and <tt><b>#</b></tt> stand for themselves.
   75    *
   76    * <p> An <i>absolute</i> URI specifies a scheme; a URI that is not absolute is
   77    * said to be <i>relative</i>.  URIs are also classified according to whether
   78    * they are <i>opaque</i> or <i>hierarchical</i>.
   79    *
   80    * <p> An <i>opaque</i> URI is an absolute URI whose scheme-specific part does
   81    * not begin with a slash character (<tt>'/'</tt>).  Opaque URIs are not
   82    * subject to further parsing.  Some examples of opaque URIs are:
   83    *
   84    * <blockquote><table cellpadding=0 cellspacing=0 summary="layout">
   85    * <tr><td><tt>mailto:java-net@java.sun.com</tt><td></tr>
   86    * <tr><td><tt>news:comp.lang.java</tt><td></tr>
   87    * <tr><td><tt>urn:isbn:096139210x</tt></td></tr>
   88    * </table></blockquote>
   89    *
   90    * <p> A <i>hierarchical</i> URI is either an absolute URI whose
   91    * scheme-specific part begins with a slash character, or a relative URI, that
   92    * is, a URI that does not specify a scheme.  Some examples of hierarchical
   93    * URIs are:
   94    *
   95    * <blockquote>
   96    * <tt>http://java.sun.com/j2se/1.3/</tt><br>
   97    * <tt>docs/guide/collections/designfaq.html#28</tt><br>
   98    * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java</tt><br>
   99    * <tt>file:///~/calendar</tt>
  100    * </blockquote>
  101    *
  102    * <p> A hierarchical URI is subject to further parsing according to the syntax
  103    *
  104    * <blockquote>
  105    * [<i>scheme</i><tt><b>:</b></tt>][<tt><b>//</b></tt><i>authority</i>][<i>path</i>][<tt><b>?</b></tt><i>query</i>][<tt><b>#</b></tt><i>fragment</i>]
  106    * </blockquote>
  107    *
  108    * where the characters <tt><b>:</b></tt>, <tt><b>/</b></tt>,
  109    * <tt><b>?</b></tt>, and <tt><b>#</b></tt> stand for themselves.  The
  110    * scheme-specific part of a hierarchical URI consists of the characters
  111    * between the scheme and fragment components.
  112    *
  113    * <p> The authority component of a hierarchical URI is, if specified, either
  114    * <i>server-based</i> or <i>registry-based</i>.  A server-based authority
  115    * parses according to the familiar syntax
  116    *
  117    * <blockquote>
  118    * [<i>user-info</i><tt><b>@</b></tt>]<i>host</i>[<tt><b>:</b></tt><i>port</i>]
  119    * </blockquote>
  120    *
  121    * where the characters <tt><b>@</b></tt> and <tt><b>:</b></tt> stand for
  122    * themselves.  Nearly all URI schemes currently in use are server-based.  An
  123    * authority component that does not parse in this way is considered to be
  124    * registry-based.
  125    *
  126    * <p> The path component of a hierarchical URI is itself said to be absolute
  127    * if it begins with a slash character (<tt>'/'</tt>); otherwise it is
  128    * relative.  The path of a hierarchical URI that is either absolute or
  129    * specifies an authority is always absolute.
  130    *
  131    * <p> All told, then, a URI instance has the following nine components:
  132    *
  133    * <blockquote><table summary="Describes the components of a URI:scheme,scheme-specific-part,authority,user-info,host,port,path,query,fragment">
  134    * <tr><th><i>Component</i></th><th><i>Type</i></th></tr>
  135    * <tr><td>scheme</td><td><tt>String</tt></td></tr>
  136    * <tr><td>scheme-specific-part&nbsp;&nbsp;&nbsp;&nbsp;</td><td><tt>String</tt></td></tr>
  137    * <tr><td>authority</td><td><tt>String</tt></td></tr>
  138    * <tr><td>user-info</td><td><tt>String</tt></td></tr>
  139    * <tr><td>host</td><td><tt>String</tt></td></tr>
  140    * <tr><td>port</td><td><tt>int</tt></td></tr>
  141    * <tr><td>path</td><td><tt>String</tt></td></tr>
  142    * <tr><td>query</td><td><tt>String</tt></td></tr>
  143    * <tr><td>fragment</td><td><tt>String</tt></td></tr>
  144    * </table></blockquote>
  145    *
  146    * In a given instance any particular component is either <i>undefined</i> or
  147    * <i>defined</i> with a distinct value.  Undefined string components are
  148    * represented by <tt>null</tt>, while undefined integer components are
  149    * represented by <tt>-1</tt>.  A string component may be defined to have the
  150    * empty string as its value; this is not equivalent to that component being
  151    * undefined.
  152    *
  153    * <p> Whether a particular component is or is not defined in an instance
  154    * depends upon the type of the URI being represented.  An absolute URI has a
  155    * scheme component.  An opaque URI has a scheme, a scheme-specific part, and
  156    * possibly a fragment, but has no other components.  A hierarchical URI always
  157    * has a path (though it may be empty) and a scheme-specific-part (which at
  158    * least contains the path), and may have any of the other components.  If the
  159    * authority component is present and is server-based then the host component
  160    * will be defined and the user-information and port components may be defined.
  161    *
  162    *
  163    * <h4> Operations on URI instances </h4>
  164    *
  165    * The key operations supported by this class are those of
  166    * <i>normalization</i>, <i>resolution</i>, and <i>relativization</i>.
  167    *
  168    * <p> <i>Normalization</i> is the process of removing unnecessary <tt>"."</tt>
  169    * and <tt>".."</tt> segments from the path component of a hierarchical URI.
  170    * Each <tt>"."</tt> segment is simply removed.  A <tt>".."</tt> segment is
  171    * removed only if it is preceded by a non-<tt>".."</tt> segment.
  172    * Normalization has no effect upon opaque URIs.
  173    *
  174    * <p> <i>Resolution</i> is the process of resolving one URI against another,
  175    * <i>base</i> URI.  The resulting URI is constructed from components of both
  176    * URIs in the manner specified by RFC&nbsp;2396, taking components from the
  177    * base URI for those not specified in the original.  For hierarchical URIs,
  178    * the path of the original is resolved against the path of the base and then
  179    * normalized.  The result, for example, of resolving
  180    *
  181    * <blockquote>
  182    * <tt>docs/guide/collections/designfaq.html#28&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt>(1)
  183    * </blockquote>
  184    *
  185    * against the base URI <tt>http://java.sun.com/j2se/1.3/</tt> is the result
  186    * URI
  187    *
  188    * <blockquote>
  189    * <tt>http://java.sun.com/j2se/1.3/docs/guide/collections/designfaq.html#28</tt>
  190    * </blockquote>
  191    *
  192    * Resolving the relative URI
  193    *
  194    * <blockquote>
  195    * <tt>../../../demo/jfc/SwingSet2/src/SwingSet2.java&nbsp;&nbsp;&nbsp;&nbsp;</tt>(2)
  196    * </blockquote>
  197    *
  198    * against this result yields, in turn,
  199    *
  200    * <blockquote>
  201    * <tt>http://java.sun.com/j2se/1.3/demo/jfc/SwingSet2/src/SwingSet2.java</tt>
  202    * </blockquote>
  203    *
  204    * Resolution of both absolute and relative URIs, and of both absolute and
  205    * relative paths in the case of hierarchical URIs, is supported.  Resolving
  206    * the URI <tt>file:///~calendar</tt> against any other URI simply yields the
  207    * original URI, since it is absolute.  Resolving the relative URI (2) above
  208    * against the relative base URI (1) yields the normalized, but still relative,
  209    * URI
  210    *
  211    * <blockquote>
  212    * <tt>demo/jfc/SwingSet2/src/SwingSet2.java</tt>
  213    * </blockquote>
  214    *
  215    * <p> <i>Relativization</i>, finally, is the inverse of resolution: For any
  216    * two normalized URIs <i>u</i> and&nbsp;<i>v</i>,
  217    *
  218    * <blockquote>
  219    *   <i>u</i><tt>.relativize(</tt><i>u</i><tt>.resolve(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;and<br>
  220    *   <i>u</i><tt>.resolve(</tt><i>u</i><tt>.relativize(</tt><i>v</i><tt>)).equals(</tt><i>v</i><tt>)</tt>&nbsp;&nbsp;.<br>
  221    * </blockquote>
  222    *
  223    * This operation is often useful when constructing a document containing URIs
  224    * that must be made relative to the base URI of the document wherever
  225    * possible.  For example, relativizing the URI
  226    *
  227    * <blockquote>
  228    * <tt>http://java.sun.com/j2se/1.3/docs/guide/index.html</tt>
  229    * </blockquote>
  230    *
  231    * against the base URI
  232    *
  233    * <blockquote>
  234    * <tt>http://java.sun.com/j2se/1.3</tt>
  235    * </blockquote>
  236    *
  237    * yields the relative URI <tt>docs/guide/index.html</tt>.
  238    *
  239    *
  240    * <h4> Character categories </h4>
  241    *
  242    * RFC&nbsp;2396 specifies precisely which characters are permitted in the
  243    * various components of a URI reference.  The following categories, most of
  244    * which are taken from that specification, are used below to describe these
  245    * constraints:
  246    *
  247    * <blockquote><table cellspacing=2 summary="Describes categories alpha,digit,alphanum,unreserved,punct,reserved,escaped,and other">
  248    *   <tr><th valign=top><i>alpha</i></th>
  249    *       <td>The US-ASCII alphabetic characters,
  250    *        <tt>'A'</tt>&nbsp;through&nbsp;<tt>'Z'</tt>
  251    *        and <tt>'a'</tt>&nbsp;through&nbsp;<tt>'z'</tt></td></tr>
  252    *   <tr><th valign=top><i>digit</i></th>
  253    *       <td>The US-ASCII decimal digit characters,
  254    *       <tt>'0'</tt>&nbsp;through&nbsp;<tt>'9'</tt></td></tr>
  255    *   <tr><th valign=top><i>alphanum</i></th>
  256    *       <td>All <i>alpha</i> and <i>digit</i> characters</td></tr>
  257    *   <tr><th valign=top><i>unreserved</i>&nbsp;&nbsp;&nbsp;&nbsp;</th>
  258    *       <td>All <i>alphanum</i> characters together with those in the string
  259    *        <tt>"_-!.~'()*"</tt></td></tr>
  260    *   <tr><th valign=top><i>punct</i></th>
  261    *       <td>The characters in the string <tt>",;:$&+="</tt></td></tr>
  262    *   <tr><th valign=top><i>reserved</i></th>
  263    *       <td>All <i>punct</i> characters together with those in the string
  264    *        <tt>"?/[]@"</tt></td></tr>
  265    *   <tr><th valign=top><i>escaped</i></th>
  266    *       <td>Escaped octets, that is, triplets consisting of the percent
  267    *           character (<tt>'%'</tt>) followed by two hexadecimal digits
  268    *           (<tt>'0'</tt>-<tt>'9'</tt>, <tt>'A'</tt>-<tt>'F'</tt>, and
  269    *           <tt>'a'</tt>-<tt>'f'</tt>)</td></tr>
  270    *   <tr><th valign=top><i>other</i></th>
  271    *       <td>The Unicode characters that are not in the US-ASCII character set,
  272    *           are not control characters (according to the {@link
  273    *           java.lang.Character#isISOControl(char) Character.isISOControl}
  274    *           method), and are not space characters (according to the {@link
  275    *           java.lang.Character#isSpaceChar(char) Character.isSpaceChar}
  276    *           method)&nbsp;&nbsp;<i>(<b>Deviation from RFC 2396</b>, which is
  277    *           limited to US-ASCII)</i></td></tr>
  278    * </table></blockquote>
  279    *
  280    * <p><a name="legal-chars"></a> The set of all legal URI characters consists of
  281    * the <i>unreserved</i>, <i>reserved</i>, <i>escaped</i>, and <i>other</i>
  282    * characters.
  283    *
  284    *
  285    * <h4> Escaped octets, quotation, encoding, and decoding </h4>
  286    *
  287    * RFC 2396 allows escaped octets to appear in the user-info, path, query, and
  288    * fragment components.  Escaping serves two purposes in URIs:
  289    *
  290    * <ul>
  291    *
  292    *   <li><p> To <i>encode</i> non-US-ASCII characters when a URI is required to
  293    *   conform strictly to RFC&nbsp;2396 by not containing any <i>other</i>
  294    *   characters.  </p></li>
  295    *
  296    *   <li><p> To <i>quote</i> characters that are otherwise illegal in a
  297    *   component.  The user-info, path, query, and fragment components differ
  298    *   slightly in terms of which characters are considered legal and illegal.
  299    *   </p></li>
  300    *
  301    * </ul>
  302    *
  303    * These purposes are served in this class by three related operations:
  304    *
  305    * <ul>
  306    *
  307    *   <li><p><a name="encode"></a> A character is <i>encoded</i> by replacing it
  308    *   with the sequence of escaped octets that represent that character in the
  309    *   UTF-8 character set.  The Euro currency symbol (<tt>'&#92;u20AC'</tt>),
  310    *   for example, is encoded as <tt>"%E2%82%AC"</tt>.  <i>(<b>Deviation from
  311    *   RFC&nbsp;2396</b>, which does not specify any particular character
  312    *   set.)</i> </p></li>
  313    *
  314    *   <li><p><a name="quote"></a> An illegal character is <i>quoted</i> simply by
  315    *   encoding it.  The space character, for example, is quoted by replacing it
  316    *   with <tt>"%20"</tt>.  UTF-8 contains US-ASCII, hence for US-ASCII
  317    *   characters this transformation has exactly the effect required by
  318    *   RFC&nbsp;2396. </p></li>
  319    *
  320    *   <li><p><a name="decode"></a>
  321    *   A sequence of escaped octets is <i>decoded</i> by
  322    *   replacing it with the sequence of characters that it represents in the
  323    *   UTF-8 character set.  UTF-8 contains US-ASCII, hence decoding has the
  324    *   effect of de-quoting any quoted US-ASCII characters as well as that of
  325    *   decoding any encoded non-US-ASCII characters.  If a <a
  326    *   href="../nio/charset/CharsetDecoder.html#ce">decoding error</a> occurs
  327    *   when decoding the escaped octets then the erroneous octets are replaced by
  328    *   <tt>'&#92;uFFFD'</tt>, the Unicode replacement character.  </p></li>
  329    *
  330    * </ul>
  331    *
  332    * These operations are exposed in the constructors and methods of this class
  333    * as follows:
  334    *
  335    * <ul>
  336    *
  337    *   <li><p> The {@link #URI(java.lang.String) <code>single-argument
  338    *   constructor</code>} requires any illegal characters in its argument to be
  339    *   quoted and preserves any escaped octets and <i>other</i> characters that
  340    *   are present.  </p></li>
  341    *
  342    *   <li><p> The {@link
  343    *   #URI(java.lang.String,java.lang.String,java.lang.String,int,java.lang.String,java.lang.String,java.lang.String)
  344    *   <code>multi-argument constructors</code>} quote illegal characters as
  345    *   required by the components in which they appear.  The percent character
  346    *   (<tt>'%'</tt>) is always quoted by these constructors.  Any <i>other</i>
  347    *   characters are preserved.  </p></li>
  348    *
  349    *   <li><p> The {@link #getRawUserInfo() getRawUserInfo}, {@link #getRawPath()
  350    *   getRawPath}, {@link #getRawQuery() getRawQuery}, {@link #getRawFragment()
  351    *   getRawFragment}, {@link #getRawAuthority() getRawAuthority}, and {@link
  352    *   #getRawSchemeSpecificPart() getRawSchemeSpecificPart} methods return the
  353    *   values of their corresponding components in raw form, without interpreting
  354    *   any escaped octets.  The strings returned by these methods may contain
  355    *   both escaped octets and <i>other</i> characters, and will not contain any
  356    *   illegal characters.  </p></li>
  357    *
  358    *   <li><p> The {@link #getUserInfo() getUserInfo}, {@link #getPath()
  359    *   getPath}, {@link #getQuery() getQuery}, {@link #getFragment()
  360    *   getFragment}, {@link #getAuthority() getAuthority}, and {@link
  361    *   #getSchemeSpecificPart() getSchemeSpecificPart} methods decode any escaped
  362    *   octets in their corresponding components.  The strings returned by these
  363    *   methods may contain both <i>other</i> characters and illegal characters,
  364    *   and will not contain any escaped octets.  </p></li>
  365    *
  366    *   <li><p> The {@link #toString() toString} method returns a URI string with
  367    *   all necessary quotation but which may contain <i>other</i> characters.
  368    *   </p></li>
  369    *
  370    *   <li><p> The {@link #toASCIIString() toASCIIString} method returns a fully
  371    *   quoted and encoded URI string that does not contain any <i>other</i>
  372    *   characters.  </p></li>
  373    *
  374    * </ul>
  375    *
  376    *
  377    * <h4> Identities </h4>
  378    *
  379    * For any URI <i>u</i>, it is always the case that
  380    *
  381    * <blockquote>
  382    * <tt>new URI(</tt><i>u</i><tt>.toString()).equals(</tt><i>u</i><tt>)</tt>&nbsp;.
  383    * </blockquote>
  384    *
  385    * For any URI <i>u</i> that does not contain redundant syntax such as two
  386    * slashes before an empty authority (as in <tt>file:///tmp/</tt>&nbsp;) or a
  387    * colon following a host name but no port (as in
  388    * <tt>http://java.sun.com:</tt>&nbsp;), and that does not encode characters
  389    * except those that must be quoted, the following identities also hold:
  390    *
  391    * <blockquote>
  392    * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
  393    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getSchemeSpecificPart(),<br>
  394    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
  395    * .equals(</tt><i>u</i><tt>)</tt>
  396    * </blockquote>
  397    *
  398    * in all cases,
  399    *
  400    * <blockquote>
  401    * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
  402    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getAuthority(),<br>
  403    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
  404    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
  405    * .equals(</tt><i>u</i><tt>)</tt>
  406    * </blockquote>
  407    *
  408    * if <i>u</i> is hierarchical, and
  409    *
  410    * <blockquote>
  411    * <tt>new URI(</tt><i>u</i><tt>.getScheme(),<br>
  412    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getUserInfo(),&nbsp;</tt><i>u</i><tt>.getHost(),&nbsp;</tt><i>u</i><tt>.getPort(),<br>
  413    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getPath(),&nbsp;</tt><i>u</i><tt>.getQuery(),<br>
  414    * &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</tt><i>u</i><tt>.getFragment())<br>
  415    * .equals(</tt><i>u</i><tt>)</tt>
  416    * </blockquote>
  417    *
  418    * if <i>u</i> is hierarchical and has either no authority or a server-based
  419    * authority.
  420    *
  421    *
  422    * <h4> URIs, URLs, and URNs </h4>
  423    *
  424    * A URI is a uniform resource <i>identifier</i> while a URL is a uniform
  425    * resource <i>locator</i>.  Hence every URL is a URI, abstractly speaking, but
  426    * not every URI is a URL.  This is because there is another subcategory of
  427    * URIs, uniform resource <i>names</i> (URNs), which name resources but do not
  428    * specify how to locate them.  The <tt>mailto</tt>, <tt>news</tt>, and
  429    * <tt>isbn</tt> URIs shown above are examples of URNs.
  430    *
  431    * <p> The conceptual distinction between URIs and URLs is reflected in the
  432    * differences between this class and the {@link URL} class.
  433    *
  434    * <p> An instance of this class represents a URI reference in the syntactic
  435    * sense defined by RFC&nbsp;2396.  A URI may be either absolute or relative.
  436    * A URI string is parsed according to the generic syntax without regard to the
  437    * scheme, if any, that it specifies.  No lookup of the host, if any, is
  438    * performed, and no scheme-dependent stream handler is constructed.  Equality,
  439    * hashing, and comparison are defined strictly in terms of the character
  440    * content of the instance.  In other words, a URI instance is little more than
  441    * a structured string that supports the syntactic, scheme-independent
  442    * operations of comparison, normalization, resolution, and relativization.
  443    *
  444    * <p> An instance of the {@link URL} class, by contrast, represents the
  445    * syntactic components of a URL together with some of the information required
  446    * to access the resource that it describes.  A URL must be absolute, that is,
  447    * it must always specify a scheme.  A URL string is parsed according to its
  448    * scheme.  A stream handler is always established for a URL, and in fact it is
  449    * impossible to create a URL instance for a scheme for which no handler is
  450    * available.  Equality and hashing depend upon both the scheme and the
  451    * Internet address of the host, if any; comparison is not defined.  In other
  452    * words, a URL is a structured string that supports the syntactic operation of
  453    * resolution as well as the network I/O operations of looking up the host and
  454    * opening a connection to the specified resource.
  455    *
  456    *
  457    * @author Mark Reinhold
  458    * @since 1.4
  459    *
  460    * @see <a href="http://ietf.org/rfc/rfc2279.txt"><i>RFC&nbsp;2279: UTF-8, a
  461    * transformation format of ISO 10646</i></a>, <br><a
  462    * href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6 Addressing
  463    * Architecture</i></a>, <br><a
  464    * href="http://www.ietf.org/rfc/rfc2396.txt""><i>RFC&nbsp;2396: Uniform
  465    * Resource Identifiers (URI): Generic Syntax</i></a>, <br><a
  466    * href="http://www.ietf.org/rfc/rfc2732.txt"><i>RFC&nbsp;2732: Format for
  467    * Literal IPv6 Addresses in URLs</i></a>, <br><a
  468    * href="URISyntaxException.html">URISyntaxException</a>
  469    */
  470   
  471   public final class URI
  472       implements Comparable<URI>, Serializable
  473   {
  474   
  475       // Note: Comments containing the word "ASSERT" indicate places where a
  476       // throw of an InternalError should be replaced by an appropriate assertion
  477       // statement once asserts are enabled in the build.
  478   
  479       static final long serialVersionUID = -6052424284110960213L;
  480   
  481   
  482       // -- Properties and components of this instance --
  483   
  484       // Components of all URIs: [<scheme>:]<scheme-specific-part>[#<fragment>]
  485       private transient String scheme;            // null ==> relative URI
  486       private transient String fragment;
  487   
  488       // Hierarchical URI components: [//<authority>]<path>[?<query>]
  489       private transient String authority;         // Registry or server
  490   
  491       // Server-based authority: [<userInfo>@]<host>[:<port>]
  492       private transient String userInfo;
  493       private transient String host;              // null ==> registry-based
  494       private transient int port = -1;            // -1 ==> undefined
  495   
  496       // Remaining components of hierarchical URIs
  497       private transient String path;              // null ==> opaque
  498       private transient String query;
  499   
  500       // The remaining fields may be computed on demand
  501   
  502       private volatile transient String schemeSpecificPart;
  503       private volatile transient int hash;        // Zero ==> undefined
  504   
  505       private volatile transient String decodedUserInfo = null;
  506       private volatile transient String decodedAuthority = null;
  507       private volatile transient String decodedPath = null;
  508       private volatile transient String decodedQuery = null;
  509       private volatile transient String decodedFragment = null;
  510       private volatile transient String decodedSchemeSpecificPart = null;
  511   
  512       /**
  513        * The string form of this URI.
  514        *
  515        * @serial
  516        */
  517       private volatile String string;             // The only serializable field
  518   
  519   
  520   
  521       // -- Constructors and factories --
  522   
  523       private URI() { }                           // Used internally
  524   
  525       /**
  526        * Constructs a URI by parsing the given string.
  527        *
  528        * <p> This constructor parses the given string exactly as specified by the
  529        * grammar in <a
  530        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  531        * Appendix&nbsp;A, <b><i>except for the following deviations:</i></b> </p>
  532        *
  533        * <ul type=disc>
  534        *
  535        *   <li><p> An empty authority component is permitted as long as it is
  536        *   followed by a non-empty path, a query component, or a fragment
  537        *   component.  This allows the parsing of URIs such as
  538        *   <tt>"file:///foo/bar"</tt>, which seems to be the intent of
  539        *   RFC&nbsp;2396 although the grammar does not permit it.  If the
  540        *   authority component is empty then the user-information, host, and port
  541        *   components are undefined. </p></li>
  542        *
  543        *   <li><p> Empty relative paths are permitted; this seems to be the
  544        *   intent of RFC&nbsp;2396 although the grammar does not permit it.  The
  545        *   primary consequence of this deviation is that a standalone fragment
  546        *   such as <tt>"#foo"</tt> parses as a relative URI with an empty path
  547        *   and the given fragment, and can be usefully <a
  548        *   href="#resolve-frag">resolved</a> against a base URI.
  549        *
  550        *   <li><p> IPv4 addresses in host components are parsed rigorously, as
  551        *   specified by <a
  552        *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>: Each
  553        *   element of a dotted-quad address must contain no more than three
  554        *   decimal digits.  Each element is further constrained to have a value
  555        *   no greater than 255. </p></li>
  556        *
  557        *   <li> <p> Hostnames in host components that comprise only a single
  558        *   domain label are permitted to start with an <i>alphanum</i>
  559        *   character. This seems to be the intent of <a
  560        *   href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
  561        *   section&nbsp;3.2.2 although the grammar does not permit it. The
  562        *   consequence of this deviation is that the authority component of a
  563        *   hierarchical URI such as <tt>s://123</tt>, will parse as a server-based
  564        *   authority. </p></li>
  565        *
  566        *   <li><p> IPv6 addresses are permitted for the host component.  An IPv6
  567        *   address must be enclosed in square brackets (<tt>'['</tt> and
  568        *   <tt>']'</tt>) as specified by <a
  569        *   href="http://www.ietf.org/rfc/rfc2732.txt">RFC&nbsp;2732</a>.  The
  570        *   IPv6 address itself must parse according to <a
  571        *   href="http://www.ietf.org/rfc/rfc2373.txt">RFC&nbsp;2373</a>.  IPv6
  572        *   addresses are further constrained to describe no more than sixteen
  573        *   bytes of address information, a constraint implicit in RFC&nbsp;2373
  574        *   but not expressible in the grammar. </p></li>
  575        *
  576        *   <li><p> Characters in the <i>other</i> category are permitted wherever
  577        *   RFC&nbsp;2396 permits <i>escaped</i> octets, that is, in the
  578        *   user-information, path, query, and fragment components, as well as in
  579        *   the authority component if the authority is registry-based.  This
  580        *   allows URIs to contain Unicode characters beyond those in the US-ASCII
  581        *   character set. </p></li>
  582        *
  583        * </ul>
  584        *
  585        * @param  str   The string to be parsed into a URI
  586        *
  587        * @throws  NullPointerException
  588        *          If <tt>str</tt> is <tt>null</tt>
  589        *
  590        * @throws  URISyntaxException
  591        *          If the given string violates RFC&nbsp;2396, as augmented
  592        *          by the above deviations
  593        */
  594       public URI(String str) throws URISyntaxException {
  595           new Parser(str).parse(false);
  596       }
  597   
  598       /**
  599        * Constructs a hierarchical URI from the given components.
  600        *
  601        * <p> If a scheme is given then the path, if also given, must either be
  602        * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
  603        * component of the new URI may be left undefined by passing <tt>null</tt>
  604        * for the corresponding parameter or, in the case of the <tt>port</tt>
  605        * parameter, by passing <tt>-1</tt>.
  606        *
  607        * <p> This constructor first builds a URI string from the given components
  608        * according to the rules specified in <a
  609        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  610        * section&nbsp;5.2, step&nbsp;7: </p>
  611        *
  612        * <ol>
  613        *
  614        *   <li><p> Initially, the result string is empty. </p></li>
  615        *
  616        *   <li><p> If a scheme is given then it is appended to the result,
  617        *   followed by a colon character (<tt>':'</tt>).  </p></li>
  618        *
  619        *   <li><p> If user information, a host, or a port are given then the
  620        *   string <tt>"//"</tt> is appended.  </p></li>
  621        *
  622        *   <li><p> If user information is given then it is appended, followed by
  623        *   a commercial-at character (<tt>'@'</tt>).  Any character not in the
  624        *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
  625        *   categories is <a href="#quote">quoted</a>.  </p></li>
  626        *
  627        *   <li><p> If a host is given then it is appended.  If the host is a
  628        *   literal IPv6 address but is not enclosed in square brackets
  629        *   (<tt>'['</tt> and <tt>']'</tt>) then the square brackets are added.
  630        *   </p></li>
  631        *
  632        *   <li><p> If a port number is given then a colon character
  633        *   (<tt>':'</tt>) is appended, followed by the port number in decimal.
  634        *   </p></li>
  635        *
  636        *   <li><p> If a path is given then it is appended.  Any character not in
  637        *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
  638        *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
  639        *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
  640        *
  641        *   <li><p> If a query is given then a question-mark character
  642        *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
  643        *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
  644        *   </p></li>
  645        *
  646        *   <li><p> Finally, if a fragment is given then a hash character
  647        *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
  648        *   that is not a legal URI character is quoted.  </p></li>
  649        *
  650        * </ol>
  651        *
  652        * <p> The resulting URI string is then parsed as if by invoking the {@link
  653        * #URI(String)} constructor and then invoking the {@link
  654        * #parseServerAuthority()} method upon the result; this may cause a {@link
  655        * URISyntaxException} to be thrown.  </p>
  656        *
  657        * @param   scheme    Scheme name
  658        * @param   userInfo  User name and authorization information
  659        * @param   host      Host name
  660        * @param   port      Port number
  661        * @param   path      Path
  662        * @param   query     Query
  663        * @param   fragment  Fragment
  664        *
  665        * @throws URISyntaxException
  666        *         If both a scheme and a path are given but the path is relative,
  667        *         if the URI string constructed from the given components violates
  668        *         RFC&nbsp;2396, or if the authority component of the string is
  669        *         present but cannot be parsed as a server-based authority
  670        */
  671       public URI(String scheme,
  672                  String userInfo, String host, int port,
  673                  String path, String query, String fragment)
  674           throws URISyntaxException
  675       {
  676           String s = toString(scheme, null,
  677                               null, userInfo, host, port,
  678                               path, query, fragment);
  679           checkPath(s, scheme, path);
  680           new Parser(s).parse(true);
  681       }
  682   
  683       /**
  684        * Constructs a hierarchical URI from the given components.
  685        *
  686        * <p> If a scheme is given then the path, if also given, must either be
  687        * empty or begin with a slash character (<tt>'/'</tt>).  Otherwise a
  688        * component of the new URI may be left undefined by passing <tt>null</tt>
  689        * for the corresponding parameter.
  690        *
  691        * <p> This constructor first builds a URI string from the given components
  692        * according to the rules specified in <a
  693        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  694        * section&nbsp;5.2, step&nbsp;7: </p>
  695        *
  696        * <ol>
  697        *
  698        *   <li><p> Initially, the result string is empty.  </p></li>
  699        *
  700        *   <li><p> If a scheme is given then it is appended to the result,
  701        *   followed by a colon character (<tt>':'</tt>).  </p></li>
  702        *
  703        *   <li><p> If an authority is given then the string <tt>"//"</tt> is
  704        *   appended, followed by the authority.  If the authority contains a
  705        *   literal IPv6 address then the address must be enclosed in square
  706        *   brackets (<tt>'['</tt> and <tt>']'</tt>).  Any character not in the
  707        *   <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
  708        *   categories, and not equal to the commercial-at character
  709        *   (<tt>'@'</tt>), is <a href="#quote">quoted</a>.  </p></li>
  710        *
  711        *   <li><p> If a path is given then it is appended.  Any character not in
  712        *   the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, or <i>other</i>
  713        *   categories, and not equal to the slash character (<tt>'/'</tt>) or the
  714        *   commercial-at character (<tt>'@'</tt>), is quoted.  </p></li>
  715        *
  716        *   <li><p> If a query is given then a question-mark character
  717        *   (<tt>'?'</tt>) is appended, followed by the query.  Any character that
  718        *   is not a <a href="#legal-chars">legal URI character</a> is quoted.
  719        *   </p></li>
  720        *
  721        *   <li><p> Finally, if a fragment is given then a hash character
  722        *   (<tt>'#'</tt>) is appended, followed by the fragment.  Any character
  723        *   that is not a legal URI character is quoted.  </p></li>
  724        *
  725        * </ol>
  726        *
  727        * <p> The resulting URI string is then parsed as if by invoking the {@link
  728        * #URI(String)} constructor and then invoking the {@link
  729        * #parseServerAuthority()} method upon the result; this may cause a {@link
  730        * URISyntaxException} to be thrown.  </p>
  731        *
  732        * @param   scheme     Scheme name
  733        * @param   authority  Authority
  734        * @param   path       Path
  735        * @param   query      Query
  736        * @param   fragment   Fragment
  737        *
  738        * @throws URISyntaxException
  739        *         If both a scheme and a path are given but the path is relative,
  740        *         if the URI string constructed from the given components violates
  741        *         RFC&nbsp;2396, or if the authority component of the string is
  742        *         present but cannot be parsed as a server-based authority
  743        */
  744       public URI(String scheme,
  745                  String authority,
  746                  String path, String query, String fragment)
  747           throws URISyntaxException
  748       {
  749           String s = toString(scheme, null,
  750                               authority, null, null, -1,
  751                               path, query, fragment);
  752           checkPath(s, scheme, path);
  753           new Parser(s).parse(false);
  754       }
  755   
  756       /**
  757        * Constructs a hierarchical URI from the given components.
  758        *
  759        * <p> A component may be left undefined by passing <tt>null</tt>.
  760        *
  761        * <p> This convenience constructor works as if by invoking the
  762        * seven-argument constructor as follows:
  763        *
  764        * <blockquote><tt>
  765        * new&nbsp;{@link #URI(String, String, String, int, String, String, String)
  766        * URI}(scheme,&nbsp;null,&nbsp;host,&nbsp;-1,&nbsp;path,&nbsp;null,&nbsp;fragment);
  767        * </tt></blockquote>
  768        *
  769        * @param   scheme    Scheme name
  770        * @param   host      Host name
  771        * @param   path      Path
  772        * @param   fragment  Fragment
  773        *
  774        * @throws  URISyntaxException
  775        *          If the URI string constructed from the given components
  776        *          violates RFC&nbsp;2396
  777        */
  778       public URI(String scheme, String host, String path, String fragment)
  779           throws URISyntaxException
  780       {
  781           this(scheme, null, host, -1, path, null, fragment);
  782       }
  783   
  784       /**
  785        * Constructs a URI from the given components.
  786        *
  787        * <p> A component may be left undefined by passing <tt>null</tt>.
  788        *
  789        * <p> This constructor first builds a URI in string form using the given
  790        * components as follows:  </p>
  791        *
  792        * <ol>
  793        *
  794        *   <li><p> Initially, the result string is empty.  </p></li>
  795        *
  796        *   <li><p> If a scheme is given then it is appended to the result,
  797        *   followed by a colon character (<tt>':'</tt>).  </p></li>
  798        *
  799        *   <li><p> If a scheme-specific part is given then it is appended.  Any
  800        *   character that is not a <a href="#legal-chars">legal URI character</a>
  801        *   is <a href="#quote">quoted</a>.  </p></li>
  802        *
  803        *   <li><p> Finally, if a fragment is given then a hash character
  804        *   (<tt>'#'</tt>) is appended to the string, followed by the fragment.
  805        *   Any character that is not a legal URI character is quoted.  </p></li>
  806        *
  807        * </ol>
  808        *
  809        * <p> The resulting URI string is then parsed in order to create the new
  810        * URI instance as if by invoking the {@link #URI(String)} constructor;
  811        * this may cause a {@link URISyntaxException} to be thrown.  </p>
  812        *
  813        * @param   scheme    Scheme name
  814        * @param   ssp       Scheme-specific part
  815        * @param   fragment  Fragment
  816        *
  817        * @throws  URISyntaxException
  818        *          If the URI string constructed from the given components
  819        *          violates RFC&nbsp;2396
  820        */
  821       public URI(String scheme, String ssp, String fragment)
  822           throws URISyntaxException
  823       {
  824           new Parser(toString(scheme, ssp,
  825                               null, null, null, -1,
  826                               null, null, fragment))
  827               .parse(false);
  828       }
  829   
  830       /**
  831        * Creates a URI by parsing the given string.
  832        *
  833        * <p> This convenience factory method works as if by invoking the {@link
  834        * #URI(String)} constructor; any {@link URISyntaxException} thrown by the
  835        * constructor is caught and wrapped in a new {@link
  836        * IllegalArgumentException} object, which is then thrown.
  837        *
  838        * <p> This method is provided for use in situations where it is known that
  839        * the given string is a legal URI, for example for URI constants declared
  840        * within in a program, and so it would be considered a programming error
  841        * for the string not to parse as such.  The constructors, which throw
  842        * {@link URISyntaxException} directly, should be used situations where a
  843        * URI is being constructed from user input or from some other source that
  844        * may be prone to errors.  </p>
  845        *
  846        * @param  str   The string to be parsed into a URI
  847        * @return The new URI
  848        *
  849        * @throws  NullPointerException
  850        *          If <tt>str</tt> is <tt>null</tt>
  851        *
  852        * @throws  IllegalArgumentException
  853        *          If the given string violates RFC&nbsp;2396
  854        */
  855       public static URI create(String str) {
  856           try {
  857               return new URI(str);
  858           } catch (URISyntaxException x) {
  859               IllegalArgumentException y = new IllegalArgumentException();
  860               y.initCause(x);
  861               throw y;
  862           }
  863       }
  864   
  865   
  866       // -- Operations --
  867   
  868       /**
  869        * Attempts to parse this URI's authority component, if defined, into
  870        * user-information, host, and port components.
  871        *
  872        * <p> If this URI's authority component has already been recognized as
  873        * being server-based then it will already have been parsed into
  874        * user-information, host, and port components.  In this case, or if this
  875        * URI has no authority component, this method simply returns this URI.
  876        *
  877        * <p> Otherwise this method attempts once more to parse the authority
  878        * component into user-information, host, and port components, and throws
  879        * an exception describing why the authority component could not be parsed
  880        * in that way.
  881        *
  882        * <p> This method is provided because the generic URI syntax specified in
  883        * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>
  884        * cannot always distinguish a malformed server-based authority from a
  885        * legitimate registry-based authority.  It must therefore treat some
  886        * instances of the former as instances of the latter.  The authority
  887        * component in the URI string <tt>"//foo:bar"</tt>, for example, is not a
  888        * legal server-based authority but it is legal as a registry-based
  889        * authority.
  890        *
  891        * <p> In many common situations, for example when working URIs that are
  892        * known to be either URNs or URLs, the hierarchical URIs being used will
  893        * always be server-based.  They therefore must either be parsed as such or
  894        * treated as an error.  In these cases a statement such as
  895        *
  896        * <blockquote>
  897        * <tt>URI </tt><i>u</i><tt> = new URI(str).parseServerAuthority();</tt>
  898        * </blockquote>
  899        *
  900        * <p> can be used to ensure that <i>u</i> always refers to a URI that, if
  901        * it has an authority component, has a server-based authority with proper
  902        * user-information, host, and port components.  Invoking this method also
  903        * ensures that if the authority could not be parsed in that way then an
  904        * appropriate diagnostic message can be issued based upon the exception
  905        * that is thrown. </p>
  906        *
  907        * @return  A URI whose authority field has been parsed
  908        *          as a server-based authority
  909        *
  910        * @throws  URISyntaxException
  911        *          If the authority component of this URI is defined
  912        *          but cannot be parsed as a server-based authority
  913        *          according to RFC&nbsp;2396
  914        */
  915       public URI parseServerAuthority()
  916           throws URISyntaxException
  917       {
  918           // We could be clever and cache the error message and index from the
  919           // exception thrown during the original parse, but that would require
  920           // either more fields or a more-obscure representation.
  921           if ((host != null) || (authority == null))
  922               return this;
  923           defineString();
  924           new Parser(string).parse(true);
  925           return this;
  926       }
  927   
  928       /**
  929        * Normalizes this URI's path.
  930        *
  931        * <p> If this URI is opaque, or if its path is already in normal form,
  932        * then this URI is returned.  Otherwise a new URI is constructed that is
  933        * identical to this URI except that its path is computed by normalizing
  934        * this URI's path in a manner consistent with <a
  935        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  936        * section&nbsp;5.2, step&nbsp;6, sub-steps&nbsp;c through&nbsp;f; that is:
  937        * </p>
  938        *
  939        * <ol>
  940        *
  941        *   <li><p> All <tt>"."</tt> segments are removed. </p></li>
  942        *
  943        *   <li><p> If a <tt>".."</tt> segment is preceded by a non-<tt>".."</tt>
  944        *   segment then both of these segments are removed.  This step is
  945        *   repeated until it is no longer applicable. </p></li>
  946        *
  947        *   <li><p> If the path is relative, and if its first segment contains a
  948        *   colon character (<tt>':'</tt>), then a <tt>"."</tt> segment is
  949        *   prepended.  This prevents a relative URI with a path such as
  950        *   <tt>"a:b/c/d"</tt> from later being re-parsed as an opaque URI with a
  951        *   scheme of <tt>"a"</tt> and a scheme-specific part of <tt>"b/c/d"</tt>.
  952        *   <b><i>(Deviation from RFC&nbsp;2396)</i></b> </p></li>
  953        *
  954        * </ol>
  955        *
  956        * <p> A normalized path will begin with one or more <tt>".."</tt> segments
  957        * if there were insufficient non-<tt>".."</tt> segments preceding them to
  958        * allow their removal.  A normalized path will begin with a <tt>"."</tt>
  959        * segment if one was inserted by step 3 above.  Otherwise, a normalized
  960        * path will not contain any <tt>"."</tt> or <tt>".."</tt> segments. </p>
  961        *
  962        * @return  A URI equivalent to this URI,
  963        *          but whose path is in normal form
  964        */
  965       public URI normalize() {
  966           return normalize(this);
  967       }
  968   
  969       /**
  970        * Resolves the given URI against this URI.
  971        *
  972        * <p> If the given URI is already absolute, or if this URI is opaque, then
  973        * the given URI is returned.
  974        *
  975        * <p><a name="resolve-frag"></a> If the given URI's fragment component is
  976        * defined, its path component is empty, and its scheme, authority, and
  977        * query components are undefined, then a URI with the given fragment but
  978        * with all other components equal to those of this URI is returned.  This
  979        * allows a URI representing a standalone fragment reference, such as
  980        * <tt>"#foo"</tt>, to be usefully resolved against a base URI.
  981        *
  982        * <p> Otherwise this method constructs a new hierarchical URI in a manner
  983        * consistent with <a
  984        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
  985        * section&nbsp;5.2; that is: </p>
  986        *
  987        * <ol>
  988        *
  989        *   <li><p> A new URI is constructed with this URI's scheme and the given
  990        *   URI's query and fragment components. </p></li>
  991        *
  992        *   <li><p> If the given URI has an authority component then the new URI's
  993        *   authority and path are taken from the given URI. </p></li>
  994        *
  995        *   <li><p> Otherwise the new URI's authority component is copied from
  996        *   this URI, and its path is computed as follows: </p></li>
  997        *
  998        *   <ol type=a>
  999        *
 1000        *     <li><p> If the given URI's path is absolute then the new URI's path
 1001        *     is taken from the given URI. </p></li>
 1002        *
 1003        *     <li><p> Otherwise the given URI's path is relative, and so the new
 1004        *     URI's path is computed by resolving the path of the given URI
 1005        *     against the path of this URI.  This is done by concatenating all but
 1006        *     the last segment of this URI's path, if any, with the given URI's
 1007        *     path and then normalizing the result as if by invoking the {@link
 1008        *     #normalize() normalize} method. </p></li>
 1009        *
 1010        *   </ol>
 1011        *
 1012        * </ol>
 1013        *
 1014        * <p> The result of this method is absolute if, and only if, either this
 1015        * URI is absolute or the given URI is absolute.  </p>
 1016        *
 1017        * @param  uri  The URI to be resolved against this URI
 1018        * @return The resulting URI
 1019        *
 1020        * @throws  NullPointerException
 1021        *          If <tt>uri</tt> is <tt>null</tt>
 1022        */
 1023       public URI resolve(URI uri) {
 1024           return resolve(this, uri);
 1025       }
 1026   
 1027       /**
 1028        * Constructs a new URI by parsing the given string and then resolving it
 1029        * against this URI.
 1030        *
 1031        * <p> This convenience method works as if invoking it were equivalent to
 1032        * evaluating the expression <tt>{@link #resolve(java.net.URI)
 1033        * resolve}(URI.{@link #create(String) create}(str))</tt>. </p>
 1034        *
 1035        * @param  str   The string to be parsed into a URI
 1036        * @return The resulting URI
 1037        *
 1038        * @throws  NullPointerException
 1039        *          If <tt>str</tt> is <tt>null</tt>
 1040        *
 1041        * @throws  IllegalArgumentException
 1042        *          If the given string violates RFC&nbsp;2396
 1043        */
 1044       public URI resolve(String str) {
 1045           return resolve(URI.create(str));
 1046       }
 1047   
 1048       /**
 1049        * Relativizes the given URI against this URI.
 1050        *
 1051        * <p> The relativization of the given URI against this URI is computed as
 1052        * follows: </p>
 1053        *
 1054        * <ol>
 1055        *
 1056        *   <li><p> If either this URI or the given URI are opaque, or if the
 1057        *   scheme and authority components of the two URIs are not identical, or
 1058        *   if the path of this URI is not a prefix of the path of the given URI,
 1059        *   then the given URI is returned. </p></li>
 1060        *
 1061        *   <li><p> Otherwise a new relative hierarchical URI is constructed with
 1062        *   query and fragment components taken from the given URI and with a path
 1063        *   component computed by removing this URI's path from the beginning of
 1064        *   the given URI's path. </p></li>
 1065        *
 1066        * </ol>
 1067        *
 1068        * @param  uri  The URI to be relativized against this URI
 1069        * @return The resulting URI
 1070        *
 1071        * @throws  NullPointerException
 1072        *          If <tt>uri</tt> is <tt>null</tt>
 1073        */
 1074       public URI relativize(URI uri) {
 1075           return relativize(this, uri);
 1076       }
 1077   
 1078       /**
 1079        * Constructs a URL from this URI.
 1080        *
 1081        * <p> This convenience method works as if invoking it were equivalent to
 1082        * evaluating the expression <tt>new&nbsp;URL(this.toString())</tt> after
 1083        * first checking that this URI is absolute. </p>
 1084        *
 1085        * @return  A URL constructed from this URI
 1086        *
 1087        * @throws  IllegalArgumentException
 1088        *          If this URL is not absolute
 1089        *
 1090        * @throws  MalformedURLException
 1091        *          If a protocol handler for the URL could not be found,
 1092        *          or if some other error occurred while constructing the URL
 1093        */
 1094       public URL toURL()
 1095           throws MalformedURLException {
 1096           if (!isAbsolute())
 1097               throw new IllegalArgumentException("URI is not absolute");
 1098           return new URL(toString());
 1099       }
 1100   
 1101       // -- Component access methods --
 1102   
 1103       /**
 1104        * Returns the scheme component of this URI.
 1105        *
 1106        * <p> The scheme component of a URI, if defined, only contains characters
 1107        * in the <i>alphanum</i> category and in the string <tt>"-.+"</tt>.  A
 1108        * scheme always starts with an <i>alpha</i> character. <p>
 1109        *
 1110        * The scheme component of a URI cannot contain escaped octets, hence this
 1111        * method does not perform any decoding.
 1112        *
 1113        * @return  The scheme component of this URI,
 1114        *          or <tt>null</tt> if the scheme is undefined
 1115        */
 1116       public String getScheme() {
 1117           return scheme;
 1118       }
 1119   
 1120       /**
 1121        * Tells whether or not this URI is absolute.
 1122        *
 1123        * <p> A URI is absolute if, and only if, it has a scheme component. </p>
 1124        *
 1125        * @return  <tt>true</tt> if, and only if, this URI is absolute
 1126        */
 1127       public boolean isAbsolute() {
 1128           return scheme != null;
 1129       }
 1130   
 1131       /**
 1132        * Tells whether or not this URI is opaque.
 1133        *
 1134        * <p> A URI is opaque if, and only if, it is absolute and its
 1135        * scheme-specific part does not begin with a slash character ('/').
 1136        * An opaque URI has a scheme, a scheme-specific part, and possibly
 1137        * a fragment; all other components are undefined. </p>
 1138        *
 1139        * @return  <tt>true</tt> if, and only if, this URI is opaque
 1140        */
 1141       public boolean isOpaque() {
 1142           return path == null;
 1143       }
 1144   
 1145       /**
 1146        * Returns the raw scheme-specific part of this URI.  The scheme-specific
 1147        * part is never undefined, though it may be empty.
 1148        *
 1149        * <p> The scheme-specific part of a URI only contains legal URI
 1150        * characters. </p>
 1151        *
 1152        * @return  The raw scheme-specific part of this URI
 1153        *          (never <tt>null</tt>)
 1154        */
 1155       public String getRawSchemeSpecificPart() {
 1156           defineSchemeSpecificPart();
 1157           return schemeSpecificPart;
 1158       }
 1159   
 1160       /**
 1161        * Returns the decoded scheme-specific part of this URI.
 1162        *
 1163        * <p> The string returned by this method is equal to that returned by the
 1164        * {@link #getRawSchemeSpecificPart() getRawSchemeSpecificPart} method
 1165        * except that all sequences of escaped octets are <a
 1166        * href="#decode">decoded</a>.  </p>
 1167        *
 1168        * @return  The decoded scheme-specific part of this URI
 1169        *          (never <tt>null</tt>)
 1170        */
 1171       public String getSchemeSpecificPart() {
 1172           if (decodedSchemeSpecificPart == null)
 1173               decodedSchemeSpecificPart = decode(getRawSchemeSpecificPart());
 1174           return decodedSchemeSpecificPart;
 1175       }
 1176   
 1177       /**
 1178        * Returns the raw authority component of this URI.
 1179        *
 1180        * <p> The authority component of a URI, if defined, only contains the
 1181        * commercial-at character (<tt>'@'</tt>) and characters in the
 1182        * <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and <i>other</i>
 1183        * categories.  If the authority is server-based then it is further
 1184        * constrained to have valid user-information, host, and port
 1185        * components. </p>
 1186        *
 1187        * @return  The raw authority component of this URI,
 1188        *          or <tt>null</tt> if the authority is undefined
 1189        */
 1190       public String getRawAuthority() {
 1191           return authority;
 1192       }
 1193   
 1194       /**
 1195        * Returns the decoded authority component of this URI.
 1196        *
 1197        * <p> The string returned by this method is equal to that returned by the
 1198        * {@link #getRawAuthority() getRawAuthority} method except that all
 1199        * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
 1200        *
 1201        * @return  The decoded authority component of this URI,
 1202        *          or <tt>null</tt> if the authority is undefined
 1203        */
 1204       public String getAuthority() {
 1205           if (decodedAuthority == null)
 1206               decodedAuthority = decode(authority);
 1207           return decodedAuthority;
 1208       }
 1209   
 1210       /**
 1211        * Returns the raw user-information component of this URI.
 1212        *
 1213        * <p> The user-information component of a URI, if defined, only contains
 1214        * characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>, and
 1215        * <i>other</i> categories. </p>
 1216        *
 1217        * @return  The raw user-information component of this URI,
 1218        *          or <tt>null</tt> if the user information is undefined
 1219        */
 1220       public String getRawUserInfo() {
 1221           return userInfo;
 1222       }
 1223   
 1224       /**
 1225        * Returns the decoded user-information component of this URI.
 1226        *
 1227        * <p> The string returned by this method is equal to that returned by the
 1228        * {@link #getRawUserInfo() getRawUserInfo} method except that all
 1229        * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
 1230        *
 1231        * @return  The decoded user-information component of this URI,
 1232        *          or <tt>null</tt> if the user information is undefined
 1233        */
 1234       public String getUserInfo() {
 1235           if ((decodedUserInfo == null) && (userInfo != null))
 1236               decodedUserInfo = decode(userInfo);
 1237           return decodedUserInfo;
 1238       }
 1239   
 1240       /**
 1241        * Returns the host component of this URI.
 1242        *
 1243        * <p> The host component of a URI, if defined, will have one of the
 1244        * following forms: </p>
 1245        *
 1246        * <ul type=disc>
 1247        *
 1248        *   <li><p> A domain name consisting of one or more <i>labels</i>
 1249        *   separated by period characters (<tt>'.'</tt>), optionally followed by
 1250        *   a period character.  Each label consists of <i>alphanum</i> characters
 1251        *   as well as hyphen characters (<tt>'-'</tt>), though hyphens never
 1252        *   occur as the first or last characters in a label. The rightmost
 1253        *   label of a domain name consisting of two or more labels, begins
 1254        *   with an <i>alpha</i> character. </li>
 1255        *
 1256        *   <li><p> A dotted-quad IPv4 address of the form
 1257        *   <i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+.</tt><i>digit</i><tt>+</tt>,
 1258        *   where no <i>digit</i> sequence is longer than three characters and no
 1259        *   sequence has a value larger than 255. </p></li>
 1260        *
 1261        *   <li><p> An IPv6 address enclosed in square brackets (<tt>'['</tt> and
 1262        *   <tt>']'</tt>) and consisting of hexadecimal digits, colon characters
 1263        *   (<tt>':'</tt>), and possibly an embedded IPv4 address.  The full
 1264        *   syntax of IPv6 addresses is specified in <a
 1265        *   href="http://www.ietf.org/rfc/rfc2373.txt"><i>RFC&nbsp;2373: IPv6
 1266        *   Addressing Architecture</i></a>.  </p></li>
 1267        *
 1268        * </ul>
 1269        *
 1270        * The host component of a URI cannot contain escaped octets, hence this
 1271        * method does not perform any decoding.
 1272        *
 1273        * @return  The host component of this URI,
 1274        *          or <tt>null</tt> if the host is undefined
 1275        */
 1276       public String getHost() {
 1277           return host;
 1278       }
 1279   
 1280       /**
 1281        * Returns the port number of this URI.
 1282        *
 1283        * <p> The port component of a URI, if defined, is a non-negative
 1284        * integer. </p>
 1285        *
 1286        * @return  The port component of this URI,
 1287        *          or <tt>-1</tt> if the port is undefined
 1288        */
 1289       public int getPort() {
 1290           return port;
 1291       }
 1292   
 1293       /**
 1294        * Returns the raw path component of this URI.
 1295        *
 1296        * <p> The path component of a URI, if defined, only contains the slash
 1297        * character (<tt>'/'</tt>), the commercial-at character (<tt>'@'</tt>),
 1298        * and characters in the <i>unreserved</i>, <i>punct</i>, <i>escaped</i>,
 1299        * and <i>other</i> categories. </p>
 1300        *
 1301        * @return  The path component of this URI,
 1302        *          or <tt>null</tt> if the path is undefined
 1303        */
 1304       public String getRawPath() {
 1305           return path;
 1306       }
 1307   
 1308       /**
 1309        * Returns the decoded path component of this URI.
 1310        *
 1311        * <p> The string returned by this method is equal to that returned by the
 1312        * {@link #getRawPath() getRawPath} method except that all sequences of
 1313        * escaped octets are <a href="#decode">decoded</a>.  </p>
 1314        *
 1315        * @return  The decoded path component of this URI,
 1316        *          or <tt>null</tt> if the path is undefined
 1317        */
 1318       public String getPath() {
 1319           if ((decodedPath == null) && (path != null))
 1320               decodedPath = decode(path);
 1321           return decodedPath;
 1322       }
 1323   
 1324       /**
 1325        * Returns the raw query component of this URI.
 1326        *
 1327        * <p> The query component of a URI, if defined, only contains legal URI
 1328        * characters. </p>
 1329        *
 1330        * @return  The raw query component of this URI,
 1331        *          or <tt>null</tt> if the query is undefined
 1332        */
 1333       public String getRawQuery() {
 1334           return query;
 1335       }
 1336   
 1337       /**
 1338        * Returns the decoded query component of this URI.
 1339        *
 1340        * <p> The string returned by this method is equal to that returned by the
 1341        * {@link #getRawQuery() getRawQuery} method except that all sequences of
 1342        * escaped octets are <a href="#decode">decoded</a>.  </p>
 1343        *
 1344        * @return  The decoded query component of this URI,
 1345        *          or <tt>null</tt> if the query is undefined
 1346        */
 1347       public String getQuery() {
 1348           if ((decodedQuery == null) && (query != null))
 1349               decodedQuery = decode(query);
 1350           return decodedQuery;
 1351       }
 1352   
 1353       /**
 1354        * Returns the raw fragment component of this URI.
 1355        *
 1356        * <p> The fragment component of a URI, if defined, only contains legal URI
 1357        * characters. </p>
 1358        *
 1359        * @return  The raw fragment component of this URI,
 1360        *          or <tt>null</tt> if the fragment is undefined
 1361        */
 1362       public String getRawFragment() {
 1363           return fragment;
 1364       }
 1365   
 1366       /**
 1367        * Returns the decoded fragment component of this URI.
 1368        *
 1369        * <p> The string returned by this method is equal to that returned by the
 1370        * {@link #getRawFragment() getRawFragment} method except that all
 1371        * sequences of escaped octets are <a href="#decode">decoded</a>.  </p>
 1372        *
 1373        * @return  The decoded fragment component of this URI,
 1374        *          or <tt>null</tt> if the fragment is undefined
 1375        */
 1376       public String getFragment() {
 1377           if ((decodedFragment == null) && (fragment != null))
 1378               decodedFragment = decode(fragment);
 1379           return decodedFragment;
 1380       }
 1381   
 1382   
 1383       // -- Equality, comparison, hash code, toString, and serialization --
 1384   
 1385       /**
 1386        * Tests this URI for equality with another object.
 1387        *
 1388        * <p> If the given object is not a URI then this method immediately
 1389        * returns <tt>false</tt>.
 1390        *
 1391        * <p> For two URIs to be considered equal requires that either both are
 1392        * opaque or both are hierarchical.  Their schemes must either both be
 1393        * undefined or else be equal without regard to case. Their fragments
 1394        * must either both be undefined or else be equal.
 1395        *
 1396        * <p> For two opaque URIs to be considered equal, their scheme-specific
 1397        * parts must be equal.
 1398        *
 1399        * <p> For two hierarchical URIs to be considered equal, their paths must
 1400        * be equal and their queries must either both be undefined or else be
 1401        * equal.  Their authorities must either both be undefined, or both be
 1402        * registry-based, or both be server-based.  If their authorities are
 1403        * defined and are registry-based, then they must be equal.  If their
 1404        * authorities are defined and are server-based, then their hosts must be
 1405        * equal without regard to case, their port numbers must be equal, and
 1406        * their user-information components must be equal.
 1407        *
 1408        * <p> When testing the user-information, path, query, fragment, authority,
 1409        * or scheme-specific parts of two URIs for equality, the raw forms rather
 1410        * than the encoded forms of these components are compared and the
 1411        * hexadecimal digits of escaped octets are compared without regard to
 1412        * case.
 1413        *
 1414        * <p> This method satisfies the general contract of the {@link
 1415        * java.lang.Object#equals(Object) Object.equals} method. </p>
 1416        *
 1417        * @param   ob   The object to which this object is to be compared
 1418        *
 1419        * @return  <tt>true</tt> if, and only if, the given object is a URI that
 1420        *          is identical to this URI
 1421        */
 1422       public boolean equals(Object ob) {
 1423           if (ob == this)
 1424               return true;
 1425           if (!(ob instanceof URI))
 1426               return false;
 1427           URI that = (URI)ob;
 1428           if (this.isOpaque() != that.isOpaque()) return false;
 1429           if (!equalIgnoringCase(this.scheme, that.scheme)) return false;
 1430           if (!equal(this.fragment, that.fragment)) return false;
 1431   
 1432           // Opaque
 1433           if (this.isOpaque())
 1434               return equal(this.schemeSpecificPart, that.schemeSpecificPart);
 1435   
 1436           // Hierarchical
 1437           if (!equal(this.path, that.path)) return false;
 1438           if (!equal(this.query, that.query)) return false;
 1439   
 1440           // Authorities
 1441           if (this.authority == that.authority) return true;
 1442           if (this.host != null) {
 1443               // Server-based
 1444               if (!equal(this.userInfo, that.userInfo)) return false;
 1445               if (!equalIgnoringCase(this.host, that.host)) return false;
 1446               if (this.port != that.port) return false;
 1447           } else if (this.authority != null) {
 1448               // Registry-based
 1449               if (!equal(this.authority, that.authority)) return false;
 1450           } else if (this.authority != that.authority) {
 1451               return false;
 1452           }
 1453   
 1454           return true;
 1455       }
 1456   
 1457       /**
 1458        * Returns a hash-code value for this URI.  The hash code is based upon all
 1459        * of the URI's components, and satisfies the general contract of the
 1460        * {@link java.lang.Object#hashCode() Object.hashCode} method.
 1461        *
 1462        * @return  A hash-code value for this URI
 1463        */
 1464       public int hashCode() {
 1465           if (hash != 0)
 1466               return hash;
 1467           int h = hashIgnoringCase(0, scheme);
 1468           h = hash(h, fragment);
 1469           if (isOpaque()) {
 1470               h = hash(h, schemeSpecificPart);
 1471           } else {
 1472               h = hash(h, path);
 1473               h = hash(h, query);
 1474               if (host != null) {
 1475                   h = hash(h, userInfo);
 1476                   h = hashIgnoringCase(h, host);
 1477                   h += 1949 * port;
 1478               } else {
 1479                   h = hash(h, authority);
 1480               }
 1481           }
 1482           hash = h;
 1483           return h;
 1484       }
 1485   
 1486       /**
 1487        * Compares this URI to another object, which must be a URI.
 1488        *
 1489        * <p> When comparing corresponding components of two URIs, if one
 1490        * component is undefined but the other is defined then the first is
 1491        * considered to be less than the second.  Unless otherwise noted, string
 1492        * components are ordered according to their natural, case-sensitive
 1493        * ordering as defined by the {@link java.lang.String#compareTo(Object)
 1494        * String.compareTo} method.  String components that are subject to
 1495        * encoding are compared by comparing their raw forms rather than their
 1496        * encoded forms.
 1497        *
 1498        * <p> The ordering of URIs is defined as follows: </p>
 1499        *
 1500        * <ul type=disc>
 1501        *
 1502        *   <li><p> Two URIs with different schemes are ordered according the
 1503        *   ordering of their schemes, without regard to case. </p></li>
 1504        *
 1505        *   <li><p> A hierarchical URI is considered to be less than an opaque URI
 1506        *   with an identical scheme. </p></li>
 1507        *
 1508        *   <li><p> Two opaque URIs with identical schemes are ordered according
 1509        *   to the ordering of their scheme-specific parts. </p></li>
 1510        *
 1511        *   <li><p> Two opaque URIs with identical schemes and scheme-specific
 1512        *   parts are ordered according to the ordering of their
 1513        *   fragments. </p></li>
 1514        *
 1515        *   <li><p> Two hierarchical URIs with identical schemes are ordered
 1516        *   according to the ordering of their authority components: </p></li>
 1517        *
 1518        *   <ul type=disc>
 1519        *
 1520        *     <li><p> If both authority components are server-based then the URIs
 1521        *     are ordered according to their user-information components; if these
 1522        *     components are identical then the URIs are ordered according to the
 1523        *     ordering of their hosts, without regard to case; if the hosts are
 1524        *     identical then the URIs are ordered according to the ordering of
 1525        *     their ports. </p></li>
 1526        *
 1527        *     <li><p> If one or both authority components are registry-based then
 1528        *     the URIs are ordered according to the ordering of their authority
 1529        *     components. </p></li>
 1530        *
 1531        *   </ul>
 1532        *
 1533        *   <li><p> Finally, two hierarchical URIs with identical schemes and
 1534        *   authority components are ordered according to the ordering of their
 1535        *   paths; if their paths are identical then they are ordered according to
 1536        *   the ordering of their queries; if the queries are identical then they
 1537        *   are ordered according to the order of their fragments. </p></li>
 1538        *
 1539        * </ul>
 1540        *
 1541        * <p> This method satisfies the general contract of the {@link
 1542        * java.lang.Comparable#compareTo(Object) Comparable.compareTo}
 1543        * method. </p>
 1544        *
 1545        * @param   that
 1546        *          The object to which this URI is to be compared
 1547        *
 1548        * @return  A negative integer, zero, or a positive integer as this URI is
 1549        *          less than, equal to, or greater than the given URI
 1550        *
 1551        * @throws  ClassCastException
 1552        *          If the given object is not a URI
 1553        */
 1554       public int compareTo(URI that) {
 1555           int c;
 1556   
 1557           if ((c = compareIgnoringCase(this.scheme, that.scheme)) != 0)
 1558               return c;
 1559   
 1560           if (this.isOpaque()) {
 1561               if (that.isOpaque()) {
 1562                   // Both opaque
 1563                   if ((c = compare(this.schemeSpecificPart,
 1564                                    that.schemeSpecificPart)) != 0)
 1565                       return c;
 1566                   return compare(this.fragment, that.fragment);
 1567               }
 1568               return +1;                  // Opaque > hierarchical
 1569           } else if (that.isOpaque()) {
 1570               return -1;                  // Hierarchical < opaque
 1571           }
 1572   
 1573           // Hierarchical
 1574           if ((this.host != null) && (that.host != null)) {
 1575               // Both server-based
 1576               if ((c = compare(this.userInfo, that.userInfo)) != 0)
 1577                   return c;
 1578               if ((c = compareIgnoringCase(this.host, that.host)) != 0)
 1579                   return c;
 1580               if ((c = this.port - that.port) != 0)
 1581                   return c;
 1582           } else {
 1583               // If one or both authorities are registry-based then we simply
 1584               // compare them in the usual, case-sensitive way.  If one is
 1585               // registry-based and one is server-based then the strings are
 1586               // guaranteed to be unequal, hence the comparison will never return
 1587               // zero and the compareTo and equals methods will remain
 1588               // consistent.
 1589               if ((c = compare(this.authority, that.authority)) != 0) return c;
 1590           }
 1591   
 1592           if ((c = compare(this.path, that.path)) != 0) return c;
 1593           if ((c = compare(this.query, that.query)) != 0) return c;
 1594           return compare(this.fragment, that.fragment);
 1595       }
 1596   
 1597       /**
 1598        * Returns the content of this URI as a string.
 1599        *
 1600        * <p> If this URI was created by invoking one of the constructors in this
 1601        * class then a string equivalent to the original input string, or to the
 1602        * string computed from the originally-given components, as appropriate, is
 1603        * returned.  Otherwise this URI was created by normalization, resolution,
 1604        * or relativization, and so a string is constructed from this URI's
 1605        * components according to the rules specified in <a
 1606        * href="http://www.ietf.org/rfc/rfc2396.txt">RFC&nbsp;2396</a>,
 1607        * section&nbsp;5.2, step&nbsp;7. </p>
 1608        *
 1609        * @return  The string form of this URI
 1610        */
 1611       public String toString() {
 1612           defineString();
 1613           return string;
 1614       }
 1615   
 1616       /**
 1617        * Returns the content of this URI as a US-ASCII string.
 1618        *
 1619        * <p> If this URI does not contain any characters in the <i>other</i>
 1620        * category then an invocation of this method will return the same value as
 1621        * an invocation of the {@link #toString() toString} method.  Otherwise
 1622        * this method works as if by invoking that method and then <a
 1623        * href="#encode">encoding</a> the result.  </p>
 1624        *
 1625        * @return  The string form of this URI, encoded as needed
 1626        *          so that it only contains characters in the US-ASCII
 1627        *          charset
 1628        */
 1629       public String toASCIIString() {
 1630           defineString();
 1631           return encode(string);
 1632       }
 1633   
 1634   
 1635       // -- Serialization support --
 1636   
 1637       /**
 1638        * Saves the content of this URI to the given serial stream.
 1639        *
 1640        * <p> The only serializable field of a URI instance is its <tt>string</tt>
 1641        * field.  That field is given a value, if it does not have one already,
 1642        * and then the {@link java.io.ObjectOutputStream#defaultWriteObject()}
 1643        * method of the given object-output stream is invoked. </p>
 1644        *
 1645        * @param  os  The object-output stream to which this object
 1646        *             is to be written
 1647        */
 1648       private void writeObject(ObjectOutputStream os)
 1649           throws IOException
 1650       {
 1651           defineString();
 1652           os.defaultWriteObject();        // Writes the string field only
 1653       }
 1654   
 1655       /**
 1656        * Reconstitutes a URI from the given serial stream.
 1657        *
 1658        * <p> The {@link java.io.ObjectInputStream#defaultReadObject()} method is
 1659        * invoked to read the value of the <tt>string</tt> field.  The result is
 1660        * then parsed in the usual way.
 1661        *
 1662        * @param  is  The object-input stream from which this object
 1663        *             is being read
 1664        */
 1665       private void readObject(ObjectInputStream is)
 1666           throws ClassNotFoundException, IOException
 1667       {
 1668           port = -1;                      // Argh
 1669           is.defaultReadObject();
 1670           try {
 1671               new Parser(string).parse(false);
 1672           } catch (URISyntaxException x) {
 1673               IOException y = new InvalidObjectException("Invalid URI");
 1674               y.initCause(x);
 1675               throw y;
 1676           }
 1677       }
 1678   
 1679   
 1680       // -- End of public methods --
 1681   
 1682   
 1683       // -- Utility methods for string-field comparison and hashing --
 1684   
 1685       // These methods return appropriate values for null string arguments,
 1686       // thereby simplifying the equals, hashCode, and compareTo methods.
 1687       //
 1688       // The case-ignoring methods should only be applied to strings whose
 1689       // characters are all known to be US-ASCII.  Because of this restriction,
 1690       // these methods are faster than the similar methods in the String class.
 1691   
 1692       // US-ASCII only
 1693       private static int toLower(char c) {
 1694           if ((c >= 'A') && (c <= 'Z'))
 1695               return c + ('a' - 'A');
 1696           return c;
 1697       }
 1698   
 1699       private static boolean equal(String s, String t) {
 1700           if (s == t) return true;
 1701           if ((s != null) && (t != null)) {
 1702               if (s.length() != t.length())
 1703                   return false;
 1704               if (s.indexOf('%') < 0)
 1705                   return s.equals(t);
 1706               int n = s.length();
 1707               for (int i = 0; i < n;) {
 1708                   char c = s.charAt(i);
 1709                   char d = t.charAt(i);
 1710                   if (c != '%') {
 1711                       if (c != d)
 1712                           return false;
 1713                       i++;
 1714                       continue;
 1715                   }
 1716                   i++;
 1717                   if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
 1718                       return false;
 1719                   i++;
 1720                   if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
 1721                       return false;
 1722                   i++;
 1723               }
 1724               return true;
 1725           }
 1726           return false;
 1727       }
 1728   
 1729       // US-ASCII only
 1730       private static boolean equalIgnoringCase(String s, String t) {
 1731           if (s == t) return true;
 1732           if ((s != null) && (t != null)) {
 1733               int n = s.length();
 1734               if (t.length() != n)
 1735                   return false;
 1736               for (int i = 0; i < n; i++) {
 1737                   if (toLower(s.charAt(i)) != toLower(t.charAt(i)))
 1738                       return false;
 1739               }
 1740               return true;
 1741           }
 1742           return false;
 1743       }
 1744   
 1745       private static int hash(int hash, String s) {
 1746           if (s == null) return hash;
 1747           return hash * 127 + s.hashCode();
 1748       }
 1749   
 1750       // US-ASCII only
 1751       private static int hashIgnoringCase(int hash, String s) {
 1752           if (s == null) return hash;
 1753           int h = hash;
 1754           int n = s.length();
 1755           for (int i = 0; i < n; i++)
 1756               h = 31 * h + toLower(s.charAt(i));
 1757           return h;
 1758       }
 1759   
 1760       private static int compare(String s, String t) {
 1761           if (s == t) return 0;
 1762           if (s != null) {
 1763               if (t != null)
 1764                   return s.compareTo(t);
 1765               else
 1766                   return +1;
 1767           } else {
 1768               return -1;
 1769           }
 1770       }
 1771   
 1772       // US-ASCII only
 1773       private static int compareIgnoringCase(String s, String t) {
 1774           if (s == t) return 0;
 1775           if (s != null) {
 1776               if (t != null) {
 1777                   int sn = s.length();
 1778                   int tn = t.length();
 1779                   int n = sn < tn ? sn : tn;
 1780                   for (int i = 0; i < n; i++) {
 1781                       int c = toLower(s.charAt(i)) - toLower(t.charAt(i));
 1782                       if (c != 0)
 1783                           return c;
 1784                   }
 1785                   return sn - tn;
 1786               }
 1787               return +1;
 1788           } else {
 1789               return -1;
 1790           }
 1791       }
 1792   
 1793   
 1794       // -- String construction --
 1795   
 1796       // If a scheme is given then the path, if given, must be absolute
 1797       //
 1798       private static void checkPath(String s, String scheme, String path)
 1799           throws URISyntaxException
 1800       {
 1801           if (scheme != null) {
 1802               if ((path != null)
 1803                   && ((path.length() > 0) && (path.charAt(0) != '/')))
 1804                   throw new URISyntaxException(s,
 1805                                                "Relative path in absolute URI");
 1806           }
 1807       }
 1808   
 1809       private void appendAuthority(StringBuffer sb,
 1810                                    String authority,
 1811                                    String userInfo,
 1812                                    String host,
 1813                                    int port)
 1814       {
 1815           if (host != null) {
 1816               sb.append("//");
 1817               if (userInfo != null) {
 1818                   sb.append(quote(userInfo, L_USERINFO, H_USERINFO));
 1819                   sb.append('@');
 1820               }
 1821               boolean needBrackets = ((host.indexOf(':') >= 0)
 1822                                       && !host.startsWith("[")
 1823                                       && !host.endsWith("]"));
 1824               if (needBrackets) sb.append('[');
 1825               sb.append(host);
 1826               if (needBrackets) sb.append(']');
 1827               if (port != -1) {
 1828                   sb.append(':');
 1829                   sb.append(port);
 1830               }
 1831           } else if (authority != null) {
 1832               sb.append("//");
 1833               if (authority.startsWith("[")) {
 1834                   int end = authority.indexOf("]");
 1835                   if (end != -1 && authority.indexOf(":")!=-1) {
 1836                       String doquote, dontquote;
 1837                       if (end == authority.length()) {
 1838                           dontquote = authority;
 1839                           doquote = "";
 1840                       } else {
 1841                           dontquote = authority.substring(0,end+1);
 1842                           doquote = authority.substring(end+1);
 1843                       }
 1844                       sb.append (dontquote);
 1845                       sb.append(quote(doquote,
 1846                               L_REG_NAME | L_SERVER,
 1847                               H_REG_NAME | H_SERVER));
 1848                   }
 1849               } else {
 1850                   sb.append(quote(authority,
 1851                               L_REG_NAME | L_SERVER,
 1852                               H_REG_NAME | H_SERVER));
 1853               }
 1854           }
 1855       }
 1856   
 1857       private void appendSchemeSpecificPart(StringBuffer sb,
 1858                                             String opaquePart,
 1859                                             String authority,
 1860                                             String userInfo,
 1861                                             String host,
 1862                                             int port,
 1863                                             String path,
 1864                                             String query)
 1865       {
 1866           if (opaquePart != null) {
 1867               /* check if SSP begins with an IPv6 address
 1868                * because we must not quote a literal IPv6 address
 1869                */
 1870               if (opaquePart.startsWith("//[")) {
 1871                   int end =  opaquePart.indexOf("]");
 1872                   if (end != -1 && opaquePart.indexOf(":")!=-1) {
 1873                       String doquote, dontquote;
 1874                       if (end == opaquePart.length()) {
 1875                           dontquote = opaquePart;
 1876                           doquote = "";
 1877                       } else {
 1878                           dontquote = opaquePart.substring(0,end+1);
 1879                           doquote = opaquePart.substring(end+1);
 1880                       }
 1881                       sb.append (dontquote);
 1882                       sb.append(quote(doquote, L_URIC, H_URIC));
 1883                   }
 1884               } else {
 1885                   sb.append(quote(opaquePart, L_URIC, H_URIC));
 1886               }
 1887           } else {
 1888               appendAuthority(sb, authority, userInfo, host, port);
 1889               if (path != null)
 1890                   sb.append(quote(path, L_PATH, H_PATH));
 1891               if (query != null) {
 1892                   sb.append('?');
 1893                   sb.append(quote(query, L_URIC, H_URIC));
 1894               }
 1895           }
 1896       }
 1897   
 1898       private void appendFragment(StringBuffer sb, String fragment) {
 1899           if (fragment != null) {
 1900               sb.append('#');
 1901               sb.append(quote(fragment, L_URIC, H_URIC));
 1902           }
 1903       }
 1904   
 1905       private String toString(String scheme,
 1906                               String opaquePart,
 1907                               String authority,
 1908                               String userInfo,
 1909                               String host,
 1910                               int port,
 1911                               String path,
 1912                               String query,
 1913                               String fragment)
 1914       {
 1915           StringBuffer sb = new StringBuffer();
 1916           if (scheme != null) {
 1917               sb.append(scheme);
 1918               sb.append(':');
 1919           }
 1920           appendSchemeSpecificPart(sb, opaquePart,
 1921                                    authority, userInfo, host, port,
 1922                                    path, query);
 1923           appendFragment(sb, fragment);
 1924           return sb.toString();
 1925       }
 1926   
 1927       private void defineSchemeSpecificPart() {
 1928           if (schemeSpecificPart != null) return;
 1929           StringBuffer sb = new StringBuffer();
 1930           appendSchemeSpecificPart(sb, null, getAuthority(), getUserInfo(),
 1931                                    host, port, getPath(), getQuery());
 1932           if (sb.length() == 0) return;
 1933           schemeSpecificPart = sb.toString();
 1934       }
 1935   
 1936       private void defineString() {
 1937           if (string != null) return;
 1938   
 1939           StringBuffer sb = new StringBuffer();
 1940           if (scheme != null) {
 1941               sb.append(scheme);
 1942               sb.append(':');
 1943           }
 1944           if (isOpaque()) {
 1945               sb.append(schemeSpecificPart);
 1946           } else {
 1947               if (host != null) {
 1948                   sb.append("//");
 1949                   if (userInfo != null) {
 1950                       sb.append(userInfo);
 1951                       sb.append('@');
 1952                   }
 1953                   boolean needBrackets = ((host.indexOf(':') >= 0)
 1954                                       && !host.startsWith("[")
 1955                                       && !host.endsWith("]"));
 1956                   if (needBrackets) sb.append('[');
 1957                   sb.append(host);
 1958                   if (needBrackets) sb.append(']');
 1959                   if (port != -1) {
 1960                       sb.append(':');
 1961                       sb.append(port);
 1962                   }
 1963               } else if (authority != null) {
 1964                   sb.append("//");
 1965                   sb.append(authority);
 1966               }
 1967               if (path != null)
 1968                   sb.append(path);
 1969               if (query != null) {
 1970                   sb.append('?');
 1971                   sb.append(query);
 1972               }
 1973           }
 1974           if (fragment != null) {
 1975               sb.append('#');
 1976               sb.append(fragment);
 1977           }
 1978           string = sb.toString();
 1979       }
 1980   
 1981   
 1982       // -- Normalization, resolution, and relativization --
 1983   
 1984       // RFC2396 5.2 (6)
 1985       private static String resolvePath(String base, String child,
 1986                                         boolean absolute)
 1987       {
 1988           int i = base.lastIndexOf('/');
 1989           int cn = child.length();
 1990           String path = "";
 1991   
 1992           if (cn == 0) {
 1993               // 5.2 (6a)
 1994               if (i >= 0)
 1995                   path = base.substring(0, i + 1);
 1996           } else {
 1997               StringBuffer sb = new StringBuffer(base.length() + cn);
 1998               // 5.2 (6a)
 1999               if (i >= 0)
 2000                   sb.append(base.substring(0, i + 1));
 2001               // 5.2 (6b)
 2002               sb.append(child);
 2003               path = sb.toString();
 2004           }
 2005   
 2006           // 5.2 (6c-f)
 2007           String np = normalize(path);
 2008   
 2009           // 5.2 (6g): If the result is absolute but the path begins with "../",
 2010           // then we simply leave the path as-is
 2011   
 2012           return np;
 2013       }
 2014   
 2015       // RFC2396 5.2
 2016       private static URI resolve(URI base, URI child) {
 2017           // check if child if opaque first so that NPE is thrown
 2018           // if child is null.
 2019           if (child.isOpaque() || base.isOpaque())
 2020               return child;
 2021   
 2022           // 5.2 (2): Reference to current document (lone fragment)
 2023           if ((child.scheme == null) && (child.authority == null)
 2024               && child.path.equals("") && (child.fragment != null)
 2025               && (child.query == null)) {
 2026               if ((base.fragment != null)
 2027                   && child.fragment.equals(base.fragment)) {
 2028                   return base;
 2029               }
 2030               URI ru = new URI();
 2031               ru.scheme = base.scheme;
 2032               ru.authority = base.authority;
 2033               ru.userInfo = base.userInfo;
 2034               ru.host = base.host;
 2035               ru.port = base.port;
 2036               ru.path = base.path;
 2037               ru.fragment = child.fragment;
 2038               ru.query = base.query;
 2039               return ru;
 2040           }
 2041   
 2042           // 5.2 (3): Child is absolute
 2043           if (child.scheme != null)
 2044               return child;
 2045   
 2046           URI ru = new URI();             // Resolved URI
 2047           ru.scheme = base.scheme;
 2048           ru.query = child.query;
 2049           ru.fragment = child.fragment;
 2050   
 2051           // 5.2 (4): Authority
 2052           if (child.authority == null) {
 2053               ru.authority = base.authority;
 2054               ru.host = base.host;
 2055               ru.userInfo = base.userInfo;
 2056               ru.port = base.port;
 2057   
 2058               String cp = (child.path == null) ? "" : child.path;
 2059               if ((cp.length() > 0) && (cp.charAt(0) == '/')) {
 2060                   // 5.2 (5): Child path is absolute
 2061                   ru.path = child.path;
 2062               } else {
 2063                   // 5.2 (6): Resolve relative path
 2064                   ru.path = resolvePath(base.path, cp, base.isAbsolute());
 2065               }
 2066           } else {
 2067               ru.authority = child.authority;
 2068               ru.host = child.host;
 2069               ru.userInfo = child.userInfo;
 2070               ru.host = child.host;
 2071               ru.port = child.port;
 2072               ru.path = child.path;
 2073           }
 2074   
 2075           // 5.2 (7): Recombine (nothing to do here)
 2076           return ru;
 2077       }
 2078   
 2079       // If the given URI's path is normal then return the URI;
 2080       // o.w., return a new URI containing the normalized path.
 2081       //
 2082       private static URI normalize(URI u) {
 2083           if (u.isOpaque() || (u.path == null) || (u.path.length() == 0))
 2084               return u;
 2085   
 2086           String np = normalize(u.path);
 2087           if (np == u.path)
 2088               return u;
 2089   
 2090           URI v = new URI();
 2091           v.scheme = u.scheme;
 2092           v.fragment = u.fragment;
 2093           v.authority = u.authority;
 2094           v.userInfo = u.userInfo;
 2095           v.host = u.host;
 2096           v.port = u.port;
 2097           v.path = np;
 2098           v.query = u.query;
 2099           return v;
 2100       }
 2101   
 2102       // If both URIs are hierarchical, their scheme and authority components are
 2103       // identical, and the base path is a prefix of the child's path, then
 2104       // return a relative URI that, when resolved against the base, yields the
 2105       // child; otherwise, return the child.
 2106       //
 2107       private static URI relativize(URI base, URI child) {
 2108           // check if child if opaque first so that NPE is thrown
 2109           // if child is null.
 2110           if (child.isOpaque() || base.isOpaque())
 2111               return child;
 2112           if (!equalIgnoringCase(base.scheme, child.scheme)
 2113               || !equal(base.authority, child.authority))
 2114               return child;
 2115   
 2116           String bp = normalize(base.path);
 2117           String cp = normalize(child.path);
 2118           if (!bp.equals(cp)) {
 2119               if (!bp.endsWith("/"))
 2120                   bp = bp + "/";
 2121               if (!cp.startsWith(bp))
 2122                   return child;
 2123           }
 2124   
 2125           URI v = new URI();
 2126           v.path = cp.substring(bp.length());
 2127           v.query = child.query;
 2128           v.fragment = child.fragment;
 2129           return v;
 2130       }
 2131   
 2132   
 2133   
 2134       // -- Path normalization --
 2135   
 2136       // The following algorithm for path normalization avoids the creation of a
 2137       // string object for each segment, as well as the use of a string buffer to
 2138       // compute the final result, by using a single char array and editing it in
 2139       // place.  The array is first split into segments, replacing each slash
 2140       // with '\0' and creating a segment-index array, each element of which is
 2141       // the index of the first char in the corresponding segment.  We then walk
 2142       // through both arrays, removing ".", "..", and other segments as necessary
 2143       // by setting their entries in the index array to -1.  Finally, the two
 2144       // arrays are used to rejoin the segments and compute the final result.
 2145       //
 2146       // This code is based upon src/solaris/native/java/io/canonicalize_md.c
 2147   
 2148   
 2149       // Check the given path to see if it might need normalization.  A path
 2150       // might need normalization if it contains duplicate slashes, a "."
 2151       // segment, or a ".." segment.  Return -1 if no further normalization is
 2152       // possible, otherwise return the number of segments found.
 2153       //
 2154       // This method takes a string argument rather than a char array so that
 2155       // this test can be performed without invoking path.toCharArray().
 2156       //
 2157       static private int needsNormalization(String path) {
 2158           boolean normal = true;
 2159           int ns = 0;                     // Number of segments
 2160           int end = path.length() - 1;    // Index of last char in path
 2161           int p = 0;                      // Index of next char in path
 2162   
 2163           // Skip initial slashes
 2164           while (p <= end) {
 2165               if (path.charAt(p) != '/') break;
 2166               p++;
 2167           }
 2168           if (p > 1) normal = false;
 2169   
 2170           // Scan segments
 2171           while (p <= end) {
 2172   
 2173               // Looking at "." or ".." ?
 2174               if ((path.charAt(p) == '.')
 2175                   && ((p == end)
 2176                       || ((path.charAt(p + 1) == '/')
 2177                           || ((path.charAt(p + 1) == '.')
 2178                               && ((p + 1 == end)
 2179                                   || (path.charAt(p + 2) == '/')))))) {
 2180                   normal = false;
 2181               }
 2182               ns++;
 2183   
 2184               // Find beginning of next segment
 2185               while (p <= end) {
 2186                   if (path.charAt(p++) != '/')
 2187                       continue;
 2188   
 2189                   // Skip redundant slashes
 2190                   while (p <= end) {
 2191                       if (path.charAt(p) != '/') break;
 2192                       normal = false;
 2193                       p++;
 2194                   }
 2195   
 2196                   break;
 2197               }
 2198           }
 2199   
 2200           return normal ? -1 : ns;
 2201       }
 2202   
 2203   
 2204       // Split the given path into segments, replacing slashes with nulls and
 2205       // filling in the given segment-index array.
 2206       //
 2207       // Preconditions:
 2208       //   segs.length == Number of segments in path
 2209       //
 2210       // Postconditions:
 2211       //   All slashes in path replaced by '\0'
 2212       //   segs[i] == Index of first char in segment i (0 <= i < segs.length)
 2213       //
 2214       static private void split(char[] path, int[] segs) {
 2215           int end = path.length - 1;      // Index of last char in path
 2216           int p = 0;                      // Index of next char in path
 2217           int i = 0;                      // Index of current segment
 2218   
 2219           // Skip initial slashes
 2220           while (p <= end) {
 2221               if (path[p] != '/') break;
 2222               path[p] = '\0';
 2223               p++;
 2224           }
 2225   
 2226           while (p <= end) {
 2227   
 2228               // Note start of segment
 2229               segs[i++] = p++;
 2230   
 2231               // Find beginning of next segment
 2232               while (p <= end) {
 2233                   if (path[p++] != '/')
 2234                       continue;
 2235                   path[p - 1] = '\0';
 2236   
 2237                   // Skip redundant slashes
 2238                   while (p <= end) {
 2239                       if (path[p] != '/') break;
 2240                       path[p++] = '\0';
 2241                   }
 2242                   break;
 2243               }
 2244           }
 2245   
 2246           if (i != segs.length)
 2247               throw new InternalError();  // ASSERT
 2248       }
 2249   
 2250   
 2251       // Join the segments in the given path according to the given segment-index
 2252       // array, ignoring those segments whose index entries have been set to -1,
 2253       // and inserting slashes as needed.  Return the length of the resulting
 2254       // path.
 2255       //
 2256       // Preconditions:
 2257       //   segs[i] == -1 implies segment i is to be ignored
 2258       //   path computed by split, as above, with '\0' having replaced '/'
 2259       //
 2260       // Postconditions:
 2261       //   path[0] .. path[return value] == Resulting path
 2262       //
 2263       static private int join(char[] path, int[] segs) {
 2264           int ns = segs.length;           // Number of segments
 2265           int end = path.length - 1;      // Index of last char in path
 2266           int p = 0;                      // Index of next path char to write
 2267   
 2268           if (path[p] == '\0') {
 2269               // Restore initial slash for absolute paths
 2270               path[p++] = '/';
 2271           }
 2272   
 2273           for (int i = 0; i < ns; i++) {
 2274               int q = segs[i];            // Current segment
 2275               if (q == -1)
 2276                   // Ignore this segment
 2277                   continue;
 2278   
 2279               if (p == q) {
 2280                   // We're already at this segment, so just skip to its end
 2281                   while ((p <= end) && (path[p] != '\0'))
 2282                       p++;
 2283                   if (p <= end) {
 2284                       // Preserve trailing slash
 2285                       path[p++] = '/';
 2286                   }
 2287               } else if (p < q) {
 2288                   // Copy q down to p
 2289                   while ((q <= end) && (path[q] != '\0'))
 2290                       path[p++] = path[q++];
 2291                   if (q <= end) {
 2292                       // Preserve trailing slash
 2293                       path[p++] = '/';
 2294                   }
 2295               } else
 2296                   throw new InternalError(); // ASSERT false
 2297           }
 2298   
 2299           return p;
 2300       }
 2301   
 2302   
 2303       // Remove "." segments from the given path, and remove segment pairs
 2304       // consisting of a non-".." segment followed by a ".." segment.
 2305       //
 2306       private static void removeDots(char[] path, int[] segs) {
 2307           int ns = segs.length;
 2308           int end = path.length - 1;
 2309   
 2310           for (int i = 0; i < ns; i++) {
 2311               int dots = 0;               // Number of dots found (0, 1, or 2)
 2312   
 2313               // Find next occurrence of "." or ".."
 2314               do {
 2315                   int p = segs[i];
 2316                   if (path[p] == '.') {
 2317                       if (p == end) {
 2318                           dots = 1;
 2319                           break;
 2320                       } else if (path[p + 1] == '\0') {
 2321                           dots = 1;
 2322                           break;
 2323                       } else if ((path[p + 1] == '.')
 2324                                  && ((p + 1 == end)
 2325                                      || (path[p + 2] == '\0'))) {
 2326                           dots = 2;
 2327                           break;
 2328                       }
 2329                   }
 2330                   i++;
 2331               } while (i < ns);
 2332               if ((i > ns) || (dots == 0))
 2333                   break;
 2334   
 2335               if (dots == 1) {
 2336                   // Remove this occurrence of "."
 2337                   segs[i] = -1;
 2338               } else {
 2339                   // If there is a preceding non-".." segment, remove both that
 2340                   // segment and this occurrence of ".."; otherwise, leave this
 2341                   // ".." segment as-is.
 2342                   int j;
 2343                   for (j = i - 1; j >= 0; j--) {
 2344                       if (segs[j] != -1) break;
 2345                   }
 2346                   if (j >= 0) {
 2347                       int q = segs[j];
 2348                       if (!((path[q] == '.')
 2349                             && (path[q + 1] == '.')
 2350                             && (path[q + 2] == '\0'))) {
 2351                           segs[i] = -1;
 2352                           segs[j] = -1;
 2353                       }
 2354                   }
 2355               }
 2356           }
 2357       }
 2358   
 2359   
 2360       // DEVIATION: If the normalized path is relative, and if the first
 2361       // segment could be parsed as a scheme name, then prepend a "." segment
 2362       //
 2363       private static void maybeAddLeadingDot(char[] path, int[] segs) {
 2364   
 2365           if (path[0] == '\0')
 2366               // The path is absolute
 2367               return;
 2368   
 2369           int ns = segs.length;
 2370           int f = 0;                      // Index of first segment
 2371           while (f < ns) {
 2372               if (segs[f] >= 0)
 2373                   break;
 2374               f++;
 2375           }
 2376           if ((f >= ns) || (f == 0))
 2377               // The path is empty, or else the original first segment survived,
 2378               // in which case we already know that no leading "." is needed
 2379               return;
 2380   
 2381           int p = segs[f];
 2382           while ((p < path.length) && (path[p] != ':') && (path[p] != '\0')) p++;
 2383           if (p >= path.length || path[p] == '\0')
 2384               // No colon in first segment, so no "." needed
 2385               return;
 2386   
 2387           // At this point we know that the first segment is unused,
 2388           // hence we can insert a "." segment at that position
 2389           path[0] = '.';
 2390           path[1] = '\0';
 2391           segs[0] = 0;
 2392       }
 2393   
 2394   
 2395       // Normalize the given path string.  A normal path string has no empty
 2396       // segments (i.e., occurrences of "//"), no segments equal to ".", and no
 2397       // segments equal to ".." that are preceded by a segment not equal to "..".
 2398       // In contrast to Unix-style pathname normalization, for URI paths we
 2399       // always retain trailing slashes.
 2400       //
 2401       private static String normalize(String ps) {
 2402   
 2403           // Does this path need normalization?
 2404           int ns = needsNormalization(ps);        // Number of segments
 2405           if (ns < 0)
 2406               // Nope -- just return it
 2407               return ps;
 2408   
 2409           char[] path = ps.toCharArray();         // Path in char-array form
 2410   
 2411           // Split path into segments
 2412           int[] segs = new int[ns];               // Segment-index array
 2413           split(path, segs);
 2414   
 2415           // Remove dots
 2416           removeDots(path, segs);
 2417   
 2418           // Prevent scheme-name confusion
 2419           maybeAddLeadingDot(path, segs);
 2420   
 2421           // Join the remaining segments and return the result
 2422           String s = new String(path, 0, join(path, segs));
 2423           if (s.equals(ps)) {
 2424               // string was already normalized
 2425               return ps;
 2426           }
 2427           return s;
 2428       }
 2429   
 2430   
 2431   
 2432       // -- Character classes for parsing --
 2433   
 2434       // RFC2396 precisely specifies which characters in the US-ASCII charset are
 2435       // permissible in the various components of a URI reference.  We here
 2436       // define a set of mask pairs to aid in enforcing these restrictions.  Each
 2437       // mask pair consists of two longs, a low mask and a high mask.  Taken
 2438       // together they represent a 128-bit mask, where bit i is set iff the
 2439       // character with value i is permitted.
 2440       //
 2441       // This approach is more efficient than sequentially searching arrays of
 2442       // permitted characters.  It could be made still more efficient by
 2443       // precompiling the mask information so that a character's presence in a
 2444       // given mask could be determined by a single table lookup.
 2445   
 2446       // Compute the low-order mask for the characters in the given string
 2447       private static long lowMask(String chars) {
 2448           int n = chars.length();
 2449           long m = 0;
 2450           for (int i = 0; i < n; i++) {
 2451               char c = chars.charAt(i);
 2452               if (c < 64)
 2453                   m |= (1L << c);
 2454           }
 2455           return m;
 2456       }
 2457   
 2458       // Compute the high-order mask for the characters in the given string
 2459       private static long highMask(String chars) {
 2460           int n = chars.length();
 2461           long m = 0;
 2462           for (int i = 0; i < n; i++) {
 2463               char c = chars.charAt(i);
 2464               if ((c >= 64) && (c < 128))
 2465                   m |= (1L << (c - 64));
 2466           }
 2467           return m;
 2468       }
 2469   
 2470       // Compute a low-order mask for the characters
 2471       // between first and last, inclusive
 2472       private static long lowMask(char first, char last) {
 2473           long m = 0;
 2474           int f = Math.max(Math.min(first, 63), 0);
 2475           int l = Math.max(Math.min(last, 63), 0);
 2476           for (int i = f; i <= l; i++)
 2477               m |= 1L << i;
 2478           return m;
 2479       }
 2480   
 2481       // Compute a high-order mask for the characters
 2482       // between first and last, inclusive
 2483       private static long highMask(char first, char last) {
 2484           long m = 0;
 2485           int f = Math.max(Math.min(first, 127), 64) - 64;
 2486           int l = Math.max(Math.min(last, 127), 64) - 64;
 2487           for (int i = f; i <= l; i++)
 2488               m |= 1L << i;
 2489           return m;
 2490       }
 2491   
 2492       // Tell whether the given character is permitted by the given mask pair
 2493       private static boolean match(char c, long lowMask, long highMask) {
 2494           if (c < 64)
 2495               return ((1L << c) & lowMask) != 0;
 2496           if (c < 128)
 2497               return ((1L << (c - 64)) & highMask) != 0;
 2498           return false;
 2499       }
 2500   
 2501       // Character-class masks, in reverse order from RFC2396 because
 2502       // initializers for static fields cannot make forward references.
 2503   
 2504       // digit    = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" |
 2505       //            "8" | "9"
 2506       private static final long L_DIGIT = lowMask('0', '9');
 2507       private static final long H_DIGIT = 0L;
 2508   
 2509       // upalpha  = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" |
 2510       //            "J" | "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" |
 2511       //            "S" | "T" | "U" | "V" | "W" | "X" | "Y" | "Z"
 2512       private static final long L_UPALPHA = 0L;
 2513       private static final long H_UPALPHA = highMask('A', 'Z');
 2514   
 2515       // lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" |
 2516       //            "j" | "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" |
 2517       //            "s" | "t" | "u" | "v" | "w" | "x" | "y" | "z"
 2518       private static final long L_LOWALPHA = 0L;
 2519       private static final long H_LOWALPHA = highMask('a', 'z');
 2520   
 2521       // alpha         = lowalpha | upalpha
 2522       private static final long L_ALPHA = L_LOWALPHA | L_UPALPHA;
 2523       private static final long H_ALPHA = H_LOWALPHA | H_UPALPHA;
 2524   
 2525       // alphanum      = alpha | digit
 2526       private static final long L_ALPHANUM = L_DIGIT | L_ALPHA;
 2527       private static final long H_ALPHANUM = H_DIGIT | H_ALPHA;
 2528   
 2529       // hex           = digit | "A" | "B" | "C" | "D" | "E" | "F" |
 2530       //                         "a" | "b" | "c" | "d" | "e" | "f"
 2531       private static final long L_HEX = L_DIGIT;
 2532       private static final long H_HEX = highMask('A', 'F') | highMask('a', 'f');
 2533   
 2534       // mark          = "-" | "_" | "." | "!" | "~" | "*" | "'" |
 2535       //                 "(" | ")"
 2536       private static final long L_MARK = lowMask("-_.!~*'()");
 2537       private static final long H_MARK = highMask("-_.!~*'()");
 2538   
 2539       // unreserved    = alphanum | mark
 2540       private static final long L_UNRESERVED = L_ALPHANUM | L_MARK;
 2541       private static final long H_UNRESERVED = H_ALPHANUM | H_MARK;
 2542   
 2543       // reserved      = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
 2544       //                 "$" | "," | "[" | "]"
 2545       // Added per RFC2732: "[", "]"
 2546       private static final long L_RESERVED = lowMask(";/?:@&=+$,[]");
 2547       private static final long H_RESERVED = highMask(";/?:@&=+$,[]");
 2548   
 2549       // The zero'th bit is used to indicate that escape pairs and non-US-ASCII
 2550       // characters are allowed; this is handled by the scanEscape method below.
 2551       private static final long L_ESCAPED = 1L;
 2552       private static final long H_ESCAPED = 0L;
 2553   
 2554       // uric          = reserved | unreserved | escaped
 2555       private static final long L_URIC = L_RESERVED | L_UNRESERVED | L_ESCAPED;
 2556       private static final long H_URIC = H_RESERVED | H_UNRESERVED | H_ESCAPED;
 2557   
 2558       // pchar         = unreserved | escaped |
 2559       //                 ":" | "@" | "&" | "=" | "+" | "$" | ","
 2560       private static final long L_PCHAR
 2561           = L_UNRESERVED | L_ESCAPED | lowMask(":@&=+$,");
 2562       private static final long H_PCHAR
 2563           = H_UNRESERVED | H_ESCAPED | highMask(":@&=+$,");
 2564   
 2565       // All valid path characters
 2566       private static final long L_PATH = L_PCHAR | lowMask(";/");
 2567       private static final long H_PATH = H_PCHAR | highMask(";/");
 2568   
 2569       // Dash, for use in domainlabel and toplabel
 2570       private static final long L_DASH = lowMask("-");
 2571       private static final long H_DASH = highMask("-");
 2572   
 2573       // Dot, for use in hostnames
 2574       private static final long L_DOT = lowMask(".");
 2575       private static final long H_DOT = highMask(".");
 2576   
 2577       // userinfo      = *( unreserved | escaped |
 2578       //                    ";" | ":" | "&" | "=" | "+" | "$" | "," )
 2579       private static final long L_USERINFO
 2580           = L_UNRESERVED | L_ESCAPED | lowMask(";:&=+$,");
 2581       private static final long H_USERINFO
 2582           = H_UNRESERVED | H_ESCAPED | highMask(";:&=+$,");
 2583   
 2584       // reg_name      = 1*( unreserved | escaped | "$" | "," |
 2585       //                     ";" | ":" | "@" | "&" | "=" | "+" )
 2586       private static final long L_REG_NAME
 2587           = L_UNRESERVED | L_ESCAPED | lowMask("$,;:@&=+");
 2588       private static final long H_REG_NAME
 2589           = H_UNRESERVED | H_ESCAPED | highMask("$,;:@&=+");
 2590   
 2591       // All valid characters for server-based authorities
 2592       private static final long L_SERVER
 2593           = L_USERINFO | L_ALPHANUM | L_DASH | lowMask(".:@[]");
 2594       private static final long H_SERVER
 2595           = H_USERINFO | H_ALPHANUM | H_DASH | highMask(".:@[]");
 2596   
 2597       // Special case of server authority that represents an IPv6 address
 2598       // In this case, a % does not signify an escape sequence
 2599       private static final long L_SERVER_PERCENT
 2600           = L_SERVER | lowMask("%");
 2601       private static final long H_SERVER_PERCENT
 2602           = H_SERVER | highMask("%");
 2603       private static final long L_LEFT_BRACKET = lowMask("[");
 2604       private static final long H_LEFT_BRACKET = highMask("[");
 2605   
 2606       // scheme        = alpha *( alpha | digit | "+" | "-" | "." )
 2607       private static final long L_SCHEME = L_ALPHA | L_DIGIT | lowMask("+-.");
 2608       private static final long H_SCHEME = H_ALPHA | H_DIGIT | highMask("+-.");
 2609   
 2610       // uric_no_slash = unreserved | escaped | ";" | "?" | ":" | "@" |
 2611       //                 "&" | "=" | "+" | "$" | ","
 2612       private static final long L_URIC_NO_SLASH
 2613           = L_UNRESERVED | L_ESCAPED | lowMask(";?:@&=+$,");
 2614       private static final long H_URIC_NO_SLASH
 2615           = H_UNRESERVED | H_ESCAPED | highMask(";?:@&=+$,");
 2616   
 2617   
 2618       // -- Escaping and encoding --
 2619   
 2620       private final static char[] hexDigits = {
 2621           '0', '1', '2', '3', '4', '5', '6', '7',
 2622           '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
 2623       };
 2624   
 2625       private static void appendEscape(StringBuffer sb, byte b) {
 2626           sb.append('%');
 2627           sb.append(hexDigits[(b >> 4) & 0x0f]);
 2628           sb.append(hexDigits[(b >> 0) & 0x0f]);
 2629       }
 2630   
 2631       private static void appendEncoded(StringBuffer sb, char c) {
 2632           ByteBuffer bb = null;
 2633           try {
 2634               bb = ThreadLocalCoders.encoderFor("UTF-8")
 2635                   .encode(CharBuffer.wrap("" + c));
 2636           } catch (CharacterCodingException x) {
 2637               assert false;
 2638           }
 2639           while (bb.hasRemaining()) {
 2640               int b = bb.get() & 0xff;
 2641               if (b >= 0x80)
 2642                   appendEscape(sb, (byte)b);
 2643               else
 2644                   sb.append((char)b);
 2645           }
 2646       }
 2647   
 2648       // Quote any characters in s that are not permitted
 2649       // by the given mask pair
 2650       //
 2651       private static String quote(String s, long lowMask, long highMask) {
 2652           int n = s.length();
 2653           StringBuffer sb = null;
 2654           boolean allowNonASCII = ((lowMask & L_ESCAPED) != 0);
 2655           for (int i = 0; i < s.length(); i++) {
 2656               char c = s.charAt(i);
 2657               if (c < '\u0080') {
 2658                   if (!match(c, lowMask, highMask)) {
 2659                       if (sb == null) {
 2660                           sb = new StringBuffer();
 2661                           sb.append(s.substring(0, i));
 2662                       }
 2663                       appendEscape(sb, (byte)c);
 2664                   } else {
 2665                       if (sb != null)
 2666                           sb.append(c);
 2667                   }
 2668               } else if (allowNonASCII
 2669                          && (Character.isSpaceChar(c)
 2670                              || Character.isISOControl(c))) {
 2671                   if (sb == null) {
 2672                       sb = new StringBuffer();
 2673                       sb.append(s.substring(0, i));
 2674                   }
 2675                   appendEncoded(sb, c);
 2676               } else {
 2677                   if (sb != null)
 2678                       sb.append(c);
 2679               }
 2680           }
 2681           return (sb == null) ? s : sb.toString();
 2682       }
 2683   
 2684       // Encodes all characters >= \u0080 into escaped, normalized UTF-8 octets,
 2685       // assuming that s is otherwise legal
 2686       //
 2687       private static String encode(String s) {
 2688           int n = s.length();
 2689           if (n == 0)
 2690               return s;
 2691   
 2692           // First check whether we actually need to encode
 2693           for (int i = 0;;) {
 2694               if (s.charAt(i) >= '\u0080')
 2695                   break;
 2696               if (++i >= n)
 2697                   return s;
 2698           }
 2699   
 2700           String ns = Normalizer.normalize(s, Normalizer.Form.NFC);
 2701           ByteBuffer bb = null;
 2702           try {
 2703               bb = ThreadLocalCoders.encoderFor("UTF-8")
 2704                   .encode(CharBuffer.wrap(ns));
 2705           } catch (CharacterCodingException x) {
 2706               assert false;
 2707           }
 2708   
 2709           StringBuffer sb = new StringBuffer();
 2710           while (bb.hasRemaining()) {
 2711               int b = bb.get() & 0xff;
 2712               if (b >= 0x80)
 2713                   appendEscape(sb, (byte)b);
 2714               else
 2715                   sb.append((char)b);
 2716           }
 2717           return sb.toString();
 2718       }
 2719   
 2720       private static int decode(char c) {
 2721           if ((c >= '0') && (c <= '9'))
 2722               return c - '0';
 2723           if ((c >= 'a') && (c <= 'f'))
 2724               return c - 'a' + 10;
 2725           if ((c >= 'A') && (c <= 'F'))
 2726               return c - 'A' + 10;
 2727           assert false;
 2728           return -1;
 2729       }
 2730   
 2731       private static byte decode(char c1, char c2) {
 2732           return (byte)(  ((decode(c1) & 0xf) << 4)
 2733                         | ((decode(c2) & 0xf) << 0));
 2734       }
 2735   
 2736       // Evaluates all escapes in s, applying UTF-8 decoding if needed.  Assumes
 2737       // that escapes are well-formed syntactically, i.e., of the form %XX.  If a
 2738       // sequence of escaped octets is not valid UTF-8 then the erroneous octets
 2739       // are replaced with '\uFFFD'.
 2740       // Exception: any "%" found between "[]" is left alone. It is an IPv6 literal
 2741       //            with a scope_id
 2742       //
 2743       private static String decode(String s) {
 2744           if (s == null)
 2745               return s;
 2746           int n = s.length();
 2747           if (n == 0)
 2748               return s;
 2749           if (s.indexOf('%') < 0)
 2750               return s;
 2751   
 2752           StringBuffer sb = new StringBuffer(n);
 2753           ByteBuffer bb = ByteBuffer.allocate(n);
 2754           CharBuffer cb = CharBuffer.allocate(n);
 2755           CharsetDecoder dec = ThreadLocalCoders.decoderFor("UTF-8")
 2756               .onMalformedInput(CodingErrorAction.REPLACE)
 2757               .onUnmappableCharacter(CodingErrorAction.REPLACE);
 2758   
 2759           // This is not horribly efficient, but it will do for now
 2760           char c = s.charAt(0);
 2761           boolean betweenBrackets = false;
 2762   
 2763           for (int i = 0; i < n;) {
 2764               assert c == s.charAt(i);    // Loop invariant
 2765               if (c == '[') {
 2766                   betweenBrackets = true;
 2767               } else if (betweenBrackets && c == ']') {
 2768                   betweenBrackets = false;
 2769               }
 2770               if (c != '%' || betweenBrackets) {
 2771                   sb.append(c);
 2772                   if (++i >= n)
 2773                       break;
 2774                   c = s.charAt(i);
 2775                   continue;
 2776               }
 2777               bb.clear();
 2778               int ui = i;
 2779               for (;;) {
 2780                   assert (n - i >= 2);
 2781                   bb.put(decode(s.charAt(++i), s.charAt(++i)));
 2782                   if (++i >= n)
 2783                       break;
 2784                   c = s.charAt(i);
 2785                   if (c != '%')
 2786                       break;
 2787               }
 2788               bb.flip();
 2789               cb.clear();
 2790               dec.reset();
 2791               CoderResult cr = dec.decode(bb, cb, true);
 2792               assert cr.isUnderflow();
 2793               cr = dec.flush(cb);
 2794               assert cr.isUnderflow();
 2795               sb.append(cb.flip().toString());
 2796           }
 2797   
 2798           return sb.toString();
 2799       }
 2800   
 2801   
 2802       // -- Parsing --
 2803   
 2804       // For convenience we wrap the input URI string in a new instance of the
 2805       // following internal class.  This saves always having to pass the input
 2806       // string as an argument to each internal scan/parse method.
 2807   
 2808       private class Parser {
 2809   
 2810           private String input;           // URI input string
 2811           private boolean requireServerAuthority = false;
 2812   
 2813           Parser(String s) {
 2814               input = s;
 2815               string = s;
 2816           }
 2817   
 2818           // -- Methods for throwing URISyntaxException in various ways --
 2819   
 2820           private void fail(String reason) throws URISyntaxException {
 2821               throw new URISyntaxException(input, reason);
 2822           }
 2823   
 2824           private void fail(String reason, int p) throws URISyntaxException {
 2825               throw new URISyntaxException(input, reason, p);
 2826           }
 2827   
 2828           private void failExpecting(String expected, int p)
 2829               throws URISyntaxException
 2830           {
 2831               fail("Expected " + expected, p);
 2832           }
 2833   
 2834           private void failExpecting(String expected, String prior, int p)
 2835               throws URISyntaxException
 2836           {
 2837               fail("Expected " + expected + " following " + prior, p);
 2838           }
 2839   
 2840   
 2841           // -- Simple access to the input string --
 2842   
 2843           // Return a substring of the input string
 2844           //
 2845           private String substring(int start, int end) {
 2846               return input.substring(start, end);
 2847           }
 2848   
 2849           // Return the char at position p,
 2850           // assuming that p < input.length()
 2851           //
 2852           private char charAt(int p) {
 2853               return input.charAt(p);
 2854           }
 2855   
 2856           // Tells whether start < end and, if so, whether charAt(start) == c
 2857           //
 2858           private boolean at(int start, int end, char c) {
 2859               return (start < end) && (charAt(start) == c);
 2860           }
 2861   
 2862           // Tells whether start + s.length() < end and, if so,
 2863           // whether the chars at the start position match s exactly
 2864           //
 2865           private boolean at(int start, int end, String s) {
 2866               int p = start;
 2867               int sn = s.length();
 2868               if (sn > end - p)
 2869                   return false;
 2870               int i = 0;
 2871               while (i < sn) {
 2872                   if (charAt(p++) != s.charAt(i)) {
 2873                       break;
 2874                   }
 2875                   i++;
 2876               }
 2877               return (i == sn);
 2878           }
 2879   
 2880   
 2881           // -- Scanning --
 2882   
 2883           // The various scan and parse methods that follow use a uniform
 2884           // convention of taking the current start position and end index as
 2885           // their first two arguments.  The start is inclusive while the end is
 2886           // exclusive, just as in the String class, i.e., a start/end pair
 2887           // denotes the left-open interval [start, end) of the input string.
 2888           //
 2889           // These methods never proceed past the end position.  They may return
 2890           // -1 to indicate outright failure, but more often they simply return
 2891           // the position of the first char after the last char scanned.  Thus
 2892           // a typical idiom is
 2893           //
 2894           //     int p = start;
 2895           //     int q = scan(p, end, ...);
 2896           //     if (q > p)
 2897           //         // We scanned something
 2898           //         ...;
 2899           //     else if (q == p)
 2900           //         // We scanned nothing
 2901           //         ...;
 2902           //     else if (q == -1)
 2903           //         // Something went wrong
 2904           //         ...;
 2905   
 2906   
 2907           // Scan a specific char: If the char at the given start position is
 2908           // equal to c, return the index of the next char; otherwise, return the
 2909           // start position.
 2910           //
 2911           private int scan(int start, int end, char c) {
 2912               if ((start < end) && (charAt(start) == c))
 2913                   return start + 1;
 2914               return start;
 2915           }
 2916   
 2917           // Scan forward from the given start position.  Stop at the first char
 2918           // in the err string (in which case -1 is returned), or the first char
 2919           // in the stop string (in which case the index of the preceding char is
 2920           // returned), or the end of the input string (in which case the length
 2921           // of the input string is returned).  May return the start position if
 2922           // nothing matches.
 2923           //
 2924           private int scan(int start, int end, String err, String stop) {
 2925               int p = start;
 2926               while (p < end) {
 2927                   char c = charAt(p);
 2928                   if (err.indexOf(c) >= 0)
 2929                       return -1;
 2930                   if (stop.indexOf(c) >= 0)
 2931                       break;
 2932                   p++;
 2933               }
 2934               return p;
 2935           }
 2936   
 2937           // Scan a potential escape sequence, starting at the given position,
 2938           // with the given first char (i.e., charAt(start) == c).
 2939           //
 2940           // This method assumes that if escapes are allowed then visible
 2941           // non-US-ASCII chars are also allowed.
 2942           //
 2943           private int scanEscape(int start, int n, char first)
 2944               throws URISyntaxException
 2945           {
 2946               int p = start;
 2947               char c = first;
 2948               if (c == '%') {
 2949                   // Process escape pair
 2950                   if ((p + 3 <= n)
 2951                       && match(charAt(p + 1), L_HEX, H_HEX)
 2952                       && match(charAt(p + 2), L_HEX, H_HEX)) {
 2953                       return p + 3;
 2954                   }
 2955                   fail("Malformed escape pair", p);
 2956               } else if ((c > 128)
 2957                          && !Character.isSpaceChar(c)
 2958                          && !Character.isISOControl(c)) {
 2959                   // Allow unescaped but visible non-US-ASCII chars
 2960                   return p + 1;
 2961               }
 2962               return p;
 2963           }
 2964   
 2965           // Scan chars that match the given mask pair
 2966           //
 2967           private int scan(int start, int n, long lowMask, long highMask)
 2968               throws URISyntaxException
 2969           {
 2970               int p = start;
 2971               while (p < n) {
 2972                   char c = charAt(p);
 2973                   if (match(c, lowMask, highMask)) {
 2974                       p++;
 2975                       continue;
 2976                   }
 2977                   if ((lowMask & L_ESCAPED) != 0) {
 2978                       int q = scanEscape(p, n, c);
 2979                       if (q > p) {
 2980                           p = q;
 2981                           continue;
 2982                       }
 2983                   }
 2984                   break;
 2985               }
 2986               return p;
 2987           }
 2988   
 2989           // Check that each of the chars in [start, end) matches the given mask
 2990           //
 2991           private void checkChars(int start, int end,
 2992                                   long lowMask, long highMask,
 2993                                   String what)
 2994               throws URISyntaxException
 2995           {
 2996               int p = scan(start, end, lowMask, highMask);
 2997               if (p < end)
 2998                   fail("Illegal character in " + what, p);
 2999           }
 3000   
 3001           // Check that the char at position p matches the given mask
 3002           //
 3003           private void checkChar(int p,
 3004                                  long lowMask, long highMask,
 3005                                  String what)
 3006               throws URISyntaxException
 3007           {
 3008               checkChars(p, p + 1, lowMask, highMask, what);
 3009           }
 3010   
 3011   
 3012           // -- Parsing --
 3013   
 3014           // [<scheme>:]<scheme-specific-part>[#<fragment>]
 3015           //
 3016           void parse(boolean rsa) throws URISyntaxException {
 3017               requireServerAuthority = rsa;
 3018               int ssp;                    // Start of scheme-specific part
 3019               int n = input.length();
 3020               int p = scan(0, n, "/?#", ":");
 3021               if ((p >= 0) && at(p, n, ':')) {
 3022                   if (p == 0)
 3023                       failExpecting("scheme name", 0);
 3024                   checkChar(0, L_ALPHA, H_ALPHA, "scheme name");
 3025                   checkChars(1, p, L_SCHEME, H_SCHEME, "scheme name");
 3026                   scheme = substring(0, p);
 3027                   p++;                    // Skip ':'
 3028                   ssp = p;
 3029                   if (at(p, n, '/')) {
 3030                       p = parseHierarchical(p, n);
 3031                   } else {
 3032                       int q = scan(p, n, "", "#");
 3033                       if (q <= p)
 3034                           failExpecting("scheme-specific part", p);
 3035                       checkChars(p, q, L_URIC, H_URIC, "opaque part");
 3036                       p = q;
 3037                   }
 3038               } else {
 3039                   ssp = 0;
 3040                   p = parseHierarchical(0, n);
 3041               }
 3042               schemeSpecificPart = substring(ssp, p);
 3043               if (at(p, n, '#')) {
 3044                   checkChars(p + 1, n, L_URIC, H_URIC, "fragment");
 3045                   fragment = substring(p + 1, n);
 3046                   p = n;
 3047               }
 3048               if (p < n)
 3049                   fail("end of URI", p);
 3050           }
 3051   
 3052           // [//authority]<path>[?<query>]
 3053           //
 3054           // DEVIATION from RFC2396: We allow an empty authority component as
 3055           // long as it's followed by a non-empty path, query component, or
 3056           // fragment component.  This is so that URIs such as "file:///foo/bar"
 3057           // will parse.  This seems to be the intent of RFC2396, though the
 3058           // grammar does not permit it.  If the authority is empty then the
 3059           // userInfo, host, and port components are undefined.
 3060           //
 3061           // DEVIATION from RFC2396: We allow empty relative paths.  This seems
 3062           // to be the intent of RFC2396, but the grammar does not permit it.
 3063           // The primary consequence of this deviation is that "#f" parses as a
 3064           // relative URI with an empty path.
 3065           //
 3066           private int parseHierarchical(int start, int n)
 3067               throws URISyntaxException
 3068           {
 3069               int p = start;
 3070               if (at(p, n, '/') && at(p + 1, n, '/')) {
 3071                   p += 2;
 3072                   int q = scan(p, n, "", "/?#");
 3073                   if (q > p) {
 3074                       p = parseAuthority(p, q);
 3075                   } else if (q < n) {
 3076                       // DEVIATION: Allow empty authority prior to non-empty
 3077                       // path, query component or fragment identifier
 3078                   } else
 3079                       failExpecting("authority", p);
 3080               }
 3081               int q = scan(p, n, "", "?#"); // DEVIATION: May be empty
 3082               checkChars(p, q, L_PATH, H_PATH, "path");
 3083               path = substring(p, q);
 3084               p = q;
 3085               if (at(p, n, '?')) {
 3086                   p++;
 3087                   q = scan(p, n, "", "#");
 3088                   checkChars(p, q, L_URIC, H_URIC, "query");
 3089                   query = substring(p, q);
 3090                   p = q;
 3091               }
 3092               return p;
 3093           }
 3094   
 3095           // authority     = server | reg_name
 3096           //
 3097           // Ambiguity: An authority that is a registry name rather than a server
 3098           // might have a prefix that parses as a server.  We use the fact that
 3099           // the authority component is always followed by '/' or the end of the
 3100           // input string to resolve this: If the complete authority did not
 3101           // parse as a server then we try to parse it as a registry name.
 3102           //
 3103           private int parseAuthority(int start, int n)
 3104               throws URISyntaxException
 3105           {
 3106               int p = start;
 3107               int q = p;
 3108               URISyntaxException ex = null;
 3109   
 3110               boolean serverChars;
 3111               boolean regChars;
 3112   
 3113               if (scan(p, n, "", "]") > p) {
 3114                   // contains a literal IPv6 address, therefore % is allowed
 3115                   serverChars = (scan(p, n, L_SERVER_PERCENT, H_SERVER_PERCENT) == n);
 3116               } else {
 3117                   serverChars = (scan(p, n, L_SERVER, H_SERVER) == n);
 3118               }
 3119               regChars = (scan(p, n, L_REG_NAME, H_REG_NAME) == n);
 3120   
 3121               if (regChars && !serverChars) {
 3122                   // Must be a registry-based authority
 3123                   authority = substring(p, n);
 3124                   return n;
 3125               }
 3126   
 3127               if (serverChars) {
 3128                   // Might be (probably is) a server-based authority, so attempt
 3129                   // to parse it as such.  If the attempt fails, try to treat it
 3130                   // as a registry-based authority.
 3131                   try {
 3132                       q = parseServer(p, n);
 3133                       if (q < n)
 3134                           failExpecting("end of authority", q);
 3135                       authority = substring(p, n);
 3136                   } catch (URISyntaxException x) {
 3137                       // Undo results of failed parse
 3138                       userInfo = null;
 3139                       host = null;
 3140                       port = -1;
 3141                       if (requireServerAuthority) {
 3142                           // If we're insisting upon a server-based authority,
 3143                           // then just re-throw the exception
 3144                           throw x;
 3145                       } else {
 3146                           // Save the exception in case it doesn't parse as a
 3147                           // registry either
 3148                           ex = x;
 3149                           q = p;
 3150                       }
 3151                   }
 3152               }
 3153   
 3154               if (q < n) {
 3155                   if (regChars) {
 3156                       // Registry-based authority
 3157                       authority = substring(p, n);
 3158                   } else if (ex != null) {
 3159                       // Re-throw exception; it was probably due to
 3160                       // a malformed IPv6 address
 3161                       throw ex;
 3162                   } else {
 3163                       fail("Illegal character in authority", q);
 3164                   }
 3165               }
 3166   
 3167               return n;
 3168           }
 3169   
 3170   
 3171           // [<userinfo>@]<host>[:<port>]
 3172           //
 3173           private int parseServer(int start, int n)
 3174               throws URISyntaxException
 3175           {
 3176               int p = start;
 3177               int q;
 3178   
 3179               // userinfo
 3180               q = scan(p, n, "/?#", "@");
 3181               if ((q >= p) && at(q, n, '@')) {
 3182                   checkChars(p, q, L_USERINFO, H_USERINFO, "user info");
 3183                   userInfo = substring(p, q);
 3184                   p = q + 1;              // Skip '@'
 3185               }
 3186   
 3187               // hostname, IPv4 address, or IPv6 address
 3188               if (at(p, n, '[')) {
 3189                   // DEVIATION from RFC2396: Support IPv6 addresses, per RFC2732
 3190                   p++;
 3191                   q = scan(p, n, "/?#", "]");
 3192                   if ((q > p) && at(q, n, ']')) {
 3193                       // look for a "%" scope id
 3194                       int r = scan (p, q, "", "%");
 3195                       if (r > p) {
 3196                           parseIPv6Reference(p, r);
 3197                           if (r+1 == q) {
 3198                               fail ("scope id expected");
 3199                           }
 3200                           checkChars (r+1, q, L_ALPHANUM, H_ALPHANUM,
 3201                                                   "scope id");
 3202                       } else {
 3203                           parseIPv6Reference(p, q);
 3204                       }
 3205                       host = substring(p-1, q+1);
 3206                       p = q + 1;
 3207                   } else {
 3208                       failExpecting("closing bracket for IPv6 address", q);
 3209                   }
 3210               } else {
 3211                   q = parseIPv4Address(p, n);
 3212                   if (q <= p)
 3213                       q = parseHostname(p, n);
 3214                   p = q;
 3215               }
 3216   
 3217               // port
 3218               if (at(p, n, ':')) {
 3219                   p++;
 3220                   q = scan(p, n, "", "/");
 3221                   if (q > p) {
 3222                       checkChars(p, q, L_DIGIT, H_DIGIT, "port number");
 3223                       try {
 3224                           port = Integer.parseInt(substring(p, q));
 3225                       } catch (NumberFormatException x) {
 3226                           fail("Malformed port number", p);
 3227                       }
 3228                       p = q;
 3229                   }
 3230               }
 3231               if (p < n)
 3232                   failExpecting("port number", p);
 3233   
 3234               return p;
 3235           }
 3236   
 3237           // Scan a string of decimal digits whose value fits in a byte
 3238           //
 3239           private int scanByte(int start, int n)
 3240               throws URISyntaxException
 3241           {
 3242               int p = start;
 3243               int q = scan(p, n, L_DIGIT, H_DIGIT);
 3244               if (q <= p) return q;
 3245               if (Integer.parseInt(substring(p, q)) > 255) return p;
 3246               return q;
 3247           }
 3248   
 3249           // Scan an IPv4 address.
 3250           //
 3251           // If the strict argument is true then we require that the given
 3252           // interval contain nothing besides an IPv4 address; if it is false
 3253           // then we only require that it start with an IPv4 address.
 3254           //
 3255           // If the interval does not contain or start with (depending upon the
 3256           // strict argument) a legal IPv4 address characters then we return -1
 3257           // immediately; otherwise we insist that these characters parse as a
 3258           // legal IPv4 address and throw an exception on failure.
 3259           //
 3260           // We assume that any string of decimal digits and dots must be an IPv4
 3261           // address.  It won't parse as a hostname anyway, so making that
 3262           // assumption here allows more meaningful exceptions to be thrown.
 3263           //
 3264           private int scanIPv4Address(int start, int n, boolean strict)
 3265               throws URISyntaxException
 3266           {
 3267               int p = start;
 3268               int q;
 3269               int m = scan(p, n, L_DIGIT | L_DOT, H_DIGIT | H_DOT);
 3270               if ((m <= p) || (strict && (m != n)))
 3271                   return -1;
 3272               for (;;) {
 3273                   // Per RFC2732: At most three digits per byte
 3274                   // Further constraint: Each element fits in a byte
 3275                   if ((q = scanByte(p, m)) <= p) break;   p = q;
 3276                   if ((q = scan(p, m, '.')) <= p) break;  p = q;
 3277                   if ((q = scanByte(p, m)) <= p) break;   p = q;
 3278                   if ((q = scan(p, m, '.')) <= p) break;  p = q;
 3279                   if ((q = scanByte(p, m)) <= p) break;   p = q;
 3280                   if ((q = scan(p, m, '.')) <= p) break;  p = q;
 3281                   if ((q = scanByte(p, m)) <= p) break;   p = q;
 3282                   if (q < m) break;
 3283                   return q;
 3284               }
 3285               fail("Malformed IPv4 address", q);
 3286               return -1;
 3287           }
 3288   
 3289           // Take an IPv4 address: Throw an exception if the given interval
 3290           // contains anything except an IPv4 address
 3291           //
 3292           private int takeIPv4Address(int start, int n, String expected)
 3293               throws URISyntaxException
 3294           {
 3295               int p = scanIPv4Address(start, n, true);
 3296               if (p <= start)
 3297                   failExpecting(expected, start);
 3298               return p;
 3299           }
 3300   
 3301           // Attempt to parse an IPv4 address, returning -1 on failure but
 3302           // allowing the given interval to contain [:<characters>] after
 3303           // the IPv4 address.
 3304           //
 3305           private int parseIPv4Address(int start, int n) {
 3306               int p;
 3307   
 3308               try {
 3309                   p = scanIPv4Address(start, n, false);
 3310               } catch (URISyntaxException x) {
 3311                   return -1;
 3312               } catch (NumberFormatException nfe) {
 3313                   return -1;
 3314               }
 3315   
 3316               if (p > start && p < n) {
 3317                   // IPv4 address is followed by something - check that
 3318                   // it's a ":" as this is the only valid character to
 3319                   // follow an address.
 3320                   if (charAt(p) != ':') {
 3321                       p = -1;
 3322                   }
 3323               }
 3324   
 3325               if (p > start)
 3326                   host = substring(start, p);
 3327   
 3328               return p;
 3329           }
 3330   
 3331           // hostname      = domainlabel [ "." ] | 1*( domainlabel "." ) toplabel [ "." ]
 3332           // domainlabel   = alphanum | alphanum *( alphanum | "-" ) alphanum
 3333           // toplabel      = alpha | alpha *( alphanum | "-" ) alphanum
 3334           //
 3335           private int parseHostname(int start, int n)
 3336               throws URISyntaxException
 3337           {
 3338               int p = start;
 3339               int q;
 3340               int l = -1;                 // Start of last parsed label
 3341   
 3342               do {
 3343                   // domainlabel = alphanum [ *( alphanum | "-" ) alphanum ]
 3344                   q = scan(p, n, L_ALPHANUM, H_ALPHANUM);
 3345                   if (q <= p)
 3346                       break;
 3347                   l = p;
 3348                   if (q > p) {
 3349                       p = q;
 3350                       q = scan(p, n, L_ALPHANUM | L_DASH, H_ALPHANUM | H_DASH);
 3351                       if (q > p) {
 3352                           if (charAt(q - 1) == '-')
 3353                               fail("Illegal character in hostname", q - 1);
 3354                           p = q;
 3355                       }
 3356                   }
 3357                   q = scan(p, n, '.');
 3358                   if (q <= p)
 3359                       break;
 3360                   p = q;
 3361               } while (p < n);
 3362   
 3363               if ((p < n) && !at(p, n, ':'))
 3364                   fail("Illegal character in hostname", p);
 3365   
 3366               if (l < 0)
 3367                   failExpecting("hostname", start);
 3368   
 3369               // for a fully qualified hostname check that the rightmost
 3370               // label starts with an alpha character.
 3371               if (l > start && !match(charAt(l), L_ALPHA, H_ALPHA)) {
 3372                   fail("Illegal character in hostname", l);
 3373               }
 3374   
 3375               host = substring(start, p);
 3376               return p;
 3377           }
 3378   
 3379   
 3380           // IPv6 address parsing, from RFC2373: IPv6 Addressing Architecture
 3381           //
 3382           // Bug: The grammar in RFC2373 Appendix B does not allow addresses of
 3383           // the form ::12.34.56.78, which are clearly shown in the examples
 3384           // earlier in the document.  Here is the original grammar:
 3385           //
 3386           //   IPv6address = hexpart [ ":" IPv4address ]
 3387           //   hexpart     = hexseq | hexseq "::" [ hexseq ] | "::" [ hexseq ]
 3388           //   hexseq      = hex4 *( ":" hex4)
 3389           //   hex4        = 1*4HEXDIG
 3390           //
 3391           // We therefore use the following revised grammar:
 3392           //
 3393           //   IPv6address = hexseq [ ":" IPv4address ]
 3394           //                 | hexseq [ "::" [ hexpost ] ]
 3395           //                 | "::" [ hexpost ]
 3396           //   hexpost     = hexseq | hexseq ":" IPv4address | IPv4address
 3397           //   hexseq      = hex4 *( ":" hex4)
 3398           //   hex4        = 1*4HEXDIG
 3399           //
 3400           // This covers all and only the following cases:
 3401           //
 3402           //   hexseq
 3403           //   hexseq : IPv4address
 3404           //   hexseq ::
 3405           //   hexseq :: hexseq
 3406           //   hexseq :: hexseq : IPv4address
 3407           //   hexseq :: IPv4address
 3408           //   :: hexseq
 3409           //   :: hexseq : IPv4address
 3410           //   :: IPv4address
 3411           //   ::
 3412           //
 3413           // Additionally we constrain the IPv6 address as follows :-
 3414           //
 3415           //  i.  IPv6 addresses without compressed zeros should contain
 3416           //      exactly 16 bytes.
 3417           //
 3418           //  ii. IPv6 addresses with compressed zeros should contain
 3419           //      less than 16 bytes.
 3420   
 3421           private int ipv6byteCount = 0;
 3422   
 3423           private int parseIPv6Reference(int start, int n)
 3424               throws URISyntaxException
 3425           {
 3426               int p = start;
 3427               int q;
 3428               boolean compressedZeros = false;
 3429   
 3430               q = scanHexSeq(p, n);
 3431   
 3432               if (q > p) {
 3433                   p = q;
 3434                   if (at(p, n, "::")) {
 3435                       compressedZeros = true;
 3436                       p = scanHexPost(p + 2, n);
 3437                   } else if (at(p, n, ':')) {
 3438                       p = takeIPv4Address(p + 1,  n, "IPv4 address");
 3439                       ipv6byteCount += 4;
 3440                   }
 3441               } else if (at(p, n, "::")) {
 3442                   compressedZeros = true;
 3443                   p = scanHexPost(p + 2, n);
 3444               }
 3445               if (p < n)
 3446                   fail("Malformed IPv6 address", start);
 3447               if (ipv6byteCount > 16)
 3448                   fail("IPv6 address too long", start);
 3449               if (!compressedZeros && ipv6byteCount < 16)
 3450                   fail("IPv6 address too short", start);
 3451               if (compressedZeros && ipv6byteCount == 16)
 3452                   fail("Malformed IPv6 address", start);
 3453   
 3454               return p;
 3455           }
 3456   
 3457           private int scanHexPost(int start, int n)
 3458               throws URISyntaxException
 3459           {
 3460               int p = start;
 3461               int q;
 3462   
 3463               if (p == n)
 3464                   return p;
 3465   
 3466               q = scanHexSeq(p, n);
 3467               if (q > p) {
 3468                   p = q;
 3469                   if (at(p, n, ':')) {
 3470                       p++;
 3471                       p = takeIPv4Address(p, n, "hex digits or IPv4 address");
 3472                       ipv6byteCount += 4;
 3473                   }
 3474               } else {
 3475                   p = takeIPv4Address(p, n, "hex digits or IPv4 address");
 3476                   ipv6byteCount += 4;
 3477               }
 3478               return p;
 3479           }
 3480   
 3481           // Scan a hex sequence; return -1 if one could not be scanned
 3482           //
 3483           private int scanHexSeq(int start, int n)
 3484               throws URISyntaxException
 3485           {
 3486               int p = start;
 3487               int q;
 3488   
 3489               q = scan(p, n, L_HEX, H_HEX);
 3490               if (q <= p)
 3491                   return -1;
 3492               if (at(q, n, '.'))          // Beginning of IPv4 address
 3493                   return -1;
 3494               if (q > p + 4)
 3495                   fail("IPv6 hexadecimal digit sequence too long", p);
 3496               ipv6byteCount += 2;
 3497               p = q;
 3498               while (p < n) {
 3499                   if (!at(p, n, ':'))
 3500                       break;
 3501                   if (at(p + 1, n, ':'))
 3502                       break;              // "::"
 3503                   p++;
 3504                   q = scan(p, n, L_HEX, H_HEX);
 3505                   if (q <= p)
 3506                       failExpecting("digits for an IPv6 address", p);
 3507                   if (at(q, n, '.')) {    // Beginning of IPv4 address
 3508                       p--;
 3509                       break;
 3510                   }
 3511                   if (q > p + 4)
 3512                       fail("IPv6 hexadecimal digit sequence too long", p);
 3513                   ipv6byteCount += 2;
 3514                   p = q;
 3515               }
 3516   
 3517               return p;
 3518           }
 3519   
 3520       }
 3521   
 3522   }

Save This Page
Home » openjdk-7 » java » net » [javadoc | source]