Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: java/net/URI.java


1   /* URI.java -- An URI class
2      Copyright (C) 2002, 2004, 2005  Free Software Foundation, Inc.
3   
4   This file is part of GNU Classpath.
5   
6   GNU Classpath is free software; you can redistribute it and/or modify
7   it under the terms of the GNU General Public License as published by
8   the Free Software Foundation; either version 2, or (at your option)
9   any later version.
10  
11  GNU Classpath is distributed in the hope that it will be useful, but
12  WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  General Public License for more details.
15  
16  You should have received a copy of the GNU General Public License
17  along with GNU Classpath; see the file COPYING.  If not, write to the
18  Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19  02110-1301 USA.
20  
21  Linking this library statically or dynamically with other modules is
22  making a combined work based on this library.  Thus, the terms and
23  conditions of the GNU General Public License cover the whole
24  combination.
25  
26  As a special exception, the copyright holders of this library give you
27  permission to link this library with independent modules to produce an
28  executable, regardless of the license terms of these independent
29  modules, and to copy and distribute the resulting executable under
30  terms of your choice, provided that you also meet, for each linked
31  independent module, the terms and conditions of the license of that
32  module.  An independent module is a module which is not derived from
33  or based on this library.  If you modify this library, you may extend
34  this exception to your version of the library, but you are not
35  obligated to do so.  If you do not wish to do so, delete this
36  exception statement from your version. */
37  
38  
39  package java.net;
40  
41  import java.io.IOException;
42  import java.io.ObjectInputStream;
43  import java.io.ObjectOutputStream;
44  import java.io.Serializable;
45  import java.util.regex.Matcher;
46  import java.util.regex.Pattern;
47  
48  /**
49   * <p>
50   * A URI instance represents that defined by 
51   * <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC3986</a>,
52   * with some deviations.
53   * </p>
54   * <p>
55   * At its highest level, a URI consists of:
56   * </p>
57   * <code>[<em>scheme</em><strong>:</strong>]<em>scheme-specific-part</em>
58   * [<strong>#</strong><em>fragment</em>]</code>
59   * </p>
60   * <p>
61   * where <strong>#</strong> and <strong>:</strong> are literal characters,
62   * and those parts enclosed in square brackets are optional.
63   * </p>
64   * <p>
65   * There are two main types of URI.  An <em>opaque</em> URI is one
66   * which just consists of the above three parts, and is not further
67   * defined.  An example of such a URI would be <em>mailto:</em> URI.
68   * In contrast, <em>hierarchical</em> URIs give further definition
69   * to the scheme-specific part, so as represent some part of a hierarchical
70   * structure.
71   * </p>
72   * <p>
73   * <code>[<strong>//</strong><em>authority</em>][<em>path</em>]
74   * [<strong>?</strong><em>query</em>]</code>
75   * </p>
76   * <p>
77   * with <strong>/</strong> and <strong>?</strong> being literal characters.
78   * When server-based, the authority section is further subdivided into:
79   * </p>
80   * <p>
81   * <code>[<em>user-info</em><strong>@</strong>]<em>host</em>
82   * [<strong>:</strong><em>port</em>]</code>
83   * </p>
84   * <p>
85   * with <strong>@</strong> and <strong>:</strong> as literal characters.
86   * Authority sections that are not server-based are said to be registry-based.
87   * </p>
88   * <p>
89   * Hierarchical URIs can be either relative or absolute.  Absolute URIs
90   * always start with a `<strong>/</strong>', while relative URIs don't
91   * specify a scheme.  Opaque URIs are always absolute.
92   * </p>
93   * <p>
94   * Each part of the URI may have one of three states: undefined, empty
95   * or containing some content.  The former two of these are represented
96   * by <code>null</code> and the empty string in Java, respectively.
97   * The scheme-specific part may never be undefined.  It also follows from
98   * this that the path sub-part may also not be undefined, so as to ensure
99   * the former.
100  * </p>
101  * <h2>Character Escaping and Quoting</h2>
102  * <p>
103  * The characters that can be used within a valid URI are restricted.
104  * There are two main classes of characters which can't be used as is
105  * within the URI:
106  * </p>
107  * <ol>
108  * <li><strong>Characters outside the US-ASCII character set</strong>.
109  * These have to be <strong>escaped</strong> in order to create
110  * an RFC-compliant URI; this means replacing the character with the
111  * appropriate hexadecimal value, preceded by a `%'.</li>
112  * <li><strong>Illegal characters</strong> (e.g. space characters,
113  * control characters) are quoted, which results in them being encoded
114  * in the same way as non-US-ASCII characters.</li>
115  * </ol>
116  * <p>
117  * The set of valid characters differs depending on the section of the URI:
118  * </p>
119  * <ul>
120  * <li><strong>Scheme</strong>: Must be an alphanumeric, `-', `.' or '+'.</li>
121  * <li><strong>Authority</strong>:Composed of the username, host, port, `@'
122  * and `:'.</li>
123  * <li><strong>Username</strong>: Allows unreserved or percent-encoded
124  * characters, sub-delimiters and `:'.</li>
125  * <li><strong>Host</strong>: Allows unreserved or percent-encoded
126  * characters, sub-delimiters and square brackets (`[' and `]') for IPv6
127  * addresses.</li>
128  * <li><strong>Port</strong>: Digits only.</li>
129  * <li><strong>Path</strong>: Allows the path characters and `/'.
130  * <li><strong>Query</strong>: Allows the path characters, `?' and '/'.
131  * <li><strong>Fragment</strong>: Allows the path characters, `?' and '/'.
132  * </ul>
133  * <p>
134  * These definitions reference the following sets of characters:
135  * </p>
136  * <ul>
137  * <li><strong>Unreserved characters</strong>: The alphanumerics plus
138  * `-', `.', `_', and `~'.</li>
139  * <li><strong>Sub-delimiters</strong>: `!', `$', `&', `(', `)', `*',
140  * `+', `,', `;', `=' and the single-quote itself.</li>
141  * <li><strong>Path characters</strong>: Unreserved and percent-encoded
142  * characters and the sub-delimiters along with `@' and `:'.</li>
143  * </ul>
144  * <p>
145  * The constructors and accessor methods allow the use and retrieval of
146  * URI components which contain non-US-ASCII characters directly.
147  * They are only escaped when the <code>toASCIIString()</code> method
148  * is used.  In contrast, illegal characters are always quoted, with the
149  * exception of the return values of the non-raw accessors.
150  * </p>
151  *
152  * @author Ito Kazumitsu (ito.kazumitsu@hitachi-cable.co.jp)
153  * @author Dalibor Topic (robilad@kaffe.org)
154  * @author Michael Koch (konqueror@gmx.de)
155  * @author Andrew John Hughes (gnu_andrew@member.fsf.org)
156  * @since 1.4
157  */
158 public final class URI 
159   implements Comparable, Serializable
160 {
161   /**
162    * For serialization compatability.
163    */
164   static final long serialVersionUID = -6052424284110960213L;
165 
166   /**
167    * Regular expression for parsing URIs.
168    *
169    * Taken from RFC 2396, Appendix B.
170    * This expression doesn't parse IPv6 addresses.
171    */
172   private static final String URI_REGEXP =
173     "^(([^:/?#]+):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?";
174 
175   /**
176    * Regular expression for parsing the authority segment.
177    */
178   private static final String AUTHORITY_REGEXP =
179     "(([^?#]*)@)?([^?#:]*)(:([0-9]*))?";
180 
181   /**
182    * Valid characters (taken from rfc2396/3986)
183    */
184   private static final String RFC2396_DIGIT = "0123456789";
185   private static final String RFC2396_LOWALPHA = "abcdefghijklmnopqrstuvwxyz";
186   private static final String RFC2396_UPALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
187   private static final String RFC2396_ALPHA =
188     RFC2396_LOWALPHA + RFC2396_UPALPHA;
189   private static final String RFC2396_ALPHANUM = RFC2396_DIGIT + RFC2396_ALPHA;
190   private static final String RFC3986_UNRESERVED = RFC2396_ALPHANUM + "-._~";
191   private static final String RFC3986_SUBDELIMS = "!$&'()*+,;=";
192   private static final String RFC3986_REG_NAME =
193     RFC3986_UNRESERVED + RFC3986_SUBDELIMS + "%";
194   private static final String RFC3986_PCHAR = RFC3986_UNRESERVED + 
195     RFC3986_SUBDELIMS + ":@%";
196   private static final String RFC3986_SEGMENT = RFC3986_PCHAR;
197   private static final String RFC3986_PATH_SEGMENTS = RFC3986_SEGMENT + "/";
198   private static final String RFC3986_SSP = RFC3986_PCHAR + "?/";
199   private static final String RFC3986_HOST = RFC3986_REG_NAME + "[]";
200   private static final String RFC3986_USERINFO = RFC3986_REG_NAME + ":";
201 
202   /**
203    * Index of scheme component in parsed URI.
204    */
205   private static final int SCHEME_GROUP = 2;
206 
207   /**
208    * Index of scheme-specific-part in parsed URI.
209    */
210   private static final int SCHEME_SPEC_PART_GROUP = 3;
211 
212   /**
213    * Index of authority component in parsed URI.
214    */
215   private static final int AUTHORITY_GROUP = 5;
216 
217   /**
218    * Index of path component in parsed URI.
219    */
220   private static final int PATH_GROUP = 6;
221 
222   /**
223    * Index of query component in parsed URI.
224    */
225   private static final int QUERY_GROUP = 8;
226 
227   /**
228    * Index of fragment component in parsed URI.
229    */
230   private static final int FRAGMENT_GROUP = 10;
231   
232   /**
233    * Index of userinfo component in parsed authority section.
234    */
235   private static final int AUTHORITY_USERINFO_GROUP = 2;
236 
237   /**
238    * Index of host component in parsed authority section.
239    */
240   private static final int AUTHORITY_HOST_GROUP = 3;
241 
242   /**
243    * Index of port component in parsed authority section.
244    */
245   private static final int AUTHORITY_PORT_GROUP = 5;
246 
247   /**
248    * The compiled version of the URI regular expression.
249    */
250   private static final Pattern URI_PATTERN;
251 
252   /**
253    * The compiled version of the authority regular expression.
254    */
255   private static final Pattern AUTHORITY_PATTERN;
256 
257   /**
258    * The set of valid hexadecimal characters.
259    */
260   private static final String HEX = "0123456789ABCDEF";
261 
262   private transient String scheme;
263   private transient String rawSchemeSpecificPart;
264   private transient String schemeSpecificPart;
265   private transient String rawAuthority;
266   private transient String authority;
267   private transient String rawUserInfo;
268   private transient String userInfo;
269   private transient String rawHost;
270   private transient String host;
271   private transient int port = -1;
272   private transient String rawPath;
273   private transient String path;
274   private transient String rawQuery;
275   private transient String query;
276   private transient String rawFragment;
277   private transient String fragment;
278   private String string;
279 
280   /**
281    * Static initializer to pre-compile the regular expressions.
282    */
283   static
284   {
285     URI_PATTERN = Pattern.compile(URI_REGEXP);
286     AUTHORITY_PATTERN = Pattern.compile(AUTHORITY_REGEXP);
287   }
288 
289   private void readObject(ObjectInputStream is)
290     throws ClassNotFoundException, IOException
291   {
292     this.string = (String) is.readObject();
293     try
294       {
295   parseURI(this.string);
296       }
297     catch (URISyntaxException x)
298       {
299   // Should not happen.
300   throw new RuntimeException(x);
301       }
302   }
303 
304   private void writeObject(ObjectOutputStream os) throws IOException
305   {
306     if (string == null)
307       string = toString(); 
308     os.writeObject(string);
309   }
310 
311   /**
312    * <p>
313    * Returns the string content of the specified group of the supplied
314    * matcher.  The returned value is modified according to the following:
315    * </p>
316    * <ul>
317    * <li>If the resulting string has a length greater than 0, then
318    * that string is returned.</li>
319    * <li>If a string of zero length, is matched, then the content
320    * of the preceding group is considered.  If this is also an empty
321    * string, then <code>null</code> is returned to indicate an undefined
322    * value.  Otherwise, the value is truly the empty string and this is
323    * the returned value.</li>
324    * </ul>
325    * <p>
326    * This method is used for matching against all parts of the URI
327    * that may be either undefined or empty (i.e. all those but the
328    * scheme-specific part and the path).  In each case, the preceding
329    * group is the content of the original group, along with some
330    * additional distinguishing feature.  For example, the preceding
331    * group for the query includes the preceding question mark,
332    * while that of the fragment includes the hash symbol.  The presence
333    * of these features enables disambiguation between the two cases
334    * of a completely unspecified value and a simple non-existant value.
335    * The scheme differs in that it will never return an empty string;
336    * the delimiter follows the scheme rather than preceding it, so
337    * it becomes part of the following section.  The same is true
338    * of the user information.
339    * </p>
340    *
341    * @param match the matcher, which contains the results of the URI
342    *              matched against the URI regular expression.
343    * @return either the matched content, <code>null</code> for undefined
344    *         values, or an empty string for a URI part with empty content.
345    */
346   private static String getURIGroup(Matcher match, int group)
347   {
348     String matched = match.group(group);
349     return matched.length() == 0 
350       ? ((match.group(group - 1).length() == 0) ? null : "") : matched;
351   }
352 
353   /**
354    * Sets fields of this URI by parsing the given string.
355    *
356    * @param str The string to parse
357    *
358    * @exception URISyntaxException If the given string violates RFC 2396
359    */
360   private void parseURI(String str) throws URISyntaxException
361   {
362     Matcher matcher = URI_PATTERN.matcher(str);
363     
364     if (matcher.matches())
365       {
366   scheme = getURIGroup(matcher, SCHEME_GROUP);
367   rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
368   schemeSpecificPart = unquote(rawSchemeSpecificPart);
369   if (!isOpaque())
370     {
371       rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
372       rawPath = matcher.group(PATH_GROUP);
373       rawQuery = getURIGroup(matcher, QUERY_GROUP);
374     }
375   rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
376       }
377     else
378       throw new URISyntaxException(str,
379            "doesn't match URI regular expression");
380     parseServerAuthority();
381 
382     // We must eagerly unquote the parts, because this is the only time
383     // we may throw an exception.
384     authority = unquote(rawAuthority);
385     userInfo = unquote(rawUserInfo);
386     host = unquote(rawHost);
387     path = unquote(rawPath);
388     query = unquote(rawQuery);
389     fragment = unquote(rawFragment);
390   }
391 
392   /**
393    * Unquote "%" + hex quotes characters
394    *
395    * @param str The string to unquote or null.
396    *
397    * @return The unquoted string or null if str was null.
398    *
399    * @exception URISyntaxException If the given string contains invalid
400    * escape sequences.
401    */
402   private static String unquote(String str) throws URISyntaxException
403   {
404     if (str == null)
405       return null;
406     byte[] buf = new byte[str.length()];
407     int pos = 0;
408     for (int i = 0; i < str.length(); i++)
409       {
410   char c = str.charAt(i);
411   if (c == '%')
412     {
413       if (i + 2 >= str.length())
414         throw new URISyntaxException(str, "Invalid quoted character");
415       int hi = Character.digit(str.charAt(++i), 16);
416       int lo = Character.digit(str.charAt(++i), 16);
417       if (lo < 0 || hi < 0)
418         throw new URISyntaxException(str, "Invalid quoted character");
419       buf[pos++] = (byte) (hi * 16 + lo);
420     }
421   else
422     buf[pos++] = (byte) c;
423       }
424     try
425       {
426   return new String(buf, 0, pos, "utf-8");
427       }
428     catch (java.io.UnsupportedEncodingException x2)
429       {
430   throw (Error) new InternalError().initCause(x2);
431       }
432   }
433 
434   /**
435    * Quote characters illegal in URIs in given string.
436    *
437    * Replace illegal characters by encoding their UTF-8
438    * representation as "%" + hex code for each resulting
439    * UTF-8 character.
440    *
441    * @param str The string to quote
442    *
443    * @return The quoted string.
444    */
445   private static String quote(String str)
446   {
447     return quote(str, RFC3986_SSP);
448   }
449 
450   /**
451    * Quote characters illegal in URI authorities in given string.
452    *
453    * Replace illegal characters by encoding their UTF-8
454    * representation as "%" + hex code for each resulting
455    * UTF-8 character.
456    *
457    * @param str The string to quote
458    *
459    * @return The quoted string.
460    */
461   private static String quoteAuthority(String str)
462   {
463     // Technically, we should be using RFC2396_AUTHORITY, but
464     // it contains no additional characters.
465     return quote(str, RFC3986_REG_NAME);
466   }
467 
468   /**
469    * Quotes the characters in the supplied string that are not part of
470    * the specified set of legal characters.
471    *
472    * @param str the string to quote
473    * @param legalCharacters the set of legal characters
474    *
475    * @return the quoted string.
476    */
477   private static String quote(String str, String legalCharacters)
478   {
479     StringBuffer sb = new StringBuffer(str.length());
480     for (int i = 0; i < str.length(); i++)
481       {
482   char c = str.charAt(i);
483   if (legalCharacters.indexOf(c) == -1)
484     {
485       if (c <= 127)
486         {
487     sb.append('%');
488     sb.append(HEX.charAt(c / 16));
489     sb.append(HEX.charAt(c % 16));
490         }
491     }
492   else
493     sb.append(c);
494       }
495     return sb.toString();
496   }
497 
498   /**
499    * Quote characters illegal in URI hosts in given string.
500    *
501    * Replace illegal characters by encoding their UTF-8
502    * representation as "%" + hex code for each resulting
503    * UTF-8 character.
504    *
505    * @param str The string to quote
506    *
507    * @return The quoted string.
508    */
509   private static String quoteHost(String str)
510   {
511     return quote(str, RFC3986_HOST);
512   }
513 
514   /**
515    * Quote characters illegal in URI paths in given string.
516    *
517    * Replace illegal characters by encoding their UTF-8
518    * representation as "%" + hex code for each resulting
519    * UTF-8 character.
520    *
521    * @param str The string to quote
522    *
523    * @return The quoted string.
524    */
525   private static String quotePath(String str)
526   {
527     // Technically, we should be using RFC2396_PATH, but
528     // it contains no additional characters.
529     return quote(str, RFC3986_PATH_SEGMENTS);
530   }
531 
532   /**
533    * Quote characters illegal in URI user infos in given string.
534    *
535    * Replace illegal characters by encoding their UTF-8
536    * representation as "%" + hex code for each resulting
537    * UTF-8 character.
538    *
539    * @param str The string to quote
540    *
541    * @return The quoted string.
542    */
543   private static String quoteUserInfo(String str)
544   {
545     return quote(str, RFC3986_USERINFO);
546   }
547 
548   /**
549    * Creates an URI from the given string
550    *
551    * @param str The string to create the URI from
552    *
553    * @exception URISyntaxException If the given string violates RFC 2396
554    * @exception NullPointerException If str is null
555    */
556   public URI(String str) throws URISyntaxException
557   {
558     this.string = str;
559     parseURI(str);
560   }
561 
562   /**
563    * Create an URI from the given components
564    *
565    * @param scheme The scheme name
566    * @param userInfo The username and authorization info
567    * @param host The hostname
568    * @param port The port number
569    * @param path The path
570    * @param query The query
571    * @param fragment The fragment
572    *
573    * @exception URISyntaxException If the given string violates RFC 2396
574    */
575   public URI(String scheme, String userInfo, String host, int port,
576              String path, String query, String fragment)
577     throws URISyntaxException
578   {
579     this((scheme == null ? "" : scheme + ":")
580          + (userInfo == null && host == null && port == -1 ? "" : "//")
581          + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
582          + (host == null ? "" : quoteHost(host))
583          + (port == -1 ? "" : ":" + String.valueOf(port))
584          + (path == null ? "" : quotePath(path))
585          + (query == null ? "" : "?" + quote(query))
586          + (fragment == null ? "" : "#" + quote(fragment)));
587   }
588 
589   /**
590    * Create an URI from the given components
591    *
592    * @param scheme The scheme name
593    * @param authority The authority
594    * @param path The apth
595    * @param query The query
596    * @param fragment The fragment
597    *
598    * @exception URISyntaxException If the given string violates RFC 2396
599    */
600   public URI(String scheme, String authority, String path, String query,
601              String fragment) throws URISyntaxException
602   {
603     this((scheme == null ? "" : scheme + ":")
604          + (authority == null ? "" : "//" + quoteAuthority(authority))
605          + (path == null ? "" : quotePath(path))
606          + (query == null ? "" : "?" + quote(query))
607          + (fragment == null ? "" : "#" + quote(fragment)));
608   }
609 
610   /**
611    * Create an URI from the given components
612    *
613    * @param scheme The scheme name
614    * @param host The hostname
615    * @param path The path
616    * @param fragment The fragment
617    *
618    * @exception URISyntaxException If the given string violates RFC 2396
619    */
620   public URI(String scheme, String host, String path, String fragment)
621     throws URISyntaxException
622   {
623     this(scheme, null, host, -1, path, null, fragment);
624   }
625 
626   /**
627    * Create an URI from the given components
628    *
629    * @param scheme The scheme name
630    * @param ssp The scheme specific part
631    * @param fragment The fragment
632    *
633    * @exception URISyntaxException If the given string violates RFC 2396
634    */
635   public URI(String scheme, String ssp, String fragment)
636     throws URISyntaxException
637   {
638     this((scheme == null ? "" : scheme + ":")
639          + (ssp == null ? "" : quote(ssp))
640          + (fragment == null ? "" : "#" + quote(fragment)));
641   }
642 
643   /**
644    * Create an URI from the given string
645    *
646    * @param str The string to create the URI from
647    *
648    * @exception IllegalArgumentException If the given string violates RFC 2396
649    * @exception NullPointerException If str is null
650    */
651   public static URI create(String str)
652   {
653     try
654       {
655   return new URI(str);
656       }
657     catch (URISyntaxException e)
658       {
659   throw (IllegalArgumentException) new IllegalArgumentException()
660         .initCause(e);
661       }
662   }
663 
664   /**
665    * Attempts to parse this URI's authority component, if defined,
666    * into user-information, host, and port components.  The purpose
667    * of this method was to disambiguate between some authority sections,
668    * which form invalid server-based authories, but valid registry
669    * based authorities.  In the updated RFC 3986, the authority section
670    * is defined differently, with registry-based authorities part of
671    * the host section.  Thus, this method is now simply an explicit
672    * way of parsing any authority section.
673    *
674    * @return the URI, with the authority section parsed into user
675    *         information, host and port components.
676    * @throws URISyntaxException if the given string violates RFC 2396
677    */
678   public URI parseServerAuthority() throws URISyntaxException
679   {
680     if (rawAuthority != null)
681       {
682   Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
683 
684   if (matcher.matches())
685     {
686       rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
687       rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
688       
689       String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
690       
691       if (portStr != null)
692         try
693     {
694       port = Integer.parseInt(portStr);
695     }
696         catch (NumberFormatException e)
697     {
698       URISyntaxException use =
699         new URISyntaxException
700           (string, "doesn't match URI regular expression");
701       use.initCause(e);
702       throw use;
703     }
704     }
705   else
706     throw new URISyntaxException(string,
707                "doesn't match URI regular expression");
708       }
709     return this;
710   }
711 
712   /**
713    * <p>
714    * Returns a normalized version of the URI.  If the URI is opaque,
715    * or its path is already in normal form, then this URI is simply
716    * returned.  Otherwise, the following transformation of the path
717    * element takes place:
718    * </p>
719    * <ol>
720    * <li>All `.' segments are removed.</li>
721    * <li>Each `..' segment which can be paired with a prior non-`..' segment
722    * is removed along with the preceding segment.</li>
723    * <li>A `.' segment is added to the front if the first segment contains
724    * a colon (`:').  This is a deviation from the RFC, which prevents
725    * confusion between the path and the scheme.</li>
726    * </ol>
727    * <p>
728    * The resulting URI will be free of `.' and `..' segments, barring those
729    * that were prepended or which couldn't be paired, respectively.
730    * </p>
731    *
732    * @return the normalized URI.
733    */
734   public URI normalize()
735   {
736     if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1)
737       return this;
738     try
739       {
740   return new URI(scheme, authority, normalizePath(path), query,
741            fragment);
742       }
743     catch (URISyntaxException e)
744       {
745   throw (Error) new InternalError("Normalized URI variant could not "+
746           "be constructed").initCause(e);
747       }
748   }
749 
750   /**
751    * <p>
752    * Normalize the given path.  The following transformation takes place:
753    * </p>
754    * <ol>
755    * <li>All `.' segments are removed.</li>
756    * <li>Each `..' segment which can be paired with a prior non-`..' segment
757    * is removed along with the preceding segment.</li>
758    * <li>A `.' segment is added to the front if the first segment contains
759    * a colon (`:').  This is a deviation from the RFC, which prevents
760    * confusion between the path and the scheme.</li>
761    * </ol>
762    * <p>
763    * The resulting URI will be free of `.' and `..' segments, barring those
764    * that were prepended or which couldn't be paired, respectively.
765    * </p>
766    * 
767    * @param relativePath the relative path to be normalized.
768    * @return the normalized path.
769    */
770   private String normalizePath(String relativePath)
771   {
772     /* 
773        This follows the algorithm in section 5.2.4. of RFC3986,
774        but doesn't modify the input buffer.
775     */
776     StringBuffer input = new StringBuffer(relativePath);
777     StringBuffer output = new StringBuffer();
778     int start = 0;
779     while (start < input.length())
780       {
781   /* A */
782   if (input.indexOf("../",start) == start)
783     {
784       start += 3;
785       continue;
786     }
787   if (input.indexOf("./",start) == start)
788     {
789       start += 2;
790       continue;
791     }
792   /* B */
793   if (input.indexOf("/./",start) == start)
794     {
795       start += 2;
796       continue;
797     }
798   if (input.indexOf("/.",start) == start
799       && input.charAt(start + 2) != '.')
800     {
801       start += 1;
802       input.setCharAt(start,'/');
803       continue;
804     }
805   /* C */
806   if (input.indexOf("/../",start) == start)
807     {
808       start += 3;
809       removeLastSegment(output);
810       continue;
811     }
812   if (input.indexOf("/..",start) == start)
813     {
814       start += 2;
815       input.setCharAt(start,'/');
816       removeLastSegment(output);
817       continue;
818     }
819   /* D */
820   if (start == input.length() - 1 && input.indexOf(".",start) == start)
821     {
822       input.delete(0,1);
823       continue;
824     }
825   if (start == input.length() - 2 && input.indexOf("..",start) == start)
826     {
827       input.delete(0,2);
828       continue;
829     }
830   /* E */
831   int indexOfSlash = input.indexOf("/",start);
832   while (indexOfSlash == start)
833     {
834       output.append("/");
835       ++start;
836       indexOfSlash = input.indexOf("/",start);
837     }
838   if (indexOfSlash == -1)
839     indexOfSlash = input.length();
840   output.append(input.substring(start, indexOfSlash));
841         start = indexOfSlash;
842       }
843     return output.toString();
844   }
845 
846   /**
847    * Removes the last segment of the path from the specified buffer.
848    *
849    * @param buffer the buffer containing the path.
850    */
851   private void removeLastSegment(StringBuffer buffer)
852   {
853     int lastSlash = buffer.lastIndexOf("/");
854     if (lastSlash == -1)
855       buffer.setLength(0);
856     else
857       buffer.setLength(lastSlash);
858   }
859 
860   /**
861    * Resolves the given URI against this URI
862    *
863    * @param uri The URI to resolve against this URI
864    *
865    * @return The resulting URI, or null when it couldn't be resolved
866    * for some reason.
867    *
868    * @throws NullPointerException if uri is null
869    */
870   public URI resolve(URI uri)
871   {
872     if (uri.isAbsolute())
873       return uri;
874     if (uri.isOpaque())
875       return uri;
876 
877     String scheme = uri.getScheme();
878     String schemeSpecificPart = uri.getSchemeSpecificPart();
879     String authority = uri.getAuthority();
880     String path = uri.getPath();
881     String query = uri.getQuery();
882     String fragment = uri.getFragment();
883 
884     try
885       {
886   if (fragment != null && path != null && path.equals("")
887       && scheme == null && authority == null && query == null)
888     return new URI(this.scheme, this.schemeSpecificPart, fragment);
889 
890   if (authority == null)
891     {
892       authority = this.authority;
893       if (path == null)
894         path = "";
895       if (! (path.startsWith("/")))
896         {
897     StringBuffer basepath = new StringBuffer(this.path);
898     int i = this.path.lastIndexOf('/');
899 
900     if (i >= 0)
901       basepath.delete(i + 1, basepath.length());
902 
903     basepath.append(path);
904     path = normalizePath(basepath.toString());
905         }
906     }
907   return new URI(this.scheme, authority, path, query, fragment);
908       }
909     catch (URISyntaxException e)
910       {
911   throw (Error) new InternalError("Resolved URI variant could not "+
912           "be constructed").initCause(e);
913       }
914   }
915 
916   /**
917    * Resolves the given URI string against this URI
918    *
919    * @param str The URI as string to resolve against this URI
920    *
921    * @return The resulting URI
922    *
923    * @throws IllegalArgumentException If the given URI string
924    * violates RFC 2396
925    * @throws NullPointerException If uri is null
926    */
927   public URI resolve(String str) throws IllegalArgumentException
928   {
929     return resolve(create(str));
930   }
931 
932   /**
933    * <p>
934    * Relativizes the given URI against this URI.  The following
935    * algorithm is used:
936    * </p>
937    * <ul>
938    * <li>If either URI is opaque, the given URI is returned.</li>
939    * <li>If the schemes of the URIs differ, the given URI is returned.</li>
940    * <li>If the authority components of the URIs differ, then the given
941    * URI is returned.</li>
942    * <li>If the path of this URI is not a prefix of the supplied URI,
943    * then the given URI is returned.</li>
944    * <li>If all the above conditions hold, a new URI is created using the
945    * query and fragment components of the given URI, along with a path
946    * computed by removing the path of this URI from the start of the path
947    * of the supplied URI.</li>
948    * </ul>
949    *
950    * @param uri the URI to relativize agsint this URI
951    * @return the resulting URI
952    * @throws NullPointerException if the uri is null
953    */
954   public URI relativize(URI uri)
955   {
956     if (isOpaque() || uri.isOpaque())
957       return uri;
958     if (scheme == null && uri.getScheme() != null)
959       return uri;
960     if (scheme != null && !(scheme.equals(uri.getScheme())))
961       return uri;
962     if (rawAuthority == null && uri.getRawAuthority() != null)
963       return uri;
964     if (rawAuthority != null && !(rawAuthority.equals(uri.getRawAuthority())))
965       return uri;
966     if (!(uri.getRawPath().startsWith(rawPath)))
967       return uri;
968     try
969       {
970   return new URI(null, null, 
971            uri.getRawPath().substring(rawPath.length()),
972            uri.getRawQuery(), uri.getRawFragment());
973       }
974     catch (URISyntaxException e)
975       {
976   throw (Error) new InternalError("Relativized URI variant could not "+
977           "be constructed").initCause(e);       
978       }
979   }
980 
981   /**
982    * Creates an URL from an URI
983    *
984    * @throws MalformedURLException If a protocol handler for the URL could
985    * not be found, or if some other error occurred while constructing the URL
986    * @throws IllegalArgumentException If the URI is not absolute
987    */
988   public URL toURL() throws IllegalArgumentException, MalformedURLException
989   {
990     if (isAbsolute())
991       return new URL(this.toString());
992 
993     throw new IllegalArgumentException("not absolute");
994   }
995 
996   /**
997    * Returns the scheme of the URI
998    */
999   public String getScheme()
1000  {
1001    return scheme;
1002  }
1003
1004  /**
1005   * Tells whether this URI is absolute or not
1006   */
1007  public boolean isAbsolute()
1008  {
1009    return scheme != null;
1010  }
1011
1012  /**
1013   * Tell whether this URI is opaque or not
1014   */
1015  public boolean isOpaque()
1016  {
1017    return ((scheme != null) && ! (schemeSpecificPart.startsWith("/")));
1018  }
1019
1020  /**
1021   * Returns the raw scheme specific part of this URI.
1022   * The scheme-specific part is never undefined, though it may be empty
1023   */
1024  public String getRawSchemeSpecificPart()
1025  {
1026    return rawSchemeSpecificPart;
1027  }
1028
1029  /**
1030   * Returns the decoded scheme specific part of this URI.
1031   */
1032  public String getSchemeSpecificPart()
1033  {
1034    return schemeSpecificPart;
1035  }
1036
1037  /**
1038   * Returns the raw authority part of this URI
1039   */
1040  public String getRawAuthority()
1041  {
1042    return rawAuthority;
1043  }
1044
1045  /**
1046   * Returns the decoded authority part of this URI
1047   */
1048  public String getAuthority()
1049  {
1050    return authority;
1051  }
1052
1053  /**
1054   * Returns the raw user info part of this URI
1055   */
1056  public String getRawUserInfo()
1057  {
1058    return rawUserInfo;
1059  }
1060
1061  /**
1062   * Returns the decoded user info part of this URI
1063   */
1064  public String getUserInfo()
1065  {
1066    return userInfo;
1067  }
1068
1069  /**
1070   * Returns the hostname of the URI
1071   */
1072  public String getHost()
1073  {
1074    return host;
1075  }
1076
1077  /**
1078   * Returns the port number of the URI
1079   */
1080  public int getPort()
1081  {
1082    return port;
1083  }
1084
1085  /**
1086   * Returns the raw path part of this URI
1087   */
1088  public String getRawPath()
1089  {
1090    return rawPath;
1091  }
1092
1093  /**
1094   * Returns the path of the URI
1095   */
1096  public String getPath()
1097  {
1098    return path;
1099  }
1100
1101  /**
1102   * Returns the raw query part of this URI
1103   */
1104  public String getRawQuery()
1105  {
1106    return rawQuery;
1107  }
1108
1109  /**
1110   * Returns the query of the URI
1111   */
1112  public String getQuery()
1113  {
1114    return query;
1115  }
1116
1117  /**
1118   * Return the raw fragment part of this URI
1119   */
1120  public String getRawFragment()
1121  {
1122    return rawFragment;
1123  }
1124
1125  /**
1126   * Returns the fragment of the URI
1127   */
1128  public String getFragment()
1129  {
1130    return fragment;
1131  }
1132
1133  /**
1134   * <p> 
1135   * Compares the URI with the given object for equality.  If the
1136   * object is not a <code>URI</code>, then the method returns false.
1137   * Otherwise, the following criteria are observed:
1138   * </p>
1139   * <ul>
1140   * <li>The scheme of the URIs must either be null (undefined) in both cases,
1141   * or equal, ignorant of case.</li>
1142   * <li>The raw fragment of the URIs must either be null (undefined) in both
1143   * cases, or equal, ignorant of case.</li>
1144   * <li>Both URIs must be of the same type (opaque or hierarchial)</li>
1145   * <li><strong>For opaque URIs:</strong></li>
1146   * <ul>
1147   * <li>The raw scheme-specific parts must be equal.</li>
1148   * </ul>
1149   * <li>For hierarchical URIs:</li>
1150   * <ul>
1151   * <li>The raw paths must be equal, ignorant of case.</li>
1152   * <li>The raw queries are either both undefined or both equal, ignorant
1153   * of case.</li>
1154   * <li>The raw authority sections are either both undefined or:</li>
1155   * <li><strong>For registry-based authorities:</strong></li>
1156   * <ul><li>they are equal.</li></ul>
1157   * <li><strong>For server-based authorities:</strong></li>
1158   * <ul>
1159   * <li>the hosts are equal, ignoring case</li>
1160   * <li>the ports are equal</li>
1161   * <li>the user information components are equal</li>
1162   * </ul>
1163   * </ul>
1164   * </ul>
1165   *
1166   * @param obj the obj to compare the URI with.
1167   * @return <code>true</code> if the objects are equal, according to
1168   *         the specification above.
1169   */
1170  public boolean equals(Object obj)
1171  {
1172    if (!(obj instanceof URI))
1173      return false;
1174    URI uriObj = (URI) obj;
1175    if (scheme == null)
1176      {
1177  if (uriObj.getScheme() != null)
1178    return false;
1179      }
1180    else
1181      if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
1182  return false;
1183    if (rawFragment == null)
1184      {
1185  if (uriObj.getRawFragment() != null)
1186    return false;
1187      }
1188    else
1189      if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
1190  return false;
1191    boolean opaqueThis = isOpaque();
1192    boolean opaqueObj = uriObj.isOpaque();
1193    if (opaqueThis && opaqueObj)
1194      return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
1195    else if (!opaqueThis && !opaqueObj)
1196      {
1197  boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
1198    && ((rawQuery == null && uriObj.getRawQuery() == null)
1199        || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
1200  if (rawAuthority == null && uriObj.getRawAuthority() == null)
1201    return common;
1202  if (host == null)
1203    return common 
1204      && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
1205  return common 
1206    && host.equalsIgnoreCase(uriObj.getHost())
1207    && port == uriObj.getPort()
1208    && (rawUserInfo == null ?
1209        uriObj.getRawUserInfo() == null :
1210        rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
1211      }
1212    else
1213      return false;
1214  }
1215
1216  /**
1217   * Computes the hashcode of the URI
1218   */
1219  public int hashCode()
1220  {
1221    return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
1222      + 17 * getRawSchemeSpecificPart().hashCode()
1223      + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
1224  }
1225
1226  /**
1227   * Compare the URI with another object that must also be a URI.
1228   * Undefined components are taken to be less than any other component.
1229   * The following criteria are observed:
1230   * </p>
1231   * <ul>
1232   * <li>Two URIs with different schemes are compared according to their
1233   * scheme, regardless of case.</li>
1234   * <li>A hierarchical URI is less than an opaque URI with the same
1235   * scheme.</li>
1236   * <li><strong>For opaque URIs:</strong></li>
1237   * <ul>
1238   * <li>URIs with differing scheme-specific parts are ordered according
1239   * to the ordering of the scheme-specific part.</li>
1240   * <li>URIs with the same scheme-specific part are ordered by the
1241   * raw fragment.</li>
1242   * </ul>
1243   * <li>For hierarchical URIs:</li>
1244   * <ul>
1245   * <li>URIs are ordered according to their raw authority sections,
1246   * if they are unequal.</li>
1247   * <li><strong>For registry-based authorities:</strong></li>
1248   * <ul><li>they are ordered according to the ordering of the authority
1249   * component.</li></ul>
1250   * <li><strong>For server-based authorities:</strong></li>
1251   * <ul>
1252   * <li>URIs are ordered according to the raw user information.</li>
1253   * <li>URIs with the same user information are ordered by the host,
1254   * ignoring case.</li>
1255   * <lI>URIs with the same host are ordered by the port.</li>
1256   * </ul>
1257   * <li>URIs with the same authority section are ordered by the raw path.</li>
1258   * <li>URIs with the same path are ordered by their raw query.</li>
1259   * <li>URIs with the same query are ordered by their raw fragments.</li>
1260   * </ul>
1261   * </ul>
1262   *
1263   * @param obj This object to compare this URI with
1264   * @return a negative integer, zero or a positive integer depending
1265   *         on whether this URI is less than, equal to or greater
1266   *         than that supplied, respectively.
1267   * @throws ClassCastException if the given object is not a URI
1268   */
1269  public int compareTo(Object obj) 
1270    throws ClassCastException
1271  {
1272    URI uri = (URI) obj;
1273    if (scheme == null && uri.getScheme() != null)
1274      return -1;
1275    if (scheme != null)
1276      {
1277  int sCompare = scheme.compareToIgnoreCase(uri.getScheme()); 
1278  if (sCompare != 0)
1279    return sCompare;
1280      }
1281    boolean opaqueThis = isOpaque();
1282    boolean opaqueObj = uri.isOpaque();
1283    if (opaqueThis && !opaqueObj)
1284      return 1;
1285    if (!opaqueThis && opaqueObj)
1286      return -1;
1287    if (opaqueThis)
1288      {
1289  int ssCompare = 
1290    rawSchemeSpecificPart.compareTo(uri.getRawSchemeSpecificPart());
1291  if (ssCompare == 0)
1292    return compareFragments(uri);
1293  else
1294    return ssCompare;
1295      }
1296    if (rawAuthority == null && uri.getRawAuthority() != null)
1297      return -1;
1298    if (rawAuthority != null)
1299      {
1300  int aCompare = rawAuthority.compareTo(uri.getRawAuthority());
1301  if (aCompare != 0)
1302    {
1303      if (host == null)
1304        return aCompare;
1305      if (rawUserInfo == null && uri.getRawUserInfo() != null)
1306        return -1;
1307      int uCompare = rawUserInfo.compareTo(uri.getRawUserInfo());
1308      if (uCompare != 0)
1309        return uCompare;
1310      if (host == null && uri.getHost() != null)
1311        return -1;
1312      int hCompare = host.compareTo(uri.getHost());
1313      if (hCompare != 0)
1314        return hCompare;
1315      return new Integer(port).compareTo(new Integer(uri.getPort()));
1316    }
1317      }
1318    if (rawPath == null && uri.getRawPath() != null)
1319      return -1;
1320    if (rawPath != null)
1321      {
1322  int pCompare = rawPath.compareTo(uri.getRawPath()); 
1323  if (pCompare != 0)
1324    return pCompare;
1325      }
1326    if (rawQuery == null && uri.getRawQuery() != null)
1327      return -1;
1328    if (rawQuery != null)
1329      {
1330  int qCompare = rawQuery.compareTo(uri.getRawQuery());
1331  if (qCompare != 0)
1332    return qCompare;
1333      }
1334    return compareFragments(uri);
1335  }
1336
1337  /**
1338   * Compares the fragment of this URI with that of the supplied URI.
1339   *
1340   * @param uri the URI to compare with this one.
1341   * @return a negative integer, zero or a positive integer depending
1342   *         on whether this uri's fragment is less than, equal to
1343   *         or greater than the fragment of the uri supplied, respectively.
1344   */
1345  private int compareFragments(URI uri)
1346  {
1347    if (rawFragment == null && uri.getRawFragment() != null)
1348      return -1;
1349    else if (rawFragment == null)
1350      return 0;
1351    else
1352      return rawFragment.compareTo(uri.getRawFragment());
1353  }
1354
1355  /**
1356   * Returns the URI as a String.  If the URI was created using a constructor,
1357   * then this will be the same as the original input string.
1358   *
1359   * @return a string representation of the URI.
1360   */
1361  public String toString()
1362  {
1363    return (scheme == null ? "" : scheme + ":")
1364      + rawSchemeSpecificPart
1365      + (rawFragment == null ? "" : "#" + rawFragment);
1366  }
1367
1368  /**
1369   * Returns the URI as US-ASCII string.  This is the same as the result
1370   * from <code>toString()</code> for URIs that don't contain any non-US-ASCII
1371   * characters.  Otherwise, the non-US-ASCII characters are replaced
1372   * by their percent-encoded representations.
1373   *
1374   * @return a string representation of the URI, containing only US-ASCII
1375   *         characters.
1376   */
1377  public String toASCIIString()
1378  {
1379    String strRep = toString();
1380    boolean inNonAsciiBlock = false;
1381    StringBuffer buffer = new StringBuffer();
1382    StringBuffer encBuffer = null;
1383    for (int i = 0; i < strRep.length(); i++)
1384      {
1385  char c = strRep.charAt(i);
1386  if (c <= 127)
1387    {
1388      if (inNonAsciiBlock)
1389        {
1390    buffer.append(escapeCharacters(encBuffer.toString()));
1391    inNonAsciiBlock = false;
1392        }
1393      buffer.append(c);
1394    }
1395  else
1396    {
1397      if (!inNonAsciiBlock)
1398        {
1399    encBuffer = new StringBuffer();
1400    inNonAsciiBlock = true;
1401        }
1402      encBuffer.append(c);
1403    }
1404      }
1405    return buffer.toString();
1406  }
1407
1408  /**
1409   * Converts the non-ASCII characters in the supplied string
1410   * to their equivalent percent-encoded representations.
1411   * That is, they are replaced by "%" followed by their hexadecimal value.
1412   *
1413   * @param str a string including non-ASCII characters.
1414   * @return the string with the non-ASCII characters converted to their
1415   *         percent-encoded representations.
1416   */
1417  private static String escapeCharacters(String str)
1418  {
1419    try
1420      {
1421  StringBuffer sb = new StringBuffer(); 
1422  // this is far from optimal, but it works
1423  byte[] utf8 = str.getBytes("utf-8");
1424  for (int j = 0; j < utf8.length; j++)
1425    {
1426      sb.append('%');
1427      sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
1428      sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
1429    }
1430  return sb.toString();
1431      }
1432    catch (java.io.UnsupportedEncodingException x)
1433      {
1434  throw (Error) new InternalError("Escaping error").initCause(x);
1435      }
1436  }
1437
1438}