Save This Page
Home » iText-src-2.1.3 » com.lowagie » text » pdf » [javadoc | source]
    1   /*
    2    * Copyright 2003 Paulo Soares
    3    *
    4    * The contents of this file are subject to the Mozilla Public License Version 1.1
    5    * (the "License"); you may not use this file except in compliance with the License.
    6    * You may obtain a copy of the License at http://www.mozilla.org/MPL/
    7    *
    8    * Software distributed under the License is distributed on an "AS IS" basis,
    9    * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
   10    * for the specific language governing rights and limitations under the License.
   11    *
   12    * The Original Code is 'iText, a free JAVA-PDF library'.
   13    *
   14    * The Initial Developer of the Original Code is Bruno Lowagie. Portions created by
   15    * the Initial Developer are Copyright (C) 1999, 2000, 2001, 2002 by Bruno Lowagie.
   16    * All Rights Reserved.
   17    * Co-Developer of the code is Paulo Soares. Portions created by the Co-Developer
   18    * are Copyright (C) 2000, 2001, 2002 by Paulo Soares. All Rights Reserved.
   19    *
   20    * Contributor(s): all the names of the contributors are added in the source code
   21    * where applicable.
   22    *
   23    * Alternatively, the contents of this file may be used under the terms of the
   24    * LGPL license (the "GNU LIBRARY GENERAL PUBLIC LICENSE"), in which case the
   25    * provisions of LGPL are applicable instead of those above.  If you wish to
   26    * allow use of your version of this file only under the terms of the LGPL
   27    * License and not to allow others to use your version of this file under
   28    * the MPL, indicate your decision by deleting the provisions above and
   29    * replace them with the notice and other provisions required by the LGPL.
   30    * If you do not delete the provisions above, a recipient may use your version
   31    * of this file under either the MPL or the GNU LIBRARY GENERAL PUBLIC LICENSE.
   32    *
   33    * This library is free software; you can redistribute it and/or modify it
   34    * under the terms of the MPL as stated above or under the terms of the GNU
   35    * Library General Public License as published by the Free Software Foundation;
   36    * either version 2 of the License, or any later version.
   37    *
   38    * This library is distributed in the hope that it will be useful, but WITHOUT
   39    * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
   40    * FOR A PARTICULAR PURPOSE. See the GNU Library general Public License for more
   41    * details.
   42    *
   43    * If you didn't download this code from the following link, you should check if
   44    * you aren't using an obsolete version:
   45    * http://www.lowagie.com/iText/
   46    */
   47   package com.lowagie.text.pdf;
   48   
   49   import java.io;
   50   import java.util.Stack;
   51   import java.util.HashMap;
   52   
   53   /**
   54    * A simple XML and HTML parser.  This parser is, like the SAX parser,
   55    * an event based parser, but with much less functionality.
   56    * <p>
   57    * The parser can:
   58    * <p>
   59    * <ul>
   60    * <li>It recognizes the encoding used
   61    * <li>It recognizes all the elements' start tags and end tags
   62    * <li>It lists attributes, where attribute values can be enclosed in single or double quotes
   63    * <li>It recognizes the <code>&lt;[CDATA[ ... ]]&gt;</code> construct
   64    * <li>It recognizes the standard entities: &amp;amp;, &amp;lt;, &amp;gt;, &amp;quot;, and &amp;apos;, as well as numeric entities
   65    * <li>It maps lines ending in <code>\r\n</code> and <code>\r</code> to <code>\n</code> on input, in accordance with the XML Specification, Section 2.11
   66    * </ul>
   67    * <p>
   68    * The code is based on <A HREF="http://www.javaworld.com/javaworld/javatips/javatip128/">
   69    * http://www.javaworld.com/javaworld/javatips/javatip128/</A> with some extra
   70    * code from XERCES to recognize the encoding.
   71    */
   72   public class SimpleXMLParser {
   73       private static final HashMap fIANA2JavaMap = new HashMap();
   74       private static final HashMap entityMap = new HashMap();
   75       
   76       private static int popMode(Stack st) {
   77           if(!st.empty())
   78               return ((Integer)st.pop()).intValue();
   79           else
   80               return PRE;
   81       }
   82       
   83       private final static int
   84       TEXT = 1,
   85       ENTITY = 2,
   86       OPEN_TAG = 3,
   87       CLOSE_TAG = 4,
   88       START_TAG = 5,
   89       ATTRIBUTE_LVALUE = 6,
   90       ATTRIBUTE_EQUAL = 9,
   91       ATTRIBUTE_RVALUE = 10,
   92       QUOTE = 7,
   93       IN_TAG = 8,
   94       SINGLE_TAG = 12,
   95       COMMENT = 13,
   96       DONE = 11,
   97       DOCTYPE = 14,
   98       PRE = 15,
   99       CDATA = 16;
  100       
  101       private SimpleXMLParser() {
  102       }
  103       
  104       /**
  105        * Parses the XML document firing the events to the handler.
  106        * @param doc the document handler
  107        * @param in the document. The encoding is deduced from the stream. The stream is not closed
  108        * @throws IOException on error
  109        */    
  110       public static void parse(SimpleXMLDocHandler doc, InputStream in) throws IOException {
  111           byte b4[] = new byte[4];
  112           int count = in.read(b4);
  113           if (count != 4)
  114               throw new IOException("Insufficient length.");
  115           String encoding = getEncodingName(b4);
  116           String decl = null;
  117           if (encoding.equals("UTF-8")) {
  118               StringBuffer sb = new StringBuffer();
  119               int c;
  120               while ((c = in.read()) != -1) {
  121                   if (c == '>')
  122                       break;
  123                   sb.append((char)c);
  124               }
  125               decl = sb.toString();
  126           }
  127           else if (encoding.equals("CP037")) {
  128               ByteArrayOutputStream bi = new ByteArrayOutputStream();
  129               int c;
  130               while ((c = in.read()) != -1) {
  131                   if (c == 0x6e) // that's '>' in ebcdic
  132                       break;
  133                   bi.write(c);
  134               }
  135               decl = new String(bi.toByteArray(), "CP037");
  136           }
  137           if (decl != null) {
  138               decl = getDeclaredEncoding(decl);
  139               if (decl != null)
  140                   encoding = decl;
  141           }
  142           parse(doc, new InputStreamReader(in, getJavaEncoding(encoding)));
  143       }
  144       
  145       private static String getDeclaredEncoding(String decl) {
  146           if (decl == null)
  147               return null;
  148           int idx = decl.indexOf("encoding");
  149           if (idx < 0)
  150               return null;
  151           int idx1 = decl.indexOf('"', idx);
  152           int idx2 = decl.indexOf('\'', idx);
  153           if (idx1 == idx2)
  154               return null;
  155           if ((idx1 < 0 && idx2 > 0) || (idx2 > 0 && idx2 < idx1)) {
  156               int idx3 = decl.indexOf('\'', idx2 + 1);
  157               if (idx3 < 0)
  158                   return null;
  159               return decl.substring(idx2 + 1, idx3);
  160           }
  161           if ((idx2 < 0 && idx1 > 0) || (idx1 > 0 && idx1 < idx2)) {
  162               int idx3 = decl.indexOf('"', idx1 + 1);
  163               if (idx3 < 0)
  164                   return null;
  165               return decl.substring(idx1 + 1, idx3);
  166           }
  167           return null;
  168       }
  169       
  170       /**
  171        * Gets the java encoding from the IANA encoding. If the encoding cannot be found
  172        * it returns the input.
  173        * @param iana the IANA encoding
  174        * @return the java encoding
  175        */    
  176       public static String getJavaEncoding(String iana) {
  177           String IANA = iana.toUpperCase();
  178           String jdec = (String)fIANA2JavaMap.get(IANA);
  179           if (jdec == null)
  180               jdec = iana;
  181           return jdec;
  182       }
  183       
  184       public static void parse(SimpleXMLDocHandler doc,Reader r) throws IOException {
  185           parse(doc, null, r, false);
  186       }
  187       
  188       /**
  189        * Parses the XML document firing the events to the handler.
  190        * @param doc the document handler
  191        * @param r the document. The encoding is already resolved. The reader is not closed
  192        * @throws IOException on error
  193        */
  194       public static void parse(SimpleXMLDocHandler doc, SimpleXMLDocHandlerComment comment, Reader r, boolean html) throws IOException {
  195           BufferedReader reader;
  196           if (r instanceof BufferedReader)
  197               reader = (BufferedReader)r;
  198           else
  199               reader = new BufferedReader(r);
  200           Stack st = new Stack();
  201           int depth = 0;
  202           int mode = PRE;
  203           int c = 0;
  204           int quotec = '"';
  205           depth = 0;
  206           StringBuffer sb = new StringBuffer();
  207           StringBuffer etag = new StringBuffer();
  208           String tagName = null;
  209           String lvalue = null;
  210           String rvalue = null;
  211           HashMap attrs = null;
  212           st = new Stack();
  213           doc.startDocument();
  214           int line=1, col=0;
  215           boolean eol = false;
  216           if (html)
  217               mode = TEXT;
  218           int pushBack = -1;
  219           while(true) {
  220               if (pushBack != -1) {
  221                   c = pushBack;
  222                   pushBack = -1;
  223               }
  224               else
  225                   c = reader.read();
  226               if (c == -1)
  227                   break;
  228               
  229               // We need to map \r, \r\n, and \n to \n
  230               // See XML spec section 2.11
  231               if(c == '\n' && eol) {
  232                   eol = false;
  233                   continue;
  234               } else if(eol) {
  235                   eol = false;
  236               } else if(c == '\n') {
  237                   line++;
  238                   col=0;
  239               } else if(c == '\r') {
  240                   eol = true;
  241                   c = '\n';
  242                   line++;
  243                   col=0;
  244               } else {
  245                   col++;
  246               }
  247               
  248               if(mode == DONE) {
  249                   doc.endDocument();
  250                   return;
  251                   
  252                   // We are between tags collecting text.
  253               } else if(mode == TEXT) {
  254                   if(c == '<') {
  255                       st.push(new Integer(mode));
  256                       mode = START_TAG;
  257                       if(sb.length() > 0) {
  258                           doc.text(sb.toString());
  259                           sb.setLength(0);
  260                       }
  261                   } else if(c == '&') {
  262                       st.push(new Integer(mode));
  263                       mode = ENTITY;
  264                       etag.setLength(0);
  265                   } else
  266                       sb.append((char)c);
  267                   
  268                   // we are processing a closing tag: e.g. </foo>
  269               } else if(mode == CLOSE_TAG) {
  270                   if(c == '>') {
  271                       mode = popMode(st);
  272                       tagName = sb.toString();
  273                       if (html)
  274                           tagName = tagName.toLowerCase();
  275                       sb.setLength(0);
  276                       depth--;
  277                       if(!html && depth==0)
  278                           mode = DONE;
  279                      doc.endElement(tagName);
  280                   } else {
  281                       if (!Character.isWhitespace((char)c))
  282                           sb.append((char)c);
  283                   }
  284                   
  285                   // we are processing CDATA
  286               } else if(mode == CDATA) {
  287                   if(c == '>'
  288                   && sb.toString().endsWith("]]")) {
  289                       sb.setLength(sb.length()-2);
  290                       doc.text(sb.toString());
  291                       sb.setLength(0);
  292                       mode = popMode(st);
  293                   } else
  294                       sb.append((char)c);
  295                   
  296                   // we are processing a comment.  We are inside
  297                   // the <!-- .... --> looking for the -->.
  298               } else if(mode == COMMENT) {
  299                   if(c == '>'
  300                   && sb.toString().endsWith("--")) {
  301                       if (comment != null) {
  302                           sb.setLength(sb.length() - 2);
  303                           comment.comment(sb.toString());
  304                       }
  305                       sb.setLength(0);
  306                       mode = popMode(st);
  307                   } else
  308                       sb.append((char)c);
  309                   
  310                   // We are outside the root tag element
  311               } else if(mode == PRE) {
  312                   if(c == '<') {
  313                       mode = TEXT;
  314                       st.push(new Integer(mode));
  315                       mode = START_TAG;
  316                   }
  317                   
  318                   // We are inside one of these <? ... ?>
  319                   // or one of these <!DOCTYPE ... >
  320               } else if(mode == DOCTYPE) {
  321                   if(c == '>') {
  322                       mode = popMode(st);
  323                       if(mode == TEXT) mode = PRE;
  324                   }
  325                   
  326                   // we have just seen a < and
  327                   // are wondering what we are looking at
  328                   // <foo>, </foo>, <!-- ... --->, etc.
  329               } else if(mode == START_TAG) {
  330                   mode = popMode(st);
  331                   if(c == '/') {
  332                       st.push(new Integer(mode));
  333                       mode = CLOSE_TAG;
  334                   } else if (c == '?') {
  335                       mode = DOCTYPE;
  336                   } else {
  337                       st.push(new Integer(mode));
  338                       mode = OPEN_TAG;
  339                       tagName = null;
  340                       attrs = new HashMap();
  341                       sb.append((char)c);
  342                   }
  343                   
  344                   // we are processing an entity, e.g. &lt;, &#187;, etc.
  345               } else if(mode == ENTITY) {
  346                   if(c == ';') {
  347                       mode = popMode(st);
  348                       String cent = etag.toString();
  349                       etag.setLength(0);
  350                       if(cent.startsWith("#x")) {
  351                           try {
  352                               char ci = (char)Integer.parseInt(cent.substring(2),16);
  353                               sb.append(ci);
  354                           }
  355                           catch (Exception es) {
  356                               sb.append('&').append(cent).append(';');
  357                           }
  358                       }
  359                       else if(cent.startsWith("#")) {
  360                           try {
  361                               char ci = (char)Integer.parseInt(cent.substring(1));
  362                               sb.append(ci);
  363                           }
  364                           catch (Exception es) {
  365                               sb.append('&').append(cent).append(';');
  366                           }
  367                       }
  368                       else {
  369                           char ce = decodeEntity(cent);
  370                           if (ce == '\0')
  371                               sb.append('&').append(cent).append(';');
  372                           else
  373                           sb.append(ce);
  374                       }
  375                   } else if ((c != '#' && (c < '0' || c > '9') && (c < 'a' || c > 'z')
  376                       && (c < 'A' || c > 'Z')) || etag.length() >= 7) {
  377                       mode = popMode(st);
  378                       pushBack = c;
  379                       sb.append('&').append(etag.toString());
  380                       etag.setLength(0);
  381                   }
  382                   else {
  383                       etag.append((char)c);
  384                   }
  385                   
  386                   // we have just seen something like this:
  387                   // <foo a="b"/
  388                   // and are looking for the final >.
  389               } else if(mode == SINGLE_TAG) {
  390                   if(tagName == null)
  391                       tagName = sb.toString();
  392                   if (html)
  393                       tagName = tagName.toLowerCase();
  394                   if(c != '>')
  395                       exc("Expected > for tag: <"+tagName+"/>",line,col);
  396                   doc.startElement(tagName,attrs);
  397                   doc.endElement(tagName);
  398                   if(!html && depth==0) {
  399                       doc.endDocument();
  400                       return;
  401                   }
  402                   sb.setLength(0);
  403                   attrs = new HashMap();
  404                   tagName = null;
  405                   mode = popMode(st);
  406                   
  407                   // we are processing something
  408                   // like this <foo ... >.  It could
  409                   // still be a <!-- ... --> or something.
  410               } else if(mode == OPEN_TAG) {
  411                   if(c == '>') {
  412                       if(tagName == null)
  413                           tagName = sb.toString();
  414                       if (html)
  415                           tagName = tagName.toLowerCase();
  416                       sb.setLength(0);
  417                       depth++;
  418                       doc.startElement(tagName,attrs);
  419                       tagName = null;
  420                       attrs = new HashMap();
  421                       mode = popMode(st);
  422                   } else if(c == '/') {
  423                       mode = SINGLE_TAG;
  424                   } else if(c == '-' && sb.toString().equals("!-")) {
  425                       mode = COMMENT;
  426                       sb.setLength(0);
  427                   } else if(c == '[' && sb.toString().equals("![CDATA")) {
  428                       mode = CDATA;
  429                       sb.setLength(0);
  430                   } else if(c == 'E' && sb.toString().equals("!DOCTYP")) {
  431                       sb.setLength(0);
  432                       mode = DOCTYPE;
  433                   } else if(Character.isWhitespace((char)c)) {
  434                       tagName = sb.toString();
  435                       if (html)
  436                           tagName = tagName.toLowerCase();
  437                       sb.setLength(0);
  438                       mode = IN_TAG;
  439                   } else {
  440                       sb.append((char)c);
  441                   }
  442                   
  443                   // We are processing the quoted right-hand side
  444                   // of an element's attribute.
  445               } else if(mode == QUOTE) {
  446                   if (html && quotec == ' ' && c == '>') {
  447                       rvalue = sb.toString();
  448                       sb.setLength(0);
  449                       attrs.put(lvalue,rvalue);
  450                       mode = popMode(st);
  451                       doc.startElement(tagName,attrs);
  452                       depth++;
  453                       tagName = null;
  454                       attrs = new HashMap();
  455                   }
  456                   else if (html && quotec == ' ' && Character.isWhitespace((char)c)) {
  457                       rvalue = sb.toString();
  458                       sb.setLength(0);
  459                       attrs.put(lvalue,rvalue);
  460                       mode = IN_TAG;
  461                   }
  462                   else if (html && quotec == ' ') {
  463                       sb.append((char)c);
  464                   }
  465                   else if(c == quotec) {
  466                       rvalue = sb.toString();
  467                       sb.setLength(0);
  468                       attrs.put(lvalue,rvalue);
  469                       mode = IN_TAG;
  470                       // See section the XML spec, section 3.3.3
  471                       // on normalization processing.
  472                   } else if(" \r\n\u0009".indexOf(c)>=0) {
  473                       sb.append(' ');
  474                   } else if(c == '&') {
  475                       st.push(new Integer(mode));
  476                       mode = ENTITY;
  477                       etag.setLength(0);
  478                   } else {
  479                       sb.append((char)c);
  480                   }
  481                   
  482               } else if(mode == ATTRIBUTE_RVALUE) {
  483                   if(c == '"' || c == '\'') {
  484                       quotec = c;
  485                       mode = QUOTE;
  486                   } else if(Character.isWhitespace((char)c)) {
  487                       ;
  488                   } else if (html && c == '>') {
  489                       attrs.put(lvalue,sb.toString());
  490                       sb.setLength(0);
  491                       mode = popMode(st);
  492                       doc.startElement(tagName,attrs);
  493                       depth++;
  494                       tagName = null;
  495                       attrs = new HashMap();
  496                   } else if (html) {
  497                       sb.append((char)c);
  498                       quotec = ' ';
  499                       mode = QUOTE;
  500                   } else {
  501                       exc("Error in attribute processing",line,col);
  502                   }
  503                   
  504               } else if(mode == ATTRIBUTE_LVALUE) {
  505                   if(Character.isWhitespace((char)c)) {
  506                       lvalue = sb.toString();
  507                       if (html)
  508                           lvalue = lvalue.toLowerCase();
  509                       sb.setLength(0);
  510                       mode = ATTRIBUTE_EQUAL;
  511                   } else if(c == '=') {
  512                       lvalue = sb.toString();
  513                       if (html)
  514                           lvalue = lvalue.toLowerCase();
  515                       sb.setLength(0);
  516                       mode = ATTRIBUTE_RVALUE;
  517                   } else if (html && c == '>') {
  518                       sb.setLength(0);
  519                       mode = popMode(st);
  520                       doc.startElement(tagName,attrs);
  521                       depth++;
  522                       tagName = null;
  523                       attrs = new HashMap();
  524                   } else {
  525                       sb.append((char)c);
  526                   }
  527                   
  528               } else if(mode == ATTRIBUTE_EQUAL) {
  529                   if(c == '=') {
  530                       mode = ATTRIBUTE_RVALUE;
  531                   } else if(Character.isWhitespace((char)c)) {
  532                       ;
  533                   } else if (html && c == '>') {
  534                       sb.setLength(0);
  535                       mode = popMode(st);
  536                       doc.startElement(tagName,attrs);
  537                       depth++;
  538                       tagName = null;
  539                       attrs = new HashMap();
  540                   } else if (html && c == '/') {
  541                       sb.setLength(0);
  542                       mode = SINGLE_TAG;
  543                   } else if (html) {
  544                       sb.setLength(0);
  545                       sb.append((char)c);
  546                       mode = ATTRIBUTE_LVALUE;
  547                   } else {
  548                       exc("Error in attribute processing.",line,col);
  549                   }
  550                   
  551               } else if(mode == IN_TAG) {
  552                   if(c == '>') {
  553                       mode = popMode(st);
  554                       doc.startElement(tagName,attrs);
  555                       depth++;
  556                       tagName = null;
  557                       attrs = new HashMap();
  558                   } else if(c == '/') {
  559                       mode = SINGLE_TAG;
  560                   } else if(Character.isWhitespace((char)c)) {
  561                       ;
  562                   } else {
  563                       mode = ATTRIBUTE_LVALUE;
  564                       sb.append((char)c);
  565                   }
  566               }
  567           }
  568           if(html || mode == DONE) {
  569               if (html && mode == TEXT)
  570                   doc.text(sb.toString());
  571               doc.endDocument();
  572           }
  573           else
  574               exc("missing end tag",line,col);
  575       }
  576       private static void exc(String s,int line,int col) throws IOException {
  577           throw new IOException(s+" near line "+line+", column "+col);
  578       }
  579       
  580       /**
  581        * Escapes a string with the appropriated XML codes.
  582        * @param s the string to be escaped
  583        * @param onlyASCII codes above 127 will always be escaped with &amp;#nn; if <CODE>true</CODE>
  584        * @return the escaped string
  585        */    
  586       public static String escapeXML(String s, boolean onlyASCII) {
  587           char cc[] = s.toCharArray();
  588           int len = cc.length;
  589           StringBuffer sb = new StringBuffer();
  590           for (int k = 0; k < len; ++k) {
  591               int c = cc[k];
  592               switch (c) {
  593                   case '<':
  594                       sb.append("&lt;");
  595                       break;
  596                   case '>':
  597                       sb.append("&gt;");
  598                       break;
  599                   case '&':
  600                       sb.append("&amp;");
  601                       break;
  602                   case '"':
  603                       sb.append("&quot;");
  604                       break;
  605                   case '\'':
  606                       sb.append("&apos;");
  607                       break;
  608                   default:
  609                       if (onlyASCII && c > 127)
  610                           sb.append("&#").append(c).append(";");
  611                       else
  612                           sb.append((char)c);
  613               }
  614           }
  615           return sb.toString();
  616       }
  617       
  618       public static char decodeEntity(String s) {
  619           Character c = (Character)entityMap.get(s);
  620           if (c == null)
  621               return '\0';
  622           else
  623               return c.charValue();
  624       }
  625       
  626       private static String getEncodingName(byte[] b4) {
  627           
  628           // UTF-16, with BOM
  629           int b0 = b4[0] & 0xFF;
  630           int b1 = b4[1] & 0xFF;
  631           if (b0 == 0xFE && b1 == 0xFF) {
  632               // UTF-16, big-endian
  633               return "UTF-16BE";
  634           }
  635           if (b0 == 0xFF && b1 == 0xFE) {
  636               // UTF-16, little-endian
  637               return "UTF-16LE";
  638           }
  639           
  640           // UTF-8 with a BOM
  641           int b2 = b4[2] & 0xFF;
  642           if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
  643               return "UTF-8";
  644           }
  645           
  646           // other encodings
  647           int b3 = b4[3] & 0xFF;
  648           if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
  649               // UCS-4, big endian (1234)
  650               return "ISO-10646-UCS-4";
  651           }
  652           if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
  653               // UCS-4, little endian (4321)
  654               return "ISO-10646-UCS-4";
  655           }
  656           if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
  657               // UCS-4, unusual octet order (2143)
  658               // REVISIT: What should this be?
  659               return "ISO-10646-UCS-4";
  660           }
  661           if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
  662               // UCS-4, unusual octect order (3412)
  663               // REVISIT: What should this be?
  664               return "ISO-10646-UCS-4";
  665           }
  666           if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
  667               // UTF-16, big-endian, no BOM
  668               // (or could turn out to be UCS-2...
  669               // REVISIT: What should this be?
  670               return "UTF-16BE";
  671           }
  672           if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
  673               // UTF-16, little-endian, no BOM
  674               // (or could turn out to be UCS-2...
  675               return "UTF-16LE";
  676           }
  677           if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
  678               // EBCDIC
  679               // a la xerces1, return CP037 instead of EBCDIC here
  680               return "CP037";
  681           }
  682           
  683           // default encoding
  684           return "UTF-8";
  685       }
  686   
  687       static {        
  688           // add IANA to Java encoding mappings.
  689           fIANA2JavaMap.put("BIG5", "Big5");
  690           fIANA2JavaMap.put("CSBIG5", "Big5");
  691           fIANA2JavaMap.put("CP037", "CP037");
  692           fIANA2JavaMap.put("IBM037", "CP037");
  693           fIANA2JavaMap.put("CSIBM037", "CP037");
  694           fIANA2JavaMap.put("EBCDIC-CP-US", "CP037");
  695           fIANA2JavaMap.put("EBCDIC-CP-CA", "CP037");
  696           fIANA2JavaMap.put("EBCDIC-CP-NL", "CP037");
  697           fIANA2JavaMap.put("EBCDIC-CP-WT", "CP037");
  698           fIANA2JavaMap.put("IBM277", "CP277");
  699           fIANA2JavaMap.put("CP277", "CP277");
  700           fIANA2JavaMap.put("CSIBM277", "CP277");
  701           fIANA2JavaMap.put("EBCDIC-CP-DK", "CP277");
  702           fIANA2JavaMap.put("EBCDIC-CP-NO", "CP277");
  703           fIANA2JavaMap.put("IBM278", "CP278");
  704           fIANA2JavaMap.put("CP278", "CP278");
  705           fIANA2JavaMap.put("CSIBM278", "CP278");
  706           fIANA2JavaMap.put("EBCDIC-CP-FI", "CP278");
  707           fIANA2JavaMap.put("EBCDIC-CP-SE", "CP278");
  708           fIANA2JavaMap.put("IBM280", "CP280");
  709           fIANA2JavaMap.put("CP280", "CP280");
  710           fIANA2JavaMap.put("CSIBM280", "CP280");
  711           fIANA2JavaMap.put("EBCDIC-CP-IT", "CP280");
  712           fIANA2JavaMap.put("IBM284", "CP284");
  713           fIANA2JavaMap.put("CP284", "CP284");
  714           fIANA2JavaMap.put("CSIBM284", "CP284");
  715           fIANA2JavaMap.put("EBCDIC-CP-ES", "CP284");
  716           fIANA2JavaMap.put("EBCDIC-CP-GB", "CP285");
  717           fIANA2JavaMap.put("IBM285", "CP285");
  718           fIANA2JavaMap.put("CP285", "CP285");
  719           fIANA2JavaMap.put("CSIBM285", "CP285");
  720           fIANA2JavaMap.put("EBCDIC-CP-FR", "CP297");
  721           fIANA2JavaMap.put("IBM297", "CP297");
  722           fIANA2JavaMap.put("CP297", "CP297");
  723           fIANA2JavaMap.put("CSIBM297", "CP297");
  724           fIANA2JavaMap.put("EBCDIC-CP-AR1", "CP420");
  725           fIANA2JavaMap.put("IBM420", "CP420");
  726           fIANA2JavaMap.put("CP420", "CP420");
  727           fIANA2JavaMap.put("CSIBM420", "CP420");
  728           fIANA2JavaMap.put("EBCDIC-CP-HE", "CP424");
  729           fIANA2JavaMap.put("IBM424", "CP424");
  730           fIANA2JavaMap.put("CP424", "CP424");
  731           fIANA2JavaMap.put("CSIBM424", "CP424");
  732           fIANA2JavaMap.put("EBCDIC-CP-CH", "CP500");
  733           fIANA2JavaMap.put("IBM500", "CP500");
  734           fIANA2JavaMap.put("CP500", "CP500");
  735           fIANA2JavaMap.put("CSIBM500", "CP500");
  736           fIANA2JavaMap.put("EBCDIC-CP-CH", "CP500");
  737           fIANA2JavaMap.put("EBCDIC-CP-BE", "CP500");
  738           fIANA2JavaMap.put("IBM868", "CP868");
  739           fIANA2JavaMap.put("CP868", "CP868");
  740           fIANA2JavaMap.put("CSIBM868", "CP868");
  741           fIANA2JavaMap.put("CP-AR", "CP868");
  742           fIANA2JavaMap.put("IBM869", "CP869");
  743           fIANA2JavaMap.put("CP869", "CP869");
  744           fIANA2JavaMap.put("CSIBM869", "CP869");
  745           fIANA2JavaMap.put("CP-GR", "CP869");
  746           fIANA2JavaMap.put("IBM870", "CP870");
  747           fIANA2JavaMap.put("CP870", "CP870");
  748           fIANA2JavaMap.put("CSIBM870", "CP870");
  749           fIANA2JavaMap.put("EBCDIC-CP-ROECE", "CP870");
  750           fIANA2JavaMap.put("EBCDIC-CP-YU", "CP870");
  751           fIANA2JavaMap.put("IBM871", "CP871");
  752           fIANA2JavaMap.put("CP871", "CP871");
  753           fIANA2JavaMap.put("CSIBM871", "CP871");
  754           fIANA2JavaMap.put("EBCDIC-CP-IS", "CP871");
  755           fIANA2JavaMap.put("IBM918", "CP918");
  756           fIANA2JavaMap.put("CP918", "CP918");
  757           fIANA2JavaMap.put("CSIBM918", "CP918");
  758           fIANA2JavaMap.put("EBCDIC-CP-AR2", "CP918");
  759           fIANA2JavaMap.put("EUC-JP", "EUCJIS");
  760           fIANA2JavaMap.put("CSEUCPkdFmtJapanese", "EUCJIS");
  761           fIANA2JavaMap.put("EUC-KR", "KSC5601");
  762           fIANA2JavaMap.put("GB2312", "GB2312");
  763           fIANA2JavaMap.put("CSGB2312", "GB2312");
  764           fIANA2JavaMap.put("ISO-2022-JP", "JIS");
  765           fIANA2JavaMap.put("CSISO2022JP", "JIS");
  766           fIANA2JavaMap.put("ISO-2022-KR", "ISO2022KR");
  767           fIANA2JavaMap.put("CSISO2022KR", "ISO2022KR");
  768           fIANA2JavaMap.put("ISO-2022-CN", "ISO2022CN");
  769           
  770           fIANA2JavaMap.put("X0201", "JIS0201");
  771           fIANA2JavaMap.put("CSISO13JISC6220JP", "JIS0201");
  772           fIANA2JavaMap.put("X0208", "JIS0208");
  773           fIANA2JavaMap.put("ISO-IR-87", "JIS0208");
  774           fIANA2JavaMap.put("X0208dbiJIS_X0208-1983", "JIS0208");
  775           fIANA2JavaMap.put("CSISO87JISX0208", "JIS0208");
  776           fIANA2JavaMap.put("X0212", "JIS0212");
  777           fIANA2JavaMap.put("ISO-IR-159", "JIS0212");
  778           fIANA2JavaMap.put("CSISO159JISX02121990", "JIS0212");
  779           fIANA2JavaMap.put("SHIFT_JIS", "SJIS");
  780           fIANA2JavaMap.put("CSSHIFT_JIS", "SJIS");
  781           fIANA2JavaMap.put("MS_Kanji", "SJIS");
  782           
  783           // Add support for Cp1252 and its friends
  784           fIANA2JavaMap.put("WINDOWS-1250", "Cp1250");
  785           fIANA2JavaMap.put("WINDOWS-1251", "Cp1251");
  786           fIANA2JavaMap.put("WINDOWS-1252", "Cp1252");
  787           fIANA2JavaMap.put("WINDOWS-1253", "Cp1253");
  788           fIANA2JavaMap.put("WINDOWS-1254", "Cp1254");
  789           fIANA2JavaMap.put("WINDOWS-1255", "Cp1255");
  790           fIANA2JavaMap.put("WINDOWS-1256", "Cp1256");
  791           fIANA2JavaMap.put("WINDOWS-1257", "Cp1257");
  792           fIANA2JavaMap.put("WINDOWS-1258", "Cp1258");
  793           fIANA2JavaMap.put("TIS-620", "TIS620");
  794           
  795           fIANA2JavaMap.put("ISO-8859-1", "ISO8859_1");
  796           fIANA2JavaMap.put("ISO-IR-100", "ISO8859_1");
  797           fIANA2JavaMap.put("ISO_8859-1", "ISO8859_1");
  798           fIANA2JavaMap.put("LATIN1", "ISO8859_1");
  799           fIANA2JavaMap.put("CSISOLATIN1", "ISO8859_1");
  800           fIANA2JavaMap.put("L1", "ISO8859_1");
  801           fIANA2JavaMap.put("IBM819", "ISO8859_1");
  802           fIANA2JavaMap.put("CP819", "ISO8859_1");
  803           
  804           fIANA2JavaMap.put("ISO-8859-2", "ISO8859_2");
  805           fIANA2JavaMap.put("ISO-IR-101", "ISO8859_2");
  806           fIANA2JavaMap.put("ISO_8859-2", "ISO8859_2");
  807           fIANA2JavaMap.put("LATIN2", "ISO8859_2");
  808           fIANA2JavaMap.put("CSISOLATIN2", "ISO8859_2");
  809           fIANA2JavaMap.put("L2", "ISO8859_2");
  810           
  811           fIANA2JavaMap.put("ISO-8859-3", "ISO8859_3");
  812           fIANA2JavaMap.put("ISO-IR-109", "ISO8859_3");
  813           fIANA2JavaMap.put("ISO_8859-3", "ISO8859_3");
  814           fIANA2JavaMap.put("LATIN3", "ISO8859_3");
  815           fIANA2JavaMap.put("CSISOLATIN3", "ISO8859_3");
  816           fIANA2JavaMap.put("L3", "ISO8859_3");
  817           
  818           fIANA2JavaMap.put("ISO-8859-4", "ISO8859_4");
  819           fIANA2JavaMap.put("ISO-IR-110", "ISO8859_4");
  820           fIANA2JavaMap.put("ISO_8859-4", "ISO8859_4");
  821           fIANA2JavaMap.put("LATIN4", "ISO8859_4");
  822           fIANA2JavaMap.put("CSISOLATIN4", "ISO8859_4");
  823           fIANA2JavaMap.put("L4", "ISO8859_4");
  824           
  825           fIANA2JavaMap.put("ISO-8859-5", "ISO8859_5");
  826           fIANA2JavaMap.put("ISO-IR-144", "ISO8859_5");
  827           fIANA2JavaMap.put("ISO_8859-5", "ISO8859_5");
  828           fIANA2JavaMap.put("CYRILLIC", "ISO8859_5");
  829           fIANA2JavaMap.put("CSISOLATINCYRILLIC", "ISO8859_5");
  830           
  831           fIANA2JavaMap.put("ISO-8859-6", "ISO8859_6");
  832           fIANA2JavaMap.put("ISO-IR-127", "ISO8859_6");
  833           fIANA2JavaMap.put("ISO_8859-6", "ISO8859_6");
  834           fIANA2JavaMap.put("ECMA-114", "ISO8859_6");
  835           fIANA2JavaMap.put("ASMO-708", "ISO8859_6");
  836           fIANA2JavaMap.put("ARABIC", "ISO8859_6");
  837           fIANA2JavaMap.put("CSISOLATINARABIC", "ISO8859_6");
  838           
  839           fIANA2JavaMap.put("ISO-8859-7", "ISO8859_7");
  840           fIANA2JavaMap.put("ISO-IR-126", "ISO8859_7");
  841           fIANA2JavaMap.put("ISO_8859-7", "ISO8859_7");
  842           fIANA2JavaMap.put("ELOT_928", "ISO8859_7");
  843           fIANA2JavaMap.put("ECMA-118", "ISO8859_7");
  844           fIANA2JavaMap.put("GREEK", "ISO8859_7");
  845           fIANA2JavaMap.put("CSISOLATINGREEK", "ISO8859_7");
  846           fIANA2JavaMap.put("GREEK8", "ISO8859_7");
  847           
  848           fIANA2JavaMap.put("ISO-8859-8", "ISO8859_8");
  849           fIANA2JavaMap.put("ISO-8859-8-I", "ISO8859_8"); // added since this encoding only differs w.r.t. presentation
  850           fIANA2JavaMap.put("ISO-IR-138", "ISO8859_8");
  851           fIANA2JavaMap.put("ISO_8859-8", "ISO8859_8");
  852           fIANA2JavaMap.put("HEBREW", "ISO8859_8");
  853           fIANA2JavaMap.put("CSISOLATINHEBREW", "ISO8859_8");
  854           
  855           fIANA2JavaMap.put("ISO-8859-9", "ISO8859_9");
  856           fIANA2JavaMap.put("ISO-IR-148", "ISO8859_9");
  857           fIANA2JavaMap.put("ISO_8859-9", "ISO8859_9");
  858           fIANA2JavaMap.put("LATIN5", "ISO8859_9");
  859           fIANA2JavaMap.put("CSISOLATIN5", "ISO8859_9");
  860           fIANA2JavaMap.put("L5", "ISO8859_9");
  861           
  862           fIANA2JavaMap.put("KOI8-R", "KOI8_R");
  863           fIANA2JavaMap.put("CSKOI8-R", "KOI8_R");
  864           fIANA2JavaMap.put("US-ASCII", "ASCII");
  865           fIANA2JavaMap.put("ISO-IR-6", "ASCII");
  866           fIANA2JavaMap.put("ANSI_X3.4-1986", "ASCII");
  867           fIANA2JavaMap.put("ISO_646.IRV:1991", "ASCII");
  868           fIANA2JavaMap.put("ASCII", "ASCII");
  869           fIANA2JavaMap.put("CSASCII", "ASCII");
  870           fIANA2JavaMap.put("ISO646-US", "ASCII");
  871           fIANA2JavaMap.put("US", "ASCII");
  872           fIANA2JavaMap.put("IBM367", "ASCII");
  873           fIANA2JavaMap.put("CP367", "ASCII");
  874           fIANA2JavaMap.put("UTF-8", "UTF8");
  875           fIANA2JavaMap.put("UTF-16", "Unicode");
  876           fIANA2JavaMap.put("UTF-16BE", "UnicodeBig");
  877           fIANA2JavaMap.put("UTF-16LE", "UnicodeLittle");
  878   
  879           entityMap.put("nbsp", new Character('\u00a0')); // no-break space = non-breaking space, U+00A0 ISOnum
  880           entityMap.put("iexcl", new Character('\u00a1')); // inverted exclamation mark, U+00A1 ISOnum
  881           entityMap.put("cent", new Character('\u00a2')); // cent sign, U+00A2 ISOnum
  882           entityMap.put("pound", new Character('\u00a3')); // pound sign, U+00A3 ISOnum
  883           entityMap.put("curren", new Character('\u00a4')); // currency sign, U+00A4 ISOnum
  884           entityMap.put("yen", new Character('\u00a5')); // yen sign = yuan sign, U+00A5 ISOnum
  885           entityMap.put("brvbar", new Character('\u00a6')); // broken bar = broken vertical bar, U+00A6 ISOnum
  886           entityMap.put("sect", new Character('\u00a7')); // section sign, U+00A7 ISOnum
  887           entityMap.put("uml", new Character('\u00a8')); // diaeresis = spacing diaeresis, U+00A8 ISOdia
  888           entityMap.put("copy", new Character('\u00a9')); // copyright sign, U+00A9 ISOnum
  889           entityMap.put("ordf", new Character('\u00aa')); // feminine ordinal indicator, U+00AA ISOnum
  890           entityMap.put("laquo", new Character('\u00ab')); // left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum
  891           entityMap.put("not", new Character('\u00ac')); // not sign, U+00AC ISOnum
  892           entityMap.put("shy", new Character('\u00ad')); // soft hyphen = discretionary hyphen, U+00AD ISOnum
  893           entityMap.put("reg", new Character('\u00ae')); // registered sign = registered trade mark sign, U+00AE ISOnum
  894           entityMap.put("macr", new Character('\u00af')); // macron = spacing macron = overline = APL overbar, U+00AF ISOdia
  895           entityMap.put("deg", new Character('\u00b0')); // degree sign, U+00B0 ISOnum
  896           entityMap.put("plusmn", new Character('\u00b1')); // plus-minus sign = plus-or-minus sign, U+00B1 ISOnum
  897           entityMap.put("sup2", new Character('\u00b2')); // superscript two = superscript digit two = squared, U+00B2 ISOnum
  898           entityMap.put("sup3", new Character('\u00b3')); // superscript three = superscript digit three = cubed, U+00B3 ISOnum
  899           entityMap.put("acute", new Character('\u00b4')); // acute accent = spacing acute, U+00B4 ISOdia
  900           entityMap.put("micro", new Character('\u00b5')); // micro sign, U+00B5 ISOnum
  901           entityMap.put("para", new Character('\u00b6')); // pilcrow sign = paragraph sign, U+00B6 ISOnum
  902           entityMap.put("middot", new Character('\u00b7')); // middle dot = Georgian comma = Greek middle dot, U+00B7 ISOnum
  903           entityMap.put("cedil", new Character('\u00b8')); // cedilla = spacing cedilla, U+00B8 ISOdia
  904           entityMap.put("sup1", new Character('\u00b9')); // superscript one = superscript digit one, U+00B9 ISOnum
  905           entityMap.put("ordm", new Character('\u00ba')); // masculine ordinal indicator, U+00BA ISOnum
  906           entityMap.put("raquo", new Character('\u00bb')); // right-pointing double angle quotation mark = right pointing guillemet, U+00BB ISOnum
  907           entityMap.put("frac14", new Character('\u00bc')); // vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum
  908           entityMap.put("frac12", new Character('\u00bd')); // vulgar fraction one half = fraction one half, U+00BD ISOnum
  909           entityMap.put("frac34", new Character('\u00be')); // vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum
  910           entityMap.put("iquest", new Character('\u00bf')); // inverted question mark = turned question mark, U+00BF ISOnum
  911           entityMap.put("Agrave", new Character('\u00c0')); // latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1
  912           entityMap.put("Aacute", new Character('\u00c1')); // latin capital letter A with acute, U+00C1 ISOlat1
  913           entityMap.put("Acirc", new Character('\u00c2')); // latin capital letter A with circumflex, U+00C2 ISOlat1
  914           entityMap.put("Atilde", new Character('\u00c3')); // latin capital letter A with tilde, U+00C3 ISOlat1
  915           entityMap.put("Auml", new Character('\u00c4')); // latin capital letter A with diaeresis, U+00C4 ISOlat1
  916           entityMap.put("Aring", new Character('\u00c5')); // latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1
  917           entityMap.put("AElig", new Character('\u00c6')); // latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1
  918           entityMap.put("Ccedil", new Character('\u00c7')); // latin capital letter C with cedilla, U+00C7 ISOlat1
  919           entityMap.put("Egrave", new Character('\u00c8')); // latin capital letter E with grave, U+00C8 ISOlat1
  920           entityMap.put("Eacute", new Character('\u00c9')); // latin capital letter E with acute, U+00C9 ISOlat1
  921           entityMap.put("Ecirc", new Character('\u00ca')); // latin capital letter E with circumflex, U+00CA ISOlat1
  922           entityMap.put("Euml", new Character('\u00cb')); // latin capital letter E with diaeresis, U+00CB ISOlat1
  923           entityMap.put("Igrave", new Character('\u00cc')); // latin capital letter I with grave, U+00CC ISOlat1
  924           entityMap.put("Iacute", new Character('\u00cd')); // latin capital letter I with acute, U+00CD ISOlat1
  925           entityMap.put("Icirc", new Character('\u00ce')); // latin capital letter I with circumflex, U+00CE ISOlat1
  926           entityMap.put("Iuml", new Character('\u00cf')); // latin capital letter I with diaeresis, U+00CF ISOlat1
  927           entityMap.put("ETH", new Character('\u00d0')); // latin capital letter ETH, U+00D0 ISOlat1
  928           entityMap.put("Ntilde", new Character('\u00d1')); // latin capital letter N with tilde, U+00D1 ISOlat1
  929           entityMap.put("Ograve", new Character('\u00d2')); // latin capital letter O with grave, U+00D2 ISOlat1
  930           entityMap.put("Oacute", new Character('\u00d3')); // latin capital letter O with acute, U+00D3 ISOlat1
  931           entityMap.put("Ocirc", new Character('\u00d4')); // latin capital letter O with circumflex, U+00D4 ISOlat1
  932           entityMap.put("Otilde", new Character('\u00d5')); // latin capital letter O with tilde, U+00D5 ISOlat1
  933           entityMap.put("Ouml", new Character('\u00d6')); // latin capital letter O with diaeresis, U+00D6 ISOlat1
  934           entityMap.put("times", new Character('\u00d7')); // multiplication sign, U+00D7 ISOnum
  935           entityMap.put("Oslash", new Character('\u00d8')); // latin capital letter O with stroke = latin capital letter O slash, U+00D8 ISOlat1
  936           entityMap.put("Ugrave", new Character('\u00d9')); // latin capital letter U with grave, U+00D9 ISOlat1
  937           entityMap.put("Uacute", new Character('\u00da')); // latin capital letter U with acute, U+00DA ISOlat1
  938           entityMap.put("Ucirc", new Character('\u00db')); // latin capital letter U with circumflex, U+00DB ISOlat1
  939           entityMap.put("Uuml", new Character('\u00dc')); // latin capital letter U with diaeresis, U+00DC ISOlat1
  940           entityMap.put("Yacute", new Character('\u00dd')); // latin capital letter Y with acute, U+00DD ISOlat1
  941           entityMap.put("THORN", new Character('\u00de')); // latin capital letter THORN, U+00DE ISOlat1
  942           entityMap.put("szlig", new Character('\u00df')); // latin small letter sharp s = ess-zed, U+00DF ISOlat1
  943           entityMap.put("agrave", new Character('\u00e0')); // latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
  944           entityMap.put("aacute", new Character('\u00e1')); // latin small letter a with acute, U+00E1 ISOlat1
  945           entityMap.put("acirc", new Character('\u00e2')); // latin small letter a with circumflex, U+00E2 ISOlat1
  946           entityMap.put("atilde", new Character('\u00e3')); // latin small letter a with tilde, U+00E3 ISOlat1
  947           entityMap.put("auml", new Character('\u00e4')); // latin small letter a with diaeresis, U+00E4 ISOlat1
  948           entityMap.put("aring", new Character('\u00e5')); // latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1
  949           entityMap.put("aelig", new Character('\u00e6')); // latin small letter ae = latin small ligature ae, U+00E6 ISOlat1
  950           entityMap.put("ccedil", new Character('\u00e7')); // latin small letter c with cedilla, U+00E7 ISOlat1
  951           entityMap.put("egrave", new Character('\u00e8')); // latin small letter e with grave, U+00E8 ISOlat1
  952           entityMap.put("eacute", new Character('\u00e9')); // latin small letter e with acute, U+00E9 ISOlat1
  953           entityMap.put("ecirc", new Character('\u00ea')); // latin small letter e with circumflex, U+00EA ISOlat1
  954           entityMap.put("euml", new Character('\u00eb')); // latin small letter e with diaeresis, U+00EB ISOlat1
  955           entityMap.put("igrave", new Character('\u00ec')); // latin small letter i with grave, U+00EC ISOlat1
  956           entityMap.put("iacute", new Character('\u00ed')); // latin small letter i with acute, U+00ED ISOlat1
  957           entityMap.put("icirc", new Character('\u00ee')); // latin small letter i with circumflex, U+00EE ISOlat1
  958           entityMap.put("iuml", new Character('\u00ef')); // latin small letter i with diaeresis, U+00EF ISOlat1
  959           entityMap.put("eth", new Character('\u00f0')); // latin small letter eth, U+00F0 ISOlat1
  960           entityMap.put("ntilde", new Character('\u00f1')); // latin small letter n with tilde, U+00F1 ISOlat1
  961           entityMap.put("ograve", new Character('\u00f2')); // latin small letter o with grave, U+00F2 ISOlat1
  962           entityMap.put("oacute", new Character('\u00f3')); // latin small letter o with acute, U+00F3 ISOlat1
  963           entityMap.put("ocirc", new Character('\u00f4')); // latin small letter o with circumflex, U+00F4 ISOlat1
  964           entityMap.put("otilde", new Character('\u00f5')); // latin small letter o with tilde, U+00F5 ISOlat1
  965           entityMap.put("ouml", new Character('\u00f6')); // latin small letter o with diaeresis, U+00F6 ISOlat1
  966           entityMap.put("divide", new Character('\u00f7')); // division sign, U+00F7 ISOnum
  967           entityMap.put("oslash", new Character('\u00f8')); // latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1
  968           entityMap.put("ugrave", new Character('\u00f9')); // latin small letter u with grave, U+00F9 ISOlat1
  969           entityMap.put("uacute", new Character('\u00fa')); // latin small letter u with acute, U+00FA ISOlat1
  970           entityMap.put("ucirc", new Character('\u00fb')); // latin small letter u with circumflex, U+00FB ISOlat1
  971           entityMap.put("uuml", new Character('\u00fc')); // latin small letter u with diaeresis, U+00FC ISOlat1
  972           entityMap.put("yacute", new Character('\u00fd')); // latin small letter y with acute, U+00FD ISOlat1
  973           entityMap.put("thorn", new Character('\u00fe')); // latin small letter thorn, U+00FE ISOlat1
  974           entityMap.put("yuml", new Character('\u00ff')); // latin small letter y with diaeresis, U+00FF ISOlat1
  975           // Latin Extended-B
  976           entityMap.put("fnof", new Character('\u0192')); // latin small f with hook = function = florin, U+0192 ISOtech
  977           // Greek
  978           entityMap.put("Alpha", new Character('\u0391')); // greek capital letter alpha, U+0391
  979           entityMap.put("Beta", new Character('\u0392')); // greek capital letter beta, U+0392
  980           entityMap.put("Gamma", new Character('\u0393')); // greek capital letter gamma, U+0393 ISOgrk3
  981           entityMap.put("Delta", new Character('\u0394')); // greek capital letter delta, U+0394 ISOgrk3
  982           entityMap.put("Epsilon", new Character('\u0395')); // greek capital letter epsilon, U+0395
  983           entityMap.put("Zeta", new Character('\u0396')); // greek capital letter zeta, U+0396
  984           entityMap.put("Eta", new Character('\u0397')); // greek capital letter eta, U+0397
  985           entityMap.put("Theta", new Character('\u0398')); // greek capital letter theta, U+0398 ISOgrk3
  986           entityMap.put("Iota", new Character('\u0399')); // greek capital letter iota, U+0399
  987           entityMap.put("Kappa", new Character('\u039a')); // greek capital letter kappa, U+039A
  988           entityMap.put("Lambda", new Character('\u039b')); // greek capital letter lambda, U+039B ISOgrk3
  989           entityMap.put("Mu", new Character('\u039c')); // greek capital letter mu, U+039C
  990           entityMap.put("Nu", new Character('\u039d')); // greek capital letter nu, U+039D
  991           entityMap.put("Xi", new Character('\u039e')); // greek capital letter xi, U+039E ISOgrk3
  992           entityMap.put("Omicron", new Character('\u039f')); // greek capital letter omicron, U+039F
  993           entityMap.put("Pi", new Character('\u03a0')); // greek capital letter pi, U+03A0 ISOgrk3
  994           entityMap.put("Rho", new Character('\u03a1')); // greek capital letter rho, U+03A1
  995           // there is no Sigmaf, and no U+03A2 character either
  996           entityMap.put("Sigma", new Character('\u03a3')); // greek capital letter sigma, U+03A3 ISOgrk3
  997           entityMap.put("Tau", new Character('\u03a4')); // greek capital letter tau, U+03A4
  998           entityMap.put("Upsilon", new Character('\u03a5')); // greek capital letter upsilon, U+03A5 ISOgrk3
  999           entityMap.put("Phi", new Character('\u03a6')); // greek capital letter phi, U+03A6 ISOgrk3
 1000           entityMap.put("Chi", new Character('\u03a7')); // greek capital letter chi, U+03A7
 1001           entityMap.put("Psi", new Character('\u03a8')); // greek capital letter psi, U+03A8 ISOgrk3
 1002           entityMap.put("Omega", new Character('\u03a9')); // greek capital letter omega, U+03A9 ISOgrk3
 1003           entityMap.put("alpha", new Character('\u03b1')); // greek small letter alpha, U+03B1 ISOgrk3
 1004           entityMap.put("beta", new Character('\u03b2')); // greek small letter beta, U+03B2 ISOgrk3
 1005           entityMap.put("gamma", new Character('\u03b3')); // greek small letter gamma, U+03B3 ISOgrk3
 1006           entityMap.put("delta", new Character('\u03b4')); // greek small letter delta, U+03B4 ISOgrk3
 1007           entityMap.put("epsilon", new Character('\u03b5')); // greek small letter epsilon, U+03B5 ISOgrk3
 1008           entityMap.put("zeta", new Character('\u03b6')); // greek small letter zeta, U+03B6 ISOgrk3
 1009           entityMap.put("eta", new Character('\u03b7')); // greek small letter eta, U+03B7 ISOgrk3
 1010           entityMap.put("theta", new Character('\u03b8')); // greek small letter theta, U+03B8 ISOgrk3
 1011           entityMap.put("iota", new Character('\u03b9')); // greek small letter iota, U+03B9 ISOgrk3
 1012           entityMap.put("kappa", new Character('\u03ba')); // greek small letter kappa, U+03BA ISOgrk3
 1013           entityMap.put("lambda", new Character('\u03bb')); // greek small letter lambda, U+03BB ISOgrk3
 1014           entityMap.put("mu", new Character('\u03bc')); // greek small letter mu, U+03BC ISOgrk3
 1015           entityMap.put("nu", new Character('\u03bd')); // greek small letter nu, U+03BD ISOgrk3
 1016           entityMap.put("xi", new Character('\u03be')); // greek small letter xi, U+03BE ISOgrk3
 1017           entityMap.put("omicron", new Character('\u03bf')); // greek small letter omicron, U+03BF NEW
 1018           entityMap.put("pi", new Character('\u03c0')); // greek small letter pi, U+03C0 ISOgrk3
 1019           entityMap.put("rho", new Character('\u03c1')); // greek small letter rho, U+03C1 ISOgrk3
 1020           entityMap.put("sigmaf", new Character('\u03c2')); // greek small letter final sigma, U+03C2 ISOgrk3
 1021           entityMap.put("sigma", new Character('\u03c3')); // greek small letter sigma, U+03C3 ISOgrk3
 1022           entityMap.put("tau", new Character('\u03c4')); // greek small letter tau, U+03C4 ISOgrk3
 1023           entityMap.put("upsilon", new Character('\u03c5')); // greek small letter upsilon, U+03C5 ISOgrk3
 1024           entityMap.put("phi", new Character('\u03c6')); // greek small letter phi, U+03C6 ISOgrk3
 1025           entityMap.put("chi", new Character('\u03c7')); // greek small letter chi, U+03C7 ISOgrk3
 1026           entityMap.put("psi", new Character('\u03c8')); // greek small letter psi, U+03C8 ISOgrk3
 1027           entityMap.put("omega", new Character('\u03c9')); // greek small letter omega, U+03C9 ISOgrk3
 1028           entityMap.put("thetasym", new Character('\u03d1')); // greek small letter theta symbol, U+03D1 NEW
 1029           entityMap.put("upsih", new Character('\u03d2')); // greek upsilon with hook symbol, U+03D2 NEW
 1030           entityMap.put("piv", new Character('\u03d6')); // greek pi symbol, U+03D6 ISOgrk3
 1031           // General Punctuation
 1032           entityMap.put("bull", new Character('\u2022')); // bullet = black small circle, U+2022 ISOpub
 1033           // bullet is NOT the same as bullet operator, U+2219
 1034           entityMap.put("hellip", new Character('\u2026')); // horizontal ellipsis = three dot leader, U+2026 ISOpub
 1035           entityMap.put("prime", new Character('\u2032')); // prime = minutes = feet, U+2032 ISOtech
 1036           entityMap.put("Prime", new Character('\u2033')); // double prime = seconds = inches, U+2033 ISOtech
 1037           entityMap.put("oline", new Character('\u203e')); // overline = spacing overscore, U+203E NEW
 1038           entityMap.put("frasl", new Character('\u2044')); // fraction slash, U+2044 NEW
 1039           // Letterlike Symbols
 1040           entityMap.put("weierp", new Character('\u2118')); // script capital P = power set = Weierstrass p, U+2118 ISOamso
 1041           entityMap.put("image", new Character('\u2111')); // blackletter capital I = imaginary part, U+2111 ISOamso
 1042           entityMap.put("real", new Character('\u211c')); // blackletter capital R = real part symbol, U+211C ISOamso
 1043           entityMap.put("trade", new Character('\u2122')); // trade mark sign, U+2122 ISOnum
 1044           entityMap.put("alefsym", new Character('\u2135')); // alef symbol = first transfinite cardinal, U+2135 NEW
 1045           // alef symbol is NOT the same as hebrew letter alef,
 1046           // U+05D0 although the same glyph could be used to depict both characters
 1047           // Arrows
 1048           entityMap.put("larr", new Character('\u2190')); // leftwards arrow, U+2190 ISOnum
 1049           entityMap.put("uarr", new Character('\u2191')); // upwards arrow, U+2191 ISOnum
 1050           entityMap.put("rarr", new Character('\u2192')); // rightwards arrow, U+2192 ISOnum
 1051           entityMap.put("darr", new Character('\u2193')); // downwards arrow, U+2193 ISOnum
 1052           entityMap.put("harr", new Character('\u2194')); // left right arrow, U+2194 ISOamsa
 1053           entityMap.put("crarr", new Character('\u21b5')); // downwards arrow with corner leftwards = carriage return, U+21B5 NEW
 1054           entityMap.put("lArr", new Character('\u21d0')); // leftwards double arrow, U+21D0 ISOtech
 1055           // ISO 10646 does not say that lArr is the same as the 'is implied by' arrow
 1056           // but also does not have any other character for that function. So ? lArr can
 1057           // be used for 'is implied by' as ISOtech suggests
 1058           entityMap.put("uArr", new Character('\u21d1')); // upwards double arrow, U+21D1 ISOamsa
 1059           entityMap.put("rArr", new Character('\u21d2')); // rightwards double arrow, U+21D2 ISOtech
 1060           // ISO 10646 does not say this is the 'implies' character but does not have 
 1061           // another character with this function so ?
 1062           // rArr can be used for 'implies' as ISOtech suggests
 1063           entityMap.put("dArr", new Character('\u21d3')); // downwards double arrow, U+21D3 ISOamsa
 1064           entityMap.put("hArr", new Character('\u21d4')); // left right double arrow, U+21D4 ISOamsa
 1065           // Mathematical Operators
 1066           entityMap.put("forall", new Character('\u2200')); // for all, U+2200 ISOtech
 1067           entityMap.put("part", new Character('\u2202')); // partial differential, U+2202 ISOtech
 1068           entityMap.put("exist", new Character('\u2203')); // there exists, U+2203 ISOtech
 1069           entityMap.put("empty", new Character('\u2205')); // empty set = null set = diameter, U+2205 ISOamso
 1070           entityMap.put("nabla", new Character('\u2207')); // nabla = backward difference, U+2207 ISOtech
 1071           entityMap.put("isin", new Character('\u2208')); // element of, U+2208 ISOtech
 1072           entityMap.put("notin", new Character('\u2209')); // not an element of, U+2209 ISOtech
 1073           entityMap.put("ni", new Character('\u220b')); // contains as member, U+220B ISOtech
 1074           // should there be a more memorable name than 'ni'?
 1075           entityMap.put("prod", new Character('\u220f')); // n-ary product = product sign, U+220F ISOamsb
 1076           // prod is NOT the same character as U+03A0 'greek capital letter pi' though
 1077           // the same glyph might be used for both
 1078           entityMap.put("sum", new Character('\u2211')); // n-ary sumation, U+2211 ISOamsb
 1079           // sum is NOT the same character as U+03A3 'greek capital letter sigma'
 1080           // though the same glyph might be used for both
 1081           entityMap.put("minus", new Character('\u2212')); // minus sign, U+2212 ISOtech
 1082           entityMap.put("lowast", new Character('\u2217')); // asterisk operator, U+2217 ISOtech
 1083           entityMap.put("radic", new Character('\u221a')); // square root = radical sign, U+221A ISOtech
 1084           entityMap.put("prop", new Character('\u221d')); // proportional to, U+221D ISOtech
 1085           entityMap.put("infin", new Character('\u221e')); // infinity, U+221E ISOtech
 1086           entityMap.put("ang", new Character('\u2220')); // angle, U+2220 ISOamso
 1087           entityMap.put("and", new Character('\u2227')); // logical and = wedge, U+2227 ISOtech
 1088           entityMap.put("or", new Character('\u2228')); // logical or = vee, U+2228 ISOtech
 1089           entityMap.put("cap", new Character('\u2229')); // intersection = cap, U+2229 ISOtech
 1090           entityMap.put("cup", new Character('\u222a')); // union = cup, U+222A ISOtech
 1091           entityMap.put("int", new Character('\u222b')); // integral, U+222B ISOtech
 1092           entityMap.put("there4", new Character('\u2234')); // therefore, U+2234 ISOtech
 1093           entityMap.put("sim", new Character('\u223c')); // tilde operator = varies with = similar to, U+223C ISOtech
 1094           // tilde operator is NOT the same character as the tilde, U+007E,
 1095           // although the same glyph might be used to represent both
 1096           entityMap.put("cong", new Character('\u2245')); // approximately equal to, U+2245 ISOtech
 1097           entityMap.put("asymp", new Character('\u2248')); // almost equal to = asymptotic to, U+2248 ISOamsr
 1098           entityMap.put("ne", new Character('\u2260')); // not equal to, U+2260 ISOtech
 1099           entityMap.put("equiv", new Character('\u2261')); // identical to, U+2261 ISOtech
 1100           entityMap.put("le", new Character('\u2264')); // less-than or equal to, U+2264 ISOtech
 1101           entityMap.put("ge", new Character('\u2265')); // greater-than or equal to, U+2265 ISOtech
 1102           entityMap.put("sub", new Character('\u2282')); // subset of, U+2282 ISOtech
 1103           entityMap.put("sup", new Character('\u2283')); // superset of, U+2283 ISOtech
 1104           // note that nsup, 'not a superset of, U+2283' is not covered by the Symbol 
 1105           // font encoding and is not included. Should it be, for symmetry?
 1106           // It is in ISOamsn
 1107           entityMap.put("nsub", new Character('\u2284')); // not a subset of, U+2284 ISOamsn
 1108           entityMap.put("sube", new Character('\u2286')); // subset of or equal to, U+2286 ISOtech
 1109           entityMap.put("supe", new Character('\u2287')); // superset of or equal to, U+2287 ISOtech
 1110           entityMap.put("oplus", new Character('\u2295')); // circled plus = direct sum, U+2295 ISOamsb
 1111           entityMap.put("otimes", new Character('\u2297')); // circled times = vector product, U+2297 ISOamsb
 1112           entityMap.put("perp", new Character('\u22a5')); // up tack = orthogonal to = perpendicular, U+22A5 ISOtech
 1113           entityMap.put("sdot", new Character('\u22c5')); // dot operator, U+22C5 ISOamsb
 1114           // dot operator is NOT the same character as U+00B7 middle dot
 1115           // Miscellaneous Technical
 1116           entityMap.put("lceil", new Character('\u2308')); // left ceiling = apl upstile, U+2308 ISOamsc
 1117           entityMap.put("rceil", new Character('\u2309')); // right ceiling, U+2309 ISOamsc
 1118           entityMap.put("lfloor", new Character('\u230a')); // left floor = apl downstile, U+230A ISOamsc
 1119           entityMap.put("rfloor", new Character('\u230b')); // right floor, U+230B ISOamsc
 1120           entityMap.put("lang", new Character('\u2329')); // left-pointing angle bracket = bra, U+2329 ISOtech
 1121           // lang is NOT the same character as U+003C 'less than' 
 1122           // or U+2039 'single left-pointing angle quotation mark'
 1123           entityMap.put("rang", new Character('\u232a')); // right-pointing angle bracket = ket, U+232A ISOtech
 1124           // rang is NOT the same character as U+003E 'greater than' 
 1125           // or U+203A 'single right-pointing angle quotation mark'
 1126           // Geometric Shapes
 1127           entityMap.put("loz", new Character('\u25ca')); // lozenge, U+25CA ISOpub
 1128           // Miscellaneous Symbols
 1129           entityMap.put("spades", new Character('\u2660')); // black spade suit, U+2660 ISOpub
 1130           // black here seems to mean filled as opposed to hollow
 1131           entityMap.put("clubs", new Character('\u2663')); // black club suit = shamrock, U+2663 ISOpub
 1132           entityMap.put("hearts", new Character('\u2665')); // black heart suit = valentine, U+2665 ISOpub
 1133           entityMap.put("diams", new Character('\u2666')); // black diamond suit, U+2666 ISOpub
 1134           // C0 Controls and Basic Latin
 1135           entityMap.put("quot", new Character('\u0022')); // quotation mark = APL quote, U+0022 ISOnum
 1136           entityMap.put("amp", new Character('\u0026')); // ampersand, U+0026 ISOnum
 1137           entityMap.put("apos", new Character('\''));
 1138           entityMap.put("lt", new Character('\u003c')); // less-than sign, U+003C ISOnum
 1139           entityMap.put("gt", new Character('\u003e')); // greater-than sign, U+003E ISOnum
 1140           // Latin Extended-A
 1141           entityMap.put("OElig", new Character('\u0152')); // latin capital ligature OE, U+0152 ISOlat2
 1142           entityMap.put("oelig", new Character('\u0153')); // latin small ligature oe, U+0153 ISOlat2
 1143           // ligature is a misnomer, this is a separate character in some languages
 1144           entityMap.put("Scaron", new Character('\u0160')); // latin capital letter S with caron, U+0160 ISOlat2
 1145           entityMap.put("scaron", new Character('\u0161')); // latin small letter s with caron, U+0161 ISOlat2
 1146           entityMap.put("Yuml", new Character('\u0178')); // latin capital letter Y with diaeresis, U+0178 ISOlat2
 1147           // Spacing Modifier Letters
 1148           entityMap.put("circ", new Character('\u02c6')); // modifier letter circumflex accent, U+02C6 ISOpub
 1149           entityMap.put("tilde", new Character('\u02dc')); // small tilde, U+02DC ISOdia
 1150           // General Punctuation
 1151           entityMap.put("ensp", new Character('\u2002')); // en space, U+2002 ISOpub
 1152           entityMap.put("emsp", new Character('\u2003')); // em space, U+2003 ISOpub
 1153           entityMap.put("thinsp", new Character('\u2009')); // thin space, U+2009 ISOpub
 1154           entityMap.put("zwnj", new Character('\u200c')); // zero width non-joiner, U+200C NEW RFC 2070
 1155           entityMap.put("zwj", new Character('\u200d')); // zero width joiner, U+200D NEW RFC 2070
 1156           entityMap.put("lrm", new Character('\u200e')); // left-to-right mark, U+200E NEW RFC 2070
 1157           entityMap.put("rlm", new Character('\u200f')); // right-to-left mark, U+200F NEW RFC 2070
 1158           entityMap.put("ndash", new Character('\u2013')); // en dash, U+2013 ISOpub
 1159           entityMap.put("mdash", new Character('\u2014')); // em dash, U+2014 ISOpub
 1160           entityMap.put("lsquo", new Character('\u2018')); // left single quotation mark, U+2018 ISOnum
 1161           entityMap.put("rsquo", new Character('\u2019')); // right single quotation mark, U+2019 ISOnum
 1162           entityMap.put("sbquo", new Character('\u201a')); // single low-9 quotation mark, U+201A NEW
 1163           entityMap.put("ldquo", new Character('\u201c')); // left double quotation mark, U+201C ISOnum
 1164           entityMap.put("rdquo", new Character('\u201d')); // right double quotation mark, U+201D ISOnum
 1165           entityMap.put("bdquo", new Character('\u201e')); // double low-9 quotation mark, U+201E NEW
 1166           entityMap.put("dagger", new Character('\u2020')); // dagger, U+2020 ISOpub
 1167           entityMap.put("Dagger", new Character('\u2021')); // double dagger, U+2021 ISOpub
 1168           entityMap.put("permil", new Character('\u2030')); // per mille sign, U+2030 ISOtech
 1169           entityMap.put("lsaquo", new Character('\u2039')); // single left-pointing angle quotation mark, U+2039 ISO proposed
 1170           // lsaquo is proposed but not yet ISO standardized
 1171           entityMap.put("rsaquo", new Character('\u203a')); // single right-pointing angle quotation mark, U+203A ISO proposed
 1172           // rsaquo is proposed but not yet ISO standardized
 1173           entityMap.put("euro", new Character('\u20ac')); // euro sign, U+20AC NEW
 1174       
 1175       
 1176       }
 1177   }

Save This Page
Home » iText-src-2.1.3 » com.lowagie » text » pdf » [javadoc | source]