Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/enhydra/xml/io/MkEncodingsTable.java


1   /*
2    * Enhydra Java Application Server Project
3    * 
4    * The contents of this file are subject to the Enhydra Public License
5    * Version 1.1 (the "License"); you may not use this file except in
6    * compliance with the License. You may obtain a copy of the License on
7    * the Enhydra web site ( http://www.enhydra.org/ ).
8    * 
9    * Software distributed under the License is distributed on an "AS IS"
10   * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See 
11   * the License for the specific terms governing rights and limitations
12   * under the License.
13   * 
14   * The Initial Developer of the Enhydra Application Server is Lutris
15   * Technologies, Inc. The Enhydra Application Server and portions created
16   * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
17   * All Rights Reserved.
18   * 
19   * Contributor(s):
20   * 
21   * $Id: MkEncodingsTable.java,v 1.1.2.3 2001/02/12 04:43:31 markd Exp $
22   */
23  package org.enhydra.xml.io;
24  
25  import java.util.HashMap;
26  import java.util.HashSet;
27  import java.util.ArrayList;
28  import java.util.Iterator;
29  import java.io.BufferedReader;
30  import java.io.BufferedWriter;
31  import java.io.PrintWriter;
32  import java.io.FileReader;
33  import java.io.FileWriter;
34  import java.io.IOException;
35  
36  //FIXME: Next to convert to file to XML.
37  
38  /**
39   * Generate a file contain character encodings by parsing
40   * the IANA Charset Registry, obtained from:
41   * <br>
42   * <a href="ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets">
43   *   <tt>ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets</tt></a>
44   * <br>
45   * This is a development-time tool, with special hacks to make up for
46   * various java encoding names not in the table.
47   * The resulting file has the format:
48   * <pre>
49   * name bits mime-name alias1 alias2 ...
50   * </pre>
51   * Where bits is 7, 8 or 16 and mime-name can be "null" if there is none
52   * defined.  This file will be converted to XML in a future release.
53   */
54  public final class MkEncodingsTable {
55      /**
56       * Resource name of IANA Charset Registry file.
57       */
58      private final String CHAR_SET_REGISTRY = "character-sets";
59  
60      /**
61       * Character set table that is created
62       */
63      private final String CHAR_SET_TABLE = "character-sets.tbl";
64  
65      /**
66       * Labels in registry file.
67       */
68      private final String REG_NAME_FIELD = "Name:";
69      private final String REG_ALIAS_FIELD = "Alias:";
70  
71      //FIXME: these encodings lists are not complete..
72      /**
73       * 7 bit encoding names.
74       */
75      private static final String[] ENCODINGS_7BIT = {
76          "ANSI_X3.4-1968",
77          "T.61-7bit"
78      };
79  
80      /**
81       * 8 bit encoding names.
82       */
83      private static final String[] ENCODINGS_8BIT = {
84          "T.61-8bit",
85          "UNKNOWN-8BIT",
86          "PC8-Danish-Norwegian",
87          "PC8-Turkish",
88          "ISO_8859-1:1987",
89          "ISO_8859-2:1987",
90          "ISO_8859-3:1988",
91          "ISO_8859-4:1988",
92          "ISO_8859-6:1987",
93          "ISO_8859-6-E",
94          "ISO_8859-6-I",
95          "ISO_8859-7:1987",
96          "ISO_8859-8:1988",
97          "ISO_8859-8-E",
98          "ISO_8859-8-I",
99          "ISO_8859-5:1988",
100         "ISO_8859-9:1989",
101         "ISO_8859-supp",
102         "ISO-8859-10",
103         "ISO-8859-15",
104         "ISO-8859-1-Windows-3.0-Latin-1",
105         "ISO-8859-1-Windows-3.1-Latin-1",
106         "ISO-8859-2-Windows-Latin-2",
107         "ISO-8859-9-Windows-Latin-5",
108         "latin-greek",
109         "Latin-greek-1"
110     };
111 
112     /**
113      * Tables of known 7 & 8 bit encodings.
114      */
115     private static final HashSet f7BitEncodings = new HashSet();
116     private static final HashSet f8BitEncodings = new HashSet();
117 
118     /**
119      * Pattern indicating the preferred MIME name.
120      */
121     private static final String REG_MIME_PREFERRED = "preferred MIME name";
122 
123     /**
124      * Table of aliases to add.
125      */
126     private static final String[][] HACKED_ALIASES = {
127         {"UTF-8", "UTF8"},
128         {"ANSI_X3.4-1968", "646"},
129     };
130 
131     /**
132      * Table of prefix conversions.  An alias is created for names/aliases
133      * that match the first prefix, with the second subsitutued.
134      */
135     private static final String[][] HACKED_PREFIXES = {
136         {"windows-", "Cp"},
137         {"ISO_8859-", "ISO8859-"},
138         {"ISO-8859-", "ISO8859_"}
139     };
140 
141     /**
142      * Extra entries to output, with no other hacks available to get them.
143      */
144     private static String[] EXTRA_ENTRIES = {
145         "UnicodeBig 16 null",
146         "UnicodeBigUnmarked 16 null",
147         "UnicodeLittle 16 null",
148         "UnicodeLittleUnmarked 16 null",
149         "UTF-16 16 null UTF16"
150     };
151 
152     /**
153      * Class initializer.
154      */
155     static {
156         for (int idx = 0; idx < ENCODINGS_7BIT.length; idx++) {
157             f7BitEncodings.add(ENCODINGS_7BIT[idx]);
158         }
159         for (int idx = 0; idx < ENCODINGS_8BIT.length; idx++) {
160             f8BitEncodings.add(ENCODINGS_8BIT[idx]);
161         }
162     }
163 
164     /**
165      * Generate an error about parsing a line in registry.
166      */
167     private void ianaParseError(String msg,
168                                 String line) {
169         throw new XMLIOError(msg + "; parsing line in " + CHAR_SET_REGISTRY
170                              + "\"" + line + "\"");
171     }
172 
173 
174     /**
175      * Get the encoding size.  Returning 7, 8, or 16.  This makes a guess
176      * based on some encoded knowledge.  If not known, returns 16.
177      */
178     private int getCharSize(String encoding) {
179         if (f7BitEncodings.contains(encoding)) {
180             return 7;
181         } else if (f8BitEncodings.contains(encoding)) {
182             return 8;
183         } else {
184             return 16;
185         }
186     }
187 
188     /**
189      * Extract a encoding name out of a Name: or Alias: line.  Returns
190      * null if empty.
191      */
192     private String parseName(String line) {
193         int len = line.length();
194 
195         // Get next char after index.
196         int startIdx = line.indexOf(':');
197         if (startIdx < 0) {
198             ianaParseError("no `:' found", line);
199         }
200         startIdx++;
201 
202         // Skip spaces
203         while ((startIdx < len) && (line.charAt(startIdx) == ' ')) {
204             startIdx++;
205         }
206         
207         // Find end
208         int endIdx = startIdx;
209         while ((endIdx < len) && (line.charAt(endIdx) != ' ')) {
210             endIdx++;
211         }
212         if (endIdx <= startIdx) {
213             return null;
214         } else {
215             return line.substring(startIdx, endIdx).intern();
216         }
217     }
218 
219     /**
220      * Determine if a line contains the preferred MIME encoding.
221      */
222     private boolean isMimePreferredEntry(String line) {
223         return (line.indexOf(REG_MIME_PREFERRED) >= 0);
224     }
225 
226     /**
227      * Add a alias to the list of aliases, if its not null or not already
228      * there.
229      */
230     private void addAlias(ArrayList aliases,
231                           String alias) {
232         if ((alias != null) && !aliases.contains(alias)) {
233             aliases.add(alias);
234         }
235     }
236 
237     /**
238      * Do special hacked mapping of name/aliases to other aliases.  This
239      * handles alisas not in registry
240      */
241     private void makeHackedAliases(ArrayList aliases,
242                                    String name) {
243         // Hacks based on alias.
244         for (int idx = 0; idx < HACKED_ALIASES.length; idx++) {
245             String[] mapping = HACKED_ALIASES[idx];
246             if (name.equals(mapping[0])) {
247                 addAlias(aliases, mapping[1]);
248             }
249         }
250 
251         // Hacks based on prefix.
252         for (int idx = 0; idx < HACKED_PREFIXES.length; idx++) {
253             String[] mapping = HACKED_PREFIXES[idx];
254             if (name.startsWith(mapping[0])) {
255                 addAlias(aliases,
256                          mapping[1] + name.substring(mapping[0].length()));
257             }
258         }
259     }
260 
261     /**
262      * Scan the input stream for the next encoding entry and parse that
263      * entry and write a record.
264      */
265     private boolean parseCharSetEntry(BufferedReader in,
266                                       PrintWriter out) throws IOException {
267         ArrayList aliases = new ArrayList();
268         String mimePreferred = null;
269         String line = null;
270 
271         // Scan for next Name: entry
272         while ((line = in.readLine()) != null) {
273             if (line.startsWith(REG_NAME_FIELD)) {
274                 break;
275             }
276         }
277         if (line == null) {
278             return false; // EOF
279         }
280         String name = parseName(line);
281         if (name == null) {
282             ianaParseError("no name parsed", line);
283         }
284         if (isMimePreferredEntry(line)){ 
285             mimePreferred = name;
286         }
287 
288         // Handle stuff missing from registry
289         makeHackedAliases(aliases, name);
290         
291         // Parse Alias: entries, scanning until a blank line or EOF.
292         while (((line = in.readLine()) != null)
293                && (line.trim().length() > 0)) {
294             if (line.startsWith(REG_ALIAS_FIELD)) {
295                 String alias = parseName(line);
296                 if (alias != null) {
297                     addAlias(aliases, alias);
298                     makeHackedAliases(aliases, alias);
299                     if (isMimePreferredEntry(line)){ 
300                         mimePreferred = alias;
301                     }
302                 }
303             }
304         }
305         
306         // output entry
307         out.print(name);
308         out.print(' ');
309         out.print(getCharSize(name));
310         out.print(' ');
311         out.print(mimePreferred);
312         int len = aliases.size();
313         for (int idx = 0; idx < len; idx++) {
314             out.print(' ');
315             out.print(aliases.get(idx));
316         }
317         out.println();
318         return true;
319     }
320 
321     /**
322      * Parse the registry file.
323      */
324     private void parseIanaRegistry(BufferedReader in,
325                                    PrintWriter out) throws IOException {
326         while (parseCharSetEntry(in, out)) {
327             // Looping till eof
328         }
329     }
330 
331     /**
332      * Parse the registry file.
333      */
334     private void parseIanaRegistry() throws IOException {
335         BufferedReader in = new BufferedReader(new FileReader(CHAR_SET_REGISTRY));
336         PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(CHAR_SET_TABLE)));
337         parseIanaRegistry(in, out);
338 
339         for (int i = 0; i < EXTRA_ENTRIES.length; i++) {
340             out.println(EXTRA_ENTRIES[i]);
341         }
342         out.close();
343         in.close();
344     }
345 
346     /**
347      * Entry
348      */
349     public static void main(String[] args) throws IOException {
350         new MkEncodingsTable().parseIanaRegistry();
351     }
352 }