Source code: org/enhydra/xml/io/MkEncodingsTable.java
1 /*
2 * Enhydra Java Application Server Project
3 *
4 * The contents of this file are subject to the Enhydra Public License
5 * Version 1.1 (the "License"); you may not use this file except in
6 * compliance with the License. You may obtain a copy of the License on
7 * the Enhydra web site ( http://www.enhydra.org/ ).
8 *
9 * Software distributed under the License is distributed on an "AS IS"
10 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
11 * the License for the specific terms governing rights and limitations
12 * under the License.
13 *
14 * The Initial Developer of the Enhydra Application Server is Lutris
15 * Technologies, Inc. The Enhydra Application Server and portions created
16 * by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
17 * All Rights Reserved.
18 *
19 * Contributor(s):
20 *
21 * $Id: MkEncodingsTable.java,v 1.1.2.3 2001/02/12 04:43:31 markd Exp $
22 */
23 package org.enhydra.xml.io;
24
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.ArrayList;
28 import java.util.Iterator;
29 import java.io.BufferedReader;
30 import java.io.BufferedWriter;
31 import java.io.PrintWriter;
32 import java.io.FileReader;
33 import java.io.FileWriter;
34 import java.io.IOException;
35
36 //FIXME: Next to convert to file to XML.
37
38 /**
39 * Generate a file contain character encodings by parsing
40 * the IANA Charset Registry, obtained from:
41 * <br>
42 * <a href="ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets">
43 * <tt>ftp://ftp.isi.edu/in-notes/iana/assignments/character-sets</tt></a>
44 * <br>
45 * This is a development-time tool, with special hacks to make up for
46 * various java encoding names not in the table.
47 * The resulting file has the format:
48 * <pre>
49 * name bits mime-name alias1 alias2 ...
50 * </pre>
51 * Where bits is 7, 8 or 16 and mime-name can be "null" if there is none
52 * defined. This file will be converted to XML in a future release.
53 */
54 public final class MkEncodingsTable {
55 /**
56 * Resource name of IANA Charset Registry file.
57 */
58 private final String CHAR_SET_REGISTRY = "character-sets";
59
60 /**
61 * Character set table that is created
62 */
63 private final String CHAR_SET_TABLE = "character-sets.tbl";
64
65 /**
66 * Labels in registry file.
67 */
68 private final String REG_NAME_FIELD = "Name:";
69 private final String REG_ALIAS_FIELD = "Alias:";
70
71 //FIXME: these encodings lists are not complete..
72 /**
73 * 7 bit encoding names.
74 */
75 private static final String[] ENCODINGS_7BIT = {
76 "ANSI_X3.4-1968",
77 "T.61-7bit"
78 };
79
80 /**
81 * 8 bit encoding names.
82 */
83 private static final String[] ENCODINGS_8BIT = {
84 "T.61-8bit",
85 "UNKNOWN-8BIT",
86 "PC8-Danish-Norwegian",
87 "PC8-Turkish",
88 "ISO_8859-1:1987",
89 "ISO_8859-2:1987",
90 "ISO_8859-3:1988",
91 "ISO_8859-4:1988",
92 "ISO_8859-6:1987",
93 "ISO_8859-6-E",
94 "ISO_8859-6-I",
95 "ISO_8859-7:1987",
96 "ISO_8859-8:1988",
97 "ISO_8859-8-E",
98 "ISO_8859-8-I",
99 "ISO_8859-5:1988",
100 "ISO_8859-9:1989",
101 "ISO_8859-supp",
102 "ISO-8859-10",
103 "ISO-8859-15",
104 "ISO-8859-1-Windows-3.0-Latin-1",
105 "ISO-8859-1-Windows-3.1-Latin-1",
106 "ISO-8859-2-Windows-Latin-2",
107 "ISO-8859-9-Windows-Latin-5",
108 "latin-greek",
109 "Latin-greek-1"
110 };
111
112 /**
113 * Tables of known 7 & 8 bit encodings.
114 */
115 private static final HashSet f7BitEncodings = new HashSet();
116 private static final HashSet f8BitEncodings = new HashSet();
117
118 /**
119 * Pattern indicating the preferred MIME name.
120 */
121 private static final String REG_MIME_PREFERRED = "preferred MIME name";
122
123 /**
124 * Table of aliases to add.
125 */
126 private static final String[][] HACKED_ALIASES = {
127 {"UTF-8", "UTF8"},
128 {"ANSI_X3.4-1968", "646"},
129 };
130
131 /**
132 * Table of prefix conversions. An alias is created for names/aliases
133 * that match the first prefix, with the second subsitutued.
134 */
135 private static final String[][] HACKED_PREFIXES = {
136 {"windows-", "Cp"},
137 {"ISO_8859-", "ISO8859-"},
138 {"ISO-8859-", "ISO8859_"}
139 };
140
141 /**
142 * Extra entries to output, with no other hacks available to get them.
143 */
144 private static String[] EXTRA_ENTRIES = {
145 "UnicodeBig 16 null",
146 "UnicodeBigUnmarked 16 null",
147 "UnicodeLittle 16 null",
148 "UnicodeLittleUnmarked 16 null",
149 "UTF-16 16 null UTF16"
150 };
151
152 /**
153 * Class initializer.
154 */
155 static {
156 for (int idx = 0; idx < ENCODINGS_7BIT.length; idx++) {
157 f7BitEncodings.add(ENCODINGS_7BIT[idx]);
158 }
159 for (int idx = 0; idx < ENCODINGS_8BIT.length; idx++) {
160 f8BitEncodings.add(ENCODINGS_8BIT[idx]);
161 }
162 }
163
164 /**
165 * Generate an error about parsing a line in registry.
166 */
167 private void ianaParseError(String msg,
168 String line) {
169 throw new XMLIOError(msg + "; parsing line in " + CHAR_SET_REGISTRY
170 + "\"" + line + "\"");
171 }
172
173
174 /**
175 * Get the encoding size. Returning 7, 8, or 16. This makes a guess
176 * based on some encoded knowledge. If not known, returns 16.
177 */
178 private int getCharSize(String encoding) {
179 if (f7BitEncodings.contains(encoding)) {
180 return 7;
181 } else if (f8BitEncodings.contains(encoding)) {
182 return 8;
183 } else {
184 return 16;
185 }
186 }
187
188 /**
189 * Extract a encoding name out of a Name: or Alias: line. Returns
190 * null if empty.
191 */
192 private String parseName(String line) {
193 int len = line.length();
194
195 // Get next char after index.
196 int startIdx = line.indexOf(':');
197 if (startIdx < 0) {
198 ianaParseError("no `:' found", line);
199 }
200 startIdx++;
201
202 // Skip spaces
203 while ((startIdx < len) && (line.charAt(startIdx) == ' ')) {
204 startIdx++;
205 }
206
207 // Find end
208 int endIdx = startIdx;
209 while ((endIdx < len) && (line.charAt(endIdx) != ' ')) {
210 endIdx++;
211 }
212 if (endIdx <= startIdx) {
213 return null;
214 } else {
215 return line.substring(startIdx, endIdx).intern();
216 }
217 }
218
219 /**
220 * Determine if a line contains the preferred MIME encoding.
221 */
222 private boolean isMimePreferredEntry(String line) {
223 return (line.indexOf(REG_MIME_PREFERRED) >= 0);
224 }
225
226 /**
227 * Add a alias to the list of aliases, if its not null or not already
228 * there.
229 */
230 private void addAlias(ArrayList aliases,
231 String alias) {
232 if ((alias != null) && !aliases.contains(alias)) {
233 aliases.add(alias);
234 }
235 }
236
237 /**
238 * Do special hacked mapping of name/aliases to other aliases. This
239 * handles alisas not in registry
240 */
241 private void makeHackedAliases(ArrayList aliases,
242 String name) {
243 // Hacks based on alias.
244 for (int idx = 0; idx < HACKED_ALIASES.length; idx++) {
245 String[] mapping = HACKED_ALIASES[idx];
246 if (name.equals(mapping[0])) {
247 addAlias(aliases, mapping[1]);
248 }
249 }
250
251 // Hacks based on prefix.
252 for (int idx = 0; idx < HACKED_PREFIXES.length; idx++) {
253 String[] mapping = HACKED_PREFIXES[idx];
254 if (name.startsWith(mapping[0])) {
255 addAlias(aliases,
256 mapping[1] + name.substring(mapping[0].length()));
257 }
258 }
259 }
260
261 /**
262 * Scan the input stream for the next encoding entry and parse that
263 * entry and write a record.
264 */
265 private boolean parseCharSetEntry(BufferedReader in,
266 PrintWriter out) throws IOException {
267 ArrayList aliases = new ArrayList();
268 String mimePreferred = null;
269 String line = null;
270
271 // Scan for next Name: entry
272 while ((line = in.readLine()) != null) {
273 if (line.startsWith(REG_NAME_FIELD)) {
274 break;
275 }
276 }
277 if (line == null) {
278 return false; // EOF
279 }
280 String name = parseName(line);
281 if (name == null) {
282 ianaParseError("no name parsed", line);
283 }
284 if (isMimePreferredEntry(line)){
285 mimePreferred = name;
286 }
287
288 // Handle stuff missing from registry
289 makeHackedAliases(aliases, name);
290
291 // Parse Alias: entries, scanning until a blank line or EOF.
292 while (((line = in.readLine()) != null)
293 && (line.trim().length() > 0)) {
294 if (line.startsWith(REG_ALIAS_FIELD)) {
295 String alias = parseName(line);
296 if (alias != null) {
297 addAlias(aliases, alias);
298 makeHackedAliases(aliases, alias);
299 if (isMimePreferredEntry(line)){
300 mimePreferred = alias;
301 }
302 }
303 }
304 }
305
306 // output entry
307 out.print(name);
308 out.print(' ');
309 out.print(getCharSize(name));
310 out.print(' ');
311 out.print(mimePreferred);
312 int len = aliases.size();
313 for (int idx = 0; idx < len; idx++) {
314 out.print(' ');
315 out.print(aliases.get(idx));
316 }
317 out.println();
318 return true;
319 }
320
321 /**
322 * Parse the registry file.
323 */
324 private void parseIanaRegistry(BufferedReader in,
325 PrintWriter out) throws IOException {
326 while (parseCharSetEntry(in, out)) {
327 // Looping till eof
328 }
329 }
330
331 /**
332 * Parse the registry file.
333 */
334 private void parseIanaRegistry() throws IOException {
335 BufferedReader in = new BufferedReader(new FileReader(CHAR_SET_REGISTRY));
336 PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(CHAR_SET_TABLE)));
337 parseIanaRegistry(in, out);
338
339 for (int i = 0; i < EXTRA_ENTRIES.length; i++) {
340 out.println(EXTRA_ENTRIES[i]);
341 }
342 out.close();
343 in.close();
344 }
345
346 /**
347 * Entry
348 */
349 public static void main(String[] args) throws IOException {
350 new MkEncodingsTable().parseIanaRegistry();
351 }
352 }