Source code: org/htmlparser/util/Generate.java
1 // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/util/Generate.java,v 1.2 2004/02/11 02:16:59 woolfel Exp $
2 /*
3 * ====================================================================
4 * Copyright 2002-2004 The Apache Software Foundation.
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 */
19
20 // The developers of JMeter and Apache are greatful to the developers
21 // of HTMLParser for giving Apache Software Foundation a non-exclusive
22 // license. The performance benefits of HTMLParser are clear and the
23 // users of JMeter will benefit from the hard work the HTMLParser
24 // team. For detailed information about HTMLParser, the project is
25 // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26 //
27 // HTMLParser was originally created by Somik Raha in 2000. Since then
28 // a healthy community of users has formed and helped refine the
29 // design so that it is able to tackle the difficult task of parsing
30 // dirty HTML. Derrick Oswald is the current lead developer and was kind
31 // enough to assist JMeter.
32 //
33 // This class was contributed by
34 // Derrick Oswald
35 //
36
37 package org.htmlparser.util;
38
39 import org.htmlparser.Node;
40 import org.htmlparser.Parser;
41 import org.htmlparser.RemarkNode;
42 import org.htmlparser.StringNode;
43 import org.htmlparser.tags.EndTag;
44 import org.htmlparser.tags.LinkTag;
45 import org.htmlparser.tags.Tag;
46
47 /**
48 * Create a character reference translation class source file.
49 * Usage:
50 * <pre>
51 * java -classpath .:lib/htmlparser.jar Generate > Translate.java
52 * </pre>
53 * Derived from HTMLStringFilter.java provided as an example with the
54 * htmlparser.jar file available at
55 * <a href="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>
56 * written by Somik Raha (
57 * <a href='mailto:somik@industriallogic.com?
58 * subject=htmlparser'>somik@industriallogic. com</a>
59 * <a href="http://industriallogic.com">http://industriallogic.com</a>).
60 * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
61 */
62 public class Generate
63 {
64 /**
65 * The working parser.
66 */
67 protected Parser parser;
68
69 /**
70 * The system specific line separator string.
71 */
72 protected static final String nl =
73 System.getProperty("line.separator", "\n");
74
75 /**
76 * Create a Generate object.
77 * Sets up the generation by creating a new <code>Parser</code> pointed
78 * at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
79 * with the standard scanners registered.
80 */
81 public Generate() throws ParserException
82 {
83 parser =
84 new Parser("http://www.w3.org/TR/REC-html40/sgml/entities.html");
85 parser.registerScanners();
86 }
87
88 /**
89 * Translate character references.
90 * After generating the Translate class we could use it
91 * to do this job, but that would involve a bootstrap
92 * problem, so this method does the reference conversion
93 * for a very tiny subset (enough to understand the w3.org
94 * page).
95 * @param string The raw string.
96 * @return The string with character references fixed.
97 */
98 public String translate(String string)
99 {
100 int index;
101 int amp;
102 StringBuffer ret;
103
104 ret = new StringBuffer(4096);
105
106 index = 0;
107 while ((index < string.length())
108 && (-1 != (amp = string.indexOf('&', index))))
109 {
110 // include the part before the special character
111 ret.append(string.substring(index, amp));
112 if (string.startsWith(" ", amp))
113 {
114 ret.append(" ");
115 index = amp + 6;
116 }
117 else if (string.startsWith("<", amp))
118 {
119 ret.append("<");
120 index = amp + 4;
121 }
122 else if (string.startsWith(">", amp))
123 {
124 ret.append(">");
125 index = amp + 4;
126 }
127 else if (string.startsWith("&", amp))
128 {
129 ret.append("&");
130 index = amp + 5;
131 }
132 else if (string.startsWith(""e;", amp))
133 {
134 ret.append("\"");
135 index = amp + 7;
136 }
137 else if (string.startsWith("÷", amp))
138 {
139 ret.append('\u00F7');
140 index = amp + 8;
141 }
142 else if (string.startsWith("©", amp))
143 {
144 ret.append('\u00A9');
145 index = amp + 6;
146 }
147 else
148 {
149 System.out.println(
150 "unknown special character starting with "
151 + string.substring(amp, amp + 7));
152 ret.append("&");
153 index = amp + 1;
154 }
155 }
156 ret.append(string.substring(index));
157
158 return (ret.toString());
159 }
160
161 /**
162 * Pull out text elements from the HTML.
163 */
164 public void parse() throws ParserException
165 {
166 Node node;
167 StringBuffer buffer = new StringBuffer(4096);
168
169 // Run through an enumeration of html elements, and pick up
170 // only those that are plain string.
171 for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
172 {
173 node = e.nextNode();
174
175 if (node instanceof StringNode)
176 {
177 // Node is a plain string
178 // Cast it to an HTMLStringNode
179 StringNode stringNode = (StringNode) node;
180 // Retrieve the data from the object
181 buffer.append(stringNode.getText());
182 }
183 else if (node instanceof LinkTag)
184 {
185 // Node is a link
186 // Cast it to an HTMLLinkTag
187 LinkTag linkNode = (LinkTag) node;
188 // Retrieve the data from the object and print it
189 buffer.append(linkNode.getLinkText());
190 }
191 else if (node instanceof Tag)
192 {
193 String contents = ((Tag) node).getText();
194 if (contents.equals("BR") || contents.equals("P"))
195 buffer.append(nl);
196 }
197 else if (node instanceof EndTag)
198 {
199 String contents = ((EndTag) node).getText();
200 if (contents.equals("BR") || contents.equals("P"))
201 buffer.append(nl);
202 }
203 else if (node instanceof RemarkNode)
204 {
205 }
206 else
207 {
208 System.out.println();
209 System.out.println(node.toString());
210 }
211 }
212
213 String text = translate(buffer.toString());
214 sgml(text);
215 }
216
217 /**
218 * Find the lowest index of whitespace (space or newline).
219 * @param string The string to look in.
220 * @param index Where to start looking.
221 * @return -1 if there is no whitespace, the minimum index otherwise.
222 */
223 public int indexOfWhitespace(String string, int index)
224 {
225 int space;
226 int cr;
227 int ret;
228
229 space = string.indexOf(" ", index);
230 cr = string.indexOf(nl, index);
231 if (-1 == space)
232 ret = cr;
233 else if (-1 == cr)
234 ret = space;
235 else
236 ret = Math.min(space, cr);
237
238 return (ret);
239 }
240
241 /**
242 * Rewrite the comment string.
243 * In the sgml table, the comments are of the form:
244 * <pre>
245 * -- latin capital letter I with diaeresis,
246 * U+00CF ISOlat1
247 * </pre>
248 * so we just want to make a one-liner without the spaces and newlines.
249 * @param string The raw comment.
250 * @return The single line comment.
251 */
252 public String pack(String string)
253 {
254 int index;
255 int spaces;
256 StringBuffer ret;
257
258 ret = new StringBuffer(string.length());
259
260 if (string.startsWith("-- "))
261 string = string.substring(3);
262 // remove doublespaces
263 index = 0;
264 while ((index < string.length())
265 && (-1 != (spaces = indexOfWhitespace(string, index))))
266 {
267 ret.append(string.substring(index, spaces));
268 ret.append(" ");
269 while ((spaces < string.length())
270 && Character.isWhitespace(string.charAt(spaces)))
271 spaces++;
272 index = spaces;
273 }
274 if (index < string.length())
275 ret.append(string.substring(index));
276
277 return (ret.toString());
278 }
279
280 /**
281 * Pretty up a comment string.
282 * @param string The comment to operate on.
283 * @return The beautiful comment string.
284 */
285 public String pretty(String string)
286 {
287 int index;
288 int spaces;
289 StringBuffer ret;
290
291 ret = new StringBuffer(string.length());
292
293 // newline instead of doublespaces
294 index = 0;
295 while ((index < string.length())
296 && (-1 != (spaces = string.indexOf(" ", index))))
297 {
298 ret.append(" // " + string.substring(index, spaces));
299 if (!string.substring(index, spaces).endsWith(nl))
300 ret.append(nl);
301 while ((spaces < string.length())
302 && Character.isWhitespace(string.charAt(spaces)))
303 spaces++;
304 index = spaces;
305 }
306 if (index < string.length())
307 ret.append(" // " + string.substring(index));
308
309 return (ret.toString());
310 }
311
312 /**
313 * Pad a string on the left with the given character to the length specified.
314 * @param string The string to pad
315 * @param character The character to pad with.
316 * @param length The size to pad to.
317 * @return The padded string.
318 */
319 public String pad(String string, char character, int length)
320 {
321 StringBuffer ret;
322
323 ret = new StringBuffer(length);
324 ret.append(string);
325 while (length > ret.length())
326 ret.insert(0, character);
327
328 return (ret.toString());
329 }
330
331 /**
332 * Convert the textual representation of the numeric character reference to a character.
333 * @param string The numeric character reference (in quotes).
334 * @return The character represented by the numeric character reference.
335 *
336 */
337 public String unicode(String string)
338 {
339 int code;
340
341 if (string.startsWith("\"&#") && string.endsWith(";\""))
342 {
343 string = string.substring(3, string.length() - 2);
344 try
345 {
346 code = Integer.parseInt(string);
347 string =
348 "new Character ('\\u"
349 + pad(Integer.toHexString(code), '0', 4)
350 + "')";
351 }
352 catch (Exception e)
353 {
354 e.printStackTrace();
355 }
356 return (string);
357 }
358 else
359 return (string);
360 }
361
362 /**
363 * Parse the sgml declaration for character entity reference
364 * name, equivalent numeric character reference and a comment.
365 * Emit a java hash table 'put' with the name as the key, the
366 * numeric character as the value and comment the insertion
367 * with the comment.
368 * @param string The contents of the sgml declaration.
369 */
370 public void extract(String string)
371 {
372 int space;
373 String token;
374 String code;
375 int comment;
376 String description;
377
378 if (string.startsWith("<!--"))
379 System.out.println(
380 pretty(string.substring(4, string.length() - 3).trim()));
381 else if (string.startsWith("<!ENTITY"))
382 {
383 string = string.substring(8, string.length() - 3).trim();
384 if (-1 != (space = string.indexOf(" ")))
385 {
386 token = string.substring(0, space);
387 string = string.substring(space).trim();
388 if (string.startsWith("CDATA"))
389 {
390 string = string.substring(5).trim();
391 if (-1 != (space = string.indexOf(" ")))
392 {
393 code = string.substring(0, space).trim();
394 code = unicode(code);
395 string = string.substring(space).trim();
396 System
397 .out
398 .println(" mRefChar.put (\"" + token + "\","
399 // no token is larger than 8 characters - yet
400 +pad(code, ' ', code.length() + 9 - token.length())
401 + ");"
402 + " // "
403 + pack(string));
404 }
405 else
406 System.out.println(string);
407 }
408 else
409 System.out.println(string);
410 }
411 else
412 System.out.println(string);
413 }
414 else
415 System.out.println(string);
416 }
417
418 /**
419 * Extract special characters.
420 * Scan the string looking for substrings of the form:
421 * <pre>
422 * <!ENTITY nbsp CDATA "&#160;" -- no-break space = non-breaking space, U+00A0 ISOnum -->
423 * </pre>
424 * and emit a java definition for each.
425 * @param string The raw string from w3.org.
426 */
427 public void sgml(String string)
428 {
429 int index;
430 int begin;
431 int end;
432 StringBuffer ret;
433
434 ret = new StringBuffer(4096);
435
436 index = 0;
437 while (-1 != (begin = string.indexOf("<", index)))
438 {
439 if (-1 != (end = string.indexOf("-->", begin)))
440 {
441 extract(string.substring(begin, end + 3));
442 index = end + 3;
443 }
444 else
445 index = begin + 1;
446 }
447 }
448
449 /**
450 * Generator program.
451 * <pre>
452 * java -classpath .:lib/htmlparser.jar Generate > Translate.java
453 * </pre>
454 * @param args <em>Not used.</em>
455 */
456 public static void main(String[] args) throws ParserException
457 {
458 Generate filter = new Generate();
459 System.out.println("import java.util.Hashtable;");
460 System.out.println("import java.util.Iterator;");
461 System.out.println();
462 System.out.println("/**");
463 System.out.println(
464 " * Translate numeric character references and character entity references to unicode characters.");
465 System.out.println(
466 " * Based on tables found at <a href=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">");
467 System.out.println(
468 " * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>");
469 System.out.println(
470 " * <p><b>Note: Do not edit! This class is created by the Generate class.</b>");
471 System.out.println(" * <p>Typical usage:");
472 System.out.println(" * <pre>");
473 System.out.println(
474 " * String s = Translate.decode (getTextFromHtmlPage ());");
475 System.out.println(" * </pre>");
476 System.out.println(
477 " * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>");
478 System.out.println(" */");
479 System.out.println("public class Translate");
480 System.out.println("{");
481 System.out.println(" /**");
482 System.out.println(
483 " * Table mapping entity reference kernel to character.");
484 System.out.println(
485 " * <p><code>String</code>-><code>Character</code>");
486 System.out.println(" */");
487 System.out.println(" protected static Hashtable mRefChar;");
488 System.out.println(" static");
489 System.out.println(" {");
490 System.out.println(" mRefChar = new Hashtable (1000);");
491 System.out.println();
492 filter.parse();
493 System.out.println(" }");
494 System.out.println();
495 System.out.println(" /**");
496 System.out.println(
497 " * Table mapping character to entity reference kernel.");
498 System.out.println(
499 " * <p><code>Character</code>-><code>String</code>");
500 System.out.println(" */");
501 System.out.println(" protected static Hashtable mCharRef;");
502 System.out.println(" static");
503 System.out.println(" {");
504 System.out.println(
505 " mCharRef = new Hashtable (mRefChar.size ());");
506 System.out.println();
507 System.out.println(
508 " Iterator iterator = mRefChar.keySet ().iterator ();");
509 System.out.println(" while (iterator.hasNext ())");
510 System.out.println(" {");
511 System.out.println(
512 " String key = (String)iterator.next ();");
513 System.out.println(
514 " Character character = (Character)mRefChar.get (key);");
515 System.out.println(" mCharRef.put (character, key);");
516 System.out.println(" }");
517 System.out.println(" }");
518 System.out.println();
519 System.out.println(" /**");
520 System.out.println(" * Private constructor.");
521 System.out.println(
522 " * This class is fully static and thread safe.");
523 System.out.println(" */");
524 System.out.println(" private Translate ()");
525 System.out.println(" {");
526 System.out.println(" }");
527 System.out.println();
528 System.out.println(" /**");
529 System.out.println(
530 " * Convert a reference to a unicode character.");
531 System.out.println(
532 " * Convert a single numeric character reference or character entity reference");
533 System.out.println(" * to a unicode character.");
534 System.out.println(
535 " * @param string The string to convert. Of the form &xxxx; or &#xxxx; with");
536 System.out.println(
537 " * or without the leading ampersand or trailing semi-colon.");
538 System.out.println(
539 " * @return The converted character or '\\0' (zero) if the string is an");
540 System.out.println(" * invalid reference.");
541 System.out.println(" */");
542 System.out.println(
543 " public static char convertToChar (String string)");
544 System.out.println(" {");
545 System.out.println(" int length;");
546 System.out.println(" Character item;");
547 System.out.println(" char ret;");
548 System.out.println();
549 System.out.println(" ret = 0;");
550 System.out.println();
551 System.out.println(" length = string.length ();");
552 System.out.println(" if (0 < length)");
553 System.out.println(" {");
554 System.out.println(" if ('&' == string.charAt (0))");
555 System.out.println(" {");
556 System.out.println(" string = string.substring (1);");
557 System.out.println(" length--;");
558 System.out.println(" }");
559 System.out.println(" if (0 < length)");
560 System.out.println(" {");
561 System.out.println(
562 " if (';' == string.charAt (length - 1))");
563 System.out.println(
564 " string = string.substring (0, --length);");
565 System.out.println(" if (0 < length)");
566 System.out.println(" {");
567 System.out.println(" if ('#' == string.charAt (0))");
568 System.out.println(" try");
569 System.out.println(" {");
570 System.out.println(
571 " ret = (char)Integer.parseInt (string.substring (1));");
572 System.out.println(" }");
573 System.out.println(
574 " catch (NumberFormatException nfe)");
575 System.out.println(" {");
576 System.out.println(
577 " /* failed conversion, return 0 */");
578 System.out.println(" }");
579 System.out.println(" else");
580 System.out.println(" {");
581 System.out.println(
582 " item = (Character)refChar.get (string);");
583 System.out.println(" if (null != item)");
584 System.out.println(
585 " ret = item.charValue ();");
586 System.out.println(" }");
587 System.out.println(" }");
588 System.out.println(" }");
589 System.out.println(" }");
590 System.out.println();
591 System.out.println(" return (ret);");
592 System.out.println(" }");
593 System.out.println();
594 System.out.println(" /**");
595 System.out.println(" * Decode a string containing references.");
596 System.out.println(
597 " * Change all numeric character reference and character entity references");
598 System.out.println(" * to unicode characters.");
599 System.out.println(" * @param string The string to translate.");
600 System.out.println(" */");
601 System.out.println(" public static String decode (String string)");
602 System.out.println(" {");
603 System.out.println(" int index;");
604 System.out.println(" int length;");
605 System.out.println(" int amp;");
606 System.out.println(" int semi;");
607 System.out.println(" String code;");
608 System.out.println(" char character;");
609 System.out.println(" StringBuffer ret;");
610 System.out.println();
611 System.out.println(
612 " ret = new StringBuffer (string.length ());");
613 System.out.println();
614 System.out.println(" index = 0;");
615 System.out.println(" length = string.length ();");
616 System.out.println(
617 " while ((index < length) && (-1 != (amp = string.indexOf ('&', index))))");
618 System.out.println(" {");
619 System.out.println(
620 " ret.append (string.substring (index, amp));");
621 System.out.println(" index = amp + 1;");
622 System.out.println(" if (amp < length - 1)");
623 System.out.println(" {");
624 System.out.println(" semi = string.indexOf (';', amp);");
625 System.out.println(" if (-1 != semi)");
626 System.out.println(
627 " code = string.substring (amp, semi + 1);");
628 System.out.println(" else");
629 System.out.println(
630 " code = string.substring (amp);");
631 System.out.println(
632 " if (0 != (character = convertToChar (code)))");
633 System.out.println(" index += code.length () - 1;");
634 System.out.println(" else");
635 System.out.println(" character = '&';");
636 System.out.println(" }");
637 System.out.println(" else");
638 System.out.println(" character = '&';");
639 System.out.println(" ret.append (character);");
640 System.out.println(" }");
641 System.out.println(" if (index < length)");
642 System.out.println(
643 " ret.append (string.substring (index));");
644 System.out.println();
645 System.out.println(" return (ret.toString ());");
646 System.out.println(" }");
647 System.out.println();
648 System.out.println(" /**");
649 System.out.println(
650 " * Convert a character to a character entity reference.");
651 System.out.println(
652 " * Convert a unicode character to a character entity reference of");
653 System.out.println(" * the form &xxxx;.");
654 System.out.println(" * @param character The character to convert.");
655 System.out.println(
656 " * @return The converted character or <code>null</code> if the character");
657 System.out.println(" * is not one of the known entity references.");
658 System.out.println(" */");
659 System.out.println(
660 " public static String convertToString (Character character)");
661 System.out.println(" {");
662 System.out.println(" StringBuffer buffer;");
663 System.out.println(" String ret;");
664 System.out.println();
665 System.out.println(
666 " if (null != (ret = (String)mCharRef.get (character)))");
667 System.out.println(" {");
668 System.out.println(
669 " buffer = new StringBuffer (ret.length () + 2);");
670 System.out.println(" buffer.append ('&');");
671 System.out.println(" buffer.append (ret);");
672 System.out.println(" buffer.append (';');");
673 System.out.println(" ret = buffer.toString ();");
674 System.out.println(" }");
675 System.out.println();
676 System.out.println(" return (ret);");
677 System.out.println(" }");
678 System.out.println();
679 System.out.println(" /**");
680 System.out.println(
681 " * Convert a character to a numeric character reference.");
682 System.out.println(
683 " * Convert a unicode character to a numeric character reference of");
684 System.out.println(" * the form &#xxxx;.");
685 System.out.println(" * @param character The character to convert.");
686 System.out.println(" * @return The converted character.");
687 System.out.println(" */");
688 System.out.println(
689 " public static String convertToString (int character)");
690 System.out.println(" {");
691 System.out.println(" StringBuffer ret;");
692 System.out.println();
693 System.out.println(
694 " ret = new StringBuffer (13); /* � */");
695 System.out.println(" ret.append (\"&#\");");
696 System.out.println(" ret.append (character);");
697 System.out.println(" ret.append (';');");
698 System.out.println();
699 System.out.println(" return (ret.toString ());");
700 System.out.println(" }");
701 System.out.println();
702 System.out.println(" /**");
703 System.out.println(" * Encode a string to use references.");
704 System.out.println(
705 " * Change all characters that are not ASCII to their numeric character");
706 System.out.println(" * reference or character entity reference.");
707 System.out.println(
708 " * This implementation is inefficient, allocating a new");
709 System.out.println(
710 " * <code>Character</code> for each character in the string,");
711 System.out.println(
712 " * but this class is primarily intended to decode strings");
713 System.out.println(
714 " * so efficiency and speed in the encoding was not a priority.");
715 System.out.println(" * @param string The string to translate.");
716 System.out.println(" */");
717 System.out.println(" public static String encode (String string)");
718 System.out.println(" {");
719 System.out.println(" int length;");
720 System.out.println(" char c;");
721 System.out.println(" Character character;");
722 System.out.println(" String value;");
723 System.out.println(" StringBuffer ret;");
724 System.out.println();
725 System.out.println(
726 " ret = new StringBuffer (string.length () * 6);");
727 System.out.println(" length = string.length ();");
728 System.out.println(" for (int i = 0; i < length; i++)");
729 System.out.println(" {");
730 System.out.println(" c = string.charAt (i);");
731 System.out.println(" character = new Character (c);");
732 System.out.println(
733 " if (null != (value = convertToString (character)))");
734 System.out.println(" ret.append (value);");
735 System.out.println(
736 " else if (!((c > 0x001F) && (c < 0x007F)))");
737 System.out.println(" {");
738 System.out.println(" value = convertToString (c);");
739 System.out.println(" ret.append (value);");
740 System.out.println(" }");
741 System.out.println(" else");
742 System.out.println(" ret.append (character);");
743 System.out.println(" }");
744 System.out.println();
745 System.out.println(" return (ret.toString ());");
746 System.out.println(" }");
747 System.out.println("}");
748 }
749 }