Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/htmlparser/util/Generate.java


1   // $Header: /home/cvs/jakarta-jmeter/src/htmlparser/org/htmlparser/util/Generate.java,v 1.2 2004/02/11 02:16:59 woolfel Exp $
2   /*
3    * ====================================================================
4    * Copyright 2002-2004 The Apache Software Foundation.
5    *
6    * Licensed under the Apache License, Version 2.0 (the "License");
7    * you may not use this file except in compliance with the License.
8    * You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   * 
18   */
19  
20  // The developers of JMeter and Apache are greatful to the developers
21  // of HTMLParser for giving Apache Software Foundation a non-exclusive
22  // license. The performance benefits of HTMLParser are clear and the
23  // users of JMeter will benefit from the hard work the HTMLParser
24  // team. For detailed information about HTMLParser, the project is
25  // hosted on sourceforge at http://htmlparser.sourceforge.net/.
26  //
27  // HTMLParser was originally created by Somik Raha in 2000. Since then
28  // a healthy community of users has formed and helped refine the
29  // design so that it is able to tackle the difficult task of parsing
30  // dirty HTML. Derrick Oswald is the current lead developer and was kind
31  // enough to assist JMeter.
32  // 
33  // This class was contributed by 
34  // Derrick Oswald
35  //
36  
37  package org.htmlparser.util;
38  
39  import org.htmlparser.Node;
40  import org.htmlparser.Parser;
41  import org.htmlparser.RemarkNode;
42  import org.htmlparser.StringNode;
43  import org.htmlparser.tags.EndTag;
44  import org.htmlparser.tags.LinkTag;
45  import org.htmlparser.tags.Tag;
46  
47  /**
48   * Create a character reference translation class source file.
49   * Usage:
50   * <pre>
51   *     java -classpath .:lib/htmlparser.jar Generate > Translate.java
52   * </pre>
53   * Derived from HTMLStringFilter.java provided as an example with the
54   * htmlparser.jar file available at
55   * <a href="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a>
56   * written by Somik Raha (
57   * <a href='mailto:somik@industriallogic.com?
58   * subject=htmlparser'>somik@industriallogic. com</a>
59   * <a href="http://industriallogic.com">http://industriallogic.com</a>).
60   * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>
61   */
62  public class Generate
63  {
64      /**
65       * The working parser.
66       */
67      protected Parser parser;
68  
69      /**
70       * The system specific line separator string.
71       */
72      protected static final String nl =
73          System.getProperty("line.separator", "\n");
74  
75      /**
76       * Create a Generate object.
77       * Sets up the generation by creating a new <code>Parser</code> pointed
78       * at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a>
79       * with the standard scanners registered.
80       */
81      public Generate() throws ParserException
82      {
83          parser =
84              new Parser("http://www.w3.org/TR/REC-html40/sgml/entities.html");
85          parser.registerScanners();
86      }
87  
88      /**
89       * Translate character references.
90       * After generating the Translate class we could use it
91       * to do this job, but that would involve a bootstrap
92       * problem, so this method does the reference conversion
93       * for a very tiny subset (enough  to understand the w3.org
94       * page).
95       * @param string The raw string.
96       * @return The string with character references fixed.
97       */
98      public String translate(String string)
99      {
100         int index;
101         int amp;
102         StringBuffer ret;
103 
104         ret = new StringBuffer(4096);
105 
106         index = 0;
107         while ((index < string.length())
108             && (-1 != (amp = string.indexOf('&', index))))
109         {
110             // include the part before the special character
111             ret.append(string.substring(index, amp));
112             if (string.startsWith("&nbsp;", amp))
113             {
114                 ret.append(" ");
115                 index = amp + 6;
116             }
117             else if (string.startsWith("&lt;", amp))
118             {
119                 ret.append("<");
120                 index = amp + 4;
121             }
122             else if (string.startsWith("&gt;", amp))
123             {
124                 ret.append(">");
125                 index = amp + 4;
126             }
127             else if (string.startsWith("&amp;", amp))
128             {
129                 ret.append("&");
130                 index = amp + 5;
131             }
132             else if (string.startsWith("&quote;", amp))
133             {
134                 ret.append("\"");
135                 index = amp + 7;
136             }
137             else if (string.startsWith("&divide;", amp))
138             {
139                 ret.append('\u00F7');
140                 index = amp + 8;
141             }
142             else if (string.startsWith("&copy;", amp))
143             {
144                 ret.append('\u00A9');
145                 index = amp + 6;
146             }
147             else
148             {
149                 System.out.println(
150                     "unknown special character starting with "
151                         + string.substring(amp, amp + 7));
152                 ret.append("&");
153                 index = amp + 1;
154             }
155         }
156         ret.append(string.substring(index));
157 
158         return (ret.toString());
159     }
160 
161     /**
162      * Pull out text elements from the HTML.
163      */
164     public void parse() throws ParserException
165     {
166         Node node;
167         StringBuffer buffer = new StringBuffer(4096);
168 
169         // Run through an enumeration of html elements, and pick up
170         // only those that are plain string.
171         for (NodeIterator e = parser.elements(); e.hasMoreNodes();)
172         {
173             node = e.nextNode();
174 
175             if (node instanceof StringNode)
176             {
177                 // Node is a plain string
178                 // Cast it to an HTMLStringNode
179                 StringNode stringNode = (StringNode) node;
180                 // Retrieve the data from the object
181                 buffer.append(stringNode.getText());
182             }
183             else if (node instanceof LinkTag)
184             {
185                 // Node is a link
186                 // Cast it to an HTMLLinkTag
187                 LinkTag linkNode = (LinkTag) node;
188                 // Retrieve the data from the object and print it
189                 buffer.append(linkNode.getLinkText());
190             }
191             else if (node instanceof Tag)
192             {
193                 String contents = ((Tag) node).getText();
194                 if (contents.equals("BR") || contents.equals("P"))
195                     buffer.append(nl);
196             }
197             else if (node instanceof EndTag)
198             {
199                 String contents = ((EndTag) node).getText();
200                 if (contents.equals("BR") || contents.equals("P"))
201                     buffer.append(nl);
202             }
203             else if (node instanceof RemarkNode)
204             {
205             }
206             else
207             {
208                 System.out.println();
209                 System.out.println(node.toString());
210             }
211         }
212 
213         String text = translate(buffer.toString());
214         sgml(text);
215     }
216 
217     /**
218      * Find the lowest index of whitespace (space or newline).
219      * @param string The string to look in.
220      * @param index Where to start looking.
221      * @return -1 if there is no whitespace, the minimum index otherwise.
222      */
223     public int indexOfWhitespace(String string, int index)
224     {
225         int space;
226         int cr;
227         int ret;
228 
229         space = string.indexOf(" ", index);
230         cr = string.indexOf(nl, index);
231         if (-1 == space)
232             ret = cr;
233         else if (-1 == cr)
234             ret = space;
235         else
236             ret = Math.min(space, cr);
237 
238         return (ret);
239     }
240 
241     /**
242      * Rewrite the comment string.
243      * In the sgml table, the comments are of the form:
244      * <pre>
245      * -- latin capital letter I with diaeresis,
246      *             U+00CF ISOlat1
247      * </pre>
248      * so we just want to make a one-liner without the spaces and newlines.
249      * @param string The raw comment.
250      * @return The single line comment.
251      */
252     public String pack(String string)
253     {
254         int index;
255         int spaces;
256         StringBuffer ret;
257 
258         ret = new StringBuffer(string.length());
259 
260         if (string.startsWith("-- "))
261             string = string.substring(3);
262         // remove doublespaces
263         index = 0;
264         while ((index < string.length())
265             && (-1 != (spaces = indexOfWhitespace(string, index))))
266         {
267             ret.append(string.substring(index, spaces));
268             ret.append(" ");
269             while ((spaces < string.length())
270                 && Character.isWhitespace(string.charAt(spaces)))
271                 spaces++;
272             index = spaces;
273         }
274         if (index < string.length())
275             ret.append(string.substring(index));
276 
277         return (ret.toString());
278     }
279 
280     /**
281      * Pretty up a comment string.
282      * @param string The comment to operate on.
283      * @return The beautiful comment string.
284      */
285     public String pretty(String string)
286     {
287         int index;
288         int spaces;
289         StringBuffer ret;
290 
291         ret = new StringBuffer(string.length());
292 
293         // newline instead of doublespaces
294         index = 0;
295         while ((index < string.length())
296             && (-1 != (spaces = string.indexOf("  ", index))))
297         {
298             ret.append("        // " + string.substring(index, spaces));
299             if (!string.substring(index, spaces).endsWith(nl))
300                 ret.append(nl);
301             while ((spaces < string.length())
302                 && Character.isWhitespace(string.charAt(spaces)))
303                 spaces++;
304             index = spaces;
305         }
306         if (index < string.length())
307             ret.append("        // " + string.substring(index));
308 
309         return (ret.toString());
310     }
311 
312     /**
313      * Pad a string on the left with the given character to the length specified.
314      * @param string The string to pad
315      * @param character The character to pad with.
316      * @param length The size to pad to.
317      * @return The padded string.
318      */
319     public String pad(String string, char character, int length)
320     {
321         StringBuffer ret;
322 
323         ret = new StringBuffer(length);
324         ret.append(string);
325         while (length > ret.length())
326             ret.insert(0, character);
327 
328         return (ret.toString());
329     }
330 
331     /**
332      * Convert the textual representation of the numeric character reference to a character.
333      * @param string The numeric character reference (in quotes).
334      * @return The character represented by the numeric character reference.
335      *
336      */
337     public String unicode(String string)
338     {
339         int code;
340 
341         if (string.startsWith("\"&#") && string.endsWith(";\""))
342         {
343             string = string.substring(3, string.length() - 2);
344             try
345             {
346                 code = Integer.parseInt(string);
347                 string =
348                     "new Character ('\\u"
349                         + pad(Integer.toHexString(code), '0', 4)
350                         + "')";
351             }
352             catch (Exception e)
353             {
354                 e.printStackTrace();
355             }
356             return (string);
357         }
358         else
359             return (string);
360     }
361 
362     /**
363      * Parse the sgml declaration for character entity reference
364      * name, equivalent numeric character reference and a comment.
365      * Emit a java hash table 'put' with the name as the key, the
366      * numeric character as the value and comment the insertion
367      * with the comment.
368      * @param string The contents of the sgml declaration.
369      */
370     public void extract(String string)
371     {
372         int space;
373         String token;
374         String code;
375         int comment;
376         String description;
377 
378         if (string.startsWith("<!--"))
379             System.out.println(
380                 pretty(string.substring(4, string.length() - 3).trim()));
381         else if (string.startsWith("<!ENTITY"))
382         {
383             string = string.substring(8, string.length() - 3).trim();
384             if (-1 != (space = string.indexOf(" ")))
385             {
386                 token = string.substring(0, space);
387                 string = string.substring(space).trim();
388                 if (string.startsWith("CDATA"))
389                 {
390                     string = string.substring(5).trim();
391                     if (-1 != (space = string.indexOf(" ")))
392                     {
393                         code = string.substring(0, space).trim();
394                         code = unicode(code);
395                         string = string.substring(space).trim();
396                         System
397                             .out
398                             .println("        mRefChar.put (\"" + token + "\","
399                         // no token is larger than 8 characters - yet
400                         +pad(code, ' ', code.length() + 9 - token.length())
401                             + ");"
402                             + " // "
403                             + pack(string));
404                     }
405                     else
406                         System.out.println(string);
407                 }
408                 else
409                     System.out.println(string);
410             }
411             else
412                 System.out.println(string);
413         }
414         else
415             System.out.println(string);
416     }
417 
418     /**
419      * Extract special characters.
420      * Scan the string looking for substrings of the form:
421      * <pre>
422      * &lt;!ENTITY nbsp   CDATA "&amp;#160;" -- no-break space = non-breaking space, U+00A0 ISOnum --&gt;
423      * </pre>
424      * and emit a java definition for each.
425      * @param string The raw string from w3.org.
426      */
427     public void sgml(String string)
428     {
429         int index;
430         int begin;
431         int end;
432         StringBuffer ret;
433 
434         ret = new StringBuffer(4096);
435 
436         index = 0;
437         while (-1 != (begin = string.indexOf("<", index)))
438         {
439             if (-1 != (end = string.indexOf("-->", begin)))
440             {
441                 extract(string.substring(begin, end + 3));
442                 index = end + 3;
443             }
444             else
445                 index = begin + 1;
446         }
447     }
448 
449     /**
450      * Generator program.
451      * <pre>
452      *     java -classpath .:lib/htmlparser.jar Generate > Translate.java
453      * </pre>
454      * @param args <em>Not used.</em>
455      */
456     public static void main(String[] args) throws ParserException
457     {
458         Generate filter = new Generate();
459         System.out.println("import java.util.Hashtable;");
460         System.out.println("import java.util.Iterator;");
461         System.out.println();
462         System.out.println("/**");
463         System.out.println(
464             " * Translate numeric character references and character entity references to unicode characters.");
465         System.out.println(
466             " * Based on tables found at <a href=\"http://www.w3.org/TR/REC-html40/sgml/entities.html\">");
467         System.out.println(
468             " * http://www.w3.org/TR/REC-html40/sgml/entities.html</a>");
469         System.out.println(
470             " * <p><b>Note: Do not edit! This class is created by the Generate class.</b>");
471         System.out.println(" * <p>Typical usage:");
472         System.out.println(" * <pre>");
473         System.out.println(
474             " *      String s = Translate.decode (getTextFromHtmlPage ());");
475         System.out.println(" * </pre>");
476         System.out.println(
477             " * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a>");
478         System.out.println(" */");
479         System.out.println("public class Translate");
480         System.out.println("{");
481         System.out.println("    /**");
482         System.out.println(
483             "     * Table mapping entity reference kernel to character.");
484         System.out.println(
485             "     * <p><code>String</code>-><code>Character</code>");
486         System.out.println("     */");
487         System.out.println("    protected static Hashtable mRefChar;");
488         System.out.println("    static");
489         System.out.println("    {");
490         System.out.println("        mRefChar = new Hashtable (1000);");
491         System.out.println();
492         filter.parse();
493         System.out.println("    }");
494         System.out.println();
495         System.out.println("    /**");
496         System.out.println(
497             "     * Table mapping character to entity reference kernel.");
498         System.out.println(
499             "     * <p><code>Character</code>-><code>String</code>");
500         System.out.println("     */");
501         System.out.println("    protected static Hashtable mCharRef;");
502         System.out.println("    static");
503         System.out.println("    {");
504         System.out.println(
505             "        mCharRef = new Hashtable (mRefChar.size ());");
506         System.out.println();
507         System.out.println(
508             "        Iterator iterator = mRefChar.keySet ().iterator ();");
509         System.out.println("        while (iterator.hasNext ())");
510         System.out.println("        {");
511         System.out.println(
512             "            String key = (String)iterator.next ();");
513         System.out.println(
514             "            Character character = (Character)mRefChar.get (key);");
515         System.out.println("            mCharRef.put (character, key);");
516         System.out.println("        }");
517         System.out.println("    }");
518         System.out.println();
519         System.out.println("    /**");
520         System.out.println("     * Private constructor.");
521         System.out.println(
522             "     * This class is fully static and thread safe.");
523         System.out.println("     */");
524         System.out.println("    private Translate ()");
525         System.out.println("    {");
526         System.out.println("    }");
527         System.out.println();
528         System.out.println("    /**");
529         System.out.println(
530             "     * Convert a reference to a unicode character.");
531         System.out.println(
532             "     * Convert a single numeric character reference or character entity reference");
533         System.out.println("     * to a unicode character.");
534         System.out.println(
535             "     * @param string The string to convert. Of the form &xxxx; or &amp;#xxxx; with");
536         System.out.println(
537             "     * or without the leading ampersand or trailing semi-colon.");
538         System.out.println(
539             "     * @return The converted character or '\\0' (zero) if the string is an");
540         System.out.println("     * invalid reference.");
541         System.out.println("     */");
542         System.out.println(
543             "    public static char convertToChar (String string)");
544         System.out.println("    {");
545         System.out.println("        int length;");
546         System.out.println("        Character item;");
547         System.out.println("        char ret;");
548         System.out.println();
549         System.out.println("        ret = 0;");
550         System.out.println();
551         System.out.println("        length = string.length ();");
552         System.out.println("        if (0 < length)");
553         System.out.println("        {");
554         System.out.println("            if ('&' == string.charAt (0))");
555         System.out.println("            {");
556         System.out.println("                string = string.substring (1);");
557         System.out.println("                length--;");
558         System.out.println("            }");
559         System.out.println("            if (0 < length)");
560         System.out.println("            {");
561         System.out.println(
562             "                if (';' == string.charAt (length - 1))");
563         System.out.println(
564             "                    string = string.substring (0, --length);");
565         System.out.println("                if (0 < length)");
566         System.out.println("                {");
567         System.out.println("                    if ('#' == string.charAt (0))");
568         System.out.println("                        try");
569         System.out.println("                        {");
570         System.out.println(
571             "                            ret = (char)Integer.parseInt (string.substring (1));");
572         System.out.println("                        }");
573         System.out.println(
574             "                        catch (NumberFormatException nfe)");
575         System.out.println("                        {");
576         System.out.println(
577             "                            /* failed conversion, return 0 */");
578         System.out.println("                        }");
579         System.out.println("                    else");
580         System.out.println("                    {");
581         System.out.println(
582             "                        item = (Character)refChar.get (string);");
583         System.out.println("                        if (null != item)");
584         System.out.println(
585             "                            ret = item.charValue ();");
586         System.out.println("                    }");
587         System.out.println("                }");
588         System.out.println("            }");
589         System.out.println("        }");
590         System.out.println();
591         System.out.println("        return (ret);");
592         System.out.println("    }");
593         System.out.println();
594         System.out.println("    /**");
595         System.out.println("     * Decode a string containing references.");
596         System.out.println(
597             "     * Change all numeric character reference and character entity references");
598         System.out.println("     * to unicode characters.");
599         System.out.println("     * @param string The string to translate.");
600         System.out.println("     */");
601         System.out.println("    public static String decode (String string)");
602         System.out.println("    {");
603         System.out.println("        int index;");
604         System.out.println("        int length;");
605         System.out.println("        int amp;");
606         System.out.println("        int semi;");
607         System.out.println("        String code;");
608         System.out.println("        char character;");
609         System.out.println("        StringBuffer ret;");
610         System.out.println();
611         System.out.println(
612             "        ret = new StringBuffer (string.length ());");
613         System.out.println();
614         System.out.println("        index = 0;");
615         System.out.println("        length = string.length ();");
616         System.out.println(
617             "        while ((index < length) && (-1 != (amp = string.indexOf ('&', index))))");
618         System.out.println("        {");
619         System.out.println(
620             "            ret.append (string.substring (index, amp));");
621         System.out.println("            index = amp + 1;");
622         System.out.println("            if (amp < length - 1)");
623         System.out.println("            {");
624         System.out.println("                semi = string.indexOf (';', amp);");
625         System.out.println("                if (-1 != semi)");
626         System.out.println(
627             "                    code = string.substring (amp, semi + 1);");
628         System.out.println("                else");
629         System.out.println(
630             "                    code = string.substring (amp);");
631         System.out.println(
632             "                if (0 != (character = convertToChar (code)))");
633         System.out.println("                    index += code.length () - 1;");
634         System.out.println("                else");
635         System.out.println("                    character = '&';");
636         System.out.println("            }");
637         System.out.println("            else");
638         System.out.println("                character = '&';");
639         System.out.println("            ret.append (character);");
640         System.out.println("        }");
641         System.out.println("        if (index < length)");
642         System.out.println(
643             "            ret.append (string.substring (index));");
644         System.out.println();
645         System.out.println("        return (ret.toString ());");
646         System.out.println("    }");
647         System.out.println();
648         System.out.println("    /**");
649         System.out.println(
650             "     * Convert a character to a character entity reference.");
651         System.out.println(
652             "     * Convert a unicode character to a character entity reference of");
653         System.out.println("     * the form &xxxx;.");
654         System.out.println("     * @param character The character to convert.");
655         System.out.println(
656             "     * @return The converted character or <code>null</code> if the character");
657         System.out.println("     * is not one of the known entity references.");
658         System.out.println("     */");
659         System.out.println(
660             "    public static String convertToString (Character character)");
661         System.out.println("    {");
662         System.out.println("        StringBuffer buffer;");
663         System.out.println("        String ret;");
664         System.out.println();
665         System.out.println(
666             "        if (null != (ret = (String)mCharRef.get (character)))");
667         System.out.println("        {");
668         System.out.println(
669             "            buffer = new StringBuffer (ret.length () + 2);");
670         System.out.println("            buffer.append ('&');");
671         System.out.println("            buffer.append (ret);");
672         System.out.println("            buffer.append (';');");
673         System.out.println("            ret = buffer.toString ();");
674         System.out.println("        }");
675         System.out.println();
676         System.out.println("        return (ret);");
677         System.out.println("    }");
678         System.out.println();
679         System.out.println("    /**");
680         System.out.println(
681             "     * Convert a character to a numeric character reference.");
682         System.out.println(
683             "     * Convert a unicode character to a numeric character reference of");
684         System.out.println("     * the form &amp;#xxxx;.");
685         System.out.println("     * @param character The character to convert.");
686         System.out.println("     * @return The converted character.");
687         System.out.println("     */");
688         System.out.println(
689             "    public static String convertToString (int character)");
690         System.out.println("    {");
691         System.out.println("        StringBuffer ret;");
692         System.out.println();
693         System.out.println(
694             "        ret = new StringBuffer (13); /* &#2147483647; */");
695         System.out.println("        ret.append (\"&#\");");
696         System.out.println("        ret.append (character);");
697         System.out.println("        ret.append (';');");
698         System.out.println();
699         System.out.println("        return (ret.toString ());");
700         System.out.println("    }");
701         System.out.println();
702         System.out.println("    /**");
703         System.out.println("     * Encode a string to use references.");
704         System.out.println(
705             "     * Change all characters that are not ASCII to their numeric character");
706         System.out.println("     * reference or character entity reference.");
707         System.out.println(
708             "     * This implementation is inefficient, allocating a new");
709         System.out.println(
710             "     * <code>Character</code> for each character in the string,");
711         System.out.println(
712             "     * but this class is primarily intended to decode strings");
713         System.out.println(
714             "     * so efficiency and speed in the encoding was not a priority.");
715         System.out.println("     * @param string The string to translate.");
716         System.out.println("     */");
717         System.out.println("    public static String encode (String string)");
718         System.out.println("    {");
719         System.out.println("        int length;");
720         System.out.println("        char c;");
721         System.out.println("        Character character;");
722         System.out.println("        String value;");
723         System.out.println("        StringBuffer ret;");
724         System.out.println();
725         System.out.println(
726             "        ret = new StringBuffer (string.length () * 6);");
727         System.out.println("        length  = string.length ();");
728         System.out.println("        for (int i = 0; i < length; i++)");
729         System.out.println("        {");
730         System.out.println("            c = string.charAt (i);");
731         System.out.println("            character = new Character (c);");
732         System.out.println(
733             "            if (null != (value = convertToString (character)))");
734         System.out.println("                ret.append (value);");
735         System.out.println(
736             "            else if (!((c > 0x001F) && (c < 0x007F)))");
737         System.out.println("            {");
738         System.out.println("                value = convertToString (c);");
739         System.out.println("                ret.append (value);");
740         System.out.println("            }");
741         System.out.println("            else");
742         System.out.println("                ret.append (character);");
743         System.out.println("        }");
744         System.out.println();
745         System.out.println("        return (ret.toString ());");
746         System.out.println("    }");
747         System.out.println("}");
748     }
749 }