Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/arthurdo/parser/HtmlStreamTokenizer.java


1   /*
2    * Copyright (c) 1996, 2001 by Arthur Do <arthur@cs.stanford.edu>.
3    * All Rights Reserved.
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation; either version 2 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  
20  package com.arthurdo.parser;
21  
22  import java.io.*;
23  import java.util.*;
24  
25  /**
26   * <p>HtmlStreamTokenizer is an HTML parser that is similar
27   * to the StreamTokenizer class but is specialized for
28   * HTML streams. This class is useful when you need to
29   * parse the structure of an HTML document.</p>
30   *
31   * <pre>
32   * import com.arthurdo.parser.*;
33   * <p>
34   * HtmlStreamTokenizer tok = new HtmlStreamTokenizer(inputstream);
35   * HtmlTag tag = new HtmlTag();
36   *
37   * while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF) {
38   *  int ttype = tok.getTokenType();
39   *
40   *  if (ttype == HtmlStreamTokenizer.TT_TAG) {
41   *    tok.parseTag(tok.getStringValue(), tag);
42   *    System.out.println(&quot;tag: &quot; + tag.toString());
43   *  }
44   *  else if (ttype == HtmlStreamTokenizer.TT_TEXT) {
45   *    System.out.println(&quot;text: &quot; + tok.getStringValue());
46   *  }
47   *  else if (ttype == HtmlStreamTokenizer.TT_COMMENT) {
48   *    System.out.println(&quot;comment: &lt;!--&quot; + tok.getStringValue() + &quot;--&gt;&quot;);
49   *  }
50   * }
51   * </pre>
52   *
53   * <p>One of the motivations for designing <i>parseTag()</i> to take
54   * an HtmlTag argument rather than having <i>parseTag()</i> return
55   * a newly created HtmlTag is so you can create your own tag class
56   * derived from HtmlTag.
57   *
58   * <ul>
59   * <li> 02/09/98 Thomas Horster-Möller, fixed bug with counting
60   * newlines twice on character pushback.
61   * <li> 06/14/99 text is now returned as 'runs' instead of being
62   * broken up into words as in previous versions. You can use a StringTokenizer
63   * to break your text into words.
64   * </ul>
65   *
66   * @version 2.01 09/12/97
67   * @author Arthur Do <arthur@cs.stanford.edu>
68   * @see     com.arthurdo.parser.HtmlTag
69   * @see     com.arthurdo.parser.Table
70   */
71  public class HtmlStreamTokenizer
72  {
73    /**
74     * end of stream.
75     */
76      public static final int TT_EOF = -1;
77    /**
78     * text token.
79     */
80      public static final int TT_TEXT = -2;
81    /**
82     * tag token.
83     */
84      public static final int TT_TAG = -3;
85    /**
86     * comment token.
87     */
88    public static final int TT_COMMENT = -4;
89  
90    /**
91     * inside <! to provide support for doctypes with internal dtd, <![CDATA sections, and degenerate html comments
92     */
93    public static final int TT_BANGTAG = -5;
94  
95    /**
96     * entity reference token (&*;)
97     */
98    public static final int TT_ENTITYREFERENCE = -6;
99  
100   /**
101    * @deprecated  use HtmlStreamTokenizer(Reader) instead.
102    *        This version of the constructor can lead to 10x slower code
103    *        because of the InputStreamReader wrapper.
104    * @param  in  input stream
105    */
106   public HtmlStreamTokenizer(InputStream in)
107   {
108     this(new BufferedReader(new InputStreamReader(in)));
109   }
110 
111   /**
112    * @param  in  Reader. The input is assumed to be buffered as needed.
113    */
114   public HtmlStreamTokenizer(Reader in)
115   {
116     m_in = in;
117     m_state = STATE_TEXT;
118   }
119 
120   /**
121    * @return  token type, one of the <b>TT_</b> defines
122    */
123   public final int getTokenType()
124   {
125     return m_ttype;
126   }
127 
128   /**
129    * @return  string value of the token
130    */
131   public final StringBuffer getStringValue()
132   {
133     return m_buf;
134   }
135 
136   /**
137    * @return string value of the token, including characters stripped off by the tokenizer
138    */
139   public final String getRawString()
140   {
141     switch (m_ttype)
142     {
143     case TT_TAG:
144       return "<" + m_buf.toString() + ">";
145     case TT_BANGTAG:
146       return "<!" + m_buf.toString() + ">";
147     case TT_COMMENT:
148       return "<!--" + m_buf.toString() + "-->";
149     case TT_ENTITYREFERENCE:
150       return "&" + m_buf.toString() + ";";
151     default:
152       return m_buf.toString();
153     }
154   }
155 
156   /**
157    * @deprecated  white space is now returned as TT_TEXT. This buffer is always
158    *        empty.
159    * @return  any white space accumulated since last call to nextToken
160    */
161   public final StringBuffer getWhiteSpace()
162   {
163     return m_whitespace;
164   }
165 
166   /**
167    * @return  current line number. Every time nextToken() sees a new
168    *      line character ('\n'), it increments the line number.
169    */
170   public int getLineNumber()
171   {
172     return m_lineno;
173   }
174 
175   /**
176   * @param char[] exitString CDATA mode will terminate when it encounters this string
177   * @param boolean pushbackExitString whether to parse the exit string again or not
178   *
179   * it'd be an error to call enterCDATAMode(exitString, true); getToken()==TT_CDATA; enterCDATAMode(differentExitString, true); 'cause the next getToken() call will parse differentExitString instead of exitString
180   */
181   public void enterCDATAMode(char[] exitString, boolean pushbackExitString)
182   {
183     m_cdata_end = exitString;
184     m_cdata = 0;
185     m_cdata_pushback = pushbackExitString;
186   }
187 
188   public boolean isCDATA() { return m_isCDTATA; }
189 
190   /**
191    * @return  the next token
192      * @exception  IOException  if error reading input stream.
193    */
194   public int nextToken()
195     throws IOException
196   {
197     m_buf.setLength(0);
198     m_whitespace.setLength(0);
199     int ltcount = 0;
200     m_isCDTATA = false;
201     boolean hasAmp = false;
202 
203     while (true)
204     {
205       int c;
206 
207       if (m_pushback != 0)
208       {
209         c = m_pushback;
210         if (c == '\n')
211           m_lineno--;    // don't count newline twice
212         m_pushback = 0;
213       }
214       else if (m_cdata < -1)
215         c = m_cdata_end[m_cdata++ + m_cdata_end.length + 1];
216       else
217       {
218         c = m_in.read();
219       }
220 
221       if (c < 0)
222       {
223         int state = m_state;
224         m_state = STATE_EOF;
225 
226         if (m_buf.length() > 0 && state == STATE_TEXT)
227         {
228           if (m_unescape && hasAmp)
229             unescape(m_buf);
230           return m_ttype = TT_TEXT;
231         }
232         else
233           return m_ttype = TT_EOF;
234       }
235 
236       if (c == '\n')
237         m_lineno++;
238 
239       switch (m_state)
240       {
241       case STATE_TEXT:
242         {
243           if (m_cdata > -1)
244           {
245             //we're in cdata mode
246             if ((Character.toUpperCase(m_cdata_end[m_cdata]) == c) ||
247               (Character.toLowerCase(m_cdata_end[m_cdata]) == c)) // support case sensitive exit strings
248             {
249               if (++m_cdata == m_cdata_end.length)
250               {
251                 if (m_cdata_pushback)
252                   m_cdata = -m_cdata_end.length -1;
253                 else
254                   m_cdata = -1;
255                 m_isCDTATA = true;
256                 return m_ttype = TT_TEXT;
257               }
258             }
259             else
260             {
261               if (m_cdata > 0)
262               {
263                 m_buf.append(m_cdata_end, 0, m_cdata);
264                 m_cdata = 0;
265               }
266               m_buf.append((char)c);
267             }
268 
269           }
270           else if (c == '<')
271           {
272             boolean inCDATApushback = m_cdata < -1;
273             int peek = inCDATApushback ? m_cdata_end[m_cdata++  + m_cdata_end.length + 1] : m_in.read();
274 
275             if (peek == '!')
276               m_state = STATE_BANGTAG;
277             else if (peek == '<')
278             {
279               // handle <<, some people use it in <pre>
280               m_buf.append("<<");
281               break;
282             }
283             else
284             {
285               m_pushback = peek;
286               if (inCDATApushback)
287                 --m_cdata;
288               m_state = STATE_TAG;
289             }
290 
291             if (m_buf.length() > 0)
292             {
293               if (m_unescape && hasAmp)
294                 unescape(m_buf);
295               return m_ttype = TT_TEXT;
296             }
297           }
298           /*
299           else if (isSpace(c))
300           {
301             m_pushback = c;
302             m_state = STATE_WS;
303             if (m_buf.length() > 0)
304             {
305               if (m_unescape && hasAmp)
306                 unescape(m_buf);
307               return m_ttype = TT_TEXT;
308             }
309           }
310           */
311           else
312           {
313             if (c == '&')
314             {
315               if (m_getEntities)
316               {
317                 m_state = STATE_ENTITYREF;
318                 return m_ttype = TT_TEXT;
319               }
320               else
321                 hasAmp = true;
322             }
323             m_buf.append((char)c);
324           }
325         }
326         break;
327 
328       case STATE_WS:
329         {
330           if (!isSpace(c))
331           {
332             m_pushback = c;
333             m_state = STATE_TEXT;
334           }
335           else
336           {
337             m_whitespace.append((char)c);
338           }
339         }
340         break;
341 
342       case STATE_TAG:
343         {
344           if (c == '>')
345           {
346             m_state = STATE_TEXT;
347             return m_ttype = TT_TAG;
348           }
349           else if (c == C_SINGLEQUOTE || c == C_DOUBLEQUOTE)
350           {
351             // handle quotes inside tag
352             m_tagquote = c;
353             m_buf.append((char)c);
354             m_state = STATE_TAG_QUOTE;
355           }
356           else
357           {
358             m_buf.append((char)c);
359           }
360         }
361         break;
362       case STATE_BANGTAG:
363         {
364           int buflen = m_buf.length();
365           if (c == '<')
366           {
367             ++ltcount;
368             m_buf.append((char)c);
369           }
370           else if (c == '>' && --ltcount < 0)
371           {
372             m_state = STATE_TEXT;
373             return m_ttype = TT_BANGTAG;
374           }
375             else if (c == '-' && buflen == 1 && m_buf.charAt(0) == '-')
376           {
377             // handle <!--
378             m_buf.setLength(0);
379             m_state = STATE_COMMENT;
380           }
381           else if (buflen == 6 && c == '[' && m_buf.toString().equals("[CDATA[") )
382           {
383             // handle <![CDATA[
384             m_buf.setLength(0);
385             enterCDATAMode(m_xmlcdata_end, false);
386             m_state = STATE_TEXT;
387           }
388           else
389           {
390             m_buf.append((char)c);
391           }
392         }
393         break;
394       case STATE_TAG_QUOTE:
395         {
396           // the only way out out of this state is to close the quote
397           // special case: some people forget to end quote in a tag
398           if (c == '>')
399           {
400             m_pushback = c;
401             m_state = STATE_TAG;
402           }
403           else
404           {
405             m_buf.append((char)c);
406             if (c == m_tagquote)
407             {
408               m_state = STATE_TAG;
409             }
410           }
411         }
412         break;
413 
414       case STATE_COMMENT:
415         {
416           if (c == '>' && m_comment >= 2)
417           {
418             m_buf.setLength(m_buf.length() - 2);
419             m_comment = 0;
420             m_state = STATE_TEXT;
421             return m_ttype = TT_COMMENT;
422           }
423           else if (c == '-')
424           {
425             m_comment++;
426           }
427           else
428           {
429             m_comment = 0;
430           }
431 
432           m_buf.append((char)c);
433         }
434         break;
435       case STATE_ENTITYREF:
436         {
437           if (c == ';' || c == '<' || (isPunct( (char) c) && c != '#') || isSpace(c)) //accept any of these as terminating the entity
438           {
439             if (c != ';')
440               m_pushback = c;
441             m_state = STATE_TEXT;
442             return m_ttype = TT_ENTITYREFERENCE;
443           }
444           m_buf.append((char)c);
445         }
446         break;
447       }
448     }
449   }
450 
451   /**
452    * The reason this function takes an HtmlTag argument rather than returning
453    * a newly created HtmlTag object is so that you can create your own
454    * tag class derived from HtmlTag if desired.
455    *
456    * @param  sbuf  text buffer to parse
457    * @param  tag  parse the text buffer and store the result in this object
458      * @exception  HtmlException  if malformed tag.
459    */
460   public void parseTag(StringBuffer sbuf, HtmlTag tag)
461     throws HtmlException
462   {
463     tag.reset();
464 
465     String buf = sbuf.toString();
466     int len = buf.length();
467     int idx = 0;
468     int begin = 0;
469 
470     // parse tag
471     while (idx < len && isSpace(buf.charAt(idx)))
472       idx++;
473 
474     if (idx == len)
475       throw new HtmlException("parse empty tag");
476 
477     if (buf.charAt(idx) == C_ENDTAG)
478     {
479       tag.setEndTag(true);
480       idx++;
481     }
482 
483     if (idx == len)
484       throw new HtmlException("parse empty tag");
485 
486     begin = idx;
487     // deal with empty tags like <img/>
488     while (idx < len && !isSpace(buf.charAt(idx)) && buf.charAt(idx) != C_EMPTY)
489       idx++;
490     String token = buf.substring(begin, idx);
491 
492     tag.setTag(token);
493 
494     parseParams(tag, buf, idx);
495   }
496 
497   /**
498    * Replaces HTML escape sequences with its character equivalent, e.g.
499    * <b>&amp;amp;copy;</b> becomes <b>&amp;copy;</b>.
500    *
501    * @param  buf  text buffer to unescape
502    * @return  a string with all HTML escape sequences removed
503    */
504   public static String unescape(String buf)
505   {
506     // quick check to see if there are any escape characters
507     if (buf.indexOf('&') == -1)
508       return buf;
509 
510     StringBuffer b = new StringBuffer(buf);
511     unescape(b);
512     return b.toString();
513   }
514 
515 
516   /**
517    * Replaces HTML escape sequences with its character equivalent, e.g.
518    * <b>&amp;copy;</b> becomes <b>&copy;</b>.
519    *
520    * @param  buf  will remove all HTML escape sequences from this buffer
521    */
522   public static void unescape(StringBuffer buf)
523   {
524     int len = buf.length();
525     int i = 0;
526     int r = i;
527     while (i<len)
528     {
529       char ch = buf.charAt(i);
530       if (ch == '&')
531       {
532         int saver = r;
533         String esc = "";
534         int j = i+1;
535         for (; j<len; j++)
536         {
537           buf.setCharAt(r++, ch);
538           ch = buf.charAt(j);
539           if (ch == ';' || ch == '<' || (isPunct(ch) && ch != '#') || isSpace(ch))
540           {
541             Character e = parseEscape(esc);
542             if (e != null)
543             {
544               // found escape sequence
545               // as opposed to false or unrecognized escape, e.g. AT&T.
546               r = saver;
547               char v = e.charValue();
548               buf.setCharAt(r++, v);
549             }
550             i = j;
551             // this handles things like &lt&gt
552             if (ch != '&')
553               i++;  // if not '&' then discard char
554             break;
555           }
556           esc += ch;
557         }
558         if (j == len)
559         {
560           Character e = parseEscape(esc);
561           if (e != null)
562           {
563             r = saver;
564             buf.setCharAt(r++, e.charValue());
565           }
566           break;
567         }
568       }
569       else
570       {
571         buf.setCharAt(r++, ch);
572         i++;
573       }
574     }
575     buf.setLength(r);
576   }
577 
578     private int m_ttype;
579   private StringBuffer m_buf = new StringBuffer(128);
580   private StringBuffer m_whitespace = new StringBuffer();
581   private int m_pushback = 0;
582   private int m_lineno = 1;
583   private int m_comment = 0;
584 
585   private char[] m_cdata_end = null;
586   private int m_cdata = -1;
587   private boolean m_cdata_pushback = false;
588   private boolean m_isCDTATA = false;
589   private static char[] m_xmlcdata_end = "]]>".toCharArray();
590 
591     private static final int STATE_EOF = -1;
592     private static final int STATE_COMMENT = -2;
593     private static final int STATE_TEXT = -3;
594     private static final int STATE_TAG = -4;
595     private static final int STATE_WS = -5;
596   private static final int STATE_TAG_QUOTE = -6;
597   private static final int STATE_BANGTAG = -7;
598   private static final int STATE_ENTITYREF = -8;
599 
600   private int m_state = STATE_TEXT;
601 
602   //private InputStream m_in;
603   private Reader m_in; //input reader appears to be an order of magnitude slower than inputstream!
604 
605   /*package*/ static final char C_ENDTAG = '/';
606   private static final char C_EMPTY = '/';  // XML char for empty tags
607   private static final char C_SINGLEQUOTE = '\'';
608   private static final char C_DOUBLEQUOTE = '"';
609   private int m_tagquote;
610 
611   private static final int CTYPE_LEN = 256;
612     private static byte m_ctype[] = new byte[CTYPE_LEN];
613     private static final byte CT_WHITESPACE = 1;
614     private static final byte CT_DIGIT = 2;
615     private static final byte CT_ALPHA = 4;
616     private static final byte CT_QUOTE = 8;
617     private static final byte CT_COMMENT = 16;
618 
619   private static Hashtable m_escapes = new Hashtable();
620   private boolean m_unescape = false;
621   private boolean m_getEntities = false; //return TT_ENTITYREFERENCE
622 
623   static
624   {
625     int len = m_ctype.length;
626     for (int i = 0; i < len; i++)
627       m_ctype[i] = 0;
628 
629     m_ctype[' '] = CT_WHITESPACE;
630     m_ctype['\r'] = CT_WHITESPACE;
631     m_ctype['\n'] = CT_WHITESPACE;
632     m_ctype['\t'] = CT_WHITESPACE;
633     for (int i = 0x0E; i <= 0x1F; i++)
634       m_ctype[i] = CT_WHITESPACE;
635 
636     m_escapes.put(new String("Aacute"), new Character('\u00c1'));
637     m_escapes.put(new String("aacute"), new Character('\u00e1'));
638     m_escapes.put(new String("Acirc"), new Character('\u00c2'));
639     m_escapes.put(new String("acirc"), new Character('\u00e2'));
640     m_escapes.put(new String("AElig"), new Character('\u00c6'));
641     m_escapes.put(new String("aelig"), new Character('\u00e6'));
642     m_escapes.put(new String("Agrave"), new Character('\u00c0'));
643     m_escapes.put(new String("agrave"), new Character('\u00e0'));
644     m_escapes.put(new String("amp"), new Character('&'));
645     m_escapes.put(new String("aring"), new Character('\u00e5'));
646     m_escapes.put(new String("Atilde"), new Character('\u00c3'));
647     m_escapes.put(new String("atilde"), new Character('\u00e3'));
648     m_escapes.put(new String("Auml"), new Character('\u00c4'));
649     m_escapes.put(new String("auml"), new Character('\u00e4'));
650     m_escapes.put(new String("brvbar"), new Character('\u00a6'));
651     m_escapes.put(new String("Ccedil"), new Character('\u00c7'));
652     m_escapes.put(new String("ccedil"), new Character('\u00e7'));
653     m_escapes.put(new String("cent"), new Character('\u00a2'));
654     m_escapes.put(new String("copy"), new Character('\u00a9'));
655     m_escapes.put(new String("deg"), new Character('\u00b0'));
656     m_escapes.put(new String("Eacute"), new Character('\u00c9'));
657     m_escapes.put(new String("eacute"), new Character('\u00e9'));
658     m_escapes.put(new String("Ecirc"), new Character('\u00ca'));
659     m_escapes.put(new String("ecirc"), new Character('\u00ea'));
660     m_escapes.put(new String("Egrave"), new Character('\u00c8'));
661     m_escapes.put(new String("egrave"), new Character('\u00e8'));
662     m_escapes.put(new String("ETH"), new Character('\u00d0'));
663     m_escapes.put(new String("eth"), new Character('\u00f0'));
664     m_escapes.put(new String("Euml"), new Character('\u00cb'));
665     m_escapes.put(new String("euml"), new Character('\u00eb'));
666     m_escapes.put(new String("frac12"), new Character('\u00bd'));
667     m_escapes.put(new String("frac14"), new Character('\u00bc'));
668     m_escapes.put(new String("frac34"), new Character('\u00be'));
669     m_escapes.put(new String("gt"), new Character('>'));
670     m_escapes.put(new String("iacute"), new Character('\u00ed'));
671     m_escapes.put(new String("Icirc"), new Character('\u00ce'));
672     m_escapes.put(new String("icirc"), new Character('\u00ee'));
673     m_escapes.put(new String("iexcl"), new Character('\u00a1'));
674     m_escapes.put(new String("Igrave"), new Character('\u00cc'));
675     m_escapes.put(new String("igrave"), new Character('\u00ec'));
676     m_escapes.put(new String("iquest"), new Character('\u00bf'));
677     m_escapes.put(new String("Iuml"), new Character('\u00cf'));
678     m_escapes.put(new String("iuml"), new Character('\u00ef'));
679     m_escapes.put(new String("laquo"), new Character('\u00ab'));
680     m_escapes.put(new String("lt"), new Character('<'));
681     m_escapes.put(new String("middot"), new Character('\u00b7'));
682     m_escapes.put(new String("nbsp"), new Character('\u00A0'));
683     m_escapes.put(new String("not"), new Character('\u00ac'));
684     m_escapes.put(new String("Ntilde"), new Character('\u00d1'));
685     m_escapes.put(new String("ntilde"), new Character('\u00f1'));
686     m_escapes.put(new String("Oacute"), new Character('\u00d3'));
687     m_escapes.put(new String("oacute"), new Character('\u00f3'));
688     m_escapes.put(new String("Ocirc"), new Character('\u00d4'));
689     m_escapes.put(new String("ocirc"), new Character('\u00f4'));
690     m_escapes.put(new String("Ograve"), new Character('\u00d2'));
691     m_escapes.put(new String("ograve"), new Character('\u00f2'));
692     m_escapes.put(new String("Oslash"), new Character('\u00d8'));
693     m_escapes.put(new String("oslash"), new Character('\u00f8'));
694     m_escapes.put(new String("Otilde"), new Character('\u00d5'));
695     m_escapes.put(new String("otilde"), new Character('\u00f5'));
696     m_escapes.put(new String("Ouml"), new Character('\u00d6'));
697     m_escapes.put(new String("ouml"), new Character('\u00f6'));
698     m_escapes.put(new String("para"), new Character('\u00b6'));
699     m_escapes.put(new String("plusmn"), new Character('\u00b1'));
700     m_escapes.put(new String("pound"), new Character('\u00a3'));
701     m_escapes.put(new String("quot"), new Character('"'));
702     m_escapes.put(new String("reg"), new Character('\u00ae'));
703     m_escapes.put(new String("sect"), new Character('\u00a7'));
704     m_escapes.put(new String("sup1"), new Character('\u00b9'));
705     m_escapes.put(new String("sup2"), new Character('\u00b2'));
706     m_escapes.put(new String("sup3"), new Character('\u00b3'));
707     m_escapes.put(new String("szlig"), new Character('\u00df'));
708     m_escapes.put(new String("THORN"), new Character('\u00de'));
709     m_escapes.put(new String("thorn"), new Character('\u00fe'));
710     m_escapes.put(new String("Uacute"), new Character('\u00da'));
711     m_escapes.put(new String("uacute"), new Character('\u00fa'));
712     m_escapes.put(new String("Ucirc"), new Character('\u00db'));
713     m_escapes.put(new String("ucirc"), new Character('\u00fb'));
714     m_escapes.put(new String("Ugrave"), new Character('\u00d9'));
715     m_escapes.put(new String("ugrave"), new Character('\u00f9'));
716     m_escapes.put(new String("Uuml"), new Character('\u00dc'));
717     m_escapes.put(new String("uuml"), new Character('\u00fc'));
718     m_escapes.put(new String("Yacute"), new Character('\u00dd'));
719     m_escapes.put(new String("yacute"), new Character('\u00fd'));
720     m_escapes.put(new String("yen"), new Character('\u00a5'));
721     m_escapes.put(new String("yuml"), new Character('\u00ff'));
722   }
723 
724   private static boolean isSpace(int c)
725   {
726      return c >=0 && c < CTYPE_LEN ? (m_ctype[c] & CT_WHITESPACE) != 0: false;
727   }
728 
729   private static boolean isPunct(char c)
730   {
731     return !Character.isLetterOrDigit(c);
732   }
733 
734   public boolean isUnescaped()
735   {
736     return m_unescape;
737   }
738 
739   public void setUnescaped(boolean unescape)
740   {
741     m_unescape = unescape;
742   }
743 
744   private static Character parseEscape(String s)
745   {
746     int len = s.length();
747     if (len == 0)
748       return null;
749     Character ch = null;
750 
751     if (s.charAt(0) == '#')
752     {
753       if (len <= 1)
754         return null;
755 
756       int code = 0;
757       for (int i=1; i<len; i++)
758       {
759         if (!Character.isDigit(s.charAt(i)))
760           return null;
761         code = (code * 10) + Character.digit(s.charAt(i), 10);
762       }
763       ch = new Character((char)code);
764     }
765     else
766     {
767       ch = (Character)m_escapes.get(s);
768     }
769 
770     return ch;
771   }
772 
773   private void parseParams(HtmlTag tag, String buf, int idx)
774     throws HtmlException
775   {
776     int len = buf.length();
777     int begin = 0;
778 
779     if (len-1 >= idx)
780     {
781       int end = len - 1;
782       while (end > idx && isSpace(buf.charAt(end)))//remove trailing whitespace
783         end--;
784       //todo: tag.setWhitespaceAtEnd(buf.substring(end, len-1) );
785       if (buf.charAt(end) == C_EMPTY)
786       {
787         tag.setEmpty(true);
788         end--;
789       }
790       len = end + 1;
791     }
792 
793     while (idx < len)
794     {
795       begin = idx;
796       while (idx < len && isSpace(buf.charAt(idx)))//skip space before attribute name
797         idx++;
798 
799       if (idx == len)//at end
800         continue;
801 
802       String whitespaceBefore = buf.substring(begin, idx);
803 
804       begin = idx;
805       if (buf.charAt(idx) == C_DOUBLEQUOTE) //how often are attribute names quoted??
806       {
807         idx++;
808         while (idx < len && buf.charAt(idx) != C_DOUBLEQUOTE)//look for close quote
809           idx++;
810         if (idx == len)
811           continue;  // bad name
812         idx++;
813       }
814       else if (buf.charAt(idx) == C_SINGLEQUOTE) //how often are attribute names quoted??
815       {
816         idx++;
817         while (idx < len && buf.charAt(idx) != C_SINGLEQUOTE)//look for close quote
818           idx++;
819         if (idx == len)
820           continue;  // bad name
821         idx++;
822       }
823       else
824       {
825         //if not quoted look for whitespace or '=' to terminate attribute name
826         while (idx < len && !isSpace(buf.charAt(idx)) && buf.charAt(idx) != '=')
827           idx++;
828       }
829 
830       String name = buf.substring(begin, idx);
831 
832       begin = idx;
833       if (idx < len && isSpace(buf.charAt(idx)))//skip whitespace after attribute name
834       {
835         while (idx < len && isSpace(buf.charAt(idx)))
836           idx++;
837       }
838 
839       if (idx == len || buf.charAt(idx) != '=') //attribute name only, no value specified
840       {
841         // name with empty value
842         tag.setParam(name, name); //set the attribute name as the value (SGML tag minimalization rule)
843         tag.setWhitespace(name, whitespaceBefore, "");
844         continue;
845       }
846       idx++; //skip past the '='
847 
848       if (idx == len)
849         continue;
850 
851       if (isSpace(buf.charAt(idx)))
852       {
853         while (idx < len && isSpace(buf.charAt(idx)))//skip past whitespace after '='
854           idx++;
855 
856         // special case: if value is surrounded by quotes
857         // then it can have a space after the '='
858         //if (idx == len || (buf.charAt(idx) != C_DOUBLEQUOTE && buf.charAt(idx) != C_SINGLEQUOTE))
859         if (idx == len)
860         {
861           // name with empty value
862           tag.setParam(name, name); //set the attribute name as the value (SGML tag minimalization rule)
863           tag.setWhitespace(name, whitespaceBefore, buf.substring(begin, idx));
864           continue;
865         }
866       }
867 
868       char quote = buf.charAt(idx);
869       int includeQuote = (quote == C_DOUBLEQUOTE || quote == C_SINGLEQUOTE) ? 1 : 0;
870       String whitespaceAfter = buf.substring(begin, idx + includeQuote);
871 
872       begin = idx;
873       int end = begin;
874       if (quote == C_DOUBLEQUOTE)
875       {
876         idx++;
877         begin = idx;
878         while (idx < len && buf.charAt(idx) != C_DOUBLEQUOTE)
879           idx++;
880         if (idx == len)
881           continue;  // bad value
882         end = idx;
883         idx++;
884       }
885       else if (quote == C_SINGLEQUOTE)
886       {
887         idx++;
888         begin = idx;
889         while (idx < len && buf.charAt(idx) != C_SINGLEQUOTE)
890           idx++;
891         if (idx == len)
892           continue;  // bad value
893         end = idx;
894         idx++;
895       }
896       else
897       {//not quoted, whitespace terminates attribute value
898         while (idx < len && !isSpace(buf.charAt(idx)))
899           idx++;
900         end = idx;
901       }
902 
903       String value = buf.substring(begin, end);
904 
905       if (m_unescape)
906         value = unescape(value);
907 
908       tag.setParam(name, value);
909       tag.setWhitespace(name, whitespaceBefore, whitespaceAfter);
910     }
911   }
912 }
913