Source code: com/arthurdo/parser/HtmlStreamTokenizer.java
1 /*
2 * Copyright (c) 1996, 2001 by Arthur Do <arthur@cs.stanford.edu>.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 package com.arthurdo.parser;
21
22 import java.io.*;
23 import java.util.*;
24
25 /**
26 * <p>HtmlStreamTokenizer is an HTML parser that is similar
27 * to the StreamTokenizer class but is specialized for
28 * HTML streams. This class is useful when you need to
29 * parse the structure of an HTML document.</p>
30 *
31 * <pre>
32 * import com.arthurdo.parser.*;
33 * <p>
34 * HtmlStreamTokenizer tok = new HtmlStreamTokenizer(inputstream);
35 * HtmlTag tag = new HtmlTag();
36 *
37 * while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF) {
38 * int ttype = tok.getTokenType();
39 *
40 * if (ttype == HtmlStreamTokenizer.TT_TAG) {
41 * tok.parseTag(tok.getStringValue(), tag);
42 * System.out.println("tag: " + tag.toString());
43 * }
44 * else if (ttype == HtmlStreamTokenizer.TT_TEXT) {
45 * System.out.println("text: " + tok.getStringValue());
46 * }
47 * else if (ttype == HtmlStreamTokenizer.TT_COMMENT) {
48 * System.out.println("comment: <!--" + tok.getStringValue() + "-->");
49 * }
50 * }
51 * </pre>
52 *
53 * <p>One of the motivations for designing <i>parseTag()</i> to take
54 * an HtmlTag argument rather than having <i>parseTag()</i> return
55 * a newly created HtmlTag is so you can create your own tag class
56 * derived from HtmlTag.
57 *
58 * <ul>
59 * <li> 02/09/98 Thomas Horster-Möller, fixed bug with counting
60 * newlines twice on character pushback.
61 * <li> 06/14/99 text is now returned as 'runs' instead of being
62 * broken up into words as in previous versions. You can use a StringTokenizer
63 * to break your text into words.
64 * </ul>
65 *
66 * @version 2.01 09/12/97
67 * @author Arthur Do <arthur@cs.stanford.edu>
68 * @see com.arthurdo.parser.HtmlTag
69 * @see com.arthurdo.parser.Table
70 */
71 public class HtmlStreamTokenizer
72 {
73 /**
74 * end of stream.
75 */
76 public static final int TT_EOF = -1;
77 /**
78 * text token.
79 */
80 public static final int TT_TEXT = -2;
81 /**
82 * tag token.
83 */
84 public static final int TT_TAG = -3;
85 /**
86 * comment token.
87 */
88 public static final int TT_COMMENT = -4;
89
90 /**
91 * inside <! to provide support for doctypes with internal dtd, <![CDATA sections, and degenerate html comments
92 */
93 public static final int TT_BANGTAG = -5;
94
95 /**
96 * entity reference token (&*;)
97 */
98 public static final int TT_ENTITYREFERENCE = -6;
99
100 /**
101 * @deprecated use HtmlStreamTokenizer(Reader) instead.
102 * This version of the constructor can lead to 10x slower code
103 * because of the InputStreamReader wrapper.
104 * @param in input stream
105 */
106 public HtmlStreamTokenizer(InputStream in)
107 {
108 this(new BufferedReader(new InputStreamReader(in)));
109 }
110
111 /**
112 * @param in Reader. The input is assumed to be buffered as needed.
113 */
114 public HtmlStreamTokenizer(Reader in)
115 {
116 m_in = in;
117 m_state = STATE_TEXT;
118 }
119
120 /**
121 * @return token type, one of the <b>TT_</b> defines
122 */
123 public final int getTokenType()
124 {
125 return m_ttype;
126 }
127
128 /**
129 * @return string value of the token
130 */
131 public final StringBuffer getStringValue()
132 {
133 return m_buf;
134 }
135
136 /**
137 * @return string value of the token, including characters stripped off by the tokenizer
138 */
139 public final String getRawString()
140 {
141 switch (m_ttype)
142 {
143 case TT_TAG:
144 return "<" + m_buf.toString() + ">";
145 case TT_BANGTAG:
146 return "<!" + m_buf.toString() + ">";
147 case TT_COMMENT:
148 return "<!--" + m_buf.toString() + "-->";
149 case TT_ENTITYREFERENCE:
150 return "&" + m_buf.toString() + ";";
151 default:
152 return m_buf.toString();
153 }
154 }
155
156 /**
157 * @deprecated white space is now returned as TT_TEXT. This buffer is always
158 * empty.
159 * @return any white space accumulated since last call to nextToken
160 */
161 public final StringBuffer getWhiteSpace()
162 {
163 return m_whitespace;
164 }
165
166 /**
167 * @return current line number. Every time nextToken() sees a new
168 * line character ('\n'), it increments the line number.
169 */
170 public int getLineNumber()
171 {
172 return m_lineno;
173 }
174
175 /**
176 * @param char[] exitString CDATA mode will terminate when it encounters this string
177 * @param boolean pushbackExitString whether to parse the exit string again or not
178 *
179 * it'd be an error to call enterCDATAMode(exitString, true); getToken()==TT_CDATA; enterCDATAMode(differentExitString, true); 'cause the next getToken() call will parse differentExitString instead of exitString
180 */
181 public void enterCDATAMode(char[] exitString, boolean pushbackExitString)
182 {
183 m_cdata_end = exitString;
184 m_cdata = 0;
185 m_cdata_pushback = pushbackExitString;
186 }
187
188 public boolean isCDATA() { return m_isCDTATA; }
189
190 /**
191 * @return the next token
192 * @exception IOException if error reading input stream.
193 */
194 public int nextToken()
195 throws IOException
196 {
197 m_buf.setLength(0);
198 m_whitespace.setLength(0);
199 int ltcount = 0;
200 m_isCDTATA = false;
201 boolean hasAmp = false;
202
203 while (true)
204 {
205 int c;
206
207 if (m_pushback != 0)
208 {
209 c = m_pushback;
210 if (c == '\n')
211 m_lineno--; // don't count newline twice
212 m_pushback = 0;
213 }
214 else if (m_cdata < -1)
215 c = m_cdata_end[m_cdata++ + m_cdata_end.length + 1];
216 else
217 {
218 c = m_in.read();
219 }
220
221 if (c < 0)
222 {
223 int state = m_state;
224 m_state = STATE_EOF;
225
226 if (m_buf.length() > 0 && state == STATE_TEXT)
227 {
228 if (m_unescape && hasAmp)
229 unescape(m_buf);
230 return m_ttype = TT_TEXT;
231 }
232 else
233 return m_ttype = TT_EOF;
234 }
235
236 if (c == '\n')
237 m_lineno++;
238
239 switch (m_state)
240 {
241 case STATE_TEXT:
242 {
243 if (m_cdata > -1)
244 {
245 //we're in cdata mode
246 if ((Character.toUpperCase(m_cdata_end[m_cdata]) == c) ||
247 (Character.toLowerCase(m_cdata_end[m_cdata]) == c)) // support case sensitive exit strings
248 {
249 if (++m_cdata == m_cdata_end.length)
250 {
251 if (m_cdata_pushback)
252 m_cdata = -m_cdata_end.length -1;
253 else
254 m_cdata = -1;
255 m_isCDTATA = true;
256 return m_ttype = TT_TEXT;
257 }
258 }
259 else
260 {
261 if (m_cdata > 0)
262 {
263 m_buf.append(m_cdata_end, 0, m_cdata);
264 m_cdata = 0;
265 }
266 m_buf.append((char)c);
267 }
268
269 }
270 else if (c == '<')
271 {
272 boolean inCDATApushback = m_cdata < -1;
273 int peek = inCDATApushback ? m_cdata_end[m_cdata++ + m_cdata_end.length + 1] : m_in.read();
274
275 if (peek == '!')
276 m_state = STATE_BANGTAG;
277 else if (peek == '<')
278 {
279 // handle <<, some people use it in <pre>
280 m_buf.append("<<");
281 break;
282 }
283 else
284 {
285 m_pushback = peek;
286 if (inCDATApushback)
287 --m_cdata;
288 m_state = STATE_TAG;
289 }
290
291 if (m_buf.length() > 0)
292 {
293 if (m_unescape && hasAmp)
294 unescape(m_buf);
295 return m_ttype = TT_TEXT;
296 }
297 }
298 /*
299 else if (isSpace(c))
300 {
301 m_pushback = c;
302 m_state = STATE_WS;
303 if (m_buf.length() > 0)
304 {
305 if (m_unescape && hasAmp)
306 unescape(m_buf);
307 return m_ttype = TT_TEXT;
308 }
309 }
310 */
311 else
312 {
313 if (c == '&')
314 {
315 if (m_getEntities)
316 {
317 m_state = STATE_ENTITYREF;
318 return m_ttype = TT_TEXT;
319 }
320 else
321 hasAmp = true;
322 }
323 m_buf.append((char)c);
324 }
325 }
326 break;
327
328 case STATE_WS:
329 {
330 if (!isSpace(c))
331 {
332 m_pushback = c;
333 m_state = STATE_TEXT;
334 }
335 else
336 {
337 m_whitespace.append((char)c);
338 }
339 }
340 break;
341
342 case STATE_TAG:
343 {
344 if (c == '>')
345 {
346 m_state = STATE_TEXT;
347 return m_ttype = TT_TAG;
348 }
349 else if (c == C_SINGLEQUOTE || c == C_DOUBLEQUOTE)
350 {
351 // handle quotes inside tag
352 m_tagquote = c;
353 m_buf.append((char)c);
354 m_state = STATE_TAG_QUOTE;
355 }
356 else
357 {
358 m_buf.append((char)c);
359 }
360 }
361 break;
362 case STATE_BANGTAG:
363 {
364 int buflen = m_buf.length();
365 if (c == '<')
366 {
367 ++ltcount;
368 m_buf.append((char)c);
369 }
370 else if (c == '>' && --ltcount < 0)
371 {
372 m_state = STATE_TEXT;
373 return m_ttype = TT_BANGTAG;
374 }
375 else if (c == '-' && buflen == 1 && m_buf.charAt(0) == '-')
376 {
377 // handle <!--
378 m_buf.setLength(0);
379 m_state = STATE_COMMENT;
380 }
381 else if (buflen == 6 && c == '[' && m_buf.toString().equals("[CDATA[") )
382 {
383 // handle <![CDATA[
384 m_buf.setLength(0);
385 enterCDATAMode(m_xmlcdata_end, false);
386 m_state = STATE_TEXT;
387 }
388 else
389 {
390 m_buf.append((char)c);
391 }
392 }
393 break;
394 case STATE_TAG_QUOTE:
395 {
396 // the only way out out of this state is to close the quote
397 // special case: some people forget to end quote in a tag
398 if (c == '>')
399 {
400 m_pushback = c;
401 m_state = STATE_TAG;
402 }
403 else
404 {
405 m_buf.append((char)c);
406 if (c == m_tagquote)
407 {
408 m_state = STATE_TAG;
409 }
410 }
411 }
412 break;
413
414 case STATE_COMMENT:
415 {
416 if (c == '>' && m_comment >= 2)
417 {
418 m_buf.setLength(m_buf.length() - 2);
419 m_comment = 0;
420 m_state = STATE_TEXT;
421 return m_ttype = TT_COMMENT;
422 }
423 else if (c == '-')
424 {
425 m_comment++;
426 }
427 else
428 {
429 m_comment = 0;
430 }
431
432 m_buf.append((char)c);
433 }
434 break;
435 case STATE_ENTITYREF:
436 {
437 if (c == ';' || c == '<' || (isPunct( (char) c) && c != '#') || isSpace(c)) //accept any of these as terminating the entity
438 {
439 if (c != ';')
440 m_pushback = c;
441 m_state = STATE_TEXT;
442 return m_ttype = TT_ENTITYREFERENCE;
443 }
444 m_buf.append((char)c);
445 }
446 break;
447 }
448 }
449 }
450
451 /**
452 * The reason this function takes an HtmlTag argument rather than returning
453 * a newly created HtmlTag object is so that you can create your own
454 * tag class derived from HtmlTag if desired.
455 *
456 * @param sbuf text buffer to parse
457 * @param tag parse the text buffer and store the result in this object
458 * @exception HtmlException if malformed tag.
459 */
460 public void parseTag(StringBuffer sbuf, HtmlTag tag)
461 throws HtmlException
462 {
463 tag.reset();
464
465 String buf = sbuf.toString();
466 int len = buf.length();
467 int idx = 0;
468 int begin = 0;
469
470 // parse tag
471 while (idx < len && isSpace(buf.charAt(idx)))
472 idx++;
473
474 if (idx == len)
475 throw new HtmlException("parse empty tag");
476
477 if (buf.charAt(idx) == C_ENDTAG)
478 {
479 tag.setEndTag(true);
480 idx++;
481 }
482
483 if (idx == len)
484 throw new HtmlException("parse empty tag");
485
486 begin = idx;
487 // deal with empty tags like <img/>
488 while (idx < len && !isSpace(buf.charAt(idx)) && buf.charAt(idx) != C_EMPTY)
489 idx++;
490 String token = buf.substring(begin, idx);
491
492 tag.setTag(token);
493
494 parseParams(tag, buf, idx);
495 }
496
497 /**
498 * Replaces HTML escape sequences with its character equivalent, e.g.
499 * <b>&amp;copy;</b> becomes <b>&copy;</b>.
500 *
501 * @param buf text buffer to unescape
502 * @return a string with all HTML escape sequences removed
503 */
504 public static String unescape(String buf)
505 {
506 // quick check to see if there are any escape characters
507 if (buf.indexOf('&') == -1)
508 return buf;
509
510 StringBuffer b = new StringBuffer(buf);
511 unescape(b);
512 return b.toString();
513 }
514
515
516 /**
517 * Replaces HTML escape sequences with its character equivalent, e.g.
518 * <b>&copy;</b> becomes <b>©</b>.
519 *
520 * @param buf will remove all HTML escape sequences from this buffer
521 */
522 public static void unescape(StringBuffer buf)
523 {
524 int len = buf.length();
525 int i = 0;
526 int r = i;
527 while (i<len)
528 {
529 char ch = buf.charAt(i);
530 if (ch == '&')
531 {
532 int saver = r;
533 String esc = "";
534 int j = i+1;
535 for (; j<len; j++)
536 {
537 buf.setCharAt(r++, ch);
538 ch = buf.charAt(j);
539 if (ch == ';' || ch == '<' || (isPunct(ch) && ch != '#') || isSpace(ch))
540 {
541 Character e = parseEscape(esc);
542 if (e != null)
543 {
544 // found escape sequence
545 // as opposed to false or unrecognized escape, e.g. AT&T.
546 r = saver;
547 char v = e.charValue();
548 buf.setCharAt(r++, v);
549 }
550 i = j;
551 // this handles things like <>
552 if (ch != '&')
553 i++; // if not '&' then discard char
554 break;
555 }
556 esc += ch;
557 }
558 if (j == len)
559 {
560 Character e = parseEscape(esc);
561 if (e != null)
562 {
563 r = saver;
564 buf.setCharAt(r++, e.charValue());
565 }
566 break;
567 }
568 }
569 else
570 {
571 buf.setCharAt(r++, ch);
572 i++;
573 }
574 }
575 buf.setLength(r);
576 }
577
578 private int m_ttype;
579 private StringBuffer m_buf = new StringBuffer(128);
580 private StringBuffer m_whitespace = new StringBuffer();
581 private int m_pushback = 0;
582 private int m_lineno = 1;
583 private int m_comment = 0;
584
585 private char[] m_cdata_end = null;
586 private int m_cdata = -1;
587 private boolean m_cdata_pushback = false;
588 private boolean m_isCDTATA = false;
589 private static char[] m_xmlcdata_end = "]]>".toCharArray();
590
591 private static final int STATE_EOF = -1;
592 private static final int STATE_COMMENT = -2;
593 private static final int STATE_TEXT = -3;
594 private static final int STATE_TAG = -4;
595 private static final int STATE_WS = -5;
596 private static final int STATE_TAG_QUOTE = -6;
597 private static final int STATE_BANGTAG = -7;
598 private static final int STATE_ENTITYREF = -8;
599
600 private int m_state = STATE_TEXT;
601
602 //private InputStream m_in;
603 private Reader m_in; //input reader appears to be an order of magnitude slower than inputstream!
604
605 /*package*/ static final char C_ENDTAG = '/';
606 private static final char C_EMPTY = '/'; // XML char for empty tags
607 private static final char C_SINGLEQUOTE = '\'';
608 private static final char C_DOUBLEQUOTE = '"';
609 private int m_tagquote;
610
611 private static final int CTYPE_LEN = 256;
612 private static byte m_ctype[] = new byte[CTYPE_LEN];
613 private static final byte CT_WHITESPACE = 1;
614 private static final byte CT_DIGIT = 2;
615 private static final byte CT_ALPHA = 4;
616 private static final byte CT_QUOTE = 8;
617 private static final byte CT_COMMENT = 16;
618
619 private static Hashtable m_escapes = new Hashtable();
620 private boolean m_unescape = false;
621 private boolean m_getEntities = false; //return TT_ENTITYREFERENCE
622
623 static
624 {
625 int len = m_ctype.length;
626 for (int i = 0; i < len; i++)
627 m_ctype[i] = 0;
628
629 m_ctype[' '] = CT_WHITESPACE;
630 m_ctype['\r'] = CT_WHITESPACE;
631 m_ctype['\n'] = CT_WHITESPACE;
632 m_ctype['\t'] = CT_WHITESPACE;
633 for (int i = 0x0E; i <= 0x1F; i++)
634 m_ctype[i] = CT_WHITESPACE;
635
636 m_escapes.put(new String("Aacute"), new Character('\u00c1'));
637 m_escapes.put(new String("aacute"), new Character('\u00e1'));
638 m_escapes.put(new String("Acirc"), new Character('\u00c2'));
639 m_escapes.put(new String("acirc"), new Character('\u00e2'));
640 m_escapes.put(new String("AElig"), new Character('\u00c6'));
641 m_escapes.put(new String("aelig"), new Character('\u00e6'));
642 m_escapes.put(new String("Agrave"), new Character('\u00c0'));
643 m_escapes.put(new String("agrave"), new Character('\u00e0'));
644 m_escapes.put(new String("amp"), new Character('&'));
645 m_escapes.put(new String("aring"), new Character('\u00e5'));
646 m_escapes.put(new String("Atilde"), new Character('\u00c3'));
647 m_escapes.put(new String("atilde"), new Character('\u00e3'));
648 m_escapes.put(new String("Auml"), new Character('\u00c4'));
649 m_escapes.put(new String("auml"), new Character('\u00e4'));
650 m_escapes.put(new String("brvbar"), new Character('\u00a6'));
651 m_escapes.put(new String("Ccedil"), new Character('\u00c7'));
652 m_escapes.put(new String("ccedil"), new Character('\u00e7'));
653 m_escapes.put(new String("cent"), new Character('\u00a2'));
654 m_escapes.put(new String("copy"), new Character('\u00a9'));
655 m_escapes.put(new String("deg"), new Character('\u00b0'));
656 m_escapes.put(new String("Eacute"), new Character('\u00c9'));
657 m_escapes.put(new String("eacute"), new Character('\u00e9'));
658 m_escapes.put(new String("Ecirc"), new Character('\u00ca'));
659 m_escapes.put(new String("ecirc"), new Character('\u00ea'));
660 m_escapes.put(new String("Egrave"), new Character('\u00c8'));
661 m_escapes.put(new String("egrave"), new Character('\u00e8'));
662 m_escapes.put(new String("ETH"), new Character('\u00d0'));
663 m_escapes.put(new String("eth"), new Character('\u00f0'));
664 m_escapes.put(new String("Euml"), new Character('\u00cb'));
665 m_escapes.put(new String("euml"), new Character('\u00eb'));
666 m_escapes.put(new String("frac12"), new Character('\u00bd'));
667 m_escapes.put(new String("frac14"), new Character('\u00bc'));
668 m_escapes.put(new String("frac34"), new Character('\u00be'));
669 m_escapes.put(new String("gt"), new Character('>'));
670 m_escapes.put(new String("iacute"), new Character('\u00ed'));
671 m_escapes.put(new String("Icirc"), new Character('\u00ce'));
672 m_escapes.put(new String("icirc"), new Character('\u00ee'));
673 m_escapes.put(new String("iexcl"), new Character('\u00a1'));
674 m_escapes.put(new String("Igrave"), new Character('\u00cc'));
675 m_escapes.put(new String("igrave"), new Character('\u00ec'));
676 m_escapes.put(new String("iquest"), new Character('\u00bf'));
677 m_escapes.put(new String("Iuml"), new Character('\u00cf'));
678 m_escapes.put(new String("iuml"), new Character('\u00ef'));
679 m_escapes.put(new String("laquo"), new Character('\u00ab'));
680 m_escapes.put(new String("lt"), new Character('<'));
681 m_escapes.put(new String("middot"), new Character('\u00b7'));
682 m_escapes.put(new String("nbsp"), new Character('\u00A0'));
683 m_escapes.put(new String("not"), new Character('\u00ac'));
684 m_escapes.put(new String("Ntilde"), new Character('\u00d1'));
685 m_escapes.put(new String("ntilde"), new Character('\u00f1'));
686 m_escapes.put(new String("Oacute"), new Character('\u00d3'));
687 m_escapes.put(new String("oacute"), new Character('\u00f3'));
688 m_escapes.put(new String("Ocirc"), new Character('\u00d4'));
689 m_escapes.put(new String("ocirc"), new Character('\u00f4'));
690 m_escapes.put(new String("Ograve"), new Character('\u00d2'));
691 m_escapes.put(new String("ograve"), new Character('\u00f2'));
692 m_escapes.put(new String("Oslash"), new Character('\u00d8'));
693 m_escapes.put(new String("oslash"), new Character('\u00f8'));
694 m_escapes.put(new String("Otilde"), new Character('\u00d5'));
695 m_escapes.put(new String("otilde"), new Character('\u00f5'));
696 m_escapes.put(new String("Ouml"), new Character('\u00d6'));
697 m_escapes.put(new String("ouml"), new Character('\u00f6'));
698 m_escapes.put(new String("para"), new Character('\u00b6'));
699 m_escapes.put(new String("plusmn"), new Character('\u00b1'));
700 m_escapes.put(new String("pound"), new Character('\u00a3'));
701 m_escapes.put(new String("quot"), new Character('"'));
702 m_escapes.put(new String("reg"), new Character('\u00ae'));
703 m_escapes.put(new String("sect"), new Character('\u00a7'));
704 m_escapes.put(new String("sup1"), new Character('\u00b9'));
705 m_escapes.put(new String("sup2"), new Character('\u00b2'));
706 m_escapes.put(new String("sup3"), new Character('\u00b3'));
707 m_escapes.put(new String("szlig"), new Character('\u00df'));
708 m_escapes.put(new String("THORN"), new Character('\u00de'));
709 m_escapes.put(new String("thorn"), new Character('\u00fe'));
710 m_escapes.put(new String("Uacute"), new Character('\u00da'));
711 m_escapes.put(new String("uacute"), new Character('\u00fa'));
712 m_escapes.put(new String("Ucirc"), new Character('\u00db'));
713 m_escapes.put(new String("ucirc"), new Character('\u00fb'));
714 m_escapes.put(new String("Ugrave"), new Character('\u00d9'));
715 m_escapes.put(new String("ugrave"), new Character('\u00f9'));
716 m_escapes.put(new String("Uuml"), new Character('\u00dc'));
717 m_escapes.put(new String("uuml"), new Character('\u00fc'));
718 m_escapes.put(new String("Yacute"), new Character('\u00dd'));
719 m_escapes.put(new String("yacute"), new Character('\u00fd'));
720 m_escapes.put(new String("yen"), new Character('\u00a5'));
721 m_escapes.put(new String("yuml"), new Character('\u00ff'));
722 }
723
724 private static boolean isSpace(int c)
725 {
726 return c >=0 && c < CTYPE_LEN ? (m_ctype[c] & CT_WHITESPACE) != 0: false;
727 }
728
729 private static boolean isPunct(char c)
730 {
731 return !Character.isLetterOrDigit(c);
732 }
733
734 public boolean isUnescaped()
735 {
736 return m_unescape;
737 }
738
739 public void setUnescaped(boolean unescape)
740 {
741 m_unescape = unescape;
742 }
743
744 private static Character parseEscape(String s)
745 {
746 int len = s.length();
747 if (len == 0)
748 return null;
749 Character ch = null;
750
751 if (s.charAt(0) == '#')
752 {
753 if (len <= 1)
754 return null;
755
756 int code = 0;
757 for (int i=1; i<len; i++)
758 {
759 if (!Character.isDigit(s.charAt(i)))
760 return null;
761 code = (code * 10) + Character.digit(s.charAt(i), 10);
762 }
763 ch = new Character((char)code);
764 }
765 else
766 {
767 ch = (Character)m_escapes.get(s);
768 }
769
770 return ch;
771 }
772
773 private void parseParams(HtmlTag tag, String buf, int idx)
774 throws HtmlException
775 {
776 int len = buf.length();
777 int begin = 0;
778
779 if (len-1 >= idx)
780 {
781 int end = len - 1;
782 while (end > idx && isSpace(buf.charAt(end)))//remove trailing whitespace
783 end--;
784 //todo: tag.setWhitespaceAtEnd(buf.substring(end, len-1) );
785 if (buf.charAt(end) == C_EMPTY)
786 {
787 tag.setEmpty(true);
788 end--;
789 }
790 len = end + 1;
791 }
792
793 while (idx < len)
794 {
795 begin = idx;
796 while (idx < len && isSpace(buf.charAt(idx)))//skip space before attribute name
797 idx++;
798
799 if (idx == len)//at end
800 continue;
801
802 String whitespaceBefore = buf.substring(begin, idx);
803
804 begin = idx;
805 if (buf.charAt(idx) == C_DOUBLEQUOTE) //how often are attribute names quoted??
806 {
807 idx++;
808 while (idx < len && buf.charAt(idx) != C_DOUBLEQUOTE)//look for close quote
809 idx++;
810 if (idx == len)
811 continue; // bad name
812 idx++;
813 }
814 else if (buf.charAt(idx) == C_SINGLEQUOTE) //how often are attribute names quoted??
815 {
816 idx++;
817 while (idx < len && buf.charAt(idx) != C_SINGLEQUOTE)//look for close quote
818 idx++;
819 if (idx == len)
820 continue; // bad name
821 idx++;
822 }
823 else
824 {
825 //if not quoted look for whitespace or '=' to terminate attribute name
826 while (idx < len && !isSpace(buf.charAt(idx)) && buf.charAt(idx) != '=')
827 idx++;
828 }
829
830 String name = buf.substring(begin, idx);
831
832 begin = idx;
833 if (idx < len && isSpace(buf.charAt(idx)))//skip whitespace after attribute name
834 {
835 while (idx < len && isSpace(buf.charAt(idx)))
836 idx++;
837 }
838
839 if (idx == len || buf.charAt(idx) != '=') //attribute name only, no value specified
840 {
841 // name with empty value
842 tag.setParam(name, name); //set the attribute name as the value (SGML tag minimalization rule)
843 tag.setWhitespace(name, whitespaceBefore, "");
844 continue;
845 }
846 idx++; //skip past the '='
847
848 if (idx == len)
849 continue;
850
851 if (isSpace(buf.charAt(idx)))
852 {
853 while (idx < len && isSpace(buf.charAt(idx)))//skip past whitespace after '='
854 idx++;
855
856 // special case: if value is surrounded by quotes
857 // then it can have a space after the '='
858 //if (idx == len || (buf.charAt(idx) != C_DOUBLEQUOTE && buf.charAt(idx) != C_SINGLEQUOTE))
859 if (idx == len)
860 {
861 // name with empty value
862 tag.setParam(name, name); //set the attribute name as the value (SGML tag minimalization rule)
863 tag.setWhitespace(name, whitespaceBefore, buf.substring(begin, idx));
864 continue;
865 }
866 }
867
868 char quote = buf.charAt(idx);
869 int includeQuote = (quote == C_DOUBLEQUOTE || quote == C_SINGLEQUOTE) ? 1 : 0;
870 String whitespaceAfter = buf.substring(begin, idx + includeQuote);
871
872 begin = idx;
873 int end = begin;
874 if (quote == C_DOUBLEQUOTE)
875 {
876 idx++;
877 begin = idx;
878 while (idx < len && buf.charAt(idx) != C_DOUBLEQUOTE)
879 idx++;
880 if (idx == len)
881 continue; // bad value
882 end = idx;
883 idx++;
884 }
885 else if (quote == C_SINGLEQUOTE)
886 {
887 idx++;
888 begin = idx;
889 while (idx < len && buf.charAt(idx) != C_SINGLEQUOTE)
890 idx++;
891 if (idx == len)
892 continue; // bad value
893 end = idx;
894 idx++;
895 }
896 else
897 {//not quoted, whitespace terminates attribute value
898 while (idx < len && !isSpace(buf.charAt(idx)))
899 idx++;
900 end = idx;
901 }
902
903 String value = buf.substring(begin, end);
904
905 if (m_unescape)
906 value = unescape(value);
907
908 tag.setParam(name, value);
909 tag.setWhitespace(name, whitespaceBefore, whitespaceAfter);
910 }
911 }
912 }
913