Source code: com/arthurdo/parser/HtmlTag.java
1 /*
2 * Copyright (c) 1996, 2001 by Arthur Do <arthur@cs.stanford.edu>.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 package com.arthurdo.parser;
21
22 import java.util.*;
23
24 /**
25 * HtmlTag is a helper class to store parsed tag information.
26 *
27 * @version 2.01 09/12/97
28 * @author Arthur Do <arthur@cs.stanford.edu>
29 * @see com.arthurdo.parser.HtmlStreamTokenizer
30 */
31 public class HtmlTag
32 {
33 public HtmlTag()
34 {
35 }
36
37 public HtmlTag(HtmlTag orig)
38 {
39 m_tag = new String(orig.m_tag);
40 m_ttype = orig.m_ttype;
41 m_endtag = orig.m_endtag;
42 m_names = (Vector)orig.m_names.clone();
43 m_values = (Vector)orig.m_values.clone();
44 // m_params = (Hashtable)orig.m_params.clone();
45 // m_originalParamNames = (Hashtable)orig.m_originalParamNames.clone();
46 m_empty = orig.m_empty;
47 }
48
49 /**
50 * Sets the tag name.
51 *
52 * @param tag name of tag, e.g. "img"
53 * @exception HtmlException if malformed tag.
54 */
55 public void setTag(String tag)
56 throws HtmlException
57 {
58 try
59 {
60 m_tag = tag;
61 Object value = m_tags.get(tag.toUpperCase());
62 if (value != null)
63 m_ttype = ((Integer)value).intValue();
64 }
65 catch (StringIndexOutOfBoundsException e)
66 {
67 throw new HtmlException("invalid tag");
68 }
69 }
70
71 /**
72 * @return tag type, e.g. one of the <b>T_</b> constants.
73 */
74 public int getTagType()
75 {
76 return m_ttype;
77 }
78
79 /**
80 * @return tag name, the same name as passed to the constructor.
81 */
82 public String getTagString()
83 {
84 return m_tag;
85 }
86
87 /**
88 * @return this is an end tag or not, i.e. if the tag has a slash before the name.
89 */
90 public boolean isEndTag()
91 {
92 return m_endtag;
93 }
94
95 /**
96 * Looks up a tag param name and returns the associated
97 * value, if any. Try to use the predefined <b>P_</b> constants.
98 *
99 * @param name name of param
100 * @return the value associated with the name, or null.
101 */
102 public String getParam(String name)
103 {
104 final int idx = indexOfName(name);
105 if (idx != -1)
106 return (String)m_values.elementAt(idx);
107
108 return null;
109 }
110
111 /**
112 * Looks up a tag param name (by position)
113 *
114 * @param i The index of the param in the list (starting at 0).
115 * @return The name of the indexed param
116 */
117 public String getParamName(int i)
118 {
119 return (String)m_names.elementAt(i);
120 }
121
122 /**
123 * Looks up a tag param value (by position)
124 *
125 * @param i The index of the param in the list (starting at 0).
126 * @return The value of the indexed param
127 */
128 public String getParamValue(int i)
129 {
130 return (String)m_values.elementAt(i);
131 }
132
133 /**
134 * Looks up a tag param name and returns the associated
135 * value, if any. Try to use the predefined <b>P_</b> constants.
136 *
137 * @param name name of param, must be lowercase
138 * @return the integer value associated with the name.
139 * @exception NumberFormatException if value is not a number.
140 */
141 public int getIntParam(String name)
142 throws NumberFormatException
143 {
144 return Integer.parseInt(getParam(name));
145 }
146
147 /**
148 * Determines if tag has a particular parameter.
149 *
150 * @param name name of param, must be lowercase
151 * @return true if tag contains parameter, false otherwise.
152 */
153 public boolean hasParam(String name)
154 {
155 return getParam(name) != null;
156 }
157
158 /**
159 * Associates a param name with a value.
160 *
161 * @param name name of param
162 * @param value value associated with name
163 */
164 public void setParam(String name, String value)
165 {
166 m_names.addElement(name);
167 m_values.addElement(value);
168 }
169
170 public void setWhitespace(String name, String whitespaceBefore, String whitespaceAfter)
171 {
172 }
173
174 /**
175 * Remove association of a param name with a value.
176 *
177 * @param name name of param to remove
178 */
179 public void removeParam(String name)
180 {
181 final int idx = indexOfName(name);
182 if (idx != -1)
183 {
184 m_names.removeElementAt(idx);
185 m_values.removeElementAt(idx);
186 }
187 }
188
189 /**
190 * @return an enumeration of the parameter names.
191 */
192 public Enumeration getParamNames()
193 {
194 return m_names.elements();
195 }
196
197 /**
198 * @return an enumeration of the parameter values.
199 */
200 public Enumeration getParamValues()
201 {
202 return m_values.elements();
203 }
204
205 /**
206 * @return the number of params.
207 */
208 public int getParamCount()
209 {
210 return m_names.size();
211 }
212
213 /**
214 * An empty tag ends with a '/'.
215 *
216 * @return true if empty tag, false otherwise.
217 */
218 public boolean isEmpty()
219 {
220 return m_empty;
221 }
222
223 /**
224 * @return string representation of tag
225 */
226 public String toString()
227 {
228 StringBuffer tag = new StringBuffer();
229
230 tag.append('<');
231 if (isEndTag())
232 tag.append(HtmlStreamTokenizer.C_ENDTAG);
233 tag.append(getTagString());
234
235 final int size = m_names.size();
236 for (int i=0; i<size; i++)
237 {
238 String name = (String)m_names.elementAt(i);
239 tag.append(" " + name);
240 String value = (String)m_values.elementAt(i);
241 if (value.length() > 0)
242 tag.append("=\"" + value + "\"");
243 }
244 if (isEmpty())
245 tag.append(" /");
246 tag.append('>');
247
248 return tag.toString();
249 }
250
251 /**
252 * Reset tag to original state, as if it was just constructed.
253 */
254 public void reset()
255 {
256 m_tag = null;
257 m_ttype = T_UNKNOWN;
258 m_endtag = false;
259 m_names.removeAllElements();
260 m_values.removeAllElements();
261 m_empty = false;
262 }
263
264
265 public static final int T_UNKNOWN = 0;
266 public static final int T_A = 1;
267 public static final int T_ABBREV = 2;
268 public static final int T_ACRONYM = 3;
269 public static final int T_ADDRESS = 4;
270 public static final int T_APPLET = 5;
271 public static final int T_AREA = 6;
272 public static final int T_AU = 7;
273 public static final int T_B = 8;
274 public static final int T_BANNER = 9;
275 public static final int T_BASE = 10;
276 public static final int T_BASEFONT = 11;
277 public static final int T_BGSOUND = 12;
278 public static final int T_BIG = 13;
279 public static final int T_BLINK = 14;
280 public static final int T_BLOCKQUOTE = 15;
281 public static final int T_BODY = 16;
282 public static final int T_BR = 17;
283 public static final int T_CAPTION = 18;
284 public static final int T_CENTER = 19;
285 public static final int T_CITE = 20;
286 public static final int T_CODE = 21;
287 public static final int T_COL = 22;
288 public static final int T_COLGROUP = 23;
289 public static final int T_CREDIT = 24;
290 public static final int T_DD = 25;
291 public static final int T_DEL = 26;
292 public static final int T_DFN = 27;
293 public static final int T_DIR = 28;
294 public static final int T_DIV = 29;
295 public static final int T_DL = 30;
296 public static final int T_DT = 31;
297 public static final int T_EM = 32;
298 public static final int T_EMBED = 33;
299 public static final int T_FIG = 34;
300 public static final int T_FN = 35;
301 public static final int T_FONT = 36;
302 public static final int T_FORM = 37;
303 public static final int T_FRAME = 38;
304 public static final int T_FRAMESET = 39;
305 public static final int T_H1 = 40;
306 public static final int T_H2 = 41;
307 public static final int T_H3 = 42;
308 public static final int T_H4 = 43;
309 public static final int T_H5 = 44;
310 public static final int T_H6 = 45;
311 public static final int T_HEAD = 46;
312 public static final int T_HTML = 47;
313 public static final int T_HR = 48;
314 public static final int T_I = 49;
315 public static final int T_IMG = 50;
316 public static final int T_INPUT = 51;
317 public static final int T_INS = 52;
318 public static final int T_ISINDEX = 53;
319 public static final int T_KBD = 54;
320 public static final int T_LANG = 55;
321 public static final int T_LH = 56;
322 public static final int T_LI = 57;
323 public static final int T_LINK = 58;
324 public static final int T_MAP = 59;
325 public static final int T_MARQUEE = 60;
326 public static final int T_MENU = 61;
327 public static final int T_META = 62;
328 public static final int T_NEXTID = 63;
329 public static final int T_NOBR = 64;
330 public static final int T_NOEMBED = 65;
331 public static final int T_NOFRAME = 66;
332 public static final int T_NOFRAMES = 67;
333 public static final int T_NOTE = 68;
334 public static final int T_OBJECT = 69;
335 public static final int T_OL = 70;
336 public static final int T_OPTION = 71;
337 public static final int T_OVERLAY = 72;
338 public static final int T_P = 73;
339 public static final int T_PARAM = 74;
340 public static final int T_PERSON = 75;
341 public static final int T_PRE = 76;
342 public static final int T_Q = 77;
343 public static final int T_RANGE = 78;
344 public static final int T_S = 79;
345 public static final int T_SAMP = 80;
346 public static final int T_SCRIPT = 81;
347 public static final int T_SELECT = 82;
348 public static final int T_SMALL = 83;
349 public static final int T_SPOT = 84;
350 public static final int T_STRONG = 85;
351 public static final int T_STYLE = 86;
352 public static final int T_SUB = 87;
353 public static final int T_SUP = 88;
354 public static final int T_TAB = 89;
355 public static final int T_TABLE = 90;
356 public static final int T_TBODY = 91;
357 public static final int T_TD = 92;
358 public static final int T_TEXTAREA = 93;
359 public static final int T_TFOOT = 94;
360 public static final int T_TH = 95;
361 public static final int T_THEAD = 96;
362 public static final int T_TITLE = 97;
363 public static final int T_TR = 98;
364 public static final int T_TT = 99;
365 public static final int T_U = 100;
366 public static final int T_UL = 101;
367 public static final int T_VAR = 102;
368 public static final int T_WBR = 103;
369 public static final int T_IFRAME = 104;
370 /**
371 * <!DOCTYPE ...>
372 */
373 public static final int T__DOCTYPE = 105;
374
375 public static final String P_ALIGN = new String("align");
376 public static final String P_BACKGROUND = new String("background");
377 public static final String P_BORDER = new String("border");
378 public static final String P_CHECKED = new String("checked");
379 public static final String P_CLEAR = new String("clear");
380 public static final String P_CODE = new String("code");
381 public static final String P_COLS = new String("cols");
382 public static final String P_COLSPAN = new String("colspan");
383 public static final String P_FACE = new String("face");
384 public static final String P_HEIGHT = new String("height");
385 public static final String P_HREF = new String("href");
386 public static final String P_LANGUAGE = new String("language");
387 public static final String P_LOWSRC = new String("lowsrc");
388 public static final String P_MAXLENGTH = new String("maxlength");
389 public static final String P_MULTIPLE = new String("multiple");
390 public static final String P_NAME = new String("name");
391 public static final String P_ROWS = new String("rows");
392 public static final String P_ROWSPAN = new String("rowspan");
393 public static final String P_SIZE = new String("size");
394 public static final String P_SRC = new String("src");
395 public static final String P_TARGET = new String("target");
396 public static final String P_TYPE = new String("type");
397 public static final String P_VALUE = new String("value");
398 public static final String P_VALUETYPE = new String("valuetype");
399 public static final String P_WIDTH = new String("width");
400
401 public static final String P_CITE = new String("cite");
402 public static final String P_PROFILE = new String("profile");
403 public static final String P_ACTION = new String("action");
404 public static final String P_LONGDESC = new String("longdesc");
405 public static final String P_FOR = new String("for");
406 public static final String P_USEMAP = new String("usemap");
407 public static final String P_CODEBASE = new String("codebase");
408 public static final String P_DATA = new String("data");
409 public static final String P_ARCHIVE = new String("archive");
410 public static final String P_REL = new String("rel");
411 public static final String P_REV = new String("rev");
412
413 //////////////////////////////////////////////////////////////////////
414
415 /**
416 * Sets whether a tag is an end tag or not.
417 */
418 protected void setEndTag(boolean endtag)
419 {
420 m_endtag = endtag;
421 }
422
423 /**
424 * Sets whether a tag is empty or not. An empty tag ends with a '/'.
425 */
426 protected void setEmpty(boolean empty)
427 {
428 m_empty = empty;
429 }
430
431 private final int indexOfName(String name)
432 {
433 final int size = m_names.size();
434 for (int i=0; i<size; i++)
435 if (name.equalsIgnoreCase((String)m_names.elementAt(i)))
436 return i;
437
438 return -1;
439 }
440
441 private String m_tag = null;
442 private int m_ttype = T_UNKNOWN;
443 private boolean m_endtag = false;
444 private Vector m_names = new Vector();
445 private Vector m_values = new Vector();
446 private static Hashtable m_tags = new Hashtable();
447 private boolean m_empty = false;
448
449 static
450 {
451 m_tags.put(new String("A"), new Integer(T_A));
452 m_tags.put(new String("ABBREV"), new Integer(T_ABBREV));
453 m_tags.put(new String("ACRONYM"), new Integer(T_ACRONYM));
454 m_tags.put(new String("ADDRESS"), new Integer(T_ADDRESS));
455 m_tags.put(new String("APPLET"), new Integer(T_APPLET));
456 m_tags.put(new String("AREA"), new Integer(T_AREA));
457 m_tags.put(new String("AU"), new Integer(T_AU));
458 m_tags.put(new String("B"), new Integer(T_B));
459 m_tags.put(new String("BANNER"), new Integer(T_BANNER));
460 m_tags.put(new String("BASE"), new Integer(T_BASE));
461 m_tags.put(new String("BASEFONT"), new Integer(T_BASEFONT));
462 m_tags.put(new String("BGSOUND"), new Integer(T_BGSOUND));
463 m_tags.put(new String("BIG"), new Integer(T_BIG));
464 m_tags.put(new String("BLINK"), new Integer(T_BLINK));
465 m_tags.put(new String("BLOCKQUOTE"), new Integer(T_BLOCKQUOTE));
466 m_tags.put(new String("BODY"), new Integer(T_BODY));
467 m_tags.put(new String("BR"), new Integer(T_BR));
468 m_tags.put(new String("CAPTION"), new Integer(T_CAPTION));
469 m_tags.put(new String("CENTER"), new Integer(T_CENTER));
470 m_tags.put(new String("CITE"), new Integer(T_CITE));
471 m_tags.put(new String("CODE"), new Integer(T_CODE));
472 m_tags.put(new String("COL"), new Integer(T_COL));
473 m_tags.put(new String("COLGROUP"), new Integer(T_COLGROUP));
474 m_tags.put(new String("CREDIT"), new Integer(T_CREDIT));
475 m_tags.put(new String("DD"), new Integer(T_DD));
476 m_tags.put(new String("DEL"), new Integer(T_DEL));
477 m_tags.put(new String("DFN"), new Integer(T_DFN));
478 m_tags.put(new String("DIR"), new Integer(T_DIR));
479 m_tags.put(new String("DIV"), new Integer(T_DIV));
480 m_tags.put(new String("DL"), new Integer(T_DL));
481 m_tags.put(new String("!DOCTYPE"), new Integer(T__DOCTYPE));
482 m_tags.put(new String("DT"), new Integer(T_DT));
483 m_tags.put(new String("EM"), new Integer(T_EM));
484 m_tags.put(new String("EMBED"), new Integer(T_EMBED));
485 m_tags.put(new String("FIG"), new Integer(T_FIG));
486 m_tags.put(new String("FN"), new Integer(T_FN));
487 m_tags.put(new String("FONT"), new Integer(T_FONT));
488 m_tags.put(new String("FORM"), new Integer(T_FORM));
489 m_tags.put(new String("FRAME"), new Integer(T_FRAME));
490 m_tags.put(new String("FRAMESET"), new Integer(T_FRAMESET));
491 m_tags.put(new String("H1"), new Integer(T_H1));
492 m_tags.put(new String("H2"), new Integer(T_H2));
493 m_tags.put(new String("H3"), new Integer(T_H3));
494 m_tags.put(new String("H4"), new Integer(T_H4));
495 m_tags.put(new String("H5"), new Integer(T_H5));
496 m_tags.put(new String("H6"), new Integer(T_H6));
497 m_tags.put(new String("HEAD"), new Integer(T_HEAD));
498 m_tags.put(new String("HTML"), new Integer(T_HTML));
499 m_tags.put(new String("HR"), new Integer(T_HR));
500 m_tags.put(new String("I"), new Integer(T_I));
501 m_tags.put(new String("IMG"), new Integer(T_IMG));
502 m_tags.put(new String("INPUT"), new Integer(T_INPUT));
503 m_tags.put(new String("INS"), new Integer(T_INS));
504 m_tags.put(new String("ISINDEX"), new Integer(T_ISINDEX));
505 m_tags.put(new String("KBD"), new Integer(T_KBD));
506 m_tags.put(new String("LANG"), new Integer(T_LANG));
507 m_tags.put(new String("LH"), new Integer(T_LH));
508 m_tags.put(new String("LI"), new Integer(T_LI));
509 m_tags.put(new String("LINK"), new Integer(T_LINK));
510 m_tags.put(new String("MAP"), new Integer(T_MAP));
511 m_tags.put(new String("MARQUEE"), new Integer(T_MARQUEE));
512 m_tags.put(new String("MENU"), new Integer(T_MENU));
513 m_tags.put(new String("META"), new Integer(T_META));
514 m_tags.put(new String("NEXTID"), new Integer(T_NEXTID));
515 m_tags.put(new String("NOBR"), new Integer(T_NOBR));
516 m_tags.put(new String("NOEMBED"), new Integer(T_NOEMBED));
517 m_tags.put(new String("NOFRAME"), new Integer(T_NOFRAME));
518 m_tags.put(new String("NOFRAMES"), new Integer(T_NOFRAMES));
519 m_tags.put(new String("NOTE"), new Integer(T_NOTE));
520 m_tags.put(new String("OBJECT"), new Integer(T_OBJECT));
521 m_tags.put(new String("OL"), new Integer(T_OL));
522 m_tags.put(new String("OPTION"), new Integer(T_OPTION));
523 m_tags.put(new String("OVERLAY"), new Integer(T_OVERLAY));
524 m_tags.put(new String("P"), new Integer(T_P));
525 m_tags.put(new String("PARAM"), new Integer(T_PARAM));
526 m_tags.put(new String("PERSON"), new Integer(T_PERSON));
527 m_tags.put(new String("PRE"), new Integer(T_PRE));
528 m_tags.put(new String("Q"), new Integer(T_Q));
529 m_tags.put(new String("RANGE"), new Integer(T_RANGE));
530 m_tags.put(new String("S"), new Integer(T_S));
531 m_tags.put(new String("SAMP"), new Integer(T_SAMP));
532 m_tags.put(new String("SCRIPT"), new Integer(T_SCRIPT));
533 m_tags.put(new String("SELECT"), new Integer(T_SELECT));
534 m_tags.put(new String("SMALL"), new Integer(T_SMALL));
535 m_tags.put(new String("SPOT"), new Integer(T_SPOT));
536 m_tags.put(new String("STRONG"), new Integer(T_STRONG));
537 m_tags.put(new String("STYLE"), new Integer(T_STYLE));
538 m_tags.put(new String("SUB"), new Integer(T_SUB));
539 m_tags.put(new String("SUP"), new Integer(T_SUP));
540 m_tags.put(new String("TAB"), new Integer(T_TAB));
541 m_tags.put(new String("TABLE"), new Integer(T_TABLE));
542 m_tags.put(new String("TBODY"), new Integer(T_TBODY));
543 m_tags.put(new String("TD"), new Integer(T_TD));
544 m_tags.put(new String("TEXTAREA"), new Integer(T_TEXTAREA));
545 m_tags.put(new String("TFOOT"), new Integer(T_TFOOT));
546 m_tags.put(new String("TH"), new Integer(T_TH));
547 m_tags.put(new String("THEAD"), new Integer(T_THEAD));
548 m_tags.put(new String("TITLE"), new Integer(T_TITLE));
549 m_tags.put(new String("TR"), new Integer(T_TR));
550 m_tags.put(new String("TT"), new Integer(T_TT));
551 m_tags.put(new String("U"), new Integer(T_U));
552 m_tags.put(new String("UL"), new Integer(T_UL));
553 m_tags.put(new String("VAR"), new Integer(T_VAR));
554 m_tags.put(new String("WBR"), new Integer(T_WBR));
555
556 m_tags.put(new String("IFRAME"), new Integer(T_IFRAME));
557 }
558 }
559