Source code: com/arthurdo/parser/Table.java
1 /*
2 * Copyright (c) 1996, 2001 by Arthur Do <arthur@cs.stanford.edu>.
3 * All Rights Reserved.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 package com.arthurdo.parser;
21
22 import java.io.*;
23 import java.util.*;
24 import java.awt.*;
25
26 /**
27 * <p><i>Null Cells and Phantom Rows</i>
28 *
29 * <p>elementAt() can return null if
30 * there is no cell at the requested coordinate due to spans
31 * across areas where there are no cells, for example,
32 *
33 * <p><blockquote>
34 * <table><br>
35 * <tr><td rowspan=2>abc<td>def<br>
36 * </table>
37 * </blockquote>
38 *
39 * <p>In this case, calling elementAt(1, 1) would return
40 * null. Similarly, getRowTag() could return null if
41 * the requested index is a phantom row (a row that doesn't
42 * have any real cells). In the example above, the table
43 * has two rows due to the rowspan but row 1 is a phantom
44 * row and does not have a <tr> tag.
45 *
46 * <p><i>Implementation Note:</i> the table parser is very
47 * strict, that is, <TABLE>
48 * can only contain <TR>. <TR> can only contain
49 * <TD>. Any deviation from this nesting is considered
50 * bad data and will be thrown away by the parser. In a future
51 * release, we hope to be more forgiving.
52 *
53 * <p><ul>
54 * <li> 02/09/98 Dr. Jaron Collis, added support for <TH> and
55 * introduced a parseTable(Reader) convenience function.
56 * </ul>
57 *
58 * @version 0.9 01/32/98
59 * @author Arthur Do <arthur@cs.stanford.edu>
60 * @see com.arthurdo.parser.TableCell
61 */
62 public class Table
63 {
64 public Table()
65 {
66 }
67
68 /**
69 * @deprecated use parseTable(Reader) instead.
70 * This version of the constructor can lead to 10x slower code
71 * because of the InputStreamReader wrapper.
72 * @param data input stream
73 */
74 public void parseTable(InputStream in)
75 throws HtmlException, IOException
76 {
77 parseTable(new InputStreamReader(in));
78 }
79
80 /**
81 * @param data input stream
82 */
83 public void parseTable(Reader in)
84 throws HtmlException, IOException
85 {
86 HtmlStreamTokenizer tok = new HtmlStreamTokenizer(in);
87 HtmlTag tag = new HtmlTag();
88
89 while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF)
90 {
91 int ttype = tok.getTokenType();
92 if (ttype == HtmlStreamTokenizer.TT_TAG)
93 {
94 tok.parseTag(tok.getStringValue(), tag);
95 if (tag.getTagType() == HtmlTag.T_TABLE && !tag.isEndTag())
96 parseTable(tok, new HtmlTag(tag));
97 }
98 }
99 }
100
101 public void parseTable(HtmlStreamTokenizer tokenizer, HtmlTag tableTag)
102 throws HtmlException, IOException
103 {
104 m_tok = tokenizer;
105 m_tableTag = tableTag;
106 HtmlTag tag = new HtmlTag();
107
108 while (nextToken() != HtmlStreamTokenizer.TT_EOF)
109 {
110 int ttype = getTokenType();
111 if (ttype == HtmlStreamTokenizer.TT_TAG)
112 {
113 try
114 {
115 m_tok.parseTag(getStringValue(), tag);
116 int tagtype = tag.getTagType();
117 boolean isEndTag = tag.isEndTag();
118
119 if (tagtype == HtmlTag.T_TR && !isEndTag)
120 {
121 m_rowTags.addElement(new HtmlTag(tag));
122 newRow();
123 if (!parseRow())
124 break;
125 }
126 else if (tagtype == HtmlTag.T_TABLE && isEndTag)
127 {
128 break;
129 }
130 else
131 {
132 // otherwise, data is considered bad and thrown away
133 //System.err.println("bad data " + m_tok.getLineNumber());
134 }
135 }
136 catch (HtmlException e)
137 {
138 }
139 }
140 }
141
142 m_tok = null;
143 m_pushback = false;
144 m_tokenType = 0;
145 m_stringValue = null;
146 m_whiteSpace = null;
147 m_cell = null;
148 m_row = null;
149
150 organizeRowCol();
151 }
152
153 /**
154 * @return the original <TABLE> tag for this table.
155 */
156 public HtmlTag getTableTag()
157 {
158 return m_tableTag;
159 }
160
161 /**
162 * @param row the row to get
163 * @return the original <TR> tag for this row or null
164 * if this is a phantom row, i.e. a row that doesn't
165 * have any real cells.
166 */
167 public HtmlTag getRowTag(int row)
168 {
169 if (row < m_rowTags.size())
170 return (HtmlTag)m_rowTags.elementAt(row);
171 return null;
172 }
173
174 /**
175 * @return the number of rows in this table.
176 */
177 public int getRows()
178 {
179 return m_elements.length;
180 }
181
182 /**
183 * @return the number of columns in this table.
184 */
185 public int getColumns()
186 {
187 return m_elements[0].length;
188 }
189
190 /**
191 * @param row row to get
192 * @param col column to get
193 * @return the cell located at the specified location in the table
194 * or null if there is no cell due to uneven spans.
195 */
196 public TableCell elementAt(int row, int col)
197 {
198 return m_elements[row][col];
199 }
200
201 private void organizeRowCol()
202 throws HtmlException
203 {
204 // calculate max number of columns
205 int maxcol = 0;
206 Vector rows = m_rows;
207 int sizey = rows.size();
208 for (int y=0; y<sizey; y++)
209 {
210 int col = 0;
211 Vector row = (Vector)rows.elementAt(y);
212 int sizex = row.size();
213 for (int x=0; x<sizex; x++)
214 {
215 TableCell cell = (TableCell)row.elementAt(x);
216 col += cell.getColSpan();
217 if (col > maxcol)
218 maxcol = col;
219 }
220 }
221 if (maxcol == 0)
222 throw new HtmlException("zero columns");
223 // rownum[] tracks the current row number for a particular column
224 int rownum[] = new int[maxcol];
225 // calculate max number of rows
226 int maxrow = 0;
227 for (int y=0; y<sizey; y++)
228 {
229 int col = 0;
230 Vector row = (Vector)rows.elementAt(y);
231 int sizex = row.size();
232 for (int x=0; x<sizex; x++)
233 {
234 while (y < rownum[col])
235 col++; // skip to a column that is not spanned in the current row
236 TableCell cell = (TableCell)row.elementAt(x);
237 int colspan = cell.getColSpan();
238 for (int i=0; i<colspan; i++)
239 {
240 int colnum = col+i;
241 rownum[colnum] += cell.getRowSpan();
242 if (rownum[colnum] > maxrow)
243 maxrow = rownum[colnum];
244 }
245 col += colspan;
246 }
247 }
248 if (maxrow == 0)
249 throw new HtmlException("zero rows");
250 for (int i=0; i<maxcol; i++)
251 rownum[i] = 0;
252
253 TableCell elements[][] = new TableCell[maxrow][maxcol];
254 // for each row
255 for (int y=0; y<sizey; y++)
256 {
257 int col = 0;
258 Vector row = (Vector)rows.elementAt(y);
259 int sizex = row.size();
260 // for each cell
261 for (int x=0; x<sizex; x++)
262 {
263 while (y < rownum[col])
264 col++; // skip to a column that is not spanned in the current row
265 TableCell cell = (TableCell)row.elementAt(x);
266 int r = rownum[col];
267 int c = col;
268 elements[r][c] = cell;
269 int colspan = cell.getColSpan();
270 // for each column this cell occupies
271 for (int i=0; i<colspan; i++)
272 {
273 int colnum = col+i;
274 int rowspan = cell.getRowSpan();
275 // for each row this cell occupies
276 for (int j=0; j<rowspan; j++)
277 {
278 if (i > 0 || j > 0)
279 // create a pseudo cell
280 elements[rownum[colnum] + j][colnum] = new TableCell(r, c);
281 }
282 // update current row number for this column
283 rownum[colnum] += rowspan;
284 }
285 col += colspan;
286 }
287 row.removeAllElements();
288 }
289 rows.removeAllElements();
290
291 m_elements = elements;
292 }
293
294 private boolean parseRow()
295 throws IOException
296 {
297 boolean continueParsing = false;
298
299 HtmlTag tag = new HtmlTag();
300 while (nextToken() != HtmlStreamTokenizer.TT_EOF)
301 {
302 int ttype = getTokenType();
303
304 if (ttype == HtmlStreamTokenizer.TT_TAG)
305 {
306 try
307 {
308 m_tok.parseTag(getStringValue(), tag);
309 int tagtype = tag.getTagType();
310 boolean isEndTag = tag.isEndTag();
311
312 if (tagtype == HtmlTag.T_TR)
313 {
314 if (!isEndTag)
315 pushBackToken();
316 // row ended, continue with next row
317 continueParsing = true;
318 break;
319 }
320 else if (tagtype == HtmlTag.T_TD || tagtype == HtmlTag.T_TH)
321 {
322 if (!isEndTag)
323 {
324 beginCell(tag);
325 if (!parseCol())
326 {
327 endCell();
328 continueParsing = false;
329 break;
330 }
331 endCell();
332 }
333 }
334 }
335 catch (HtmlException e)
336 {
337 }
338 }
339 }
340
341 return continueParsing;
342 }
343
344 private boolean parseCol()
345 throws IOException
346 {
347 boolean continueParsing = false;
348
349 HtmlTag tag = new HtmlTag();
350 while (nextToken() != HtmlStreamTokenizer.TT_EOF)
351 {
352 int ttype = getTokenType();
353
354 if (ttype == HtmlStreamTokenizer.TT_TAG)
355 {
356 try
357 {
358 m_tok.parseTag(getStringValue(), tag);
359 int tagtype = tag.getTagType();
360 boolean isEndTag = tag.isEndTag();
361
362 if (tagtype == HtmlTag.T_TR)
363 {
364 if (!isEndTag)
365 pushBackToken();
366 // column ended
367 continueParsing = true;
368 break;
369 }
370 else if (tagtype == HtmlTag.T_TD || tagtype == HtmlTag.T_TH)
371 {
372 if (!isEndTag)
373 pushBackToken();
374 // column ended
375 continueParsing = true;
376 break;
377 }
378 else if (tagtype == HtmlTag.T_TABLE)
379 {
380 if (isEndTag)
381 {
382 continueParsing = false;
383 break;
384 }
385 Table table = new Table();
386 table.parseTable(m_tok, new HtmlTag(tag));
387 addToCell(table);
388 }
389 else
390 {
391 addToCell(new HtmlTag(tag));
392 }
393 }
394 catch (HtmlException e)
395 {
396 addToCell("<" + getStringValue().toString() + ">");
397 }
398 }
399 else if (ttype == HtmlStreamTokenizer.TT_TEXT)
400 {
401 String obj = getWhiteSpace().toString();
402 obj += getStringValue().toString();
403 addToCell(obj);
404 }
405 else if (ttype == HtmlStreamTokenizer.TT_COMMENT)
406 {
407 // throw away
408 }
409 }
410
411 return continueParsing;
412 }
413
414 private void newRow()
415 {
416 Vector row = new Vector();
417 m_row = row;
418 m_rows.addElement(row);
419 }
420
421 private void beginCell(HtmlTag tag)
422 {
423 int rowspan = 1;
424 try
425 {
426 rowspan = tag.getIntParam(HtmlTag.P_ROWSPAN);
427 if (rowspan <= 0)
428 rowspan = 1;
429 }
430 catch (NumberFormatException e)
431 {
432 }
433 int colspan = 1;
434 try
435 {
436 colspan = tag.getIntParam(HtmlTag.P_COLSPAN);
437 if (colspan <= 0)
438 colspan = 1;
439 }
440 catch (NumberFormatException e)
441 {
442 }
443 TableCell cell = new TableCell(rowspan, colspan, new HtmlTag(tag));
444 m_cell = cell;
445 m_row.addElement(cell);
446 }
447
448 private void endCell()
449 {
450 m_cell = null;
451 }
452
453 private void addToCell(Object o)
454 {
455 m_cell.addElement(o);
456 }
457
458 private int nextToken()
459 throws IOException
460 {
461 if (m_pushback)
462 m_pushback = false;
463 else
464 {
465 m_tokenType = m_tok.nextToken();
466 m_stringValue = m_tok.getStringValue();
467 m_whiteSpace = m_tok.getWhiteSpace();
468 }
469
470 return m_tokenType;
471 }
472
473 private void pushBackToken()
474 throws IOException
475 {
476 if (m_pushback)
477 throw new IOException("only one token pushback supported");
478
479 m_tokenType = m_tok.getTokenType();
480 m_stringValue = m_tok.getStringValue();
481 m_whiteSpace = m_tok.getWhiteSpace();
482 m_pushback = true;
483 }
484
485 private int getTokenType()
486 {
487 return m_tokenType;
488 }
489
490 private StringBuffer getStringValue()
491 {
492 return m_stringValue;
493 }
494
495 private StringBuffer getWhiteSpace()
496 {
497 return m_whiteSpace;
498 }
499
500 private HtmlStreamTokenizer m_tok = null;
501 private boolean m_pushback = false;
502 private int m_tokenType = 0;
503 private StringBuffer m_stringValue = null;
504 private StringBuffer m_whiteSpace = null;
505 private TableCell m_cell = null;
506 private Vector m_row = null;
507 private Vector m_rows = new Vector();
508 private TableCell m_elements[][] = null;
509 private HtmlTag m_tableTag = null;
510 private Vector m_rowTags = new Vector();
511 }