Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/arthurdo/parser/Table.java


1   /*
2    * Copyright (c) 1996, 2001 by Arthur Do <arthur@cs.stanford.edu>.
3    * All Rights Reserved.
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation; either version 2 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13   * GNU General Public License for more details.
14   *
15   * You should have received a copy of the GNU General Public License
16   * along with this program; if not, write to the Free Software
17   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18   */
19  
20  package com.arthurdo.parser;
21  
22  import java.io.*;
23  import java.util.*;
24  import java.awt.*;
25  
26  /**
27   * <p><i>Null Cells and Phantom Rows</i>
28   *
29   * <p>elementAt() can return null if
30   * there is no cell at the requested coordinate due to spans
31   * across areas where there are no cells, for example,
32   *
33   * <p><blockquote>
34   * &lt;table&gt;<br>
35   * &lt;tr&gt;&lt;td rowspan=2&gt;abc&lt;td&gt;def<br>
36   * &lt;/table&gt;
37   * </blockquote>
38   *
39   * <p>In this case, calling elementAt(1, 1) would return
40   * null. Similarly, getRowTag() could return null if
41   * the requested index is a phantom row (a row that doesn't
42   * have any real cells). In the example above, the table
43   * has two rows due to the rowspan but row 1 is a phantom
44   * row and does not have a &lt;tr&gt; tag.
45   *
46   * <p><i>Implementation Note:</i> the table parser is very
47   * strict, that is, &lt;TABLE&gt;
48   * can only contain &lt;TR&gt;. &lt;TR&gt; can only contain
49   * &lt;TD&gt;. Any deviation from this nesting is considered
50   * bad data and will be thrown away by the parser. In a future
51   * release, we hope to be more forgiving.
52   *
53   * <p><ul>
54   * <li> 02/09/98 Dr. Jaron Collis, added support for <TH> and
55   * introduced a parseTable(Reader) convenience function.
56   * </ul>
57   *
58   * @version 0.9 01/32/98
59   * @author Arthur Do <arthur@cs.stanford.edu>
60   * @see     com.arthurdo.parser.TableCell
61   */
62  public class Table
63  {
64    public Table()
65    {
66    }
67  
68    /**
69     * @deprecated  use parseTable(Reader) instead.
70     *        This version of the constructor can lead to 10x slower code
71     *        because of the InputStreamReader wrapper.
72     * @param  data  input stream
73     */
74    public void parseTable(InputStream in)
75      throws HtmlException, IOException
76    {
77      parseTable(new InputStreamReader(in));
78    }
79  
80    /**
81     * @param  data  input stream
82     */
83    public void parseTable(Reader in)
84      throws HtmlException, IOException
85    {
86      HtmlStreamTokenizer tok = new HtmlStreamTokenizer(in);
87      HtmlTag tag = new HtmlTag();
88  
89      while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF)
90      {
91        int ttype = tok.getTokenType();
92        if (ttype == HtmlStreamTokenizer.TT_TAG)
93        {
94          tok.parseTag(tok.getStringValue(), tag);
95          if (tag.getTagType() == HtmlTag.T_TABLE && !tag.isEndTag())
96            parseTable(tok, new HtmlTag(tag));
97        }
98      }
99    }
100 
101   public void parseTable(HtmlStreamTokenizer tokenizer, HtmlTag tableTag)
102     throws HtmlException, IOException
103   {
104     m_tok = tokenizer;
105     m_tableTag = tableTag;
106     HtmlTag tag = new HtmlTag();
107 
108     while (nextToken() != HtmlStreamTokenizer.TT_EOF)
109     {
110       int ttype = getTokenType();
111       if (ttype == HtmlStreamTokenizer.TT_TAG)
112       {
113         try
114         {
115           m_tok.parseTag(getStringValue(), tag);
116           int tagtype = tag.getTagType();
117           boolean isEndTag = tag.isEndTag();
118 
119           if (tagtype == HtmlTag.T_TR && !isEndTag)
120           {
121             m_rowTags.addElement(new HtmlTag(tag));
122             newRow();
123             if (!parseRow())
124               break;
125           }
126           else if (tagtype == HtmlTag.T_TABLE && isEndTag)
127           {
128             break;
129           }
130           else
131           {
132             // otherwise, data is considered bad and thrown away
133             //System.err.println("bad data " + m_tok.getLineNumber());
134           }
135         }
136         catch (HtmlException e)
137         {
138         }
139       }
140     }
141 
142     m_tok = null;
143     m_pushback = false;
144     m_tokenType = 0;
145     m_stringValue = null;
146     m_whiteSpace = null;
147     m_cell = null;
148     m_row = null;
149 
150     organizeRowCol();
151   }
152 
153   /**
154    * @return  the original &lt;TABLE&gt; tag for this table.
155    */
156   public HtmlTag getTableTag()
157   {
158     return m_tableTag;
159   }
160 
161   /**
162    * @param  row  the row to get
163    * @return  the original &lt;TR&gt; tag for this row or null
164    *      if this is a phantom row, i.e. a row that doesn't
165    *      have any real cells.
166    */
167   public HtmlTag getRowTag(int row)
168   {
169     if (row < m_rowTags.size())
170       return (HtmlTag)m_rowTags.elementAt(row);
171     return null;
172   }
173 
174   /**
175    * @return  the number of rows in this table.
176    */
177   public int getRows()
178   {
179     return m_elements.length;
180   }
181 
182   /**
183    * @return  the number of columns in this table.
184    */
185   public int getColumns()
186   {
187     return m_elements[0].length;
188   }
189 
190   /**
191    * @param  row  row to get
192    * @param  col  column to get
193    * @return  the cell located at the specified location in the table
194    *      or null if there is no cell due to uneven spans.
195    */
196   public TableCell elementAt(int row, int col)
197   {
198     return m_elements[row][col];
199   }
200 
201   private void organizeRowCol()
202     throws HtmlException
203   {
204     // calculate max number of columns
205     int maxcol = 0;
206     Vector rows = m_rows;
207     int sizey = rows.size();
208     for (int y=0; y<sizey; y++)
209     {
210       int col = 0;
211       Vector row = (Vector)rows.elementAt(y);
212       int sizex = row.size();
213       for (int x=0; x<sizex; x++)
214       {
215         TableCell cell = (TableCell)row.elementAt(x);
216         col += cell.getColSpan();
217         if (col > maxcol)
218           maxcol = col;
219       }
220     }
221     if (maxcol == 0)
222       throw new HtmlException("zero columns");
223     // rownum[] tracks the current row number for a particular column
224     int rownum[] = new int[maxcol];
225     // calculate max number of rows
226     int maxrow = 0;
227     for (int y=0; y<sizey; y++)
228     {
229       int col = 0;
230       Vector row = (Vector)rows.elementAt(y);
231       int sizex = row.size();
232       for (int x=0; x<sizex; x++)
233       {
234         while (y < rownum[col])
235           col++;  // skip to a column that is not spanned in the current row
236         TableCell cell = (TableCell)row.elementAt(x);
237         int colspan = cell.getColSpan();
238         for (int i=0; i<colspan; i++)
239         {
240           int colnum = col+i;
241           rownum[colnum] += cell.getRowSpan();
242           if (rownum[colnum] > maxrow)
243             maxrow = rownum[colnum];
244         }
245         col += colspan;
246       }
247     }
248     if (maxrow == 0)
249       throw new HtmlException("zero rows");
250     for (int i=0; i<maxcol; i++)
251       rownum[i] = 0;
252 
253     TableCell elements[][] = new TableCell[maxrow][maxcol];
254     // for each row
255     for (int y=0; y<sizey; y++)
256     {
257       int col = 0;
258       Vector row = (Vector)rows.elementAt(y);
259       int sizex = row.size();
260       // for each cell
261       for (int x=0; x<sizex; x++)
262       {
263         while (y < rownum[col])
264           col++;  // skip to a column that is not spanned in the current row
265         TableCell cell = (TableCell)row.elementAt(x);
266         int r = rownum[col];
267         int c = col;
268         elements[r][c] = cell;
269         int colspan = cell.getColSpan();
270         // for each column this cell occupies
271         for (int i=0; i<colspan; i++)
272         {
273           int colnum = col+i;
274           int rowspan = cell.getRowSpan();
275           // for each row this cell occupies
276           for (int j=0; j<rowspan; j++)
277           {
278             if (i > 0 || j > 0)
279               // create a pseudo cell
280               elements[rownum[colnum] + j][colnum] = new TableCell(r, c);
281           }
282           // update current row number for this column
283           rownum[colnum] += rowspan;
284         }
285         col += colspan;
286       }
287       row.removeAllElements();
288     }
289     rows.removeAllElements();
290 
291     m_elements = elements;
292   }
293 
294   private boolean parseRow()
295     throws IOException
296   {
297     boolean continueParsing = false;
298 
299     HtmlTag tag = new HtmlTag();
300     while (nextToken() != HtmlStreamTokenizer.TT_EOF)
301     {
302       int ttype = getTokenType();
303 
304       if (ttype == HtmlStreamTokenizer.TT_TAG)
305       {
306         try
307         {
308           m_tok.parseTag(getStringValue(), tag);
309           int tagtype = tag.getTagType();
310           boolean isEndTag = tag.isEndTag();
311 
312           if (tagtype == HtmlTag.T_TR)
313           {
314             if (!isEndTag)
315               pushBackToken();
316             // row ended, continue with next row
317             continueParsing = true;
318             break;
319           }
320           else if (tagtype == HtmlTag.T_TD || tagtype == HtmlTag.T_TH)
321           {
322             if (!isEndTag)
323             {
324               beginCell(tag);
325               if (!parseCol())
326               {
327                 endCell();
328                 continueParsing = false;
329                 break;
330               }
331               endCell();
332             }
333           }
334         }
335         catch (HtmlException e)
336         {
337         }
338       }
339     }
340 
341     return continueParsing;
342   }
343 
344   private boolean parseCol()
345     throws IOException
346   {
347     boolean continueParsing = false;
348 
349     HtmlTag tag = new HtmlTag();
350     while (nextToken() != HtmlStreamTokenizer.TT_EOF)
351     {
352       int ttype = getTokenType();
353 
354       if (ttype == HtmlStreamTokenizer.TT_TAG)
355       {
356         try
357         {
358           m_tok.parseTag(getStringValue(), tag);
359           int tagtype = tag.getTagType();
360           boolean isEndTag = tag.isEndTag();
361 
362           if (tagtype == HtmlTag.T_TR)
363           {
364             if (!isEndTag)
365               pushBackToken();
366             // column ended
367             continueParsing = true;
368             break;
369           }
370           else if (tagtype == HtmlTag.T_TD || tagtype == HtmlTag.T_TH)
371           {
372             if (!isEndTag)
373               pushBackToken();
374             // column ended
375             continueParsing = true;
376             break;
377           }
378           else if (tagtype == HtmlTag.T_TABLE)
379           {
380             if (isEndTag)
381             {
382               continueParsing = false;
383               break;
384             }
385             Table table = new Table();
386             table.parseTable(m_tok, new HtmlTag(tag));
387             addToCell(table);
388           }
389           else
390           {
391             addToCell(new HtmlTag(tag));
392           }
393         }
394         catch (HtmlException e)
395         {
396           addToCell("<" + getStringValue().toString() + ">");
397         }
398       }
399       else if (ttype == HtmlStreamTokenizer.TT_TEXT)
400       {
401         String obj = getWhiteSpace().toString();
402         obj += getStringValue().toString();
403         addToCell(obj);
404       }
405       else if (ttype == HtmlStreamTokenizer.TT_COMMENT)
406       {
407         // throw away
408       }
409     }
410 
411     return continueParsing;
412   }
413 
414   private void newRow()
415   {
416     Vector row = new Vector();
417     m_row = row;
418     m_rows.addElement(row);
419   }
420 
421   private void beginCell(HtmlTag tag)
422   {
423     int rowspan = 1;
424     try
425     {
426       rowspan = tag.getIntParam(HtmlTag.P_ROWSPAN);
427       if (rowspan <= 0)
428         rowspan = 1;
429     }
430     catch (NumberFormatException e)
431     {
432     }
433     int colspan = 1;
434     try
435     {
436       colspan = tag.getIntParam(HtmlTag.P_COLSPAN);
437       if (colspan <= 0)
438         colspan = 1;
439     }
440     catch (NumberFormatException e)
441     {
442     }
443     TableCell cell = new TableCell(rowspan, colspan, new HtmlTag(tag));
444     m_cell = cell;
445     m_row.addElement(cell);
446   }
447 
448   private void endCell()
449   {
450     m_cell = null;
451   }
452 
453   private void addToCell(Object o)
454   {
455     m_cell.addElement(o);
456   }
457 
458   private int nextToken()
459     throws IOException
460   {
461     if (m_pushback)
462       m_pushback = false;
463     else
464     {
465       m_tokenType = m_tok.nextToken();
466       m_stringValue = m_tok.getStringValue();
467       m_whiteSpace = m_tok.getWhiteSpace();
468     }
469 
470     return m_tokenType;
471   }
472 
473   private void pushBackToken()
474     throws IOException
475   {
476     if (m_pushback)
477       throw new IOException("only one token pushback supported");
478 
479     m_tokenType = m_tok.getTokenType();
480     m_stringValue = m_tok.getStringValue();
481     m_whiteSpace = m_tok.getWhiteSpace();
482     m_pushback = true;
483   }
484 
485   private int getTokenType()
486   {
487     return m_tokenType;
488   }
489 
490   private StringBuffer getStringValue()
491   {
492     return m_stringValue;
493   }
494 
495   private StringBuffer getWhiteSpace()
496   {
497     return m_whiteSpace;
498   }
499 
500   private HtmlStreamTokenizer m_tok = null;
501   private boolean m_pushback = false;
502   private int m_tokenType = 0;
503   private StringBuffer m_stringValue = null;
504   private StringBuffer m_whiteSpace = null;
505   private TableCell m_cell = null;
506   private Vector m_row = null;
507   private Vector m_rows = new Vector();
508   private TableCell m_elements[][] = null;
509   private HtmlTag m_tableTag = null;
510   private Vector m_rowTags = new Vector();
511 }