Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: cvu/html/HTMLNode.java


1   /*
2    * HTML Parser
3    * Copyright (C) 1997 David McNicol
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License as published by
7    * the Free Software Foundation; either version 2 of the License, or
8    * (at your option) any later version.
9    *
10   * This program is distributed in the hope that it will be useful,
11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13   * file COPYING for more details.
14   */
15  
16  package cvu.html;
17  
18  import java.util.Hashtable;
19  import java.util.Vector;
20  import java.util.Enumeration;
21  import java.io.DataOutputStream;
22  import java.io.IOException;
23  
24  /**
25   * This class represents a single node within an HTML tree. Each node
26   * has a name, zero or more attributes and possibly some content. Nodes
27   * can appear within the content of other nodes. <p>
28   * End tags do not appear since they only indicate 'end-of-content'. To
29   * prevent the system searching for the end of standalone tags, a dynamic
30   * list has been implemented. When the HTMLNode class is resolved
31   * a setup method is called adding a set of default standalone tags
32   * to the list. Standalone tags can then be added and removed dynamically
33   * using static method calls. <p>
34   * The list is the only way the internal code can tell
35   * whether a tag is standalone. If a problem occurs the tree structure
36   * would still be sound, but it would not be accurate, so while the form
37   * of the HTML would be conserved, searches would not operate correctly.
38   * @see HTMLTree
39   * @author <a href="http://www.strath.ac.uk/~ras97108/">David McNicol</a>
40   */
41  public class HTMLNode {
42  
43    private HTMLNode parent;    // Refers to this node's parent.
44    private String name;      // Stores the name of the HTML node.
45    private AttributeList attr; // List of element's attributes.
46    private Vector children;    // Stores the HTML node's children.
47    private boolean hidden;     // True if the node is not to be printed.
48  
49    /**
50     * Constructs a new HTMLNode.
51     * @param tag the TagToken representing the start of this node.
52     * @param standalone true if the tag does not have any content.
53     * @param src enumeration of tag tokens.
54     */
55    public HTMLNode (TagToken tag, HTMLNode parent, Enumeration src) {
56  
57      // Store the reference to the node's parent.
58      this.parent = parent;
59  
60      // Set the node to be unhidden by default.
61      hidden = false;
62  
63      // Check if the given tag is null.
64      if (tag != null) {
65  
66        // Store the node's name.
67        name = tag.getName();
68  
69        // Store the node's attribute list.
70        attr = tag.getAttributes();
71  
72        // Get the node's children if needed.
73        if (HTMLNode.isStandalone(name))
74          children = null;
75        else
76          children = parseChildren(src);
77      } else {
78  
79        // Otherwise, set the name and attributes to null.
80        name = null;
81        attr = null;
82  
83        // Get the node's children from the enumeration.
84        children = parseChildren(src);
85      }
86    }
87  
88    /**
89     * Constructs a new, detached HTMLNode with the specified name.
90     * @param name the name of the new node.
91     */
92    public HTMLNode (String name) {
93      
94      // Store the name of the node.  
95      this.name = name;
96  
97      // The node will have no parent till it is added to a tree.
98      parent = null;
99  
100     // Create a new attribute list.
101     attr = new AttributeList();
102 
103     // Create space for children if the node is not standalone.
104     if (HTMLNode.isStandalone(name))
105       children = null;
106     else
107       children = new Vector();
108   }
109 
110   /**
111    * Returns the name of this node.
112    */
113   public String getName () {
114     return name;
115   }
116 
117   /**
118    * Returns the node's parent node.
119    */
120   public HTMLNode getParent () {
121     return parent;
122   }
123 
124   /**
125    * Returns the node's children.
126    */
127   public Enumeration getChildren () {
128 
129     // Return nothing if the node has any children.
130     if (children == null) return null;
131 
132     return children.elements();
133   }
134 
135   /**
136    * Returns true if the node is currently hidden.
137    */
138   public boolean isHidden () {
139     return hidden;
140   }
141 
142   /**
143    * Hides the node.
144    */
145   public void hide () {
146     hidden = true;
147   }
148 
149   /**
150    * "Unhides" the node.
151    */
152   public void unhide () {
153     hidden = false;
154   }
155 
156   /**
157    * Returns the value of the attribute with the given name.
158    * @param name the name of the attribute.
159    */
160   public String getAttribute (String name) {
161     
162     // Check that the attribute list is there.
163     if (attr == null) return null;
164 
165     // Return the value associated with the attribute name.
166     return (String) attr.get(name);
167   }
168 
169   /**
170    * Returns an enumeration of attributes defined in this node.
171    */
172   public Enumeration getAttributes () {
173 
174     // Check that the attribute list has been defined.
175     if (attr == null) return null;
176 
177     // Return an enumeration of all of the attribute names.
178     return attr.names();
179   }
180 
181   /**
182    * Returns an attribute with all double quote characters
183    * escaped with a backslash.
184    * @param name the name of the attribute.
185    */
186   public String getQuotedAttribute (String name) {
187 
188     // Check that the attribute list is there.
189     if (attr == null) return null;
190 
191     // Return the quoted version.
192     return attr.getQuoted(name);
193   }
194 
195   /**
196    * Returns a string version of the attribute and its value.
197    * @param name the name of the attribute.
198    */
199   public String getAttributeToString (String name) {
200 
201     // Check that the attribute list is there.
202     if (attr == null) return null;
203 
204     // Return the string version.
205     return attr.toString(name);
206   }
207 
208   /**
209    * Returns a string version of the HTMLNode. If the node is 
210    * currently hidden then return an empty string.
211    */
212   public String toString () {
213 
214     StringBuffer sb;  // Stores the string to be returned.
215     Enumeration list; // List of node's attributes or children.
216 
217     // Get a new StringBuffer.
218     sb = new StringBuffer();
219 
220     if (! hidden) {
221 
222       // Write the opening of the tag.
223       sb.append('<');
224 
225       // Write the tag's name.
226       sb.append(name);
227 
228       // Check if there are any attributes.
229       if (attr != null && attr.size() > 0) {
230 
231         // Print string version of the attributes.
232         sb.append(" " + attr);
233       }
234 
235       // Finish off the tag.
236       sb.append('>');
237     }
238 
239     // Return if the node is standalone.
240     if (isStandalone(name)) return sb.toString();
241 
242     // Otherwise, check if the node has any children.
243     if (children != null && children.size() > 0) {
244 
245       // Get a list of all of the children.
246       list = children.elements();
247 
248       while (list.hasMoreElements()) {
249 
250         // Get the next node from the list.
251         Object o = list.nextElement();
252         
253         // Write it.
254         sb.append(o.toString());
255       }
256     }
257 
258     if (! hidden) {
259       // Write the end tag.
260       sb.append("</").append(name).append(">");
261     }
262 
263     // Return the string version.
264     return sb.toString();
265   }
266 
267   /**
268    * Sets the node's parent to the specified HTMLNode.
269    * @param parent the new parent.
270    */
271   public void setParent (HTMLNode parent) {
272     this.parent = parent;
273   }
274 
275   /**
276    * Returns true if an attribute with the given name exists.
277    * @param name the name of the attribute.
278    */
279   public boolean isAttribute (String name) {
280   
281     // Check that the attribute list is there.
282     if (attr == null) return false;
283 
284     // Check the table for an attribute with that name.
285     return attr.exists(name);
286   }
287 
288   /**
289    * Adds a new attribute to the node's attribute list with
290    * the specified value. If the attribute already exists the
291    * old value is overwritten.
292    * @param name the name of the attribute.
293    * @param value the value of the attribute.
294    */
295   public void addAttribute (String name, String value) {
296 
297     // Return if the attribute list is not there.
298     if (attr == null) return;
299 
300     // Otherwise, add the name/value pair to the list.
301     attr.set(name, value);
302   }
303 
304   /**
305    * Adds an object to the end of this node's content
306    * @param child the node to be added.
307    */
308   public void addChild (Object child) {
309 
310     // Return if the child is invalid.
311     if (child == null) return;
312 
313     // Check that this node has no children.
314     if (children == null) return;
315 
316     // Add the child if it is a string.
317     if (child instanceof String) {
318 
319       children.addElement(child);
320       return;
321     }
322 
323     // Add the child and set its parent if it is an HTMLNode.
324     if (child instanceof HTMLNode) {
325 
326       children.addElement(child);
327       ((HTMLNode) child).setParent(this);
328       return;
329     }
330   }
331 
332   /**
333    * Removes the specified HTMLNode from the current node's
334    * list of children.
335    * @param child the node to be removed.
336    */
337   public void removeChild (HTMLNode child) {
338 
339     // Return if the child is not defined properly
340     if (child == null) return;
341 
342     // Return if the list of children is not defined properly.
343     if (children == null) return;
344 
345     // Otherwise, remove the child if it is on the list.
346     children.removeElement(child);
347   }
348 
349   /**
350    * Adds an object to this node's content before
351    * the specified child node.
352    * @param child the object to be added.
353    * @param before the node before which the child will be placed.
354    */
355   public void addChildBefore (Object child, HTMLNode before) {
356 
357     int total; // Total number of child nodes.
358     int idx;   // Index of the 'before' node.
359 
360     // Return if the child is invalid.
361     if (child == null) return;
362 
363     // Return if this node has no children.
364     if (children == null) return;
365 
366     // Add the child at the beginning if the before node is
367     // invalid.
368     if (before == null) {
369 
370       addChild(child);
371       return;
372     }
373 
374     total = children.size();
375     idx = children.indexOf(before);
376   
377     // Add the child to the beginning if the 'before' node
378     // was not found.
379     if (idx < 0) idx = 0;
380 
381     // Return if the child is not of the right type.
382     if (! ((child instanceof String) ||
383       (child instanceof HTMLNode))) return;
384 
385     // Check if the 'before' node is the last node.
386     if (idx == total - 1) {
387 
388       // Add the child to the end of the list.
389       children.addElement(child);
390     } else {
391 
392       // Add the child before the 'before' node.
393       children.insertElementAt(child, idx);
394     }
395 
396     // If the child is an HTMLNode, set its parent.
397     if (child instanceof HTMLNode)
398       ((HTMLNode) child).setParent(this);
399   }
400 
401   /**
402    * Removes an attribute with the specified name from the
403    * attribute list.
404    * @param name the name of the attribute to remove.
405    */
406   public void removeAttribute (String name) {
407 
408     // Return if the attribute list is not there.
409     if (attr == null) return;
410 
411     // Otherwise, remove the attribute from the list.
412     attr.unset(name);
413   }
414 
415   /**
416    * Returns the node after this one in the parent's
417    * list of children.
418    */
419   public HTMLNode nextSibling () {
420 
421     // Return nothing if the node has no parent.
422     if (parent == null) return null;
423 
424     // Ask the parent to return the node after this one.
425     return parent.nextChild(this);
426   }
427 
428   /**
429    * Returns the node before this one in the parent's
430    * list of children.
431    */
432   public HTMLNode previousSibling () {
433 
434     // Return nothing if the node has no parent.
435     if (parent == null) return null;
436 
437     // Ask the parent to return the node before this one.
438     return parent.previousChild(this);
439   }
440 
441   /**
442    * Returns the first child of this node.
443    */
444   public HTMLNode firstChild () {
445 
446     Enumeration list; // Enumeration of this node's children.
447     Object curr;    // Current node from the list.
448 
449     // Return nothing if this node has no children.
450     if (children == null) return null;
451 
452     // Return the first child node.
453     list = children.elements();
454 
455     while (list.hasMoreElements()) {
456 
457       curr = list.nextElement();
458 
459       // Return the first HTMLNode in the list.
460       if (curr instanceof HTMLNode)
461         return (HTMLNode) curr;
462     }
463 
464     // Return nothing if there were no HTMLNodes in the list.
465     return null;
466   }
467 
468   /**
469    * Returns the HTMLNode after the specified one in this
470    * nodes content.
471    * @param child the HTMLNode before the one we want.
472    */
473   public HTMLNode nextChild (HTMLNode child) {
474     
475     Enumeration list;   // List of this node's children.
476     Object curr;     // Current object from the list.
477     boolean getNext = false; // True when child has been found.
478 
479     // Return nothing if this node has no children.
480     if (children == null) return null;
481 
482     // Get a list of this node's children
483     list = children.elements();
484 
485     while (list.hasMoreElements()) {
486 
487       curr = list.nextElement();
488 
489       // Check if we have found the specified child.
490       if (getNext) {
491 
492         // Return the next HTMLNode we encounter.
493         if (curr instanceof HTMLNode)
494           return (HTMLNode) curr;
495       } else {
496 
497         // Check if we have found the specified child.
498         if (curr == child) getNext = true;
499       }
500     }
501 
502     return null;
503   }
504 
505   /**
506    * Returns the HTMLNode before the specified one in this
507    * nodes content.
508    * @param child the HTMLNode after the one we want.
509    */
510   public HTMLNode previousChild (HTMLNode child) {
511     
512     Enumeration list;     // List of this node's children.
513     Object curr;       // Current object from the list.
514     HTMLNode prev = null;      // Stores last found HTMLNode.
515     boolean returnPrev = true; // True when child has been found.
516 
517     // Return nothing if this node has no children.
518     if (children == null) return null;
519 
520     // Get a list of this node's children
521     list = children.elements();
522 
523     while (list.hasMoreElements()) {
524 
525       curr = list.nextElement();
526 
527       // Check if we have found the specified child.
528       if (curr == child) return prev;
529 
530       // Check if curr is an HTMLNode.
531       if (curr instanceof HTMLNode) {
532 
533         // Make curr the previously found HTMLNode.
534         prev = (HTMLNode) curr;
535       }
536     }
537 
538     return null;
539   }
540 
541   /**
542    * Parses the contents of this HTML node from the enumeration
543    * of tokens provided.
544    * @param src an enumeration of tokens.
545    */
546   private Vector parseChildren (Enumeration src) {
547 
548     // Create a new Vector to store the contents.
549     Vector store = new Vector();
550 
551     // Loop round the enumeration of tokens.
552     while (src.hasMoreElements()) {
553 
554       // Get the next token from the enumeration.
555       Object token = src.nextElement();
556 
557       // Check if the token is simple text.
558       if (token instanceof TextToken) {
559 
560         // Cast the token into type TextToken.
561         TextToken text = (TextToken) token;
562 
563         // Add the text string to the vector.
564         store.addElement(text.getText());
565 
566         continue;
567       } 
568 
569       // Check if the token is a tag.
570       if (token instanceof TagToken) {
571 
572         // Cast the token into type TagToken.
573         TagToken tag = (TagToken) token;
574 
575         // Check if the token is an end tag.
576         if (tag.isEndTag()) {
577 
578           // Break if the end tags name matches.
579           if (name != null &&
580             name.equals(tag.getName())) break;
581 
582           // Otherwise ignore the end tag.
583           continue;
584         }
585 
586         // Otherwise make it into an HTMLNode.
587         HTMLNode he =
588           new HTMLNode(tag, this, src);
589 
590         // Add the node to the vector.
591         store.addElement(he);
592       }
593     }
594 
595     if (store.size() > 0)
596       return store;
597     else
598       return null;
599   }
600    
601   /**
602    * String of default node names which are standalone.
603    */
604   private static String[] defaultStandaloneList = {
605     "area", "base", "basefont", "bgsound", "br",
606     "col", "dd", "dl", "dt", "font", "frame",
607     "hr", "img", "input", "isindex", "li",
608     "link", "meta", "nextid", "option", "overlay", "p",
609     "param", "tab", "wbr", "!", "!--"
610   };
611 
612   // Full list of standalone names.
613   private static Vector standaloneList = null;
614 
615   // Load the default standalones into the list after class resolution.
616   static {
617     setupStandaloneList();
618   }
619 
620   /**
621    * Utility method which people can use to find out exactly
622    * which nodes are in the default standalone list. The default
623     * list is printed to the standard output.
624    */
625   public static void printDefaultStandaloneList () {
626     System.out.println(defaultStandaloneList);
627   }
628 
629   /**
630    * Adds the specified string to the standalone list.
631    * @param name the new standalone name.
632    */
633   public static void addStandalone (String name) {
634 
635     // Check if the list has been initialized first.
636     if (standaloneList == null) return;
637 
638     // Convert the String to lower case.
639     String lc = name.toLowerCase();
640 
641     // Check that the list does not have the String already.
642     if (standaloneList.contains(lc)) return;
643 
644     // Otherwise add the lowercase string to the list.
645     standaloneList.addElement(lc);
646   }
647 
648   /**
649    * Removes the specified string from the standalone list.
650    * @param name the standalone name to remove.
651    */
652   public static void removeStandalone (String name) {
653 
654     // Check if the standaloneList has been initialized first.
655     if (standaloneList == null) return;
656 
657     // Convert the String to lower case.
658     String lc = name.toLowerCase();
659 
660     // Remove the lowercase string from the list.
661     standaloneList.removeElement(lc);
662   }
663 
664   /**
665    * Checks the standalone list to see if it mentions the specified
666    * tag name and returns true if so.
667    * @param name the tag name to check against the list.
668    */
669   public static boolean isStandalone (String name) {
670 
671     // Check if the standaloneList has been initialized first.
672     if (standaloneList == null) return true;
673 
674     // Otherwise check the list to see if it contains the tag name.
675     return standaloneList.contains(name);
676   }
677 
678   /**
679    * Sets up the standalone vector at runtime using the list of
680    * default standalone tags. New standalone tags can then be added
681    * to the vector. <p>
682    * This method will only be executed once, since it is guarded
683    * by a private boolean variable.
684    */
685   private static void setupStandaloneList () {
686 
687     // Create a new vector to store the defaults.
688     standaloneList = new Vector(defaultStandaloneList.length);
689 
690     // Add all of the strings in the default list.
691     for (int i = 0; i < defaultStandaloneList.length; i++)
692       standaloneList.addElement(defaultStandaloneList[i]);
693   }
694 }