Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: com/port80/html/tidy/Clean.java


1   /*
2    * @(#)Clean.java   1.11 2000/08/16
3    *
4    */
5   
6   /**
7    *
8    * Clean up misuse of presentation markup
9    *
10   * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
11   * See Tidy.java for the copyright notice.
12   * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
13   * HTML Tidy Release 4 Aug 2000</a>
14   *
15   * @author  Dave Raggett <dsr@w3.org>
16   * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
17   * @version 1.0, 1999/05/22
18   * @version 1.0.1, 1999/05/29
19   * @version 1.1, 1999/06/18 Java Bean
20   * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
21   * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
22   * @version 1.4, 1999/09/04 DOM support
23   * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
24   * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
25   * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
26   * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
27   * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
28   * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
29   * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
30   */
31  package com.port80.html.tidy;
32  
33  /**
34    Filters from other formats such as Microsoft Word
35    often make excessive use of presentation markup such
36    as font tags, B, I, and the align attribute. By applying
37    a set of production rules, it is straight forward to
38    transform this to use CSS.
39  
40    Some rules replace some of the children of an element by
41    style properties on the element, e.g.
42  
43    <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
44  
45    Such rules are applied to the element's content and then
46    to the element itself until none of the rules more apply.
47    Having applied all the rules to an element, it will have
48    a style attribute with one or more properties. 
49  
50    Other rules strip the element they apply to, replacing
51    it by style properties on the contents, e.g.
52    
53    <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
54        
55    These rules are applied to an element before processing
56    its content and replace the current element by the first
57    element in the exposed content.
58  
59    After applying both sets of rules, you can replace the
60    style attribute by a class value and style rule in the
61    document head. To support this, an association of styles
62    and class names is built.
63  
64    A naive approach is to rely on string matching to test
65    when two property lists are the same. A better approach
66    would be to first sort the properties before matching.
67  */
68  public class Clean {
69  
70    private static final String NAME = "Clean";
71    private static final boolean FIXME = false;
72  
73    private int classNum = 1;
74    private TagTable tt;
75  
76    public Clean(TagTable tt) {
77      this.tt = tt;
78    }
79  
80    private StyleProp insertProperty(StyleProp props, String name, String value) {
81      StyleProp first, prev, prop;
82      int cmp;
83      prev = null;
84      first = props;
85      while (props != null) {
86        cmp = props.name.compareTo(name);
87        if (cmp == 0) {
88          /* this property is already defined, ignore new value */
89          return first;
90        }
91        if (cmp > 0) // props.name > name
92          {
93          /* insert before this */
94          prop = new StyleProp(name, value, props);
95          if (prev != null)
96            prev.next = prop;
97          else
98            first = prop;
99          return first;
100       }
101       prev = props;
102       props = props.next;
103     }
104     prop = new StyleProp(name, value);
105     if (prev != null)
106       prev.next = prop;
107     else
108       first = prop;
109     return first;
110   }
111 
112   /*
113    Create sorted linked list of properties from style string
114    It temporarily places nulls in place of ':' and ';' to
115    delimit the strings for the property name and value.
116    Some systems don't allow you to null literal strings,
117    so to avoid this, a copy is made first.
118   */
119   private StyleProp createProps(StyleProp prop, String style) {
120     int name_end;
121     int value_end;
122     int value_start = 0;
123     int name_start = 0;
124     boolean more;
125     name_start = 0;
126     while (name_start < style.length()) {
127       while (name_start < style.length() && style.charAt(name_start) == ' ')
128         ++name_start;
129       name_end = name_start;
130       while (name_end < style.length()) {
131         if (style.charAt(name_end) == ':') {
132           value_start = name_end + 1;
133           break;
134         }
135         ++name_end;
136       }
137       if (name_end >= style.length() || style.charAt(name_end) != ':')
138         break;
139       while (value_start < style.length() && style.charAt(value_start) == ' ')
140         ++value_start;
141       value_end = value_start;
142       more = false;
143       while (value_end < style.length()) {
144         if (style.charAt(value_end) == ';') {
145           more = true;
146           break;
147         }
148         ++value_end;
149       }
150       prop =
151         insertProperty(
152           prop,
153           style.substring(name_start, name_end),
154           style.substring(value_start, value_end));
155       if (more) {
156         name_start = value_end + 1;
157         continue;
158       }
159       break;
160     }
161     return prop;
162   }
163 
164   private String createPropString(StyleProp props) {
165     String style = "";
166     int len;
167     StyleProp prop;
168     /* compute length */
169     for (len = 0, prop = props; prop != null; prop = prop.next) {
170       len += prop.name.length() + 2;
171       len += prop.value.length() + 2;
172     }
173     for (prop = props; prop != null; prop = prop.next) {
174       style = style.concat(prop.name);
175       style = style.concat(": ");
176       style = style.concat(prop.value);
177       if (prop.next == null)
178         break;
179       style = style.concat("; ");
180     }
181     return style;
182   }
183 
184   /*
185     create string with merged properties
186   */
187   private String addProperty(String style, String property) {
188     StyleProp prop;
189     prop = createProps(null, style);
190     prop = createProps(prop, property);
191     style = createPropString(prop);
192     return style;
193   }
194 
195   private String gensymClass(String tag) {
196     String str;
197     str = "c" + classNum;
198     classNum++;
199     return str;
200   }
201 
202   private String findStyle(Lexer lexer, String tag, String properties) {
203     Style style;
204     for (style = lexer.styles; style != null; style = style.next) {
205       if (style.tag.equals(tag) && style.properties.equals(properties))
206         return style.tagClass;
207     }
208     style = new Style(tag, gensymClass(tag), properties, lexer.styles);
209     lexer.styles = style;
210     return style.tagClass;
211   }
212 
213   /*
214    Find style attribute in node, and replace it
215    by corresponding class attribute. Search for
216    class in style dictionary otherwise gensym
217    new class and add to dictionary.
218   
219    Assumes that node doesn't have a class attribute
220   */
221   private void style2Rule(Lexer lexer, Node node) {
222     AttVal styleattr, classattr;
223     String classname;
224     styleattr = node.getAttrByName("style");
225     if (styleattr != null) {
226       classname = findStyle(lexer, node.element, styleattr.value);
227       classattr = node.getAttrByName("class");
228       /*
229        if there already is a class attribute
230        then append class name after a space
231       */
232       if (classattr != null) {
233         classattr.value = classattr.value + " " + classname;
234         node.removeAttribute(styleattr);
235       } else /* reuse style attribute for class attribute */ {
236         styleattr.attribute = "class";
237         styleattr.value = classname;
238       }
239     }
240   }
241   private void addColorRule(Lexer lexer, String selector, String color) {
242     if (color != null) {
243       lexer.append(selector);
244       lexer.append(" { color: ");
245       lexer.append(color);
246       lexer.append(" }\n");
247     }
248   }
249   /*
250    move presentation attribs from body to style element
251   
252    background="foo" ->  body { background-image: url(foo) }
253    bgcolor="foo"    ->  body { background-color: foo }
254    text="foo"       ->  body { color: foo }
255    link="foo"       ->  :link { color: foo }
256    vlink="foo"      ->  :visited { color: foo }
257    alink="foo"      ->  :active { color: foo }
258   */
259   private void cleanBodyAttrs(Lexer lexer, Node body) {
260     AttVal attr;
261     String bgurl = null;
262     String bgcolor = null;
263     String color = null;
264     attr = body.getAttrByName("background");
265     if (attr != null) {
266       bgurl = attr.value;
267       attr.value = null;
268       body.removeAttribute(attr);
269     }
270     attr = body.getAttrByName("bgcolor");
271     if (attr != null) {
272       bgcolor = attr.value;
273       attr.value = null;
274       body.removeAttribute(attr);
275     }
276     attr = body.getAttrByName("text");
277     if (attr != null) {
278       color = attr.value;
279       attr.value = null;
280       body.removeAttribute(attr);
281     }
282     if (bgurl != null || bgcolor != null || color != null) {
283       lexer.append(" body {\n");
284       if (bgurl != null) {
285         lexer.append("  background-image: url(");
286         lexer.append(bgurl);
287         lexer.append(");\n");
288       }
289       if (bgcolor != null) {
290         lexer.append("  background-color: ");
291         lexer.append(bgcolor);
292         lexer.append(";\n");
293       }
294       if (color != null) {
295         lexer.append("  color: ");
296         lexer.append(color);
297         lexer.append(";\n");
298       }
299       lexer.append(" }\n");
300     }
301     attr = body.getAttrByName("link");
302     if (attr != null) {
303       addColorRule(lexer, " :link", attr.value);
304       body.removeAttribute(attr);
305     }
306     attr = body.getAttrByName("vlink");
307     if (attr != null) {
308       addColorRule(lexer, " :visited", attr.value);
309       body.removeAttribute(attr);
310     }
311     attr = body.getAttrByName("alink");
312     if (attr != null) {
313       addColorRule(lexer, " :active", attr.value);
314       body.removeAttribute(attr);
315     }
316   }
317   private boolean niceBody(Lexer lexer, Node doc) {
318     Node body = doc.findBody(lexer.configuration.getTagTable());
319     if (body != null) {
320       if (body.getAttrByName("background") != null
321         || body.getAttrByName("bgcolor") != null
322         || body.getAttrByName("text") != null
323         || body.getAttrByName("link") != null
324         || body.getAttrByName("vlink") != null
325         || body.getAttrByName("alink") != null) {
326         lexer.badLayout |= Report.USING_BODY;
327         return false;
328       }
329     }
330     return true;
331   }
332   /* create style element using rules from dictionary */
333   private void createStyleElement(Lexer lexer, Node doc) {
334     Node node, head, body;
335     Style style;
336     AttVal av;
337     if (lexer.styles == null && niceBody(lexer, doc))
338       return;
339     node = lexer.newNode(Node.StartTag, 0, 0, -1, "style");
340     node.implicit = true;
341     /* insert type attribute */
342     av = new AttVal(null, null, '"', "type", "text/css");
343     av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
344     node.attributes = av;
345     body = doc.findBody(lexer.configuration.getTagTable());
346     int txtstart = lexer.length();
347     if (body != null)
348       cleanBodyAttrs(lexer, body);
349     for (style = lexer.styles; style != null; style = style.next) {
350       lexer.append(' ');
351       lexer.append(style.tag);
352       lexer.append('.');
353       lexer.append(style.tagClass);
354       lexer.append(' ');
355       lexer.append('{');
356       lexer.append(style.properties);
357       lexer.append('}');
358       lexer.append('\n');
359     }
360     Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, txtstart, lexer.length(), -1));
361     /*
362      now insert style element into document head
363     
364      doc is root node. search its children for html node
365      the head node should be first child of html node
366     */
367     head = doc.findHEAD(lexer.configuration.getTagTable());
368     if (head != null)
369       Node.insertNodeAtEnd(head, node);
370   }
371   /* ensure bidirectional links are consistent */
372   private void fixNodeLinks(Node node) {
373     Node child;
374     if (node.prev != null)
375       node.prev.next = node;
376     else
377       node.parent.content = node;
378     if (node.next != null)
379       node.next.prev = node;
380     else
381       node.parent.last = node;
382     for (child = node.content; child != null; child = child.next)
383       child.parent = node;
384   }
385   /*
386    used to strip child of node when
387    the node has one and only one child
388   */
389   private void stripOnlyChild(Node node) {
390     Node child;
391     child = node.content;
392     node.content = child.content;
393     node.last = child.last;
394     child.content = null;
395     for (child = node.content; child != null; child = child.next)
396       child.parent = node;
397   }
398 
399   /**
400    * Discard the container 'element'.
401    * Used to strip font start and end tags.
402    * @return The next node after 'element' in the new context.
403    */
404   private Node discardContainer(Node element) {
405     Node ret;
406     Node parent = element.parent;
407     if (element.content != null) {
408       // Promote element.content and replace the current element.
409       // <element><content>, ..., <last></element>
410       // to
411       // <element.prev><content>...<last><element.next>
412       element.last.next = element.next;
413       if (element.next != null) {
414         element.next.prev = element.last;
415         element.last.next = element.next;
416       } else
417         parent.last = element.last;
418       if (element.prev != null) {
419         element.content.prev = element.prev;
420         element.prev.next = element.content;
421       } else
422         parent.content = element.content;
423       for (Node node = element.content; node != null; node = node.next)
424         node.parent = parent;
425       ret = element.content;
426     } else {
427       if (element.next != null)
428         element.next.prev = element.prev;
429       else
430         parent.last = element.prev;
431       if (element.prev != null)
432         element.prev.next = element.next;
433       else
434         parent.content = element.next;
435       ret = element.next;
436     }
437     element.next = null;
438     element.content = null;
439     return ret;
440   }
441 
442   /**
443    * Merge two consecutive containers, eg. &lt;b>...&lt;/b>&lt;b>...&lt;/b>.
444    * Content of 'next' is appended to the content of 'element' and container 'next' is discarded.
445    * @return The next node to be processed in the new context.
446    */
447   private Node mergeContainers(Node element, Node next) {
448     Node ret;
449     if (element.content == null && next.content == null) {
450       // This should not occurs.
451       // But just in case, remove both element and next.
452       ret = next.next;
453       Node.removeNode(element);
454       Node.removeNode(next);
455       return ret;
456     } else if (element.content != null && next.content == null) {
457       // Should not occur either.
458       ret = element;
459       Node.removeNode(next);
460     } else if (element.content == null && next.content != null) {
461       ret = next;
462       Node.removeNode(element);
463     } else {
464       ret = element;
465       element.last.next = next.content;
466       next.content.prev=element.last;
467       element.last = next.last;
468       for (Node n = next.content; n != null; n = n.next) {
469         n.parent = element;
470       }
471       next.content = null;
472       next.last = null;
473       Node.removeNode(next);
474     }
475     return ret;
476   }
477 
478   /*
479    Add style property to element, creating style
480    attribute as needed and adding ; delimiter
481   */
482   private void addStyleProperty(Node node, String property) {
483     AttVal av;
484     for (av = node.attributes; av != null; av = av.next) {
485       if (av.attribute.equals("style"))
486         break;
487     }
488     /* if style attribute already exists then insert property */
489     if (av != null) {
490       String s;
491       s = addProperty(av.value, property);
492       av.value = s;
493     } else /* else create new style attribute */ {
494       av = new AttVal(node.attributes, null, '"', "style", property);
495       av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
496       node.attributes = av;
497     }
498   }
499   /*
500     Create new string that consists of the
501     combined style properties in s1 and s2
502   
503     To merge property lists, we build a linked
504     list of property/values and insert properties
505     into the list in order, merging values for
506     the same property name.
507   */
508   private String mergeProperties(String s1, String s2) {
509     String s;
510     StyleProp prop;
511     prop = createProps(null, s1);
512     prop = createProps(prop, s2);
513     s = createPropString(prop);
514     return s;
515   }
516   private void mergeStyles(Node node, Node child) {
517     AttVal av;
518     String s1, s2, style;
519     for (s2 = null, av = child.attributes; av != null; av = av.next) {
520       if (av.attribute.equals("style")) {
521         s2 = av.value;
522         break;
523       }
524     }
525     for (s1 = null, av = node.attributes; av != null; av = av.next) {
526       if (av.attribute.equals("style")) {
527         s1 = av.value;
528         break;
529       }
530     }
531     if (s1 != null) {
532       if (s2 != null) /* merge styles from both */ {
533         style = mergeProperties(s1, s2);
534         av.value = style;
535       }
536     } else if (s2 != null) /* copy style of child */ {
537       av = new AttVal(node.attributes, null, '"', "style", s2);
538       av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
539       node.attributes = av;
540     }
541   }
542   private String fontSize2Name(String size) {
543     /*
544     String[] sizes =
545     {
546         "50%",
547         "60%",
548         "80%",
549         null,
550         "120%",
551         "150%",
552         "200%"
553     };
554     */
555     String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
556     String buf;
557     if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
558       int n = size.charAt(0) - '0';
559       return sizes[n];
560     }
561     if (size.length() > 0 && size.charAt(0) == '-') {
562       if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
563         int n = size.charAt(1) - '0';
564         double x;
565         for (x = 1.0; n > 0; --n)
566           x *= 0.8;
567         x *= 100.0;
568         buf = "" + (int) x + "%";
569         return buf;
570       }
571       return "smaller"; /*"70%"; */
572     }
573     if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
574       int n = size.charAt(1) - '0';
575       double x;
576       for (x = 1.0; n > 0; --n)
577         x *= 1.2;
578       x *= 100.0;
579       buf = "" + (int) x + "%";
580       return buf;
581     }
582     return "larger"; /* "140%" */
583   }
584   private void addFontFace(Node node, String face) {
585     addStyleProperty(node, "font-family: " + face);
586   }
587   private void addFontSize(Node node, String size) {
588     String value;
589     if (size.equals("6") && node.tag == tt.tagP) {
590       node.element = "h1";
591       tt.findTag(node);
592       return;
593     }
594     if (size.equals("5") && node.tag == tt.tagP) {
595       node.element = "h2";
596       tt.findTag(node);
597       return;
598     }
599     if (size.equals("4") && node.tag == tt.tagP) {
600       node.element = "h3";
601       tt.findTag(node);
602       return;
603     }
604     value = fontSize2Name(size);
605     if (value != null) {
606       addStyleProperty(node, "font-size: " + value);
607     }
608   }
609   private void addFontColor(Node node, String color) {
610     addStyleProperty(node, "color: " + color);
611   }
612   private void addAlign(Node node, String align) {
613     /* force alignment value to lower case */
614     addStyleProperty(node, "text-align: " + align.toLowerCase());
615   }
616   /*
617    add style properties to node corresponding to
618    the font face, size and color attributes
619   */
620   private void addFontStyles(Node node, AttVal av) {
621     while (av != null) {
622       if (av.attribute.equals("face"))
623         addFontFace(node, av.value);
624       else if (av.attribute.equals("size"))
625         addFontSize(node, av.value);
626       else if (av.attribute.equals("color"))
627         addFontColor(node, av.value);
628       av = av.next;
629     }
630   }
631   /*
632       Symptom: <p align=center>
633       Action: <p style="text-align: center">
634   */
635   private void textAlign(Lexer lexer, Node node) {
636     AttVal av, prev;
637     prev = null;
638     for (av = node.attributes; av != null; av = av.next) {
639       if (av.attribute.equals("align")) {
640         if (prev != null)
641           prev.next = av.next;
642         else
643           node.attributes = av.next;
644         if (av.value != null) {
645           addAlign(node, av.value);
646         }
647         break;
648       }
649       prev = av;
650     }
651   }
652   /*
653      The clean up rules use the pnode argument to return the
654      next node when the orignal node has been deleted
655   */
656   /*
657       Symptom: <dir> <li> where <li> is only child
658       Action: coerce <dir> <li> to <div> with indent.
659   */
660   private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
661     Node child;
662     if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
663       child = node.content;
664       if (child == null)
665         return false;
666       /* check child has no peers */
667       if (child.next != null)
668         return false;
669       if (child.tag != tt.tagLi)
670         return false;
671       if (!child.implicit)
672         return false;
673       /* coerce dir to div */
674       node.tag = tt.tagDiv;
675       node.element = "div";
676       addStyleProperty(node, "margin-left: 2em");
677       stripOnlyChild(node);
678       return true;
679       //#if 0
680       //Node content;
681       //Node last;
682       //content = child.content;
683       //last = child.last;
684       //child.content = null;
685       /* adjust parent and set margin on contents of <li> */
686       //for (child = content; child != null; child = child.next)
687       //{
688       //    child.parent = node.parent;
689       //    addStyleProperty(child, "margin-left: 1em");
690       //}
691       /* hook first/last into sequence */
692       //if (content != null)
693       //{
694       //    content.prev = node.prev;
695       //    last.next = node.next;
696       //    fixNodeLinks(content);
697       //    fixNodeLinks(last);
698       //}
699       //node.next = null;
700       /* ensure that new node is cleaned */
701       //pnode.setObject(cleanNode(lexer, content));
702       //return true;
703       //#endif
704     }
705     return false;
706   }
707   /*
708       Symptom: <center>
709       Action: replace <center> by <div style="text-align: center">
710   */
711   private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
712     if (node.tag == tt.tagCenter) {
713       if (lexer.configuration.DropFontTags) {
714         if (node.content != null) {
715           Node last = node.last;
716           Node parent = node.parent;
717           pnode.setObject(discardContainer(node));
718           node = lexer.inferredTag("br");
719           if (last.next != null)
720             last.next.prev = node;
721           node.next = last.next;
722           last.next = node;
723           node.prev = last;
724           if (parent.last == last)
725             parent.last = node;
726           node.parent = parent;
727         } else {
728           Node prev = node.prev;
729           Node next = node.next;
730           Node parent = node.parent;
731           pnode.setObject(discardContainer(node));
732           node = lexer.inferredTag("br");
733           node.next = next;
734           node.prev = prev;
735           node.parent = parent;
736           if (next != null)
737             next.prev = node;
738           else
739             parent.last = node;
740           if (prev != null)
741             prev.next = node;
742           else
743             parent.content = node;
744         }
745         return true;
746       }
747       node.tag = tt.tagDiv;
748       node.element = "div";
749       addStyleProperty(node, "text-align: center");
750       return true;
751     }
752     return false;
753   }
754   /*
755       Symptom <div><div>...</div></div>
756       Action: merge the two divs
757   
758     This is useful after nested <dir>s used by Word
759     for indenting have been converted to <div>s
760   */
761   private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
762     Node child;
763     if (node.tag != tt.tagDiv)
764       return false;
765     child = node.content;
766     if (child == null)
767       return false;
768     if (child.tag != tt.tagDiv)
769       return false;
770     if (child.next != null)
771       return false;
772     mergeStyles(node, child);
773     stripOnlyChild(node);
774     return true;
775   }
776   /*
777       Symptom: <ul><li><ul>...</ul></li></ul>
778       Action: discard outer list
779   */
780   private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
781     Node child, list;
782     if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
783       child = node.content;
784       if (child == null)
785         return false;
786       /* check child has no peers */
787       if (child.next != null)
788         return false;
789       list = child.content;
790       if (list == null)
791         return false;
792       if (list.tag != node.tag)
793         return false;
794       pnode.setObject(node.next);
795       /* move inner list node into position of outer node */
796       list.prev = node.prev;
797       list.next = node.next;
798       list.parent = node.parent;
799       fixNodeLinks(list);
800       /* get rid of outer ul and its li */
801       child.content = null;
802       node.content = null;
803       node.next = null;
804       /*
805         If prev node was a list the chances are this node
806         should be appended to that list. Word has no way of
807         recognizing nested lists and just uses indents
808       */
809       if (list.prev != null) {
810         node = list;
811         list = node.prev;
812         if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
813           list.next = node.next;
814           if (list.next != null)
815             list.next.prev = list;
816           child = list.last; /* <li> */
817           node.parent = child;
818           node.next = null;
819           node.prev = child.last;
820           fixNodeLinks(node);
821         }
822       }
823       cleanNode(lexer, node);
824       return true;
825     }
826     return false;
827   }
828   /*
829       Symptom: the only child of a block-level element is a
830       presentation element such as B, I or FONT
831   
832       Action: add style "font-weight: bold" to the block and
833       strip the <b> element, leaving its children.
834   
835     example:
836   
837       <p>
838         <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
839       </p>
840   
841     becomes:
842   
843         <p style="font-weight: bold; font-family: Arial; font-size: 6">
844           Draft Recommended Practice
845         </p>
846   
847     This code also replaces the align attribute by a style attribute.
848     However, to avoid CSS problems with Navigator 4, this isn't done
849     for the elements: caption, tr and table
850   */
851   private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
852     Node child;
853     if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
854       if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
855         /* check for align attribute */
856         if (node.tag != tt.tagCaption)
857           textAlign(lexer, node);
858         child = node.content;
859         if (child == null)
860           return false;
861         /* check child has no peers */
862         if (child.next != null)
863           return false;
864         if (child.tag == tt.tagB) {
865           mergeStyles(node, child);
866           addStyleProperty(node, "font-weight: bold");
867           stripOnlyChild(node);
868           return true;
869         }
870         if (child.tag == tt.tagI) {
871           mergeStyles(node, child);
872           addStyleProperty(node, "font-style: italic");
873           stripOnlyChild(node);
874           return true;
875         }
876         if (child.tag == tt.tagFont) {
877           mergeStyles(node, child);
878           addFontStyles(node, child.attributes);
879           stripOnlyChild(node);
880           return true;
881         }
882       }
883     }
884     return false;
885   }
886   /* the only child of table cell or an inline element such as em */
887   private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
888     Node child;
889     if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
890       child = node.content;
891       if (child == null)
892         return false;
893       /* check child has no peers */
894       if (child.next != null)
895         return false;
896       if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
897         mergeStyles(node, child);
898         addStyleProperty(node, "font-weight: bold");
899         stripOnlyChild(node);
900         return true;
901       }
902       if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
903         mergeStyles(node, child);
904         addStyleProperty(node, "font-style: italic");
905         stripOnlyChild(node);
906         return true;
907       }
908       if (child.tag == tt.tagFont) {
909         mergeStyles(node, child);
910         addFontStyles(node, child.attributes);
911         stripOnlyChild(node);
912         return true;
913       }
914     }
915     return false;
916   }
917   /*
918     Replace font elements by span elements, deleting
919     the font element's attributes and replacing them
920     by a single style attribute.
921   */
922   private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
923     AttVal av, style, next;
924     if (node.tag == tt.tagFont) {
925       if (lexer.configuration.DropFontTags) {
926         pnode.setObject(discardContainer(node));
927         return false;
928       }
929       /* if FONT is only child of parent element then leave alone */
930       if (node.parent.content == node && node.next == null)
931         return false;
932       addFontStyles(node, node.attributes);
933       /* extract style attribute and free the rest */
934       av = node.attributes;
935       style = null;
936       while (av != null) {
937         next = av.next;
938         if (av.attribute.equals("style")) {
939           av.next = null;
940           style = av;
941         }
942         av = next;
943       }
944       node.attributes = style;
945       node.tag = tt.tagSpan;
946       node.element = "span";
947       return true;
948     }
949     return false;
950   }
951   /*
952     Applies all matching rules to a node.
953   */
954   private Node cleanNode(Lexer lexer, Node node) {
955     Node next = null;
956     MutableObject o = new MutableObject();
957     boolean b = false;
958     for (next = node; node.isElement(); node = next) {
959       o.setObject(next);
960       b = dir2Div(lexer, node, o);
961       next = (Node) o.getObject();
962       if (b)
963         continue;
964       b = nestedList(lexer, node, o);
965       next = (Node) o.getObject();
966       if (b)
967         continue;
968       b = center2Div(lexer, node, o);
969       next = (Node) o.getObject();
970       if (b)
971         continue;
972       b = mergeDivs(lexer, node, o);
973       next = (Node) o.getObject();
974       if (b)
975         continue;
976       b = blockStyle(lexer, node, o);
977       next = (Node) o.getObject();
978       if (b)
979         continue;
980       b = inlineStyle(lexer, node, o);
981       next = (Node) o.getObject();
982       if (b)
983         continue;
984       b = font2Span(lexer, node, o);
985       next = (Node) o.getObject();
986       if (b)
987         continue;
988       break;
989     }
990     return next;
991   }
992   private Node createStyleProperties(Lexer lexer, Node node) {
993     Node child;
994     if (node.content != null) {
995       for (child = node.content; child != null; child = child.next) {
996         child = createStyleProperties(lexer, child);
997       }
998     }
999     return cleanNode(lexer, node);
1000  }
1001  private void defineStyleRules(Lexer lexer, Node node) {
1002    Node child;
1003    if (node.content != null) {
1004      for (child = node.content; child != null; child = child.next) {
1005        defineStyleRules(lexer, child);
1006      }
1007    }
1008    style2Rule(lexer, node);
1009  }
1010  public void cleanTree(Lexer lexer, Node doc) {
1011    doc = createStyleProperties(lexer, doc);
1012    if (!lexer.configuration.MakeClean) {
1013      defineStyleRules(lexer, doc);
1014      createStyleElement(lexer, doc);
1015    }
1016  }
1017
1018  /**
1019   * Simplifies <b><b> ... </b> ...</b> etc.
1020   * Also remove adjacent blocks eg. ... </i><i> ... etc.
1021   */
1022  public void nestedEmphasis(Node node) {
1023    Node next;
1024    while (node != null) {
1025      next = node.next;
1026      if ((node.tag == tt.tagB
1027        || node.tag == tt.tagI
1028        || node.tag == tt.tagEm
1029        || node.tag == tt.tagStrong)) {
1030        if (node.parent != null && node.parent.tag == node.tag) {
1031          /* strip redundant inner element */
1032          node = discardContainer(node);
1033          continue;
1034        }
1035        // <b>... </b><b> ... </b> case.
1036        // Note: Empty <b></b> should have be eliminated during parsing.
1037        if (next != null && next.tag == node.tag) {
1038          if (true)
1039            System.err.println(
1040              NAME + ".nestedEmphasis(): node=" + node + "\n\tnext=" + next);
1041          node = mergeContainers(node, next);
1042          continue;
1043        }
1044      }
1045      if (node.content != null) {
1046        nestedEmphasis(node.content);
1047      }
1048      node = next;
1049    }
1050  }
1051
1052  /* replace i by em and b by strong */
1053  public void emFromI(Node node) {
1054    while (node != null) {
1055      if (node.tag == tt.tagI) {
1056        node.element = tt.tagEm.name;
1057        node.tag = tt.tagEm;
1058      } else if (node.tag == tt.tagB) {
1059        node.element = tt.tagStrong.name;
1060        node.tag = tt.tagStrong;
1061      }
1062      if (node.content != null)
1063        emFromI(node.content);
1064      node = node.next;
1065    }
1066  }
1067  /*
1068   Some people use dir or ul without an li
1069   to indent the content. The pattern to
1070   look for is a list with a single implicit
1071   li. This is recursively replaced by an
1072   implicit blockquote.
1073  */
1074  public void list2BQ(Node node) {
1075    while (node != null) {
1076      if (node.content != null)
1077        list2BQ(node.content);
1078      if (node.tag != null
1079        && node.tag.parser == ParserImpl.getParseList()
1080        && node.hasOneChild()
1081        && node.content.implicit) {
1082        stripOnlyChild(node);
1083        node.element = tt.tagBlockquote.name;
1084        node.tag = tt.tagBlockquote;
1085        node.implicit = true;
1086      }
1087      node = node.next;
1088    }
1089  }
1090  /*
1091   Replace implicit blockquote by div with an indent
1092   taking care to reduce nested blockquotes to a single
1093   div with the indent set to match the nesting depth
1094  */
1095  public void bQ2Div(Node node) {
1096    int indent;
1097    String indent_buf;
1098    while (node != null) {
1099      if (node.tag == tt.tagBlockquote && node.implicit) {
1100        indent = 1;
1101        while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
1102          ++indent;
1103          stripOnlyChild(node);
1104        }
1105        if (node.content != null)
1106          bQ2Div(node.content);
1107        indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1108        node.element = tt.tagDiv.name;
1109        node.tag = tt.tagDiv;
1110        node.addAttribute("style", indent_buf);
1111      } else if (node.content != null)
1112        bQ2Div(node.content);
1113      node = node.next;
1114    }
1115  }
1116  /* node is <![if ...]> prune up to <![endif]> */
1117  public Node pruneSection(Lexer lexer, Node node) {
1118    for (;;) {
1119      /* discard node and returns next */
1120      node = Node.discardElement(node);
1121      if (node == null)
1122        return null;
1123      if (node.type == Node.SectionTag) {
1124        if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1125          node = pruneSection(lexer, node);
1126          continue;
1127        }
1128        if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) {
1129          node = Node.discardElement(node);
1130          break;
1131        }
1132      }
1133    }
1134    return node;
1135  }
1136  public void dropSections(Lexer lexer, Node node) {
1137    while (node != null) {
1138      if (node.type == Node.SectionTag) {
1139        /* prune up to matching endif */
1140        if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1141          node = pruneSection(lexer, node);
1142          continue;
1143        }
1144        /* discard others as well */
1145        node = Node.discardElement(node);
1146        continue;
1147      }
1148      if (node.content != null)
1149        dropSections(lexer, node.content);
1150      node = node.next;
1151    }
1152  }
1153  public void purgeAttributes(Node node) {
1154    AttVal attr = node.attributes;
1155    AttVal next = null;
1156    AttVal prev = null;
1157    while (attr != null) {
1158      next = attr.next;
1159      /* special check for class="Code" denoting pre text */
1160      if (attr.attribute != null
1161        && attr.value != null
1162        && attr.attribute.equals("class")
1163        && attr.value.equals("Code")) {
1164        prev = attr;
1165      } else if (
1166        attr.attribute != null
1167          && (attr.attribute.equals("class")
1168            || attr.attribute.equals("style")
1169            || attr.attribute.equals("lang")
1170            || attr.attribute.startsWith("x:")
1171            || ((attr.attribute.equals("height") || attr.attribute.equals("width"))
1172              && (node.tag == tt.tagTd
1173                || node.tag == tt.tagTr
1174                || node.tag == tt.tagTh)))) {
1175        if (prev != null)
1176          prev.next = next;
1177        else
1178          node.attributes = next;
1179      } else
1180        prev = attr;
1181      attr = next;
1182    }
1183  }
1184  /* Word2000 uses span excessively, so we strip span out */
1185  public Node stripSpan(Lexer lexer, Node span) {
1186    Node node;
1187    Node prev = null;
1188    Node content;
1189    /*
1190     deal with span elements that have content
1191     by splicing the content in place of the span
1192     after having processed it
1193    */
1194    cleanWord2000(lexer, span.content);
1195    content = span.content;
1196    if (span.prev != null)
1197      prev = span.prev;
1198    else if (content != null) {
1199      node = content;
1200      content = content.next;
1201      Node.removeNode(node);
1202      Node.insertNodeBeforeElement(span, node);
1203      prev = node;
1204    }
1205    while (content != null) {
1206      node = content;
1207      content = content.next;
1208      Node.removeNode(node);
1209      Node.insertNodeAfterElement(prev, node);
1210      prev = node;
1211    }
1212    if (span.next == null)
1213      span.parent.last = prev;
1214    node = span.next;
1215    span.content = null;
1216    Node.discardElement(span);
1217    return node;
1218  }
1219  /* map non-breaking spaces to regular spaces */
1220  private void normalizeSpaces(Lexer lexer, Node node) {
1221    while (node != null) {
1222      if (node.content != null)
1223        normalizeSpaces(lexer, node.content);
1224      if (node.type == Node.TextNode) {
1225        char c;
1226        for (int i = node.start; i < node.end; ++i) {
1227          c = node.textarray.charAt(i);
1228          if (c == 160) {
1229            node.textarray.setCharAt(i, ' ');
1230          }
1231        }
1232      }
1233      node = node.next;
1234    }
1235  }
1236  /*
1237   This is a major clean up to strip out all the extra stuff you get
1238   when you save as web page from Word 2000. It doesn't yet know what
1239   to do with VML tags, but these will appear as errors unless you
1240   declare them as new tags, such as o:p which needs to be declared
1241   as inline.
1242  */
1243  public void cleanWord2000(Lexer lexer, Node node) {
1244    /* used to a list from a sequence of bulletted p's */
1245    Node list = null;
1246    while (node != null) {
1247      /* discard Word's style verbiage */
1248      if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
1249        node = Node.discardElement(node);
1250        continue;
1251      }
1252      /* strip out all span tags Word scatters so liberally! */
1253      if (node.tag == tt.tagSpan) {
1254        node = stripSpan(lexer, node);
1255        continue;
1256      }
1257      /* get rid of Word's xmlns attributes */
1258      if (node.tag == tt.tagHtml) {
1259        /* check that it's a Word 2000 document */
1260        if (node.getAttrByName("xmlns:o") == null)
1261          return;
1262      }
1263      if (node.tag == tt.tagLink) {
1264        AttVal attr = node.getAttrByName("rel");
1265        if (attr != null && attr.value != null && attr.value.equals("File-List")) {
1266          node = Node.discardElement(node);
1267          continue;
1268        }
1269      }
1270      /* discard empty paragraphs */
1271      if (node.content == null && node.tag == tt.tagP) {
1272        node = Node.discardElement(node);
1273        continue;
1274      }
1275      if (node.tag == tt.tagP) {
1276        AttVal attr = node.getAttrByName("class");
1277        /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1278        if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
1279          Node.coerceNode(lexer, node, tt.tagLi);
1280          if (list == null || list.tag != tt.tagUl) {
1281            list = lexer.inferredTag("ul");
1282            Node.insertNodeBeforeElement(node, list);
1283          }
1284          purgeAttributes(node);
1285          if (node.content != null)
1286            cleanWord2000(lexer, node.content);
1287          /* remove node and append to contents of list */
1288          Node.removeNode(node);
1289          Node.insertNodeAtEnd(list, node);
1290          node = list.next;
1291        }
1292        /* map sequence of <p class="Code"> to <pre>...</pre> */
1293        else if (attr != null && attr.value != null && attr.value.equals("Code")) {
1294          Node br = lexer.newLineNode();
1295          normalizeSpaces(lexer, node);
1296          if (list == null || list.tag != tt.tagPre) {
1297            list = lexer.inferredTag("pre");
1298            Node.insertNodeBeforeElement(node, list);
1299          }
1300          /* remove node and append to contents of list */
1301          Node.removeNode(node);
1302          Node.insertNodeAtEnd(list, node);
1303          stripSpan(lexer, node);
1304          Node.insertNodeAtEnd(list, br);
1305          node = list.next;
1306        } else
1307          list = null;
1308      } else
1309        list = null;
1310      /* strip out style and class attributes */
1311      if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1312        purgeAttributes(node);
1313      if (node.content != null)
1314        cleanWord2000(lexer, node.content);
1315      node = node.next;
1316    }
1317  }
1318  public boolean isWord2000(Node root, TagTable tt) {
1319    Node html = root.findHTML(tt);
1320    return (html != null && html.getAttrByName("xmlns:o") != null);
1321  }
1322}