Source code: com/port80/html/tidy/Clean.java
1 /*
2 * @(#)Clean.java 1.11 2000/08/16
3 *
4 */
5
6 /**
7 *
8 * Clean up misuse of presentation markup
9 *
10 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
11 * See Tidy.java for the copyright notice.
12 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
13 * HTML Tidy Release 4 Aug 2000</a>
14 *
15 * @author Dave Raggett <dsr@w3.org>
16 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
17 * @version 1.0, 1999/05/22
18 * @version 1.0.1, 1999/05/29
19 * @version 1.1, 1999/06/18 Java Bean
20 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
21 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
22 * @version 1.4, 1999/09/04 DOM support
23 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
24 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
25 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
26 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
27 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
28 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
29 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
30 */
31 package com.port80.html.tidy;
32
33 /**
34 Filters from other formats such as Microsoft Word
35 often make excessive use of presentation markup such
36 as font tags, B, I, and the align attribute. By applying
37 a set of production rules, it is straight forward to
38 transform this to use CSS.
39
40 Some rules replace some of the children of an element by
41 style properties on the element, e.g.
42
43 <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
44
45 Such rules are applied to the element's content and then
46 to the element itself until none of the rules more apply.
47 Having applied all the rules to an element, it will have
48 a style attribute with one or more properties.
49
50 Other rules strip the element they apply to, replacing
51 it by style properties on the contents, e.g.
52
53 <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
54
55 These rules are applied to an element before processing
56 its content and replace the current element by the first
57 element in the exposed content.
58
59 After applying both sets of rules, you can replace the
60 style attribute by a class value and style rule in the
61 document head. To support this, an association of styles
62 and class names is built.
63
64 A naive approach is to rely on string matching to test
65 when two property lists are the same. A better approach
66 would be to first sort the properties before matching.
67 */
68 public class Clean {
69
70 private static final String NAME = "Clean";
71 private static final boolean FIXME = false;
72
73 private int classNum = 1;
74 private TagTable tt;
75
76 public Clean(TagTable tt) {
77 this.tt = tt;
78 }
79
80 private StyleProp insertProperty(StyleProp props, String name, String value) {
81 StyleProp first, prev, prop;
82 int cmp;
83 prev = null;
84 first = props;
85 while (props != null) {
86 cmp = props.name.compareTo(name);
87 if (cmp == 0) {
88 /* this property is already defined, ignore new value */
89 return first;
90 }
91 if (cmp > 0) // props.name > name
92 {
93 /* insert before this */
94 prop = new StyleProp(name, value, props);
95 if (prev != null)
96 prev.next = prop;
97 else
98 first = prop;
99 return first;
100 }
101 prev = props;
102 props = props.next;
103 }
104 prop = new StyleProp(name, value);
105 if (prev != null)
106 prev.next = prop;
107 else
108 first = prop;
109 return first;
110 }
111
112 /*
113 Create sorted linked list of properties from style string
114 It temporarily places nulls in place of ':' and ';' to
115 delimit the strings for the property name and value.
116 Some systems don't allow you to null literal strings,
117 so to avoid this, a copy is made first.
118 */
119 private StyleProp createProps(StyleProp prop, String style) {
120 int name_end;
121 int value_end;
122 int value_start = 0;
123 int name_start = 0;
124 boolean more;
125 name_start = 0;
126 while (name_start < style.length()) {
127 while (name_start < style.length() && style.charAt(name_start) == ' ')
128 ++name_start;
129 name_end = name_start;
130 while (name_end < style.length()) {
131 if (style.charAt(name_end) == ':') {
132 value_start = name_end + 1;
133 break;
134 }
135 ++name_end;
136 }
137 if (name_end >= style.length() || style.charAt(name_end) != ':')
138 break;
139 while (value_start < style.length() && style.charAt(value_start) == ' ')
140 ++value_start;
141 value_end = value_start;
142 more = false;
143 while (value_end < style.length()) {
144 if (style.charAt(value_end) == ';') {
145 more = true;
146 break;
147 }
148 ++value_end;
149 }
150 prop =
151 insertProperty(
152 prop,
153 style.substring(name_start, name_end),
154 style.substring(value_start, value_end));
155 if (more) {
156 name_start = value_end + 1;
157 continue;
158 }
159 break;
160 }
161 return prop;
162 }
163
164 private String createPropString(StyleProp props) {
165 String style = "";
166 int len;
167 StyleProp prop;
168 /* compute length */
169 for (len = 0, prop = props; prop != null; prop = prop.next) {
170 len += prop.name.length() + 2;
171 len += prop.value.length() + 2;
172 }
173 for (prop = props; prop != null; prop = prop.next) {
174 style = style.concat(prop.name);
175 style = style.concat(": ");
176 style = style.concat(prop.value);
177 if (prop.next == null)
178 break;
179 style = style.concat("; ");
180 }
181 return style;
182 }
183
184 /*
185 create string with merged properties
186 */
187 private String addProperty(String style, String property) {
188 StyleProp prop;
189 prop = createProps(null, style);
190 prop = createProps(prop, property);
191 style = createPropString(prop);
192 return style;
193 }
194
195 private String gensymClass(String tag) {
196 String str;
197 str = "c" + classNum;
198 classNum++;
199 return str;
200 }
201
202 private String findStyle(Lexer lexer, String tag, String properties) {
203 Style style;
204 for (style = lexer.styles; style != null; style = style.next) {
205 if (style.tag.equals(tag) && style.properties.equals(properties))
206 return style.tagClass;
207 }
208 style = new Style(tag, gensymClass(tag), properties, lexer.styles);
209 lexer.styles = style;
210 return style.tagClass;
211 }
212
213 /*
214 Find style attribute in node, and replace it
215 by corresponding class attribute. Search for
216 class in style dictionary otherwise gensym
217 new class and add to dictionary.
218
219 Assumes that node doesn't have a class attribute
220 */
221 private void style2Rule(Lexer lexer, Node node) {
222 AttVal styleattr, classattr;
223 String classname;
224 styleattr = node.getAttrByName("style");
225 if (styleattr != null) {
226 classname = findStyle(lexer, node.element, styleattr.value);
227 classattr = node.getAttrByName("class");
228 /*
229 if there already is a class attribute
230 then append class name after a space
231 */
232 if (classattr != null) {
233 classattr.value = classattr.value + " " + classname;
234 node.removeAttribute(styleattr);
235 } else /* reuse style attribute for class attribute */ {
236 styleattr.attribute = "class";
237 styleattr.value = classname;
238 }
239 }
240 }
241 private void addColorRule(Lexer lexer, String selector, String color) {
242 if (color != null) {
243 lexer.append(selector);
244 lexer.append(" { color: ");
245 lexer.append(color);
246 lexer.append(" }\n");
247 }
248 }
249 /*
250 move presentation attribs from body to style element
251
252 background="foo" -> body { background-image: url(foo) }
253 bgcolor="foo" -> body { background-color: foo }
254 text="foo" -> body { color: foo }
255 link="foo" -> :link { color: foo }
256 vlink="foo" -> :visited { color: foo }
257 alink="foo" -> :active { color: foo }
258 */
259 private void cleanBodyAttrs(Lexer lexer, Node body) {
260 AttVal attr;
261 String bgurl = null;
262 String bgcolor = null;
263 String color = null;
264 attr = body.getAttrByName("background");
265 if (attr != null) {
266 bgurl = attr.value;
267 attr.value = null;
268 body.removeAttribute(attr);
269 }
270 attr = body.getAttrByName("bgcolor");
271 if (attr != null) {
272 bgcolor = attr.value;
273 attr.value = null;
274 body.removeAttribute(attr);
275 }
276 attr = body.getAttrByName("text");
277 if (attr != null) {
278 color = attr.value;
279 attr.value = null;
280 body.removeAttribute(attr);
281 }
282 if (bgurl != null || bgcolor != null || color != null) {
283 lexer.append(" body {\n");
284 if (bgurl != null) {
285 lexer.append(" background-image: url(");
286 lexer.append(bgurl);
287 lexer.append(");\n");
288 }
289 if (bgcolor != null) {
290 lexer.append(" background-color: ");
291 lexer.append(bgcolor);
292 lexer.append(";\n");
293 }
294 if (color != null) {
295 lexer.append(" color: ");
296 lexer.append(color);
297 lexer.append(";\n");
298 }
299 lexer.append(" }\n");
300 }
301 attr = body.getAttrByName("link");
302 if (attr != null) {
303 addColorRule(lexer, " :link", attr.value);
304 body.removeAttribute(attr);
305 }
306 attr = body.getAttrByName("vlink");
307 if (attr != null) {
308 addColorRule(lexer, " :visited", attr.value);
309 body.removeAttribute(attr);
310 }
311 attr = body.getAttrByName("alink");
312 if (attr != null) {
313 addColorRule(lexer, " :active", attr.value);
314 body.removeAttribute(attr);
315 }
316 }
317 private boolean niceBody(Lexer lexer, Node doc) {
318 Node body = doc.findBody(lexer.configuration.getTagTable());
319 if (body != null) {
320 if (body.getAttrByName("background") != null
321 || body.getAttrByName("bgcolor") != null
322 || body.getAttrByName("text") != null
323 || body.getAttrByName("link") != null
324 || body.getAttrByName("vlink") != null
325 || body.getAttrByName("alink") != null) {
326 lexer.badLayout |= Report.USING_BODY;
327 return false;
328 }
329 }
330 return true;
331 }
332 /* create style element using rules from dictionary */
333 private void createStyleElement(Lexer lexer, Node doc) {
334 Node node, head, body;
335 Style style;
336 AttVal av;
337 if (lexer.styles == null && niceBody(lexer, doc))
338 return;
339 node = lexer.newNode(Node.StartTag, 0, 0, -1, "style");
340 node.implicit = true;
341 /* insert type attribute */
342 av = new AttVal(null, null, '"', "type", "text/css");
343 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
344 node.attributes = av;
345 body = doc.findBody(lexer.configuration.getTagTable());
346 int txtstart = lexer.length();
347 if (body != null)
348 cleanBodyAttrs(lexer, body);
349 for (style = lexer.styles; style != null; style = style.next) {
350 lexer.append(' ');
351 lexer.append(style.tag);
352 lexer.append('.');
353 lexer.append(style.tagClass);
354 lexer.append(' ');
355 lexer.append('{');
356 lexer.append(style.properties);
357 lexer.append('}');
358 lexer.append('\n');
359 }
360 Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, txtstart, lexer.length(), -1));
361 /*
362 now insert style element into document head
363
364 doc is root node. search its children for html node
365 the head node should be first child of html node
366 */
367 head = doc.findHEAD(lexer.configuration.getTagTable());
368 if (head != null)
369 Node.insertNodeAtEnd(head, node);
370 }
371 /* ensure bidirectional links are consistent */
372 private void fixNodeLinks(Node node) {
373 Node child;
374 if (node.prev != null)
375 node.prev.next = node;
376 else
377 node.parent.content = node;
378 if (node.next != null)
379 node.next.prev = node;
380 else
381 node.parent.last = node;
382 for (child = node.content; child != null; child = child.next)
383 child.parent = node;
384 }
385 /*
386 used to strip child of node when
387 the node has one and only one child
388 */
389 private void stripOnlyChild(Node node) {
390 Node child;
391 child = node.content;
392 node.content = child.content;
393 node.last = child.last;
394 child.content = null;
395 for (child = node.content; child != null; child = child.next)
396 child.parent = node;
397 }
398
399 /**
400 * Discard the container 'element'.
401 * Used to strip font start and end tags.
402 * @return The next node after 'element' in the new context.
403 */
404 private Node discardContainer(Node element) {
405 Node ret;
406 Node parent = element.parent;
407 if (element.content != null) {
408 // Promote element.content and replace the current element.
409 // <element><content>, ..., <last></element>
410 // to
411 // <element.prev><content>...<last><element.next>
412 element.last.next = element.next;
413 if (element.next != null) {
414 element.next.prev = element.last;
415 element.last.next = element.next;
416 } else
417 parent.last = element.last;
418 if (element.prev != null) {
419 element.content.prev = element.prev;
420 element.prev.next = element.content;
421 } else
422 parent.content = element.content;
423 for (Node node = element.content; node != null; node = node.next)
424 node.parent = parent;
425 ret = element.content;
426 } else {
427 if (element.next != null)
428 element.next.prev = element.prev;
429 else
430 parent.last = element.prev;
431 if (element.prev != null)
432 element.prev.next = element.next;
433 else
434 parent.content = element.next;
435 ret = element.next;
436 }
437 element.next = null;
438 element.content = null;
439 return ret;
440 }
441
442 /**
443 * Merge two consecutive containers, eg. <b>...</b><b>...</b>.
444 * Content of 'next' is appended to the content of 'element' and container 'next' is discarded.
445 * @return The next node to be processed in the new context.
446 */
447 private Node mergeContainers(Node element, Node next) {
448 Node ret;
449 if (element.content == null && next.content == null) {
450 // This should not occurs.
451 // But just in case, remove both element and next.
452 ret = next.next;
453 Node.removeNode(element);
454 Node.removeNode(next);
455 return ret;
456 } else if (element.content != null && next.content == null) {
457 // Should not occur either.
458 ret = element;
459 Node.removeNode(next);
460 } else if (element.content == null && next.content != null) {
461 ret = next;
462 Node.removeNode(element);
463 } else {
464 ret = element;
465 element.last.next = next.content;
466 next.content.prev=element.last;
467 element.last = next.last;
468 for (Node n = next.content; n != null; n = n.next) {
469 n.parent = element;
470 }
471 next.content = null;
472 next.last = null;
473 Node.removeNode(next);
474 }
475 return ret;
476 }
477
478 /*
479 Add style property to element, creating style
480 attribute as needed and adding ; delimiter
481 */
482 private void addStyleProperty(Node node, String property) {
483 AttVal av;
484 for (av = node.attributes; av != null; av = av.next) {
485 if (av.attribute.equals("style"))
486 break;
487 }
488 /* if style attribute already exists then insert property */
489 if (av != null) {
490 String s;
491 s = addProperty(av.value, property);
492 av.value = s;
493 } else /* else create new style attribute */ {
494 av = new AttVal(node.attributes, null, '"', "style", property);
495 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
496 node.attributes = av;
497 }
498 }
499 /*
500 Create new string that consists of the
501 combined style properties in s1 and s2
502
503 To merge property lists, we build a linked
504 list of property/values and insert properties
505 into the list in order, merging values for
506 the same property name.
507 */
508 private String mergeProperties(String s1, String s2) {
509 String s;
510 StyleProp prop;
511 prop = createProps(null, s1);
512 prop = createProps(prop, s2);
513 s = createPropString(prop);
514 return s;
515 }
516 private void mergeStyles(Node node, Node child) {
517 AttVal av;
518 String s1, s2, style;
519 for (s2 = null, av = child.attributes; av != null; av = av.next) {
520 if (av.attribute.equals("style")) {
521 s2 = av.value;
522 break;
523 }
524 }
525 for (s1 = null, av = node.attributes; av != null; av = av.next) {
526 if (av.attribute.equals("style")) {
527 s1 = av.value;
528 break;
529 }
530 }
531 if (s1 != null) {
532 if (s2 != null) /* merge styles from both */ {
533 style = mergeProperties(s1, s2);
534 av.value = style;
535 }
536 } else if (s2 != null) /* copy style of child */ {
537 av = new AttVal(node.attributes, null, '"', "style", s2);
538 av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
539 node.attributes = av;
540 }
541 }
542 private String fontSize2Name(String size) {
543 /*
544 String[] sizes =
545 {
546 "50%",
547 "60%",
548 "80%",
549 null,
550 "120%",
551 "150%",
552 "200%"
553 };
554 */
555 String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
556 String buf;
557 if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
558 int n = size.charAt(0) - '0';
559 return sizes[n];
560 }
561 if (size.length() > 0 && size.charAt(0) == '-') {
562 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
563 int n = size.charAt(1) - '0';
564 double x;
565 for (x = 1.0; n > 0; --n)
566 x *= 0.8;
567 x *= 100.0;
568 buf = "" + (int) x + "%";
569 return buf;
570 }
571 return "smaller"; /*"70%"; */
572 }
573 if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
574 int n = size.charAt(1) - '0';
575 double x;
576 for (x = 1.0; n > 0; --n)
577 x *= 1.2;
578 x *= 100.0;
579 buf = "" + (int) x + "%";
580 return buf;
581 }
582 return "larger"; /* "140%" */
583 }
584 private void addFontFace(Node node, String face) {
585 addStyleProperty(node, "font-family: " + face);
586 }
587 private void addFontSize(Node node, String size) {
588 String value;
589 if (size.equals("6") && node.tag == tt.tagP) {
590 node.element = "h1";
591 tt.findTag(node);
592 return;
593 }
594 if (size.equals("5") && node.tag == tt.tagP) {
595 node.element = "h2";
596 tt.findTag(node);
597 return;
598 }
599 if (size.equals("4") && node.tag == tt.tagP) {
600 node.element = "h3";
601 tt.findTag(node);
602 return;
603 }
604 value = fontSize2Name(size);
605 if (value != null) {
606 addStyleProperty(node, "font-size: " + value);
607 }
608 }
609 private void addFontColor(Node node, String color) {
610 addStyleProperty(node, "color: " + color);
611 }
612 private void addAlign(Node node, String align) {
613 /* force alignment value to lower case */
614 addStyleProperty(node, "text-align: " + align.toLowerCase());
615 }
616 /*
617 add style properties to node corresponding to
618 the font face, size and color attributes
619 */
620 private void addFontStyles(Node node, AttVal av) {
621 while (av != null) {
622 if (av.attribute.equals("face"))
623 addFontFace(node, av.value);
624 else if (av.attribute.equals("size"))
625 addFontSize(node, av.value);
626 else if (av.attribute.equals("color"))
627 addFontColor(node, av.value);
628 av = av.next;
629 }
630 }
631 /*
632 Symptom: <p align=center>
633 Action: <p style="text-align: center">
634 */
635 private void textAlign(Lexer lexer, Node node) {
636 AttVal av, prev;
637 prev = null;
638 for (av = node.attributes; av != null; av = av.next) {
639 if (av.attribute.equals("align")) {
640 if (prev != null)
641 prev.next = av.next;
642 else
643 node.attributes = av.next;
644 if (av.value != null) {
645 addAlign(node, av.value);
646 }
647 break;
648 }
649 prev = av;
650 }
651 }
652 /*
653 The clean up rules use the pnode argument to return the
654 next node when the orignal node has been deleted
655 */
656 /*
657 Symptom: <dir> <li> where <li> is only child
658 Action: coerce <dir> <li> to <div> with indent.
659 */
660 private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
661 Node child;
662 if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
663 child = node.content;
664 if (child == null)
665 return false;
666 /* check child has no peers */
667 if (child.next != null)
668 return false;
669 if (child.tag != tt.tagLi)
670 return false;
671 if (!child.implicit)
672 return false;
673 /* coerce dir to div */
674 node.tag = tt.tagDiv;
675 node.element = "div";
676 addStyleProperty(node, "margin-left: 2em");
677 stripOnlyChild(node);
678 return true;
679 //#if 0
680 //Node content;
681 //Node last;
682 //content = child.content;
683 //last = child.last;
684 //child.content = null;
685 /* adjust parent and set margin on contents of <li> */
686 //for (child = content; child != null; child = child.next)
687 //{
688 // child.parent = node.parent;
689 // addStyleProperty(child, "margin-left: 1em");
690 //}
691 /* hook first/last into sequence */
692 //if (content != null)
693 //{
694 // content.prev = node.prev;
695 // last.next = node.next;
696 // fixNodeLinks(content);
697 // fixNodeLinks(last);
698 //}
699 //node.next = null;
700 /* ensure that new node is cleaned */
701 //pnode.setObject(cleanNode(lexer, content));
702 //return true;
703 //#endif
704 }
705 return false;
706 }
707 /*
708 Symptom: <center>
709 Action: replace <center> by <div style="text-align: center">
710 */
711 private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
712 if (node.tag == tt.tagCenter) {
713 if (lexer.configuration.DropFontTags) {
714 if (node.content != null) {
715 Node last = node.last;
716 Node parent = node.parent;
717 pnode.setObject(discardContainer(node));
718 node = lexer.inferredTag("br");
719 if (last.next != null)
720 last.next.prev = node;
721 node.next = last.next;
722 last.next = node;
723 node.prev = last;
724 if (parent.last == last)
725 parent.last = node;
726 node.parent = parent;
727 } else {
728 Node prev = node.prev;
729 Node next = node.next;
730 Node parent = node.parent;
731 pnode.setObject(discardContainer(node));
732 node = lexer.inferredTag("br");
733 node.next = next;
734 node.prev = prev;
735 node.parent = parent;
736 if (next != null)
737 next.prev = node;
738 else
739 parent.last = node;
740 if (prev != null)
741 prev.next = node;
742 else
743 parent.content = node;
744 }
745 return true;
746 }
747 node.tag = tt.tagDiv;
748 node.element = "div";
749 addStyleProperty(node, "text-align: center");
750 return true;
751 }
752 return false;
753 }
754 /*
755 Symptom <div><div>...</div></div>
756 Action: merge the two divs
757
758 This is useful after nested <dir>s used by Word
759 for indenting have been converted to <div>s
760 */
761 private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
762 Node child;
763 if (node.tag != tt.tagDiv)
764 return false;
765 child = node.content;
766 if (child == null)
767 return false;
768 if (child.tag != tt.tagDiv)
769 return false;
770 if (child.next != null)
771 return false;
772 mergeStyles(node, child);
773 stripOnlyChild(node);
774 return true;
775 }
776 /*
777 Symptom: <ul><li><ul>...</ul></li></ul>
778 Action: discard outer list
779 */
780 private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
781 Node child, list;
782 if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
783 child = node.content;
784 if (child == null)
785 return false;
786 /* check child has no peers */
787 if (child.next != null)
788 return false;
789 list = child.content;
790 if (list == null)
791 return false;
792 if (list.tag != node.tag)
793 return false;
794 pnode.setObject(node.next);
795 /* move inner list node into position of outer node */
796 list.prev = node.prev;
797 list.next = node.next;
798 list.parent = node.parent;
799 fixNodeLinks(list);
800 /* get rid of outer ul and its li */
801 child.content = null;
802 node.content = null;
803 node.next = null;
804 /*
805 If prev node was a list the chances are this node
806 should be appended to that list. Word has no way of
807 recognizing nested lists and just uses indents
808 */
809 if (list.prev != null) {
810 node = list;
811 list = node.prev;
812 if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
813 list.next = node.next;
814 if (list.next != null)
815 list.next.prev = list;
816 child = list.last; /* <li> */
817 node.parent = child;
818 node.next = null;
819 node.prev = child.last;
820 fixNodeLinks(node);
821 }
822 }
823 cleanNode(lexer, node);
824 return true;
825 }
826 return false;
827 }
828 /*
829 Symptom: the only child of a block-level element is a
830 presentation element such as B, I or FONT
831
832 Action: add style "font-weight: bold" to the block and
833 strip the <b> element, leaving its children.
834
835 example:
836
837 <p>
838 <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
839 </p>
840
841 becomes:
842
843 <p style="font-weight: bold; font-family: Arial; font-size: 6">
844 Draft Recommended Practice
845 </p>
846
847 This code also replaces the align attribute by a style attribute.
848 However, to avoid CSS problems with Navigator 4, this isn't done
849 for the elements: caption, tr and table
850 */
851 private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
852 Node child;
853 if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
854 if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
855 /* check for align attribute */
856 if (node.tag != tt.tagCaption)
857 textAlign(lexer, node);
858 child = node.content;
859 if (child == null)
860 return false;
861 /* check child has no peers */
862 if (child.next != null)
863 return false;
864 if (child.tag == tt.tagB) {
865 mergeStyles(node, child);
866 addStyleProperty(node, "font-weight: bold");
867 stripOnlyChild(node);
868 return true;
869 }
870 if (child.tag == tt.tagI) {
871 mergeStyles(node, child);
872 addStyleProperty(node, "font-style: italic");
873 stripOnlyChild(node);
874 return true;
875 }
876 if (child.tag == tt.tagFont) {
877 mergeStyles(node, child);
878 addFontStyles(node, child.attributes);
879 stripOnlyChild(node);
880 return true;
881 }
882 }
883 }
884 return false;
885 }
886 /* the only child of table cell or an inline element such as em */
887 private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
888 Node child;
889 if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
890 child = node.content;
891 if (child == null)
892 return false;
893 /* check child has no peers */
894 if (child.next != null)
895 return false;
896 if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
897 mergeStyles(node, child);
898 addStyleProperty(node, "font-weight: bold");
899 stripOnlyChild(node);
900 return true;
901 }
902 if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
903 mergeStyles(node, child);
904 addStyleProperty(node, "font-style: italic");
905 stripOnlyChild(node);
906 return true;
907 }
908 if (child.tag == tt.tagFont) {
909 mergeStyles(node, child);
910 addFontStyles(node, child.attributes);
911 stripOnlyChild(node);
912 return true;
913 }
914 }
915 return false;
916 }
917 /*
918 Replace font elements by span elements, deleting
919 the font element's attributes and replacing them
920 by a single style attribute.
921 */
922 private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
923 AttVal av, style, next;
924 if (node.tag == tt.tagFont) {
925 if (lexer.configuration.DropFontTags) {
926 pnode.setObject(discardContainer(node));
927 return false;
928 }
929 /* if FONT is only child of parent element then leave alone */
930 if (node.parent.content == node && node.next == null)
931 return false;
932 addFontStyles(node, node.attributes);
933 /* extract style attribute and free the rest */
934 av = node.attributes;
935 style = null;
936 while (av != null) {
937 next = av.next;
938 if (av.attribute.equals("style")) {
939 av.next = null;
940 style = av;
941 }
942 av = next;
943 }
944 node.attributes = style;
945 node.tag = tt.tagSpan;
946 node.element = "span";
947 return true;
948 }
949 return false;
950 }
951 /*
952 Applies all matching rules to a node.
953 */
954 private Node cleanNode(Lexer lexer, Node node) {
955 Node next = null;
956 MutableObject o = new MutableObject();
957 boolean b = false;
958 for (next = node; node.isElement(); node = next) {
959 o.setObject(next);
960 b = dir2Div(lexer, node, o);
961 next = (Node) o.getObject();
962 if (b)
963 continue;
964 b = nestedList(lexer, node, o);
965 next = (Node) o.getObject();
966 if (b)
967 continue;
968 b = center2Div(lexer, node, o);
969 next = (Node) o.getObject();
970 if (b)
971 continue;
972 b = mergeDivs(lexer, node, o);
973 next = (Node) o.getObject();
974 if (b)
975 continue;
976 b = blockStyle(lexer, node, o);
977 next = (Node) o.getObject();
978 if (b)
979 continue;
980 b = inlineStyle(lexer, node, o);
981 next = (Node) o.getObject();
982 if (b)
983 continue;
984 b = font2Span(lexer, node, o);
985 next = (Node) o.getObject();
986 if (b)
987 continue;
988 break;
989 }
990 return next;
991 }
992 private Node createStyleProperties(Lexer lexer, Node node) {
993 Node child;
994 if (node.content != null) {
995 for (child = node.content; child != null; child = child.next) {
996 child = createStyleProperties(lexer, child);
997 }
998 }
999 return cleanNode(lexer, node);
1000 }
1001 private void defineStyleRules(Lexer lexer, Node node) {
1002 Node child;
1003 if (node.content != null) {
1004 for (child = node.content; child != null; child = child.next) {
1005 defineStyleRules(lexer, child);
1006 }
1007 }
1008 style2Rule(lexer, node);
1009 }
1010 public void cleanTree(Lexer lexer, Node doc) {
1011 doc = createStyleProperties(lexer, doc);
1012 if (!lexer.configuration.MakeClean) {
1013 defineStyleRules(lexer, doc);
1014 createStyleElement(lexer, doc);
1015 }
1016 }
1017
1018 /**
1019 * Simplifies <b><b> ... </b> ...</b> etc.
1020 * Also remove adjacent blocks eg. ... </i><i> ... etc.
1021 */
1022 public void nestedEmphasis(Node node) {
1023 Node next;
1024 while (node != null) {
1025 next = node.next;
1026 if ((node.tag == tt.tagB
1027 || node.tag == tt.tagI
1028 || node.tag == tt.tagEm
1029 || node.tag == tt.tagStrong)) {
1030 if (node.parent != null && node.parent.tag == node.tag) {
1031 /* strip redundant inner element */
1032 node = discardContainer(node);
1033 continue;
1034 }
1035 // <b>... </b><b> ... </b> case.
1036 // Note: Empty <b></b> should have be eliminated during parsing.
1037 if (next != null && next.tag == node.tag) {
1038 if (true)
1039 System.err.println(
1040 NAME + ".nestedEmphasis(): node=" + node + "\n\tnext=" + next);
1041 node = mergeContainers(node, next);
1042 continue;
1043 }
1044 }
1045 if (node.content != null) {
1046 nestedEmphasis(node.content);
1047 }
1048 node = next;
1049 }
1050 }
1051
1052 /* replace i by em and b by strong */
1053 public void emFromI(Node node) {
1054 while (node != null) {
1055 if (node.tag == tt.tagI) {
1056 node.element = tt.tagEm.name;
1057 node.tag = tt.tagEm;
1058 } else if (node.tag == tt.tagB) {
1059 node.element = tt.tagStrong.name;
1060 node.tag = tt.tagStrong;
1061 }
1062 if (node.content != null)
1063 emFromI(node.content);
1064 node = node.next;
1065 }
1066 }
1067 /*
1068 Some people use dir or ul without an li
1069 to indent the content. The pattern to
1070 look for is a list with a single implicit
1071 li. This is recursively replaced by an
1072 implicit blockquote.
1073 */
1074 public void list2BQ(Node node) {
1075 while (node != null) {
1076 if (node.content != null)
1077 list2BQ(node.content);
1078 if (node.tag != null
1079 && node.tag.parser == ParserImpl.getParseList()
1080 && node.hasOneChild()
1081 && node.content.implicit) {
1082 stripOnlyChild(node);
1083 node.element = tt.tagBlockquote.name;
1084 node.tag = tt.tagBlockquote;
1085 node.implicit = true;
1086 }
1087 node = node.next;
1088 }
1089 }
1090 /*
1091 Replace implicit blockquote by div with an indent
1092 taking care to reduce nested blockquotes to a single
1093 div with the indent set to match the nesting depth
1094 */
1095 public void bQ2Div(Node node) {
1096 int indent;
1097 String indent_buf;
1098 while (node != null) {
1099 if (node.tag == tt.tagBlockquote && node.implicit) {
1100 indent = 1;
1101 while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
1102 ++indent;
1103 stripOnlyChild(node);
1104 }
1105 if (node.content != null)
1106 bQ2Div(node.content);
1107 indent_buf = "margin-left: " + (new Integer(2 * indent)).toString() + "em";
1108 node.element = tt.tagDiv.name;
1109 node.tag = tt.tagDiv;
1110 node.addAttribute("style", indent_buf);
1111 } else if (node.content != null)
1112 bQ2Div(node.content);
1113 node = node.next;
1114 }
1115 }
1116 /* node is <![if ...]> prune up to <![endif]> */
1117 public Node pruneSection(Lexer lexer, Node node) {
1118 for (;;) {
1119 /* discard node and returns next */
1120 node = Node.discardElement(node);
1121 if (node == null)
1122 return null;
1123 if (node.type == Node.SectionTag) {
1124 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1125 node = pruneSection(lexer, node);
1126 continue;
1127 }
1128 if ((Lexer.getString(node.textarray, node.start, 5)).equals("endif")) {
1129 node = Node.discardElement(node);
1130 break;
1131 }
1132 }
1133 }
1134 return node;
1135 }
1136 public void dropSections(Lexer lexer, Node node) {
1137 while (node != null) {
1138 if (node.type == Node.SectionTag) {
1139 /* prune up to matching endif */
1140 if ((Lexer.getString(node.textarray, node.start, 2)).equals("if")) {
1141 node = pruneSection(lexer, node);
1142 continue;
1143 }
1144 /* discard others as well */
1145 node = Node.discardElement(node);
1146 continue;
1147 }
1148 if (node.content != null)
1149 dropSections(lexer, node.content);
1150 node = node.next;
1151 }
1152 }
1153 public void purgeAttributes(Node node) {
1154 AttVal attr = node.attributes;
1155 AttVal next = null;
1156 AttVal prev = null;
1157 while (attr != null) {
1158 next = attr.next;
1159 /* special check for class="Code" denoting pre text */
1160 if (attr.attribute != null
1161 && attr.value != null
1162 && attr.attribute.equals("class")
1163 && attr.value.equals("Code")) {
1164 prev = attr;
1165 } else if (
1166 attr.attribute != null
1167 && (attr.attribute.equals("class")
1168 || attr.attribute.equals("style")
1169 || attr.attribute.equals("lang")
1170 || attr.attribute.startsWith("x:")
1171 || ((attr.attribute.equals("height") || attr.attribute.equals("width"))
1172 && (node.tag == tt.tagTd
1173 || node.tag == tt.tagTr
1174 || node.tag == tt.tagTh)))) {
1175 if (prev != null)
1176 prev.next = next;
1177 else
1178 node.attributes = next;
1179 } else
1180 prev = attr;
1181 attr = next;
1182 }
1183 }
1184 /* Word2000 uses span excessively, so we strip span out */
1185 public Node stripSpan(Lexer lexer, Node span) {
1186 Node node;
1187 Node prev = null;
1188 Node content;
1189 /*
1190 deal with span elements that have content
1191 by splicing the content in place of the span
1192 after having processed it
1193 */
1194 cleanWord2000(lexer, span.content);
1195 content = span.content;
1196 if (span.prev != null)
1197 prev = span.prev;
1198 else if (content != null) {
1199 node = content;
1200 content = content.next;
1201 Node.removeNode(node);
1202 Node.insertNodeBeforeElement(span, node);
1203 prev = node;
1204 }
1205 while (content != null) {
1206 node = content;
1207 content = content.next;
1208 Node.removeNode(node);
1209 Node.insertNodeAfterElement(prev, node);
1210 prev = node;
1211 }
1212 if (span.next == null)
1213 span.parent.last = prev;
1214 node = span.next;
1215 span.content = null;
1216 Node.discardElement(span);
1217 return node;
1218 }
1219 /* map non-breaking spaces to regular spaces */
1220 private void normalizeSpaces(Lexer lexer, Node node) {
1221 while (node != null) {
1222 if (node.content != null)
1223 normalizeSpaces(lexer, node.content);
1224 if (node.type == Node.TextNode) {
1225 char c;
1226 for (int i = node.start; i < node.end; ++i) {
1227 c = node.textarray.charAt(i);
1228 if (c == 160) {
1229 node.textarray.setCharAt(i, ' ');
1230 }
1231 }
1232 }
1233 node = node.next;
1234 }
1235 }
1236 /*
1237 This is a major clean up to strip out all the extra stuff you get
1238 when you save as web page from Word 2000. It doesn't yet know what
1239 to do with VML tags, but these will appear as errors unless you
1240 declare them as new tags, such as o:p which needs to be declared
1241 as inline.
1242 */
1243 public void cleanWord2000(Lexer lexer, Node node) {
1244 /* used to a list from a sequence of bulletted p's */
1245 Node list = null;
1246 while (node != null) {
1247 /* discard Word's style verbiage */
1248 if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
1249 node = Node.discardElement(node);
1250 continue;
1251 }
1252 /* strip out all span tags Word scatters so liberally! */
1253 if (node.tag == tt.tagSpan) {
1254 node = stripSpan(lexer, node);
1255 continue;
1256 }
1257 /* get rid of Word's xmlns attributes */
1258 if (node.tag == tt.tagHtml) {
1259 /* check that it's a Word 2000 document */
1260 if (node.getAttrByName("xmlns:o") == null)
1261 return;
1262 }
1263 if (node.tag == tt.tagLink) {
1264 AttVal attr = node.getAttrByName("rel");
1265 if (attr != null && attr.value != null && attr.value.equals("File-List")) {
1266 node = Node.discardElement(node);
1267 continue;
1268 }
1269 }
1270 /* discard empty paragraphs */
1271 if (node.content == null && node.tag == tt.tagP) {
1272 node = Node.discardElement(node);
1273 continue;
1274 }
1275 if (node.tag == tt.tagP) {
1276 AttVal attr = node.getAttrByName("class");
1277 /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
1278 if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
1279 Node.coerceNode(lexer, node, tt.tagLi);
1280 if (list == null || list.tag != tt.tagUl) {
1281 list = lexer.inferredTag("ul");
1282 Node.insertNodeBeforeElement(node, list);
1283 }
1284 purgeAttributes(node);
1285 if (node.content != null)
1286 cleanWord2000(lexer, node.content);
1287 /* remove node and append to contents of list */
1288 Node.removeNode(node);
1289 Node.insertNodeAtEnd(list, node);
1290 node = list.next;
1291 }
1292 /* map sequence of <p class="Code"> to <pre>...</pre> */
1293 else if (attr != null && attr.value != null && attr.value.equals("Code")) {
1294 Node br = lexer.newLineNode();
1295 normalizeSpaces(lexer, node);
1296 if (list == null || list.tag != tt.tagPre) {
1297 list = lexer.inferredTag("pre");
1298 Node.insertNodeBeforeElement(node, list);
1299 }
1300 /* remove node and append to contents of list */
1301 Node.removeNode(node);
1302 Node.insertNodeAtEnd(list, node);
1303 stripSpan(lexer, node);
1304 Node.insertNodeAtEnd(list, br);
1305 node = list.next;
1306 } else
1307 list = null;
1308 } else
1309 list = null;
1310 /* strip out style and class attributes */
1311 if (node.type == Node.StartTag || node.type == Node.StartEndTag)
1312 purgeAttributes(node);
1313 if (node.content != null)
1314 cleanWord2000(lexer, node.content);
1315 node = node.next;
1316 }
1317 }
1318 public boolean isWord2000(Node root, TagTable tt) {
1319 Node html = root.findHTML(tt);
1320 return (html != null && html.getAttrByName("xmlns:o") != null);
1321 }
1322}