Source code: diffxml/fmes/Match.java
1 /*
2 Class to generate Matchings for FMES algorithm
3
4 Copyright (C) 2002 Adrian Mouat
5
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License
8 as published by the Free Software Foundation; either version 2
9 of the License, or (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19
20 Author: Adrian Mouat
21 email: amouat@postmaster.co.uk
22 */
23
24 package diffxml.fmes;
25
26 //Code to solve "Good Matchings Problem"
27 //Algorithm Fast Match
28 //21/02/02
29
30 import diffxml.*;
31 import java.io.File;
32 import java.io.FileWriter;
33 import java.io.IOException;
34 import java.io.OutputStreamWriter;
35 //import java.io.Writer;
36 import java.util.TreeSet;
37 import java.util.Comparator;
38 import java.util.Iterator;
39 //import java.io.*;
40 import java.util.StringTokenizer;
41
42 import org.apache.xerces.dom.NodeImpl;
43 import org.apache.xerces.parsers.DOMParser;
44 import org.apache.xerces.dom3.Node3;
45
46 import org.w3c.dom.Attr;
47 import org.w3c.dom.Document;
48 import org.w3c.dom.NamedNodeMap;
49 import org.w3c.dom.Node;
50 import org.w3c.dom.NodeList;
51 import org.w3c.dom.NamedNodeMap;
52
53 import org.w3c.dom.traversal.*;
54 import org.xml.sax.InputSource;
55 import org.xml.sax.Locator;
56 import org.xml.sax.SAXException;
57 import org.xml.sax.helpers.*;
58 import org.xml.sax.SAXParseException;
59 import java.io.UnsupportedEncodingException;
60
61 public class Match
62 {
63
64 public boolean equal(Node a, Node b)
65 {
66 //Currently returns false but should throw exception
67
68 if (a.getNodeType()!=b.getNodeType())
69 {
70 System.out.println("Types not equal");
71 return false;
72 }
73
74
75 //if Node is an element
76 if (a.getNodeType()==Node.ELEMENT_NODE)
77 {
78 //Check NodeNames equal
79 if (a.getNodeName()!=b.getNodeName())
80 return false;
81
82 //Check attributes equal
83 NamedNodeMap attrs_a=a.getAttributes();
84 NamedNodeMap attrs_b=b.getAttributes();
85
86 int a_num = (attrs_a != null) ? attrs_a.getLength() : 0;
87 for (int i=0; i<a_num; i++)
88 {
89 //Check if attr exists in other tag
90 if (attrs_b.getNamedItem(attrs_a.item(i).getNodeName())==null)
91 return false;
92
93 if (! attrs_b.getNamedItem(attrs_a.item(i).getNodeName()).getNodeValue().equals
94 (attrs_a.item(i).getNodeValue()) )
95 {
96 //System.out.println("Equal: attributes not equal");
97 return false;
98 }
99 }
100 //Should probably compare positions of elements, or if kids matched or sumink
101 return true;
102 }
103
104 //If node is a text node
105 if (a.getNodeType()==Node.TEXT_NODE)
106 {
107 //Need to check whitespace and case options
108 String a_str=a.getNodeValue();
109 String b_str=b.getNodeValue();
110
111 if (Table.ign_all_ws)
112 {
113 //Ignore all whitespace
114 //Remove whitespace from nodes before comparison
115 StringTokenizer st = new StringTokenizer(a_str);
116 a_str="";
117 while (st.hasMoreTokens())
118 a_str=a_str+st.nextToken();
119
120 st = new StringTokenizer(b_str);
121 b_str="";
122 while (st.hasMoreTokens())
123 b_str=b_str+st.nextToken();
124 }
125 else if (Table.ign_ldn_ws)
126 {
127 //Ignore leading ws
128 //just call trim
129 a_str=a_str.trim();
130 b_str=b_str.trim();
131 }
132
133 //Check case optn
134
135 if (Table.ign_case)
136 {
137 //Just make it all lower
138 a_str.toLowerCase();
139 b_str.toLowerCase();
140 }
141
142 return (a_str.equals(b_str));
143 }
144
145 //Node is not a text node or element, so just compare value and return.
146
147 //System.out.println("Equal: elements equal");
148 return (a.getNodeValue().equals(b.getNodeValue()));
149 }
150
151 class tdComp implements Comparator
152 {
153 //Remember we want things stored in reverse order of depth!
154 //We don't really care about order of strings but we need to
155 //differentiate for a set
156
157 public int compare(Object o1, Object o2)
158 {
159 tagDepth td1 = (tagDepth)(o1);
160 tagDepth td2 = (tagDepth)(o2);
161
162 if (td1.depth==td2.depth)
163 {
164 return (td1.tag.compareTo(td2.tag));
165 }
166 else
167 {
168 return (td2.depth-td1.depth);
169 }
170 }
171
172 public boolean equals(Object o)
173 {
174 return o.equals(this);
175 }
176
177 }
178
179 public NodeSet test(Document doc1, Document doc2)
180 throws Exception {
181
182 NodeSet match_set = new NodeSet();
183
184 //Normalise documents
185 doc1.getDocumentElement().normalize();
186 doc2.getDocumentElement().normalize();
187
188 TreeSet td1 = markElements(doc1);
189 TreeSet td2 = markElements(doc2);
190
191
192 String wanted="";
193 tagDepth tg= new tagDepth();
194
195 //Iterate for nodes in Tree1
196 Iterator it=td1.iterator();
197 while (it.hasNext())
198 {
199 tg=(tagDepth) it.next();
200
201 wanted=tg.tag;
202
203 db.p("Wanted Node: " + wanted);
204 //Get all nodes in both trees with this tag
205 if (wanted.equals("#text"))
206 {
207 //Use node iterator
208 //Should really be bottom up, but shouldn't make big diff
209 db.p("Matching text nodes");
210 NodeIterator ni1 = ((DocumentTraversal)doc1).createNodeIterator(doc1.getDocumentElement(), NodeFilter.SHOW_TEXT, null, false);
211 NodeIterator ni2 = ((DocumentTraversal)doc2).createNodeIterator(doc2.getDocumentElement(), NodeFilter.SHOW_TEXT,null, false);
212 NodeImpl na= (NodeImpl) ni1.nextNode();
213 NodeImpl nb= (NodeImpl) ni2.nextNode();
214 while(na!=null)
215 {
216 //Should always be false but leave check in for mo
217 if (na.getUserData("matched").equals("false"))
218 {
219 while(nb!=null)
220 {
221 if (nb.getUserData("matched").equals("false") && equal(na,nb) )
222 {
223 //Add nodes to matching set
224 //Node3 unq1 = na;
225 //Node3 unq2 = nb;
226 match_set.add(na,nb);
227 //Mark nodes matched
228 na.setUserData("matched","true",null);
229 nb.setUserData("matched","true",null);
230
231 break;
232 }
233 nb=(NodeImpl) ni2.nextNode();
234 }
235 }
236 na=(NodeImpl) ni1.nextNode();
237 ni2.detach();
238 ni2 = ((DocumentTraversal)doc2).createNodeIterator(doc2.getDocumentElement(), NodeFilter.SHOW_TEXT,null, false);
239 nb=(NodeImpl) ni2.nextNode();
240 }
241
242
243 }
244 else if (wanted.equals("#comment"))
245 {
246 //Use node iterator
247 //Should really be bottom up, but shouldn't make big diff
248 db.p("Matching comments");
249 NodeIterator ni1 = ((DocumentTraversal)doc1).createNodeIterator(doc1.getDocumentElement(), NodeFilter.SHOW_COMMENT, null, false);
250 NodeIterator ni2 = ((DocumentTraversal)doc2).createNodeIterator(doc2.getDocumentElement(), NodeFilter.SHOW_COMMENT,null, false);
251 NodeImpl na= (NodeImpl) ni1.nextNode();
252 NodeImpl nb= (NodeImpl) ni2.nextNode();
253 while(na!=null)
254 {
255 //Should always be false but leave check in for mo
256 if (na.getUserData("matched").equals("false"))
257 {
258 while(nb!=null)
259 {
260 if (nb.getUserData("matched").equals("false") && equal(na,nb) )
261 {
262 match_set.add(na,nb);
263 na.setUserData("matched","true",null);
264 nb.setUserData("matched","true",null);
265
266 break;
267 }
268 nb=(NodeImpl) ni2.nextNode();
269 }
270 }
271 na=(NodeImpl) ni1.nextNode();
272 ni2.detach();
273 ni2 = ((DocumentTraversal)doc2).createNodeIterator(doc2.getDocumentElement(), NodeFilter.SHOW_TEXT,null, false);
274 nb=(NodeImpl) ni2.nextNode();
275 }
276 }
277 else {
278 //System.out.println("ENTERED!!!");
279 NodeList tg1=doc1.getElementsByTagName(wanted);
280 NodeList tg2=doc2.getElementsByTagName(wanted);
281
282 //Cycle through tg1 looking for matches in tg2
283
284 for(int a=0; a<tg1.getLength(); a++)
285 {
286 NodeImpl a_node= (NodeImpl) tg1.item(a);
287 if (a_node.getUserData("matched").equals("false"))
288 {
289 //Cycle through tg2 looking for match
290 //tg_tag:
291 for (int b=0; b<tg2.getLength(); b++)
292 {
293 NodeImpl b_node= (NodeImpl) tg2.item(b);
294 if (b_node.getUserData("matched").equals("false") && equal(tg1.item(a), tg2.item(b)))
295 {
296 //Add nodes to matching set
297 match_set.add(tg1.item(a), tg2.item(b));
298
299 //mark nodes matched
300 a_node.setUserData("matched","true",null);
301 b_node.setUserData("matched","true",null);
302
303 //Don't think this statement did nowt
304 //continue tg_tag;
305 break;
306 }
307 }
308 }
309 }
310 }
311 }
312
313
314 //Output matched nodes
315 if (db.on==true)
316 match_set.print_set();
317
318 return match_set;
319
320 }
321
322 public TreeSet markElements(Document doc)
323 {
324 //Maybe want to set to SHOW_ELEMENT and do other elements separately
325 TreeWalker t = ((DocumentTraversal)doc)
326 .createTreeWalker(doc.getDocumentElement(),NodeFilter.SHOW_ALL,null,false);
327
328 NodeImpl n;
329 int depth;
330 String tmptag;
331 tagDepth tmp2 = new tagDepth();
332
333 TreeSet td = new TreeSet(new tdComp());
334
335 //Add root node (should change loop to include)
336 n= (NodeImpl) t.getCurrentNode();
337 n.setUserData("matched","false",null);
338 n.setUserData("inorder","false",null);
339 tmp2.tag=n.getNodeName();
340 tmp2.depth=0;
341 td.add(tmp2);
342 db.p("Added " + tmp2.tag + " Depth " + 0);
343
344 while ( (n=(NodeImpl) t.nextNode()) != null)
345 {
346 n.setUserData("matched","false",null);
347 //Let children default to be "out of order"
348 //Test with "inorder"
349 n.setUserData("inorder","true",null);
350
351 //Get iterator for TreeSet
352 Iterator it = td.iterator();
353
354 //Add to set
355 tagDepth tmp = new tagDepth();
356 tmp.tag=n.getNodeName();
357
358 //Bit more trouble to get depth
359 depth=1;
360 n=(NodeImpl) n.getParentNode();
361 while(n!=doc.getDocumentElement() && n!=null)
362 {
363 depth++;
364 n=(NodeImpl) n.getParentNode();
365 }
366
367 tmp.depth=depth;
368 td.add(tmp);
369 db.p("Added " + tmp.tag + " Depth " + depth);
370 //}
371 }
372 return td;
373 }
374
375 public int findLeaves(Document doc, String[] leaves)
376 {
377 TreeWalker t = ((DocumentTraversal)doc)
378 .createTreeWalker(doc.getDocumentElement(),NodeFilter.SHOW_ELEMENT,new LeafFilter(),false);
379 //Node[] leaves = new Node[100];
380 Node n;
381 int i=0;
382 while ( (n=t.nextNode() ) != null)
383 {//Note that only one occurence of any element in NamedNodeMap
384 leaves[i]=n.getNodeName();
385 for (int x=0;x<i;x++)
386 {
387 if (leaves[x].equals(leaves[i]))
388 {
389 i--;
390 break;
391 }
392 }
393 i++;
394 }
395
396 return i;
397
398 }
399
400
401 class LeafFilter implements NodeFilter
402 {
403 //Consider leaf nodes to be elements
404 //with no children other than text nodes
405 //Then the text nodes represent the value
406 //of the element
407
408 public short acceptNode(Node n)
409 {
410 //System.out.println("name " + n.getNodeName());
411 if (n.hasChildNodes()==false)
412 return FILTER_ACCEPT;
413
414 NodeList nodes = n.getChildNodes();
415 for (int i=0; i<nodes.getLength(); i++)
416 {
417 if (nodes.item(i).getNodeType()!=Node.TEXT_NODE)
418 {
419 return FILTER_SKIP;
420 }
421
422 }
423 return FILTER_ACCEPT;
424 }
425 }
426
427 /*
428 public static void main(String[] args)
429 {
430
431 if (args.length !=2)
432 System.exit(0);
433
434
435 try
436 {
437 //db.on=true;
438 DOMParser parser1 = new DOMParser();
439 DOMParser parser2 = new DOMParser();
440
441 //Turn off entity resolving
442 try {
443 parser1.setFeature("http://xml.org/sax/features/external-general-entities",
444 false);
445 parser2.setFeature("http://xml.org/sax/features/external-general-entities",
446 false);
447 parser1.setFeature("http://xml.org/sax/features/external-parameter-entities",
448 false);
449 parser2.setFeature("http://xml.org/sax/features/external-parameter-entities",
450 false);
451 parser1.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
452 false);
453 parser2.setFeature("http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
454 false);
455 parser1.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd",
456 false);
457 parser2.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd",
458 false);
459 parser1.setFeature("http://apache.org/xml/features/dom/create-entity-ref-nodes",
460 false);
461 parser2.setFeature("http://apache.org/xml/features/dom/create-entity-ref-nodes",
462 false);
463 }
464 catch (SAXException e) {
465 System.err.println("could not set parser feature");}
466
467 //Ignore whitespace nodes
468 Table.ign_ws_nodes=true;
469
470 parser1.parse(args[0]);
471 Document doc1=parser1.getDocument();
472 parser2.parse(args[1]);
473 Document doc2=parser2.getDocument();
474 Match match=new Match();
475 NodeSet matchings = match.test(doc1, doc2);
476 //db.on=false;
477 db.p("Creating Edit Script");
478 Document delta= EditScript.create(doc1, doc2, matchings);
479 db.p("Created Edit Script");
480 Writer writer = new Writer();
481 try {
482 writer.setOutput(System.out, "UTF8");
483 }
484 catch (UnsupportedEncodingException e) {
485 System.err.println("error: Unable to set output. Exiting.");
486 System.exit(1);
487 }
488 writer.setCanonical(false);
489 writer.write(delta);
490
491 } catch (Exception e)
492 {
493 e.printStackTrace();
494 }
495
496 }
497 */
498 }
499