Source code: com/memoire/xml/XmlParser.java
1 /**
2 * @modification 2001-10-25
3 * @statut unstable
4 * @file XmlParser.java
5 * @version 0.04
6 * @author Guillaume Desnoix
7 * @email guillaume@desnoix.com
8 * @license GNU General Public License 2 (GPL2)
9 * @copyright 1998-2001 Guillaume Desnoix
10 */
11
12 package com.memoire.xml;
13 import com.memoire.xml.*;
14
15 import java.io.*;
16 import java.util.*;
17
18 public class XmlParser
19 implements XmlListener
20 {
21 public static final int XML =0;
22 public static final int HTML =1;
23 public static final int TXT =2;
24
25 private Reader in_;
26 private char[] buf_;
27 private int nbc_;
28 private int lineno_,charno_;
29 private String origin_;
30 private int mode_;
31 private XmlListener listener_;
32
33 public XmlParser(String _origin) throws IOException
34 {
35 this(new FileReader(_origin),_origin,XML);
36 }
37
38 public XmlParser(String _origin,int _mode) throws IOException
39 {
40 this(new FileReader(_origin),_origin,_mode);
41 }
42
43 public XmlParser(File _origin) throws IOException
44 {
45 this(new FileReader(_origin),_origin.getName(),XML);
46 }
47
48 public XmlParser(File _origin, int _mode) throws IOException
49 {
50 this(new FileReader(_origin),_origin.getName(),_mode);
51 }
52
53 public XmlParser(Reader _in, String _origin)
54 {
55 this(_in,_origin,XML);
56 }
57
58 public XmlParser(Reader _in, String _origin, int _mode)
59 {
60 in_ =_in;
61 buf_ =new char[1024];
62 nbc_ =0;
63 origin_ =_origin;
64 lineno_ =1;
65 charno_ =1;
66 mode_ =_mode;
67 listener_=this;
68 }
69
70 protected static final String replace(String _s, String _a, String _b)
71 {
72 String r=_s;
73 int i=0;
74 while((i=r.indexOf(_a,i))>=0)
75 {
76 r=r.substring(0,i)+_b+r.substring(i+_a.length());
77 i=i+_b.length();
78 }
79 return r;
80 }
81
82 private boolean ready() throws IOException
83 {
84 while(in_.ready()&&isSpace(peekC())) readC();
85 return in_.ready();
86 }
87
88 public int getMode()
89 {
90 return mode_;
91 }
92
93 public void setMode(int _mode)
94 {
95 mode_=_mode;
96 }
97
98 public String origin() { return origin_; }
99 public int lineno() { return lineno_; }
100 public int charno() { return charno_; }
101
102 private char readC() throws IOException
103 {
104 char r;
105
106 if(nbc_>0)
107 {
108 nbc_--;
109 r=buf_[nbc_];
110 }
111 else
112 {
113 int c=in_.read();
114 if(c==-1) throw new EOFException();
115 r=(char)c;
116 if(r=='\n') { charno_=1; lineno_++; }
117 else charno_++;
118 }
119
120 return r;
121 }
122
123 private char peekC()
124 {
125 char r;
126
127 try { r=readC(); putC(r); }
128 catch(IOException ex) { r=(char)-1; }
129
130 return r;
131 }
132
133 private void putC(char _c)
134 {
135 buf_[nbc_]=_c;
136 nbc_++;
137 }
138
139 private static final boolean isSpace(char c)
140 {
141 return Character.isWhitespace(c);
142 }
143
144 private static final boolean isLetterOrDigit(char c)
145 {
146 return Character.isLetterOrDigit(c);
147 }
148
149 private int tag=0;
150
151 private String readToken0() throws IOException
152 {
153 char c;
154 boolean espaces=false;
155 while(isSpace(c=readC())) espaces=true;
156
157 StringBuffer sb=new StringBuffer(10);
158
159 if(getMode()==TXT)
160 {
161 sb.append(c);
162 if(isLetterOrDigit(c))
163 {
164 while(isLetterOrDigit(c=peekC())||(c=='-')||(c=='_'))
165 sb.append(readC());
166 if(c=='@')
167 {
168 sb.append(readC());
169 while(isLetterOrDigit(c=peekC())||(c=='.')||(c=='-')||(c=='_'))
170 sb.append(readC());
171 }
172 else
173 if((c==':')&&
174 ( "http".equals(sb.toString())
175 ||"mailto".equals(sb.toString())
176 ||"ftp".equals(sb.toString())))
177 while(!isSpace(c=peekC()))
178 sb.append(readC());
179 }
180 }
181 else
182 if((tag==0)&&(c=='<'))
183 {
184 sb.append(c);
185 if(peekC()=='!')
186 {
187 sb.append(readC());
188 char t=readC();
189 if((t=='-')&&(peekC()=='-'))
190 {
191 putC(t);
192 do { sb.append(readC()); }
193 while(!sb.toString().endsWith("-->"));
194 tag--;
195 }
196 else putC(t);
197 }
198 else
199 if(peekC()=='?')
200 sb.append(readC());
201 else
202 if(peekC()=='/')
203 sb.append(readC());
204 tag++;
205 }
206 else
207 if((tag!=0)&&(c=='>'))
208 {
209 sb.append(c);
210 tag--;
211 }
212 else
213 if((tag!=0)&&((c=='?')||(c=='/'))&&(peekC()=='>'))
214 {
215 sb.append(c);
216 sb.append(readC());
217 tag--;
218 }
219 else
220 if(c=='&')
221 {
222 sb.append(c);
223 do { c=readC(); sb.append(c); } while(c!=';');
224 }
225 else
226 if((tag!=0)&&(c=='"'))
227 {
228 sb.append(c);
229 do { c=readC(); sb.append(c); } while(c!='"');
230 }
231 else
232 if((tag!=0)&&(c=='\''))
233 {
234 sb.append(c);
235 do { c=readC(); sb.append(c); } while(c!='\'');
236 }
237 else
238 if((tag!=0)&&(c=='['))
239 {
240 sb.append(c);
241 do { c=readC(); sb.append(c); } while(c!=']');
242 }
243 else
244 if((tag!=0)&&isLetterOrDigit(c))
245 {
246 sb.append(c);
247 while(isLetterOrDigit(c=peekC())
248 ||(c=='.')||(c=='-')||(c=='_')||(c==':'))
249 { sb.append(readC()); }
250
251 if((getMode()==HTML)&&(peekC()=='%'))
252 { sb.append(readC()); }
253 }
254 else
255 if(tag!=0)
256 {
257 sb.append(c);
258
259 if((getMode()==HTML)
260 &&((c=='#')||(c=='+')||(c=='-')))
261 {
262 while(isLetterOrDigit(c=peekC())
263 ||(c=='.')||(c=='-')||(c=='_')||(c==':'))
264 { sb.append(readC()); }
265 }
266 }
267 else
268 {
269 if((getMode()==HTML)&&espaces) sb.append(' ');
270 sb.append(c);
271 while(((c=peekC())!='<')&&(c!=-1)) sb.append(readC());
272 }
273
274 //System.err.println("::"+tag+" "+sb);
275 return sb.toString();
276 }
277
278 private Stack pile_=new Stack();
279
280 private String readToken() throws IOException
281 {
282 String r;
283
284 do
285 {
286 if(!pile_.empty()) r=(String)pile_.pop();
287 else r=readToken0();
288 }
289 while(r.startsWith("<!--"));
290
291 return r;
292 }
293
294 private void pushToken(String _token)
295 {
296 pile_.push(_token);
297 }
298
299 public void parse() throws IOException
300 {
301 try
302 {
303 pile_.removeAllElements();
304
305 while(true)
306 {
307 String e=readToken();
308 listener_.location(origin_,lineno_,charno_);
309
310 if(getMode()==TXT)
311 {
312 listener_.text(e);
313 }
314 else
315 if("<".equals(e))
316 {
317 e=readToken();
318 if(getMode()==HTML) e=e.toUpperCase();
319
320 String t=e;
321
322 while(true)
323 {
324 e=readToken();
325 if(">".equals(e)||"/>".equals(e)) break;
326
327 String a=e;
328 e=readToken(); // must be '='
329
330 if(getMode()==HTML)
331 {
332 if(!e.equals("="))
333 {
334 pushToken(e);
335 e="TRUE";
336 }
337 else
338 e=readToken();
339 //System.err.println(a+"="+e);
340 }
341 else
342 {
343 if(!e.equals("="))
344 listener_.error("Missing value for Attribute '"+a+"'");
345 else
346 e=readToken();
347 }
348
349 if((getMode()==HTML)&&("#".equals(e)))
350 e+=readToken();
351
352 if(e.length()>=2)
353 {
354 char c=e.charAt(0);
355 if((c=='"'||c=='\'')&&(c==e.charAt(e.length()-1)))
356 e=e.substring(1,e.length()-1);
357 else
358 if(getMode()!=HTML)
359 listener_.error("Invalid Attribute Value '"+a+"': "+e);
360 }
361 else
362 if(getMode()!=HTML)
363 listener_.error("Invalid Attribute Value '"+a+"': "+e);
364
365 if(getMode()==HTML) a=a.toUpperCase();
366 //System.err.println("ATT: "+a+" ~ "+e);
367 listener_.attribute(a,e);
368 }
369
370 //if(getMode()==HTML) t=t.toUpperCase();
371 //System.err.println("OPEN TAG: "+t);
372 listener_.startElement(t);
373
374 if(">".equals(e))
375 {
376 if((getMode()==HTML)
377 &&( "BR" .equals(t)
378 ||"HR" .equals(t)
379 ||"IMG" .equals(t)
380 ||"LINK" .equals(t)
381 ||"INPUT".equals(t)
382 ||"META" .equals(t)))
383 listener_.endElement(t);
384 }
385 else
386 if("/>".equals(e))
387 {
388 //System.err.println("CLOSE TAG: "+t);
389 listener_.endElement(t);
390 }
391 }
392 else
393 if("</".equals(e))
394 {
395 e=readToken();
396 if(getMode()==HTML) e=e.toUpperCase();
397 //System.err.println("CLOSE TAG: "+e);
398 if( (getMode()==HTML)
399 &&( "BR" .equals(e)
400 ||"HR" .equals(e)
401 ||"IMG" .equals(e)
402 ||"LINK" .equals(e)
403 ||"INPUT".equals(e)
404 ||"META" .equals(e)))
405 ;
406 else
407 listener_.endElement(e);
408 e=readToken();
409 }
410 else
411 if("<?".equals(e))
412 {
413 //System.err.println("PROCESSING");
414 do { e=readToken(); } while(!"?>".equals(e));
415 }
416 else
417 if("<!".equals(e))
418 {
419 //System.err.println("DECLARATION");
420 do { e=readToken(); } while(!">".equals(e));
421 }
422 else
423 {
424 //System.err.println("TEXT: "+e);
425 listener_.text(e);
426 }
427 }
428 }
429 catch(EOFException ex) { }
430 }
431
432 public void setXmlListener(XmlListener _l)
433 {
434 if(_l==null) _l=this;
435 listener_=_l;
436 }
437
438 public void location(String _origin, int _lineno, int _charno)
439 { }
440
441 public void startElement(String _tag)
442 { System.out.println("OPEN TAG : "+_tag); }
443
444 public void endElement(String _tag)
445 { System.out.println("CLOSE TAG : "+_tag); }
446
447 public void attribute(String _name,String _value)
448 { System.out.println("ATTRIBUTE : "+_name+" WITH VALUE : "+_value); }
449
450 public void text(String _data)
451 { System.out.println("TEXT DATA : "+_data); }
452
453 public void error(String _message)
454 { System.out.println("ERROR : "+_message+" IN "+origin()); }
455
456 public static void main(String[] _args)
457 {
458 for(int i=0;i<_args.length;i++)
459 {
460 try
461 {
462 String f=_args[i];
463 int m=XML;
464 if(f.endsWith(".html")||f.endsWith(".htm")) m=HTML;
465 if(f.endsWith(".txt")) m=TXT;
466 System.err.println("FILE: "+f+" MODE: "+m);
467 XmlParser p=new XmlParser(f,m);
468 p.parse();
469 }
470 catch(Exception ex)
471 {
472 ex.printStackTrace();
473 //System.err.println(ex);
474 }
475 }
476 }
477 }
478