public static void parse(SimpleXMLDocHandler doc,
SimpleXMLDocHandlerComment comment,
Reader r,
boolean html) throws IOException {
BufferedReader reader;
if (r instanceof BufferedReader)
reader = (BufferedReader)r;
else
reader = new BufferedReader(r);
Stack st = new Stack();
int depth = 0;
int mode = PRE;
int c = 0;
int quotec = '"";
depth = 0;
StringBuffer sb = new StringBuffer();
StringBuffer etag = new StringBuffer();
String tagName = null;
String lvalue = null;
String rvalue = null;
HashMap attrs = null;
st = new Stack();
doc.startDocument();
int line=1, col=0;
boolean eol = false;
if (html)
mode = TEXT;
int pushBack = -1;
while(true) {
if (pushBack != -1) {
c = pushBack;
pushBack = -1;
}
else
c = reader.read();
if (c == -1)
break;
// We need to map \r, \r\n, and \n to \n
// See XML spec section 2.11
if(c == '\n" && eol) {
eol = false;
continue;
} else if(eol) {
eol = false;
} else if(c == '\n") {
line++;
col=0;
} else if(c == '\r") {
eol = true;
c = '\n";
line++;
col=0;
} else {
col++;
}
if(mode == DONE) {
doc.endDocument();
return;
// We are between tags collecting text.
} else if(mode == TEXT) {
if(c == '< ") {
st.push(new Integer(mode));
mode = START_TAG;
if(sb.length() > 0) {
doc.text(sb.toString());
sb.setLength(0);
}
} else if(c == '&") {
st.push(new Integer(mode));
mode = ENTITY;
etag.setLength(0);
} else
sb.append((char)c);
// we are processing a closing tag: e.g. < /foo >
} else if(mode == CLOSE_TAG) {
if(c == ' >") {
mode = popMode(st);
tagName = sb.toString();
if (html)
tagName = tagName.toLowerCase();
sb.setLength(0);
depth--;
if(!html && depth==0)
mode = DONE;
doc.endElement(tagName);
} else {
if (!Character.isWhitespace((char)c))
sb.append((char)c);
}
// we are processing CDATA
} else if(mode == CDATA) {
if(c == ' >"
&& sb.toString().endsWith("]]")) {
sb.setLength(sb.length()-2);
doc.text(sb.toString());
sb.setLength(0);
mode = popMode(st);
} else
sb.append((char)c);
// we are processing a comment. We are inside
// the < !-- .... -- > looking for the -- >.
} else if(mode == COMMENT) {
if(c == ' >"
&& sb.toString().endsWith("--")) {
if (comment != null) {
sb.setLength(sb.length() - 2);
comment.comment(sb.toString());
}
sb.setLength(0);
mode = popMode(st);
} else
sb.append((char)c);
// We are outside the root tag element
} else if(mode == PRE) {
if(c == '< ") {
mode = TEXT;
st.push(new Integer(mode));
mode = START_TAG;
}
// We are inside one of these < ? ... ? >
// or one of these < !DOCTYPE ... >
} else if(mode == DOCTYPE) {
if(c == ' >") {
mode = popMode(st);
if(mode == TEXT) mode = PRE;
}
// we have just seen a < and
// are wondering what we are looking at
// < foo >, < /foo >, < !-- ... --- >, etc.
} else if(mode == START_TAG) {
mode = popMode(st);
if(c == '/") {
st.push(new Integer(mode));
mode = CLOSE_TAG;
} else if (c == '?") {
mode = DOCTYPE;
} else {
st.push(new Integer(mode));
mode = OPEN_TAG;
tagName = null;
attrs = new HashMap();
sb.append((char)c);
}
// we are processing an entity, e.g. <, », etc.
} else if(mode == ENTITY) {
if(c == ';") {
mode = popMode(st);
String cent = etag.toString();
etag.setLength(0);
if(cent.startsWith("#x")) {
try {
char ci = (char)Integer.parseInt(cent.substring(2),16);
sb.append(ci);
}
catch (Exception es) {
sb.append('&").append(cent).append(';");
}
}
else if(cent.startsWith("#")) {
try {
char ci = (char)Integer.parseInt(cent.substring(1));
sb.append(ci);
}
catch (Exception es) {
sb.append('&").append(cent).append(';");
}
}
else {
char ce = decodeEntity(cent);
if (ce == '\0")
sb.append('&").append(cent).append(';");
else
sb.append(ce);
}
} else if ((c != '#" && (c < '0" || c > '9") && (c < 'a" || c > 'z")
&& (c < 'A" || c > 'Z")) || etag.length() >= 7) {
mode = popMode(st);
pushBack = c;
sb.append('&").append(etag.toString());
etag.setLength(0);
}
else {
etag.append((char)c);
}
// we have just seen something like this:
// < foo a="b"/
// and are looking for the final >.
} else if(mode == SINGLE_TAG) {
if(tagName == null)
tagName = sb.toString();
if (html)
tagName = tagName.toLowerCase();
if(c != ' >")
exc("Expected > for tag: < "+tagName+"/ >",line,col);
doc.startElement(tagName,attrs);
doc.endElement(tagName);
if(!html && depth==0) {
doc.endDocument();
return;
}
sb.setLength(0);
attrs = new HashMap();
tagName = null;
mode = popMode(st);
// we are processing something
// like this < foo ... >. It could
// still be a < !-- ... -- > or something.
} else if(mode == OPEN_TAG) {
if(c == ' >") {
if(tagName == null)
tagName = sb.toString();
if (html)
tagName = tagName.toLowerCase();
sb.setLength(0);
depth++;
doc.startElement(tagName,attrs);
tagName = null;
attrs = new HashMap();
mode = popMode(st);
} else if(c == '/") {
mode = SINGLE_TAG;
} else if(c == '-" && sb.toString().equals("!-")) {
mode = COMMENT;
sb.setLength(0);
} else if(c == '[" && sb.toString().equals("![CDATA")) {
mode = CDATA;
sb.setLength(0);
} else if(c == 'E" && sb.toString().equals("!DOCTYP")) {
sb.setLength(0);
mode = DOCTYPE;
} else if(Character.isWhitespace((char)c)) {
tagName = sb.toString();
if (html)
tagName = tagName.toLowerCase();
sb.setLength(0);
mode = IN_TAG;
} else {
sb.append((char)c);
}
// We are processing the quoted right-hand side
// of an element's attribute.
} else if(mode == QUOTE) {
if (html && quotec == ' " && c == ' >") {
rvalue = sb.toString();
sb.setLength(0);
attrs.put(lvalue,rvalue);
mode = popMode(st);
doc.startElement(tagName,attrs);
depth++;
tagName = null;
attrs = new HashMap();
}
else if (html && quotec == ' " && Character.isWhitespace((char)c)) {
rvalue = sb.toString();
sb.setLength(0);
attrs.put(lvalue,rvalue);
mode = IN_TAG;
}
else if (html && quotec == ' ") {
sb.append((char)c);
}
else if(c == quotec) {
rvalue = sb.toString();
sb.setLength(0);
attrs.put(lvalue,rvalue);
mode = IN_TAG;
// See section the XML spec, section 3.3.3
// on normalization processing.
} else if(" \r\n\u0009".indexOf(c) >=0) {
sb.append(' ");
} else if(c == '&") {
st.push(new Integer(mode));
mode = ENTITY;
etag.setLength(0);
} else {
sb.append((char)c);
}
} else if(mode == ATTRIBUTE_RVALUE) {
if(c == '"" || c == '\'") {
quotec = c;
mode = QUOTE;
} else if(Character.isWhitespace((char)c)) {
;
} else if (html && c == ' >") {
attrs.put(lvalue,sb.toString());
sb.setLength(0);
mode = popMode(st);
doc.startElement(tagName,attrs);
depth++;
tagName = null;
attrs = new HashMap();
} else if (html) {
sb.append((char)c);
quotec = ' ";
mode = QUOTE;
} else {
exc("Error in attribute processing",line,col);
}
} else if(mode == ATTRIBUTE_LVALUE) {
if(Character.isWhitespace((char)c)) {
lvalue = sb.toString();
if (html)
lvalue = lvalue.toLowerCase();
sb.setLength(0);
mode = ATTRIBUTE_EQUAL;
} else if(c == '=") {
lvalue = sb.toString();
if (html)
lvalue = lvalue.toLowerCase();
sb.setLength(0);
mode = ATTRIBUTE_RVALUE;
} else if (html && c == ' >") {
sb.setLength(0);
mode = popMode(st);
doc.startElement(tagName,attrs);
depth++;
tagName = null;
attrs = new HashMap();
} else {
sb.append((char)c);
}
} else if(mode == ATTRIBUTE_EQUAL) {
if(c == '=") {
mode = ATTRIBUTE_RVALUE;
} else if(Character.isWhitespace((char)c)) {
;
} else if (html && c == ' >") {
sb.setLength(0);
mode = popMode(st);
doc.startElement(tagName,attrs);
depth++;
tagName = null;
attrs = new HashMap();
} else if (html && c == '/") {
sb.setLength(0);
mode = SINGLE_TAG;
} else if (html) {
sb.setLength(0);
sb.append((char)c);
mode = ATTRIBUTE_LVALUE;
} else {
exc("Error in attribute processing.",line,col);
}
} else if(mode == IN_TAG) {
if(c == ' >") {
mode = popMode(st);
doc.startElement(tagName,attrs);
depth++;
tagName = null;
attrs = new HashMap();
} else if(c == '/") {
mode = SINGLE_TAG;
} else if(Character.isWhitespace((char)c)) {
;
} else {
mode = ATTRIBUTE_LVALUE;
sb.append((char)c);
}
}
}
if(html || mode == DONE) {
if (html && mode == TEXT)
doc.text(sb.toString());
doc.endDocument();
}
else
exc("missing end tag",line,col);
}
Parses the XML document firing the events to the handler. |