|
|||||||||
| Home >> All >> com >> arthurdo >> [ parser overview ] | PREV CLASS NEXT CLASS | ||||||||
SUMMARY: JAVADOC | SOURCE | DOWNLOAD | NESTED | FIELD | CONSTR | METHOD |
DETAIL: FIELD | CONSTR | METHOD | ||||||||
com.arthurdo.parser
Class HtmlStreamTokenizer

java.lang.Objectcom.arthurdo.parser.HtmlStreamTokenizer
- public class HtmlStreamTokenizer
- extends java.lang.Object
HtmlStreamTokenizer is an HTML parser that is similar to the StreamTokenizer class but is specialized for HTML streams. This class is useful when you need to parse the structure of an HTML document.
import com.arthurdo.parser.*;HtmlStreamTokenizer tok = new HtmlStreamTokenizer(inputstream); HtmlTag tag = new HtmlTag(); while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF) { int ttype = tok.getTokenType(); if (ttype == HtmlStreamTokenizer.TT_TAG) { tok.parseTag(tok.getStringValue(), tag); System.out.println("tag: " + tag.toString()); } else if (ttype == HtmlStreamTokenizer.TT_TEXT) { System.out.println("text: " + tok.getStringValue()); } else if (ttype == HtmlStreamTokenizer.TT_COMMENT) { System.out.println("comment: <!--" + tok.getStringValue() + "-->"); } }
One of the motivations for designing parseTag() to take an HtmlTag argument rather than having parseTag() return a newly created HtmlTag is so you can create your own tag class derived from HtmlTag.
- 02/09/98 Thomas Horster-Möller, fixed bug with counting newlines twice on character pushback.
- 06/14/99 text is now returned as 'runs' instead of being broken up into words as in previous versions. You can use a StringTokenizer to break your text into words.
- Version:
- 2.01 09/12/97
| Field Summary | |
private static char |
C_DOUBLEQUOTE
|
private static char |
C_EMPTY
|
(package private) static char |
C_ENDTAG
|
private static char |
C_SINGLEQUOTE
|
private static byte |
CT_ALPHA
|
private static byte |
CT_COMMENT
|
private static byte |
CT_DIGIT
|
private static byte |
CT_QUOTE
|
private static byte |
CT_WHITESPACE
|
private static int |
CTYPE_LEN
|
private java.lang.StringBuffer |
m_buf
|
private int |
m_cdata
|
private char[] |
m_cdata_end
|
private boolean |
m_cdata_pushback
|
private int |
m_comment
|
private static byte[] |
m_ctype
|
private static java.util.Hashtable |
m_escapes
|
private boolean |
m_getEntities
|
private java.io.Reader |
m_in
|
private boolean |
m_isCDTATA
|
private int |
m_lineno
|
private int |
m_pushback
|
private int |
m_state
|
private int |
m_tagquote
|
private int |
m_ttype
|
private boolean |
m_unescape
|
private java.lang.StringBuffer |
m_whitespace
|
private static char[] |
m_xmlcdata_end
|
private static int |
STATE_BANGTAG
|
private static int |
STATE_COMMENT
|
private static int |
STATE_ENTITYREF
|
private static int |
STATE_EOF
|
private static int |
STATE_TAG
|
private static int |
STATE_TAG_QUOTE
|
private static int |
STATE_TEXT
|
private static int |
STATE_WS
|
static int |
TT_BANGTAG
inside |
static int |
TT_COMMENT
comment token. |
static int |
TT_ENTITYREFERENCE
entity reference token (&*;) |
static int |
TT_EOF
end of stream. |
static int |
TT_TAG
tag token. |
static int |
TT_TEXT
text token. |
| Constructor Summary | |
HtmlStreamTokenizer(java.io.InputStream in)
Deprecated. use HtmlStreamTokenizer(Reader) instead. This version of the constructor can lead to 10x slower code because of the InputStreamReader wrapper. |
|
HtmlStreamTokenizer(java.io.Reader in)
|
|
| Method Summary | |
void |
enterCDATAMode(char[] exitString,
boolean pushbackExitString)
|
int |
getLineNumber()
|
java.lang.String |
getRawString()
|
java.lang.StringBuffer |
getStringValue()
|
int |
getTokenType()
|
java.lang.StringBuffer |
getWhiteSpace()
Deprecated. white space is now returned as TT_TEXT. This buffer is always empty. |
boolean |
isCDATA()
|
private static boolean |
isPunct(char c)
|
private static boolean |
isSpace(int c)
|
boolean |
isUnescaped()
|
int |
nextToken()
|
private static java.lang.Character |
parseEscape(java.lang.String s)
|
private void |
parseParams(HtmlTag tag,
java.lang.String buf,
int idx)
|
void |
parseTag(java.lang.StringBuffer sbuf,
HtmlTag tag)
The reason this function takes an HtmlTag argument rather than returning a newly created HtmlTag object is so that you can create your own tag class derived from HtmlTag if desired. |
void |
setUnescaped(boolean unescape)
|
static java.lang.String |
unescape(java.lang.String buf)
Replaces HTML escape sequences with its character equivalent, e.g. |
static void |
unescape(java.lang.StringBuffer buf)
Replaces HTML escape sequences with its character equivalent, e.g. |
| Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
TT_EOF
public static final int TT_EOF
- end of stream.
- See Also:
- Constant Field Values
TT_TEXT
public static final int TT_TEXT
- text token.
- See Also:
- Constant Field Values
TT_TAG
public static final int TT_TAG
- tag token.
- See Also:
- Constant Field Values
TT_COMMENT
public static final int TT_COMMENT
- comment token.
- See Also:
- Constant Field Values
TT_BANGTAG
public static final int TT_BANGTAG
- inside
- See Also:
- Constant Field Values
TT_ENTITYREFERENCE
public static final int TT_ENTITYREFERENCE
- entity reference token (&*;)
- See Also:
- Constant Field Values
m_ttype
private int m_ttype
m_buf
private java.lang.StringBuffer m_buf
m_whitespace
private java.lang.StringBuffer m_whitespace
m_pushback
private int m_pushback
m_lineno
private int m_lineno
m_comment
private int m_comment
m_cdata_end
private char[] m_cdata_end
m_cdata
private int m_cdata
m_cdata_pushback
private boolean m_cdata_pushback
m_isCDTATA
private boolean m_isCDTATA
m_xmlcdata_end
private static char[] m_xmlcdata_end
STATE_EOF
private static final int STATE_EOF
- See Also:
- Constant Field Values
STATE_COMMENT
private static final int STATE_COMMENT
- See Also:
- Constant Field Values
STATE_TEXT
private static final int STATE_TEXT
- See Also:
- Constant Field Values
STATE_TAG
private static final int STATE_TAG
- See Also:
- Constant Field Values
STATE_WS
private static final int STATE_WS
- See Also:
- Constant Field Values
STATE_TAG_QUOTE
private static final int STATE_TAG_QUOTE
- See Also:
- Constant Field Values
STATE_BANGTAG
private static final int STATE_BANGTAG
- See Also:
- Constant Field Values
STATE_ENTITYREF
private static final int STATE_ENTITYREF
- See Also:
- Constant Field Values
m_state
private int m_state
m_in
private java.io.Reader m_in
C_ENDTAG
static final char C_ENDTAG
- See Also:
- Constant Field Values
C_EMPTY
private static final char C_EMPTY
- See Also:
- Constant Field Values
C_SINGLEQUOTE
private static final char C_SINGLEQUOTE
- See Also:
- Constant Field Values
C_DOUBLEQUOTE
private static final char C_DOUBLEQUOTE
- See Also:
- Constant Field Values
m_tagquote
private int m_tagquote
CTYPE_LEN
private static final int CTYPE_LEN
- See Also:
- Constant Field Values
m_ctype
private static byte[] m_ctype
CT_WHITESPACE
private static final byte CT_WHITESPACE
- See Also:
- Constant Field Values
CT_DIGIT
private static final byte CT_DIGIT
- See Also:
- Constant Field Values
CT_ALPHA
private static final byte CT_ALPHA
- See Also:
- Constant Field Values
CT_QUOTE
private static final byte CT_QUOTE
- See Also:
- Constant Field Values
CT_COMMENT
private static final byte CT_COMMENT
- See Also:
- Constant Field Values
m_escapes
private static java.util.Hashtable m_escapes
m_unescape
private boolean m_unescape
m_getEntities
private boolean m_getEntities
| Constructor Detail |
HtmlStreamTokenizer
public HtmlStreamTokenizer(java.io.InputStream in)
- Deprecated. use HtmlStreamTokenizer(Reader) instead.
This version of the constructor can lead to 10x slower code
because of the InputStreamReader wrapper.
HtmlStreamTokenizer
public HtmlStreamTokenizer(java.io.Reader in)
| Method Detail |
getTokenType
public final int getTokenType()
getStringValue
public final java.lang.StringBuffer getStringValue()
getRawString
public final java.lang.String getRawString()
getWhiteSpace
public final java.lang.StringBuffer getWhiteSpace()
- Deprecated. white space is now returned as TT_TEXT. This buffer is always
empty.
getLineNumber
public int getLineNumber()
enterCDATAMode
public void enterCDATAMode(char[] exitString,
boolean pushbackExitString)
isCDATA
public boolean isCDATA()
nextToken
public int nextToken()
throws java.io.IOException
parseTag
public void parseTag(java.lang.StringBuffer sbuf, HtmlTag tag) throws HtmlException
- The reason this function takes an HtmlTag argument rather than returning
a newly created HtmlTag object is so that you can create your own
tag class derived from HtmlTag if desired.
unescape
public static java.lang.String unescape(java.lang.String buf)
- Replaces HTML escape sequences with its character equivalent, e.g.
&copy; becomes ©.
unescape
public static void unescape(java.lang.StringBuffer buf)
- Replaces HTML escape sequences with its character equivalent, e.g.
© becomes ©.
isSpace
private static boolean isSpace(int c)
isPunct
private static boolean isPunct(char c)
isUnescaped
public boolean isUnescaped()
setUnescaped
public void setUnescaped(boolean unescape)
parseEscape
private static java.lang.Character parseEscape(java.lang.String s)
parseParams
private void parseParams(HtmlTag tag, java.lang.String buf, int idx) throws HtmlException
|
|||||||||
| Home >> All >> com >> arthurdo >> [ parser overview ] | PREV CLASS NEXT CLASS | ||||||||
SUMMARY: JAVADOC | SOURCE | DOWNLOAD | NESTED | FIELD | CONSTR | METHOD |
DETAIL: FIELD | CONSTR | METHOD | ||||||||
JAVADOC
com.arthurdo.parser.HtmlStreamTokenizer