Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

com.arthurdo.parser
Class HtmlStreamTokenizer  view HtmlStreamTokenizer download HtmlStreamTokenizer.java

java.lang.Object
  extended bycom.arthurdo.parser.HtmlStreamTokenizer

public class HtmlStreamTokenizer
extends java.lang.Object

HtmlStreamTokenizer is an HTML parser that is similar to the StreamTokenizer class but is specialized for HTML streams. This class is useful when you need to parse the structure of an HTML document.

 import com.arthurdo.parser.*;
 

HtmlStreamTokenizer tok = new HtmlStreamTokenizer(inputstream); HtmlTag tag = new HtmlTag(); while (tok.nextToken() != HtmlStreamTokenizer.TT_EOF) { int ttype = tok.getTokenType(); if (ttype == HtmlStreamTokenizer.TT_TAG) { tok.parseTag(tok.getStringValue(), tag); System.out.println("tag: " + tag.toString()); } else if (ttype == HtmlStreamTokenizer.TT_TEXT) { System.out.println("text: " + tok.getStringValue()); } else if (ttype == HtmlStreamTokenizer.TT_COMMENT) { System.out.println("comment: <!--" + tok.getStringValue() + "-->"); } }

One of the motivations for designing parseTag() to take an HtmlTag argument rather than having parseTag() return a newly created HtmlTag is so you can create your own tag class derived from HtmlTag.

Version:
2.01 09/12/97

Field Summary
private static char C_DOUBLEQUOTE
           
private static char C_EMPTY
           
(package private) static char C_ENDTAG
           
private static char C_SINGLEQUOTE
           
private static byte CT_ALPHA
           
private static byte CT_COMMENT
           
private static byte CT_DIGIT
           
private static byte CT_QUOTE
           
private static byte CT_WHITESPACE
           
private static int CTYPE_LEN
           
private  java.lang.StringBuffer m_buf
           
private  int m_cdata
           
private  char[] m_cdata_end
           
private  boolean m_cdata_pushback
           
private  int m_comment
           
private static byte[] m_ctype
           
private static java.util.Hashtable m_escapes
           
private  boolean m_getEntities
           
private  java.io.Reader m_in
           
private  boolean m_isCDTATA
           
private  int m_lineno
           
private  int m_pushback
           
private  int m_state
           
private  int m_tagquote
           
private  int m_ttype
           
private  boolean m_unescape
           
private  java.lang.StringBuffer m_whitespace
           
private static char[] m_xmlcdata_end
           
private static int STATE_BANGTAG
           
private static int STATE_COMMENT
           
private static int STATE_ENTITYREF
           
private static int STATE_EOF
           
private static int STATE_TAG
           
private static int STATE_TAG_QUOTE
           
private static int STATE_TEXT
           
private static int STATE_WS
           
static int TT_BANGTAG
          inside
static int TT_COMMENT
          comment token.
static int TT_ENTITYREFERENCE
          entity reference token (&*;)
static int TT_EOF
          end of stream.
static int TT_TAG
          tag token.
static int TT_TEXT
          text token.
 
Constructor Summary
HtmlStreamTokenizer(java.io.InputStream in)
          Deprecated. use HtmlStreamTokenizer(Reader) instead. This version of the constructor can lead to 10x slower code because of the InputStreamReader wrapper.
HtmlStreamTokenizer(java.io.Reader in)
           
 
Method Summary
 void enterCDATAMode(char[] exitString, boolean pushbackExitString)
           
 int getLineNumber()
           
 java.lang.String getRawString()
           
 java.lang.StringBuffer getStringValue()
           
 int getTokenType()
           
 java.lang.StringBuffer getWhiteSpace()
          Deprecated. white space is now returned as TT_TEXT. This buffer is always empty.
 boolean isCDATA()
           
private static boolean isPunct(char c)
           
private static boolean isSpace(int c)
           
 boolean isUnescaped()
           
 int nextToken()
           
private static java.lang.Character parseEscape(java.lang.String s)
           
private  void parseParams(HtmlTag tag, java.lang.String buf, int idx)
           
 void parseTag(java.lang.StringBuffer sbuf, HtmlTag tag)
          The reason this function takes an HtmlTag argument rather than returning a newly created HtmlTag object is so that you can create your own tag class derived from HtmlTag if desired.
 void setUnescaped(boolean unescape)
           
static java.lang.String unescape(java.lang.String buf)
          Replaces HTML escape sequences with its character equivalent, e.g.
static void unescape(java.lang.StringBuffer buf)
          Replaces HTML escape sequences with its character equivalent, e.g.
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

TT_EOF

public static final int TT_EOF
end of stream.

See Also:
Constant Field Values

TT_TEXT

public static final int TT_TEXT
text token.

See Also:
Constant Field Values

TT_TAG

public static final int TT_TAG
tag token.

See Also:
Constant Field Values

TT_COMMENT

public static final int TT_COMMENT
comment token.

See Also:
Constant Field Values

TT_BANGTAG

public static final int TT_BANGTAG
inside
See Also:
Constant Field Values

TT_ENTITYREFERENCE

public static final int TT_ENTITYREFERENCE
entity reference token (&*;)

See Also:
Constant Field Values

m_ttype

private int m_ttype

m_buf

private java.lang.StringBuffer m_buf

m_whitespace

private java.lang.StringBuffer m_whitespace

m_pushback

private int m_pushback

m_lineno

private int m_lineno

m_comment

private int m_comment

m_cdata_end

private char[] m_cdata_end

m_cdata

private int m_cdata

m_cdata_pushback

private boolean m_cdata_pushback

m_isCDTATA

private boolean m_isCDTATA

m_xmlcdata_end

private static char[] m_xmlcdata_end

STATE_EOF

private static final int STATE_EOF
See Also:
Constant Field Values

STATE_COMMENT

private static final int STATE_COMMENT
See Also:
Constant Field Values

STATE_TEXT

private static final int STATE_TEXT
See Also:
Constant Field Values

STATE_TAG

private static final int STATE_TAG
See Also:
Constant Field Values

STATE_WS

private static final int STATE_WS
See Also:
Constant Field Values

STATE_TAG_QUOTE

private static final int STATE_TAG_QUOTE
See Also:
Constant Field Values

STATE_BANGTAG

private static final int STATE_BANGTAG
See Also:
Constant Field Values

STATE_ENTITYREF

private static final int STATE_ENTITYREF
See Also:
Constant Field Values

m_state

private int m_state

m_in

private java.io.Reader m_in

C_ENDTAG

static final char C_ENDTAG
See Also:
Constant Field Values

C_EMPTY

private static final char C_EMPTY
See Also:
Constant Field Values

C_SINGLEQUOTE

private static final char C_SINGLEQUOTE
See Also:
Constant Field Values

C_DOUBLEQUOTE

private static final char C_DOUBLEQUOTE
See Also:
Constant Field Values

m_tagquote

private int m_tagquote

CTYPE_LEN

private static final int CTYPE_LEN
See Also:
Constant Field Values

m_ctype

private static byte[] m_ctype

CT_WHITESPACE

private static final byte CT_WHITESPACE
See Also:
Constant Field Values

CT_DIGIT

private static final byte CT_DIGIT
See Also:
Constant Field Values

CT_ALPHA

private static final byte CT_ALPHA
See Also:
Constant Field Values

CT_QUOTE

private static final byte CT_QUOTE
See Also:
Constant Field Values

CT_COMMENT

private static final byte CT_COMMENT
See Also:
Constant Field Values

m_escapes

private static java.util.Hashtable m_escapes

m_unescape

private boolean m_unescape

m_getEntities

private boolean m_getEntities
Constructor Detail

HtmlStreamTokenizer

public HtmlStreamTokenizer(java.io.InputStream in)
Deprecated. use HtmlStreamTokenizer(Reader) instead. This version of the constructor can lead to 10x slower code because of the InputStreamReader wrapper.


HtmlStreamTokenizer

public HtmlStreamTokenizer(java.io.Reader in)
Method Detail

getTokenType

public final int getTokenType()

getStringValue

public final java.lang.StringBuffer getStringValue()

getRawString

public final java.lang.String getRawString()

getWhiteSpace

public final java.lang.StringBuffer getWhiteSpace()
Deprecated. white space is now returned as TT_TEXT. This buffer is always empty.


getLineNumber

public int getLineNumber()

enterCDATAMode

public void enterCDATAMode(char[] exitString,
                           boolean pushbackExitString)

isCDATA

public boolean isCDATA()

nextToken

public int nextToken()
              throws java.io.IOException

parseTag

public void parseTag(java.lang.StringBuffer sbuf,
                     HtmlTag tag)
              throws HtmlException
The reason this function takes an HtmlTag argument rather than returning a newly created HtmlTag object is so that you can create your own tag class derived from HtmlTag if desired.


unescape

public static java.lang.String unescape(java.lang.String buf)
Replaces HTML escape sequences with its character equivalent, e.g. &amp;copy; becomes &copy;.


unescape

public static void unescape(java.lang.StringBuffer buf)
Replaces HTML escape sequences with its character equivalent, e.g. &copy; becomes ©.


isSpace

private static boolean isSpace(int c)

isPunct

private static boolean isPunct(char c)

isUnescaped

public boolean isUnescaped()

setUnescaped

public void setUnescaped(boolean unescape)

parseEscape

private static java.lang.Character parseEscape(java.lang.String s)

parseParams

private void parseParams(HtmlTag tag,
                         java.lang.String buf,
                         int idx)
                  throws HtmlException