|
|||||||||
| Home >> All >> org >> htmlparser >> [ beans overview ] | PREV CLASS NEXT CLASS | ||||||||
SUMMARY: JAVADOC | SOURCE | DOWNLOAD | NESTED | FIELD | CONSTR | METHOD |
DETAIL: FIELD | CONSTR | METHOD | ||||||||
org.htmlparser.beans
Class StringBean

java.lang.Objectorg.htmlparser.visitors.NodeVisitor
org.htmlparser.beans.StringBean
- All Implemented Interfaces:
- java.io.Serializable
- public class StringBean
- extends org.htmlparser.visitors.NodeVisitor
- implements java.io.Serializable
- extends org.htmlparser.visitors.NodeVisitor
Extract strings from a URL.
Text within <SCRIPT></SCRIPT> tags is removed.
The text within <PRE></PRE> tags is not altered.
The property Strings, which is the output property is null
until a URL is set. So a typical usage is:
StringBean sb = new StringBean();
sb.setLinks(false);
sb.setReplaceNonBreakingSpaces(true);
sb.setCollapse(true);
sb.setURL("http://www.netbeans.org"); // the HTTP is performed here
String s = sb.getStrings();
| Field Summary | |
protected java.lang.StringBuffer |
mBuffer
The buffer text is stored in while traversing the HTML. |
protected boolean |
mCollapse
If true sequences of whitespace characters are replaced
with a single space character. |
protected boolean |
mIsPre
Set true when traversing a PRE tag. |
protected boolean |
mIsScript
Set true when traversing a SCRIPT tag. |
protected boolean |
mLinks
If true the link URLs are embedded in the text output. |
protected org.htmlparser.Parser |
mParser
The parser used to extract strings. |
protected java.beans.PropertyChangeSupport |
mPropertySupport
Bound property support. |
protected boolean |
mReplaceSpace
If true regular space characters are substituted for
non-breaking spaces in the text output. |
protected java.lang.String |
mStrings
The strings extracted from the URL. |
private static java.lang.String |
newline
A newline. |
private static int |
newline_size
The length of the newline. |
static java.lang.String |
PROP_COLLAPSE_PROPERTY
Property name in event where the 'collapse whitespace' state changes. |
static java.lang.String |
PROP_CONNECTION_PROPERTY
Property name in event where the connection changes. |
static java.lang.String |
PROP_LINKS_PROPERTY
Property name in event where the 'embed links' state changes. |
static java.lang.String |
PROP_REPLACE_SPACE_PROPERTY
Property name in event where the 'replace non-breaking spaces' state changes. |
static java.lang.String |
PROP_STRINGS_PROPERTY
Property name in event where the URL contents changes. |
static java.lang.String |
PROP_URL_PROPERTY
Property name in event where the URL changes. |
| Fields inherited from class org.htmlparser.visitors.NodeVisitor |
|
| Constructor Summary | |
StringBean()
Create a StringBean object. |
|
| Method Summary | |
void |
addPropertyChangeListener(java.beans.PropertyChangeListener listener)
Add a PropertyChangeListener to the listener list. |
protected void |
carriage_return()
Appends a newline to the buffer if there isn't one there already. |
protected void |
collapse(java.lang.StringBuffer buffer,
java.lang.String string)
Add the given text collapsing whitespace. |
protected java.lang.String |
extractStrings()
Extract the text from a page. |
boolean |
getCollapse()
Get the current 'collapse whitespace' state. |
java.net.URLConnection |
getConnection()
Get the current connection. |
boolean |
getLinks()
Get the current 'include links' state. |
boolean |
getReplaceNonBreakingSpaces()
Get the current 'replace non breaking spaces' state. |
java.lang.String |
getStrings()
Return the textual contents of the URL. |
java.lang.String |
getURL()
Get the current URL. |
static void |
main(java.lang.String[] args)
Unit test. |
void |
removePropertyChangeListener(java.beans.PropertyChangeListener listener)
Remove a PropertyChangeListener from the listener list. |
private void |
resetStrings()
Refetch the URL contents. |
void |
setCollapse(boolean collapse_whitespace)
Set the current 'collapse whitespace' state. |
void |
setConnection(java.net.URLConnection connection)
Set the parser's connection. |
void |
setLinks(boolean links)
Set the 'include links' state. |
void |
setReplaceNonBreakingSpaces(boolean replace_space)
Set the 'replace non breaking spaces' state. |
protected void |
setStrings()
Fetch the URL contents. |
void |
setURL(java.lang.String url)
Set the URL to extract strings from. |
protected void |
updateStrings(java.lang.String strings)
Assign the Strings property, firing the property change. |
void |
visitEndTag(org.htmlparser.tags.EndTag end)
Possibly resets the state of the PRE and SCRIPT flags. |
void |
visitLinkTag(org.htmlparser.tags.LinkTag link)
Appends the link as text between angle brackets to the output. |
void |
visitStringNode(org.htmlparser.StringNode string)
Appends the text to the output. |
void |
visitTag(org.htmlparser.tags.Tag tag)
Appends a newline to the output if the tag breaks flow, and possibly sets the state of the PRE and SCRIPT flags. |
| Methods inherited from class org.htmlparser.visitors.NodeVisitor |
finishedParsing, shouldRecurseChildren, shouldRecurseSelf, visitImageTag, visitRemarkNode, visitTitleTag |
| Methods inherited from class java.lang.Object |
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
| Field Detail |
PROP_STRINGS_PROPERTY
public static final java.lang.String PROP_STRINGS_PROPERTY
- Property name in event where the URL contents changes.
- See Also:
- Constant Field Values
PROP_LINKS_PROPERTY
public static final java.lang.String PROP_LINKS_PROPERTY
- Property name in event where the 'embed links' state changes.
- See Also:
- Constant Field Values
PROP_URL_PROPERTY
public static final java.lang.String PROP_URL_PROPERTY
- Property name in event where the URL changes.
- See Also:
- Constant Field Values
PROP_REPLACE_SPACE_PROPERTY
public static final java.lang.String PROP_REPLACE_SPACE_PROPERTY
- Property name in event where the 'replace non-breaking spaces' state
changes.
- See Also:
- Constant Field Values
PROP_COLLAPSE_PROPERTY
public static final java.lang.String PROP_COLLAPSE_PROPERTY
- Property name in event where the 'collapse whitespace' state changes.
- See Also:
- Constant Field Values
PROP_CONNECTION_PROPERTY
public static final java.lang.String PROP_CONNECTION_PROPERTY
- Property name in event where the connection changes.
- See Also:
- Constant Field Values
newline
private static final java.lang.String newline
- A newline.
newline_size
private static final int newline_size
- The length of the newline.
mPropertySupport
protected java.beans.PropertyChangeSupport mPropertySupport
- Bound property support.
mParser
protected org.htmlparser.Parser mParser
- The parser used to extract strings.
mStrings
protected java.lang.String mStrings
- The strings extracted from the URL.
mLinks
protected boolean mLinks
- If
truethe link URLs are embedded in the text output.
mReplaceSpace
protected boolean mReplaceSpace
- If
trueregular space characters are substituted for non-breaking spaces in the text output.
mCollapse
protected boolean mCollapse
- If
truesequences of whitespace characters are replaced with a single space character.
mBuffer
protected java.lang.StringBuffer mBuffer
- The buffer text is stored in while traversing the HTML.
mIsScript
protected boolean mIsScript
- Set
truewhen traversing a SCRIPT tag.
mIsPre
protected boolean mIsPre
- Set
truewhen traversing a PRE tag.
| Constructor Detail |
StringBean
public StringBean()
- Create a StringBean object. Default property values are set to 'do the
right thing':
Linksis setfalseso text appears like a browser would display it, albeit without the colour or underline clues normally associated with a link.ReplaceNonBreakingSpacesis settrue, so that printing the text works, but the extra information regarding these formatting marks is available if you set it false.Collapseis settrue, so text appears compact like a browser would display it.
| Method Detail |
carriage_return
protected void carriage_return()
- Appends a newline to the buffer if there isn't one there already. Except
if the buffer is empty.
collapse
protected void collapse(java.lang.StringBuffer buffer, java.lang.String string)
- Add the given text collapsing whitespace. Use a little finite state
machine:
state 0: whitepace was last emitted character state 1: in whitespace state 2: in word A whitespace character moves us to state 1 and any other character moves us to state 2, except that state 0 stays in state 0 until a non-whitespace and going from whitespace to word we emit a space before the character: input: whitespace other-character state\next 0 0 2 1 1 space then 2 2 1 2
extractStrings
protected java.lang.String extractStrings() throws org.htmlparser.util.ParserException
- Extract the text from a page.
updateStrings
protected void updateStrings(java.lang.String strings)
- Assign the
Stringsproperty, firing the property change.
setStrings
protected void setStrings()
- Fetch the URL contents. Only do work if there is a valid parser with it's
URL set.
resetStrings
private void resetStrings()
- Refetch the URL contents. Only need to worry if there is already a valid
parser and it's been spent fetching the string contents.
addPropertyChangeListener
public void addPropertyChangeListener(java.beans.PropertyChangeListener listener)
- Add a PropertyChangeListener to the listener list. The listener is
registered for all properties.
removePropertyChangeListener
public void removePropertyChangeListener(java.beans.PropertyChangeListener listener)
- Remove a PropertyChangeListener from the listener list. This removes a
PropertyChangeListener that was registered for all properties.
getStrings
public java.lang.String getStrings()
- Return the textual contents of the URL. This is the primary output of the
bean.
getLinks
public boolean getLinks()
- Get the current 'include links' state.
setLinks
public void setLinks(boolean links)
- Set the 'include links' state. If the setting is changed after the URL
has been set, the text from the URL will be reacquired, which is possibly
expensive.
getURL
public java.lang.String getURL()
- Get the current URL.
setURL
public void setURL(java.lang.String url)
- Set the URL to extract strings from. The text from the URL will be
fetched, which may be expensive, so this property should be set last.
getReplaceNonBreakingSpaces
public boolean getReplaceNonBreakingSpaces()
- Get the current 'replace non breaking spaces' state.
setReplaceNonBreakingSpaces
public void setReplaceNonBreakingSpaces(boolean replace_space)
- Set the 'replace non breaking spaces' state. If the setting is changed
after the URL has been set, the text from the URL will be reacquired,
which is possibly expensive.
getCollapse
public boolean getCollapse()
- Get the current 'collapse whitespace' state. If set to
truethis emulates the operation of browsers in interpretting text whereuser agents should collapse input white space sequences when producing output inter-word space
. See HTML specification section 9.1 White space http://www.w3.org/TR/html4/struct/text.html#h-9.1.
setCollapse
public void setCollapse(boolean collapse_whitespace)
- Set the current 'collapse whitespace' state. If the setting is changed
after the URL has been set, the text from the URL will be reacquired,
which is possibly expensive.
getConnection
public java.net.URLConnection getConnection()
- Get the current connection.
setConnection
public void setConnection(java.net.URLConnection connection)
- Set the parser's connection. The text from the URL will be fetched, which
may be expensive, so this property should be set last.
visitLinkTag
public void visitLinkTag(org.htmlparser.tags.LinkTag link)
- Appends the link as text between angle brackets to the output.
visitStringNode
public void visitStringNode(org.htmlparser.StringNode string)
- Appends the text to the output.
visitEndTag
public void visitEndTag(org.htmlparser.tags.EndTag end)
- Possibly resets the state of the PRE and SCRIPT flags.
visitTag
public void visitTag(org.htmlparser.tags.Tag tag)
- Appends a newline to the output if the tag breaks flow, and possibly sets
the state of the PRE and SCRIPT flags.
main
public static void main(java.lang.String[] args)
- Unit test.
|
|||||||||
| Home >> All >> org >> htmlparser >> [ beans overview ] | PREV CLASS NEXT CLASS | ||||||||
SUMMARY: JAVADOC | SOURCE | DOWNLOAD | NESTED | FIELD | CONSTR | METHOD |
DETAIL: FIELD | CONSTR | METHOD | ||||||||
JAVADOC