public ParseResult getParse(Content content) {
List theRSSChannels = null;
try {
byte[] raw = content.getContent();
// create a new FeedParser...
FeedParser parser = FeedParserFactory.newFeedParser();
// create a listener for handling our callbacks
FeedParserListener listener = new FeedParserListenerImpl();
// start parsing our feed and have the onItem methods called
parser.parse(listener, new ByteArrayInputStream(raw), /* resource */
null);
theRSSChannels = ((FeedParserListenerImpl) listener).getChannels();
} catch (Exception e) { // run time exception
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
LOG.warn("nutch:parse-rss:RSSParser Exception: " + e.getMessage());
}
return new ParseStatus(ParseStatus.FAILED,
"Can't be handled as rss document. " + e).getEmptyParseResult(content.getUrl(), getConf());
}
StringBuffer contentTitle = new StringBuffer(), indexText = new StringBuffer();
List theOutlinks = new Vector();
// for us, the contentTitle will be a concatenation of the titles of the
// RSS Channels that we've parsed
// and the index text will be a concatenation of the RSS Channel
// descriptions, and descriptions of the RSS Items in the channel
// also get the outlinks
if (theRSSChannels != null) {
for (int i = 0; i < theRSSChannels.size(); i++) {
RSSChannel r = (RSSChannel) theRSSChannels.get(i);
contentTitle.append(r.getTitle());
contentTitle.append(" ");
// concat the description to the index text
indexText.append(r.getDescription());
indexText.append(" ");
if (r.getLink() != null) {
try {
// get the outlink
if (r.getDescription()!= null ) {
theOutlinks.add(new Outlink(r.getLink(), r.getDescription()));
} else {
theOutlinks.add(new Outlink(r.getLink(), ""));
}
} catch (MalformedURLException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("MalformedURL: " + r.getLink());
LOG.warn("Attempting to continue processing outlinks");
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
continue;
}
}
// now get the descriptions of all the underlying RSS Items and
// then index them too
for (int j = 0; j < r.getItems().size(); j++) {
RSSItem theRSSItem = (RSSItem) r.getItems().get(j);
indexText.append(theRSSItem.getDescription());
indexText.append(" ");
String whichLink = null;
if (theRSSItem.getPermalink() != null)
whichLink = theRSSItem.getPermalink();
else
whichLink = theRSSItem.getLink();
if (whichLink != null) {
try {
if (theRSSItem.getDescription()!=null) {
theOutlinks.add(new Outlink(whichLink, theRSSItem.getDescription()));
} else {
theOutlinks.add(new Outlink(whichLink, ""));
}
} catch (MalformedURLException e) {
if (LOG.isWarnEnabled()) {
LOG.warn("MalformedURL: " + whichLink);
LOG.warn("Attempting to continue processing outlinks");
e.printStackTrace(LogUtil.getWarnStream(LOG));
}
continue;
}
}
}
}
if (LOG.isTraceEnabled()) {
LOG.trace("nutch:parse-rss:getParse:indexText=" + indexText);
LOG.trace("nutch:parse-rss:getParse:contentTitle=" + contentTitle);
}
} else if (LOG.isTraceEnabled()) {
LOG.trace("nutch:parse-rss:Error:getParse: No RSS Channels recorded!");
}
// format the outlinks
Outlink[] outlinks = (Outlink[]) theOutlinks.toArray(new Outlink[theOutlinks.size()]);
if (LOG.isTraceEnabled()) {
LOG.trace("nutch:parse-rss:getParse:found " + outlinks.length + " outlinks");
}
// if (LOG.isInfoEnabled()) {
// LOG.info("Outlinks: "+outlinks);
// }
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
contentTitle.toString(), outlinks, content.getMetadata());
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
}
Implementation method, parses the RSS content, and then returns a
ParseImpl .
|