1 /**
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.nutch.parse;
18
19 // Commons Logging imports
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22
23 // Nutch Imports
24 import org.apache.nutch.protocol.Content;
25
26 // Hadoop imports
27 import org.apache.hadoop.conf.Configuration;
28
29
30 /**
31 * A Utility class containing methods to simply perform parsing utilities such
32 * as iterating through a preferred list of {@link Parser}s to obtain
33 * {@link Parse} objects.
34 *
35 * @author mattmann
36 * @author Jérôme Charron
37 * @author Sébastien Le Callonnec
38 */
39 public class ParseUtil {
40
41 /* our log stream */
42 public static final Log LOG = LogFactory.getLog(ParseUtil.class);
43 private ParserFactory parserFactory;
44
45 /**
46 *
47 * @param conf
48 */
49 public ParseUtil(Configuration conf) {
50 this.parserFactory = new ParserFactory(conf);
51 }
52
53 /**
54 * Performs a parse by iterating through a List of preferred {@link Parser}s
55 * until a successful parse is performed and a {@link Parse} object is
56 * returned. If the parse is unsuccessful, a message is logged to the
57 * <code>WARNING</code> level, and an empty parse is returned.
58 *
59 * @param content The content to try and parse.
60 * @return <key, {@link Parse}> pairs.
61 * @throws ParseException If no suitable parser is found to perform the parse.
62 */
63 public ParseResult parse(Content content) throws ParseException {
64 Parser[] parsers = null;
65
66 try {
67 parsers = this.parserFactory.getParsers(content.getContentType(),
68 content.getUrl() != null ? content.getUrl():"");
69 } catch (ParserNotFound e) {
70 if (LOG.isWarnEnabled()) {
71 LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
72 " of type " + content.getContentType());
73 }
74 throw new ParseException(e.getMessage());
75 }
76
77 ParseResult parseResult = null;
78 for (int i=0; i<parsers.length; i++) {
79 if (LOG.isDebugEnabled()) {
80 LOG.debug("Parsing [" + content.getUrl() + "] with [" + parsers[i] + "]");
81 }
82 parseResult = parsers[i].getParse(content);
83 if (parseResult != null && !parseResult.isEmpty())
84 return parseResult;
85 }
86
87 if (LOG.isWarnEnabled()) {
88 LOG.warn("Unable to successfully parse content " + content.getUrl() +
89 " of type " + content.getContentType());
90 }
91 return null;
92 }
93
94 /**
95 * Method parses a {@link Content} object using the {@link Parser} specified
96 * by the parameter <code>extId</code>, i.e., the Parser's extension ID.
97 * If a suitable {@link Parser} is not found, then a <code>WARNING</code>
98 * level message is logged, and a ParseException is thrown. If the parse is
99 * uncessful for any other reason, then a <code>WARNING</code> level
100 * message is logged, and a <code>ParseStatus.getEmptyParse()</code> is
101 * returned.
102 *
103 * @param extId The extension implementation ID of the {@link Parser} to use
104 * to parse the specified content.
105 * @param content The content to parse.
106 *
107 * @return <key, {@link Parse}> pairs if the parse is successful, otherwise,
108 * a single <key, <code>ParseStatus.getEmptyParse()</code>> pair.
109 *
110 * @throws ParseException If there is no suitable {@link Parser} found
111 * to perform the parse.
112 */
113 public ParseResult parseByExtensionId(String extId, Content content)
114 throws ParseException {
115 Parser p = null;
116
117 try {
118 p = this.parserFactory.getParserById(extId);
119 } catch (ParserNotFound e) {
120 if (LOG.isWarnEnabled()) {
121 LOG.warn("No suitable parser found when trying to parse content " + content.getUrl() +
122 " of type " + content.getContentType());
123 }
124 throw new ParseException(e.getMessage());
125 }
126
127 ParseResult parseResult = p.getParse(content);
128 if (parseResult != null && !parseResult.isEmpty()) {
129 return parseResult;
130 } else {
131 if (LOG.isWarnEnabled()) {
132 LOG.warn("Unable to successfully parse content " + content.getUrl() +
133 " of type " + content.getContentType());
134 }
135 return null;
136 }
137 }
138
139 }