Source code: org/apache/lenya/lucene/html/HtmlDocument.java
1 /*
2 * Copyright 1999-2004 The Apache Software Foundation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 */
17
18 /* $Id: HtmlDocument.java 42598 2004-03-01 16:18:28Z gregor $ */
19
20 package org.apache.lenya.lucene.html;
21
22
23 // Imports commented out since there is a name clash and fully
24 // qualified class names will be used in the code. Imports are
25 // left for ease of maintenance.
26 import java.io.BufferedReader;
27 import java.io.File;
28 import java.io.FileInputStream;
29 import java.io.FileReader;
30 import java.io.IOException;
31 import java.io.InputStream;
32 import java.io.StringWriter;
33
34 import org.apache.lucene.document.Field;
35 import org.w3c.dom.Attr;
36 import org.w3c.dom.Element;
37 import org.w3c.dom.Node;
38 import org.w3c.dom.NodeList;
39 import org.w3c.dom.Text;
40 import org.w3c.tidy.Tidy;
41
42
43 /**
44 * The <code>HtmlDocument</code> class creates a Lucene {@link org.apache.lucene.document.Document}
45 * from an HTML document.
46 *
47 * <P>
48 * It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link
49 * java.io.InputStream}.
50 * </p>
51 */
52 public class HtmlDocument {
53 private Element rawDoc;
54 private String luceneTagName = null;
55 private String luceneClassValue = null;
56
57 /**
58 * Constructs an <code>HtmlDocument</code> from a {@link java.io.File}.
59 *
60 * @param file the <code>File</code> containing the HTML to parse
61 * @exception IOException if an I/O exception occurs
62 */
63 public HtmlDocument(File file) throws IOException {
64 Tidy tidy = new Tidy();
65 tidy.setQuiet(true);
66 tidy.setShowWarnings(false);
67
68 org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream(file), null);
69 rawDoc = root.getDocumentElement();
70 }
71
72 /**
73 * Constructs an <code>HtmlDocument</code> from an {@link java.io.InputStream}.
74 *
75 * @param is the <code>InputStream</code> containing the HTML
76 * @exception IOException if I/O exception occurs
77 */
78 public HtmlDocument(InputStream is) throws IOException {
79 Tidy tidy = new Tidy();
80 tidy.setQuiet(true);
81 tidy.setShowWarnings(false);
82
83 org.w3c.dom.Document root = tidy.parseDOM(is, null);
84 rawDoc = root.getDocumentElement();
85 }
86
87 /**
88 * Creates a Lucene <code>Document</code> from an {@link java.io.InputStream}.
89 *
90 * @param is
91 * @return org.apache.lucene.document.Document
92 * @exception IOException
93 */
94 public static org.apache.lucene.document.Document getDocument(InputStream is)
95 throws IOException {
96 HtmlDocument htmlDoc = new HtmlDocument(is);
97 org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
98
99 luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
100 luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
101
102 return luceneDoc;
103 }
104
105 /**
106 * Creates a Lucene <code>Document</code> from a {@link java.io.File}.
107 *
108 * @param file
109 * @return org.apache.lucene.document.Document
110 * @exception IOException
111 */
112 public static org.apache.lucene.document.Document Document(File file)
113 throws IOException {
114 HtmlDocument htmlDoc = new HtmlDocument(file);
115 org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
116
117 luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
118 luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
119
120 String contents = null;
121 BufferedReader br = new BufferedReader(new FileReader(file));
122 StringWriter sw = new StringWriter();
123 String line = br.readLine();
124
125 while (line != null) {
126 sw.write(line);
127 line = br.readLine();
128 }
129
130 br.close();
131 contents = sw.toString();
132 sw.close();
133
134 luceneDoc.add(Field.UnIndexed("rawcontents", contents));
135
136 return luceneDoc;
137 }
138
139 /**
140 * Gets the title attribute of the <code>HtmlDocument</code> object.
141 *
142 * @return the title value
143 */
144 public String getTitle() {
145 if (rawDoc == null) {
146 return null;
147 }
148
149 String title = "";
150
151 NodeList nl = rawDoc.getElementsByTagName("title");
152
153 if (nl.getLength() > 0) {
154 Element titleElement = ((Element) nl.item(0));
155 Text text = (Text) titleElement.getFirstChild();
156
157 if (text != null) {
158 title = text.getData();
159 }
160 }
161
162 return title;
163 }
164
165 /**
166 * Gets the body text attribute of the <code>HtmlDocument</code> object.
167 *
168 * @return the body text value
169 */
170 public String getBody() {
171 if (rawDoc == null) {
172 return null;
173 }
174
175 // NOTE: JTidy will insert a meta tag: <meta name="generator" content="HTML Tidy, see www.w3.org" />
176 // This means that getLength is always greater than 0
177 NodeList metaNL = rawDoc.getElementsByTagName("meta");
178
179 for (int i = 0; i < metaNL.getLength(); i++) {
180 Element metaElement = (Element) metaNL.item(i);
181 Attr nameAttr = metaElement.getAttributeNode("name");
182 Attr valueAttr = metaElement.getAttributeNode("value");
183
184 if ((nameAttr != null) && (valueAttr != null)) {
185 if (nameAttr.getValue().equals("lucene-tag-name")) {
186 luceneTagName = valueAttr.getValue();
187 }
188
189 if (nameAttr.getValue().equals("lucene-class-value")) {
190 luceneClassValue = valueAttr.getValue();
191 }
192 }
193 }
194
195 boolean indexByLucene = true;
196
197 if ((luceneTagName != null) && (luceneClassValue != null)) {
198 indexByLucene = false;
199 }
200
201 System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene);
202
203 String body = "";
204 NodeList nl = rawDoc.getElementsByTagName("body");
205
206 if (nl.getLength() > 0) {
207 body = getBodyText(nl.item(0), indexByLucene);
208 }
209
210 return body;
211 }
212
213 /**
214 * Gets the bodyText attribute of the <code>HtmlDocument</code> object.
215 *
216 * @param node a DOM Node
217 * @param indexByLucene DOCUMENT ME!
218 * @return The bodyText value
219 */
220 private String getBodyText(Node node, boolean indexByLucene) {
221 NodeList nl = node.getChildNodes();
222 StringBuffer buffer = new StringBuffer();
223
224 for (int i = 0; i < nl.getLength(); i++) {
225 boolean index = indexByLucene;
226 Node child = nl.item(i);
227
228 switch (child.getNodeType()) {
229 case Node.ELEMENT_NODE:
230
231 if ((luceneTagName != null) && (luceneClassValue != null)) {
232 if (child.getNodeName().equals(luceneTagName)) {
233 Attr attribute = ((Element) child).getAttributeNode("class");
234
235 if (attribute != null) {
236 if (attribute.getValue().equals(luceneClassValue)) {
237 System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName +
238 " class=\"" + luceneClassValue + "\"> found!");
239 index = true;
240 }
241
242 }
243 }
244 }
245
246 buffer.append(getBodyText(child, index));
247
248 if (index) {
249 buffer.append(" ");
250 }
251
252 break;
253
254 case Node.TEXT_NODE:
255
256 if (indexByLucene) {
257 buffer.append(((Text) child).getData());
258 }
259
260 break;
261 }
262 }
263
264 return buffer.toString();
265 }
266 }