Docjar: A Java Source and Docuemnt Enginecom.*    java.*    javax.*    org.*    all    new    plug-in

Quick Search    Search Deep

Source code: org/apache/lenya/lucene/html/HtmlDocument.java


1   /*
2    * Copyright  1999-2004 The Apache Software Foundation
3    *
4    *  Licensed under the Apache License, Version 2.0 (the "License");
5    *  you may not use this file except in compliance with the License.
6    *  You may obtain a copy of the License at
7    *
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    *
10   *  Unless required by applicable law or agreed to in writing, software
11   *  distributed under the License is distributed on an "AS IS" BASIS,
12   *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   *  See the License for the specific language governing permissions and
14   *  limitations under the License.
15   *
16   */
17  
18  /* $Id: HtmlDocument.java 42598 2004-03-01 16:18:28Z gregor $  */
19  
20  package org.apache.lenya.lucene.html;
21  
22  
23  // Imports commented out since there is a name clash and fully
24  // qualified class names will be used in the code.  Imports are
25  // left for ease of maintenance.
26  import java.io.BufferedReader;
27  import java.io.File;
28  import java.io.FileInputStream;
29  import java.io.FileReader;
30  import java.io.IOException;
31  import java.io.InputStream;
32  import java.io.StringWriter;
33  
34  import org.apache.lucene.document.Field;
35  import org.w3c.dom.Attr;
36  import org.w3c.dom.Element;
37  import org.w3c.dom.Node;
38  import org.w3c.dom.NodeList;
39  import org.w3c.dom.Text;
40  import org.w3c.tidy.Tidy;
41  
42  
43  /**
44   * The <code>HtmlDocument</code> class creates a Lucene {@link org.apache.lucene.document.Document}
45   * from an HTML document.
46   *
47   * <P>
48   * It does this by using JTidy package. It can take input input from {@link java.io.File} or {@link
49   * java.io.InputStream}.
50   * </p>
51   */
52  public class HtmlDocument {
53      private Element rawDoc;
54      private String luceneTagName = null;
55      private String luceneClassValue = null;
56  
57      /**
58       * Constructs an <code>HtmlDocument</code> from a {@link java.io.File}.
59       *
60       * @param file the <code>File</code> containing the HTML to parse
61       * @exception IOException if an I/O exception occurs
62       */
63      public HtmlDocument(File file) throws IOException {
64          Tidy tidy = new Tidy();
65          tidy.setQuiet(true);
66          tidy.setShowWarnings(false);
67  
68          org.w3c.dom.Document root = tidy.parseDOM(new FileInputStream(file), null);
69          rawDoc = root.getDocumentElement();
70      }
71  
72      /**
73       * Constructs an <code>HtmlDocument</code> from an {@link java.io.InputStream}.
74       *
75       * @param is the <code>InputStream</code> containing the HTML
76       * @exception IOException if I/O exception occurs
77       */
78      public HtmlDocument(InputStream is) throws IOException {
79          Tidy tidy = new Tidy();
80          tidy.setQuiet(true);
81          tidy.setShowWarnings(false);
82  
83          org.w3c.dom.Document root = tidy.parseDOM(is, null);
84          rawDoc = root.getDocumentElement();
85      }
86  
87      /**
88       * Creates a Lucene <code>Document</code> from an {@link java.io.InputStream}.
89       *
90       * @param is
91       * @return org.apache.lucene.document.Document
92       * @exception IOException
93       */
94      public static org.apache.lucene.document.Document getDocument(InputStream is)
95          throws IOException {
96          HtmlDocument htmlDoc = new HtmlDocument(is);
97          org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
98  
99          luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
100         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
101 
102         return luceneDoc;
103     }
104 
105     /**
106      * Creates a Lucene <code>Document</code> from a {@link java.io.File}.
107      *
108      * @param file
109      * @return org.apache.lucene.document.Document
110      * @exception IOException
111      */
112     public static org.apache.lucene.document.Document Document(File file)
113         throws IOException {
114         HtmlDocument htmlDoc = new HtmlDocument(file);
115         org.apache.lucene.document.Document luceneDoc = new org.apache.lucene.document.Document();
116 
117         luceneDoc.add(Field.Text("title", htmlDoc.getTitle()));
118         luceneDoc.add(Field.Text("contents", htmlDoc.getBody()));
119 
120         String contents = null;
121         BufferedReader br = new BufferedReader(new FileReader(file));
122         StringWriter sw = new StringWriter();
123         String line = br.readLine();
124 
125         while (line != null) {
126             sw.write(line);
127             line = br.readLine();
128         }
129 
130         br.close();
131         contents = sw.toString();
132         sw.close();
133 
134         luceneDoc.add(Field.UnIndexed("rawcontents", contents));
135 
136         return luceneDoc;
137     }
138 
139     /**
140      * Gets the title attribute of the <code>HtmlDocument</code> object.
141      *
142      * @return the title value
143      */
144     public String getTitle() {
145         if (rawDoc == null) {
146             return null;
147         }
148 
149         String title = "";
150 
151         NodeList nl = rawDoc.getElementsByTagName("title");
152 
153         if (nl.getLength() > 0) {
154             Element titleElement = ((Element) nl.item(0));
155             Text text = (Text) titleElement.getFirstChild();
156 
157             if (text != null) {
158                 title = text.getData();
159             }
160         }
161 
162         return title;
163     }
164 
165     /**
166      * Gets the body text attribute of the <code>HtmlDocument</code> object.
167      *
168      * @return the body text value
169      */
170     public String getBody() {
171         if (rawDoc == null) {
172             return null;
173         }
174 
175         // NOTE: JTidy will insert a meta tag: <meta name="generator" content="HTML Tidy, see www.w3.org" />
176         //       This means that getLength is always greater than 0
177         NodeList metaNL = rawDoc.getElementsByTagName("meta");
178 
179         for (int i = 0; i < metaNL.getLength(); i++) {
180             Element metaElement = (Element) metaNL.item(i);
181             Attr nameAttr = metaElement.getAttributeNode("name");
182             Attr valueAttr = metaElement.getAttributeNode("value");
183 
184             if ((nameAttr != null) && (valueAttr != null)) {
185                 if (nameAttr.getValue().equals("lucene-tag-name")) {
186                     luceneTagName = valueAttr.getValue();
187                 }
188 
189                 if (nameAttr.getValue().equals("lucene-class-value")) {
190                     luceneClassValue = valueAttr.getValue();
191                 }
192             }
193         }
194 
195         boolean indexByLucene = true;
196 
197         if ((luceneTagName != null) && (luceneClassValue != null)) {
198             indexByLucene = false;
199         }
200 
201         System.out.println("HtmlDocument.getBody(): Index By Lucene (Default): " + indexByLucene);
202 
203         String body = "";
204         NodeList nl = rawDoc.getElementsByTagName("body");
205 
206         if (nl.getLength() > 0) {
207             body = getBodyText(nl.item(0), indexByLucene);
208         }
209 
210         return body;
211     }
212 
213     /**
214      * Gets the bodyText attribute of the <code>HtmlDocument</code> object.
215      *
216      * @param node a DOM Node
217      * @param indexByLucene DOCUMENT ME!
218      * @return The bodyText value
219      */
220     private String getBodyText(Node node, boolean indexByLucene) {
221         NodeList nl = node.getChildNodes();
222         StringBuffer buffer = new StringBuffer();
223 
224         for (int i = 0; i < nl.getLength(); i++) {
225             boolean index = indexByLucene;
226             Node child = nl.item(i);
227 
228             switch (child.getNodeType()) {
229             case Node.ELEMENT_NODE:
230 
231                 if ((luceneTagName != null) && (luceneClassValue != null)) {
232                     if (child.getNodeName().equals(luceneTagName)) {
233                         Attr attribute = ((Element) child).getAttributeNode("class");
234 
235                         if (attribute != null) {
236                             if (attribute.getValue().equals(luceneClassValue)) {
237                                 System.out.println("HtmlDocument.getBodyText(): <" + luceneTagName +
238                                     " class=\"" + luceneClassValue + "\"> found!");
239                                 index = true;
240                             }
241 
242                         }
243                     }
244                 }
245 
246                 buffer.append(getBodyText(child, index));
247 
248                 if (index) {
249                     buffer.append(" ");
250                 }
251 
252                 break;
253 
254             case Node.TEXT_NODE:
255 
256                 if (indexByLucene) {
257                     buffer.append(((Text) child).getData());
258                 }
259 
260                 break;
261             }
262         }
263 
264         return buffer.toString();
265     }
266 }