1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.poi.hwpf.extractor;
18
19 import java.io.IOException;
20 import java.io.InputStream;
21 import java.io.FileInputStream;
22 import java.io.UnsupportedEncodingException;
23 import java.util.Iterator;
24
25 import org.apache.poi.POIOLE2TextExtractor;
26 import org.apache.poi.hwpf.HWPFDocument;
27 import org.apache.poi.hwpf.model.TextPiece;
28 import org.apache.poi.hwpf.usermodel.Paragraph;
29 import org.apache.poi.hwpf.usermodel.Range;
30 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
31
32 /**
33 * Class to extract the text from a Word Document.
34 *
35 * You should use either getParagraphText() or getText() unless
36 * you have a strong reason otherwise.
37 *
38 * @author Nick Burch (nick at torchbox dot com)
39 */
40 public class WordExtractor extends POIOLE2TextExtractor {
41 private POIFSFileSystem fs;
42 private HWPFDocument doc;
43
44 /**
45 * Create a new Word Extractor
46 * @param is InputStream containing the word file
47 */
48 public WordExtractor(InputStream is) throws IOException {
49 this( HWPFDocument.verifyAndBuildPOIFS(is) );
50 }
51
52 /**
53 * Create a new Word Extractor
54 * @param fs POIFSFileSystem containing the word file
55 */
56 public WordExtractor(POIFSFileSystem fs) throws IOException {
57 this(new HWPFDocument(fs));
58 this.fs = fs;
59 }
60
61 /**
62 * Create a new Word Extractor
63 * @param doc The HWPFDocument to extract from
64 */
65 public WordExtractor(HWPFDocument doc) throws IOException {
66 super(doc);
67 this.doc = doc;
68 }
69
70 /**
71 * Command line extractor, so people will stop moaning that
72 * they can't just run this.
73 */
74 public static void main(String[] args) throws IOException {
75 if(args.length == 0) {
76 System.err.println("Use:");
77 System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
78 System.exit(1);
79 }
80
81 // Process the first argument as a file
82 FileInputStream fin = new FileInputStream(args[0]);
83 WordExtractor extractor = new WordExtractor(fin);
84 System.out.println(extractor.getText());
85 }
86
87 /**
88 * Get the text from the word file, as an array with one String
89 * per paragraph
90 */
91 public String[] getParagraphText() {
92 String[] ret;
93
94 // Extract using the model code
95 try {
96 Range r = doc.getRange();
97
98 ret = new String[r.numParagraphs()];
99 for(int i=0; i<ret.length; i++) {
100 Paragraph p = r.getParagraph(i);
101 ret[i] = p.text();
102
103 // Fix the line ending
104 if(ret[i].endsWith("\r")) {
105 ret[i] = ret[i] + "\n";
106 }
107 }
108 } catch(Exception e) {
109 // Something's up with turning the text pieces into paragraphs
110 // Fall back to ripping out the text pieces
111 ret = new String[1];
112 ret[0] = getTextFromPieces();
113 }
114
115 return ret;
116 }
117
118 /**
119 * Grab the text out of the text pieces. Might also include various
120 * bits of crud, but will work in cases where the text piece -> paragraph
121 * mapping is broken. Fast too.
122 */
123 public String getTextFromPieces() {
124 StringBuffer textBuf = new StringBuffer();
125
126 Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
127 while (textPieces.hasNext()) {
128 TextPiece piece = (TextPiece) textPieces.next();
129
130 String encoding = "Cp1252";
131 if (piece.usesUnicode()) {
132 encoding = "UTF-16LE";
133 }
134 try {
135 String text = new String(piece.getRawBytes(), encoding);
136 textBuf.append(text);
137 } catch(UnsupportedEncodingException e) {
138 throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
139 }
140 }
141
142 String text = textBuf.toString();
143
144 // Fix line endings (Note - won't get all of them
145 text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
146 text = text.replaceAll("\r\r", "\r\n\r\n");
147
148 if(text.endsWith("\r")) {
149 text += "\n";
150 }
151
152 return text;
153 }
154
155 /**
156 * Grab the text, based on the paragraphs. Shouldn't include any crud,
157 * but slightly slower than getTextFromPieces().
158 */
159 public String getText() {
160 StringBuffer ret = new StringBuffer();
161 String[] text = getParagraphText();
162 for(int i=0; i<text.length; i++) {
163 ret.append(text[i]);
164 }
165 return ret.toString();
166 }
167 }