Save This Page
Home » poi-src-3.2-FINAL-20081019 » org.apache » poi » hwpf » extractor » [javadoc | source]
    1   /*
    2   * Licensed to the Apache Software Foundation (ASF) under one or more
    3   * contributor license agreements.  See the NOTICE file distributed with
    4   * this work for additional information regarding copyright ownership.
    5   * The ASF licenses this file to You under the Apache License, Version 2.0
    6   * (the "License"); you may not use this file except in compliance with
    7   * the License.  You may obtain a copy of the License at
    8   *
    9   *     http://www.apache.org/licenses/LICENSE-2.0
   10   *
   11   * Unless required by applicable law or agreed to in writing, software
   12   * distributed under the License is distributed on an "AS IS" BASIS,
   13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14   * See the License for the specific language governing permissions and
   15   * limitations under the License.
   16   */
   17   package org.apache.poi.hwpf.extractor;
   18   
   19   import java.io.IOException;
   20   import java.io.InputStream;
   21   import java.io.FileInputStream;
   22   import java.io.UnsupportedEncodingException;
   23   import java.util.Iterator;
   24   
   25   import org.apache.poi.POIOLE2TextExtractor;
   26   import org.apache.poi.hwpf.HWPFDocument;
   27   import org.apache.poi.hwpf.model.TextPiece;
   28   import org.apache.poi.hwpf.usermodel.Paragraph;
   29   import org.apache.poi.hwpf.usermodel.Range;
   30   import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   31   
   32   /**
   33    * Class to extract the text from a Word Document.
   34    * 
   35    * You should use either getParagraphText() or getText() unless
   36    *  you have a strong reason otherwise.
   37    *
   38    * @author Nick Burch (nick at torchbox dot com)
   39    */
   40   public class WordExtractor extends POIOLE2TextExtractor {
   41   	private POIFSFileSystem fs;
   42   	private HWPFDocument doc;
   43   	
   44   	/**
   45   	 * Create a new Word Extractor
   46   	 * @param is InputStream containing the word file
   47   	 */
   48   	public WordExtractor(InputStream is) throws IOException {
   49   		this( HWPFDocument.verifyAndBuildPOIFS(is) );
   50   	}
   51   
   52   	/**
   53   	 * Create a new Word Extractor
   54   	 * @param fs POIFSFileSystem containing the word file
   55   	 */
   56   	public WordExtractor(POIFSFileSystem fs) throws IOException {
   57   		this(new HWPFDocument(fs));
   58   		this.fs = fs;
   59   	}
   60   	
   61   	/**
   62   	 * Create a new Word Extractor
   63   	 * @param doc The HWPFDocument to extract from
   64   	 */
   65   	public WordExtractor(HWPFDocument doc) throws IOException {
   66   		super(doc);
   67   		this.doc = doc;
   68   	}
   69   
   70   	/**
   71   	 * Command line extractor, so people will stop moaning that
   72   	 *  they can't just run this.
   73   	 */
   74   	public static void main(String[] args) throws IOException {
   75   		if(args.length == 0) {
   76   			System.err.println("Use:");
   77   			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
   78   			System.exit(1);
   79   		}
   80   
   81   		// Process the first argument as a file
   82   		FileInputStream fin = new FileInputStream(args[0]);
   83   		WordExtractor extractor = new WordExtractor(fin);
   84   		System.out.println(extractor.getText());
   85   	}
   86   	
   87   	/**
   88   	 * Get the text from the word file, as an array with one String
   89   	 *  per paragraph
   90   	 */
   91   	public String[] getParagraphText() {
   92   		String[] ret;
   93   		
   94   		// Extract using the model code
   95   		try {
   96   	    	Range r = doc.getRange();
   97   
   98   			ret = new String[r.numParagraphs()];
   99   			for(int i=0; i<ret.length; i++) {
  100   				Paragraph p = r.getParagraph(i);
  101   				ret[i] = p.text();
  102   				
  103   				// Fix the line ending
  104   				if(ret[i].endsWith("\r")) {
  105   					ret[i] = ret[i] + "\n";
  106   				}
  107   			}
  108   		} catch(Exception e) {
  109   			// Something's up with turning the text pieces into paragraphs
  110   			// Fall back to ripping out the text pieces
  111   			ret = new String[1];
  112   			ret[0] = getTextFromPieces();
  113   		}
  114   		
  115   		return ret;
  116   	}
  117   	
  118   	/**
  119   	 * Grab the text out of the text pieces. Might also include various
  120   	 *  bits of crud, but will work in cases where the text piece -> paragraph
  121   	 *  mapping is broken. Fast too.
  122   	 */
  123   	public String getTextFromPieces() {
  124       	StringBuffer textBuf = new StringBuffer();
  125       	
  126       	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
  127       	while (textPieces.hasNext()) {
  128       		TextPiece piece = (TextPiece) textPieces.next();
  129   
  130       		String encoding = "Cp1252";
  131       		if (piece.usesUnicode()) {
  132       			encoding = "UTF-16LE";
  133       		}
  134       		try {
  135       			String text = new String(piece.getRawBytes(), encoding);
  136       			textBuf.append(text);
  137       		} catch(UnsupportedEncodingException e) {
  138       			throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
  139       		}
  140       	}
  141       	
  142       	String text = textBuf.toString();
  143       	
  144       	// Fix line endings (Note - won't get all of them
  145       	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
  146       	text = text.replaceAll("\r\r", "\r\n\r\n");
  147       	
  148       	if(text.endsWith("\r")) {
  149       		text += "\n";
  150       	}
  151       	
  152       	return text;
  153   	}
  154   	
  155   	/**
  156   	 * Grab the text, based on the paragraphs. Shouldn't include any crud,
  157   	 *  but slightly slower than getTextFromPieces().
  158   	 */
  159   	public String getText() {
  160   		StringBuffer ret = new StringBuffer();
  161   		String[] text = getParagraphText();
  162   		for(int i=0; i<text.length; i++) {
  163   			ret.append(text[i]);
  164   		}
  165   		return ret.toString();
  166   	}
  167   }

Save This Page
Home » poi-src-3.2-FINAL-20081019 » org.apache » poi » hwpf » extractor » [javadoc | source]