Save This Page
Home » poi-src-3.2-FINAL-20081019 » org.apache » poi » hslf » extractor » [javadoc | source]
    1   
    2   /* ====================================================================
    3      Licensed to the Apache Software Foundation (ASF) under one or more
    4      contributor license agreements.  See the NOTICE file distributed with
    5      this work for additional information regarding copyright ownership.
    6      The ASF licenses this file to You under the Apache License, Version 2.0
    7      (the "License"); you may not use this file except in compliance with
    8      the License.  You may obtain a copy of the License at
    9   
   10          http://www.apache.org/licenses/LICENSE-2.0
   11   
   12      Unless required by applicable law or agreed to in writing, software
   13      distributed under the License is distributed on an "AS IS" BASIS,
   14      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   15      See the License for the specific language governing permissions and
   16      limitations under the License.
   17   ==================================================================== */
   18           
   19   
   20   
   21   package org.apache.poi.hslf.extractor;
   22   
   23   import java.io;
   24   import java.util.HashSet;
   25   
   26   import org.apache.poi.POIOLE2TextExtractor;
   27   import org.apache.poi.poifs.filesystem.POIFSFileSystem;
   28   import org.apache.poi.hslf;
   29   import org.apache.poi.hslf.model;
   30   import org.apache.poi.hslf.usermodel;
   31   
   32   /**
   33    * This class can be used to extract text from a PowerPoint file.
   34    *  Can optionally also get the notes from one.
   35    *
   36    * @author Nick Burch
   37    */
   38   
   39   public class PowerPointExtractor extends POIOLE2TextExtractor
   40   {
   41   	private HSLFSlideShow _hslfshow;
   42   	private SlideShow _show;
   43   	private Slide[] _slides;
   44   	
   45   	private boolean slidesByDefault = true;
   46   	private boolean notesByDefault = false;
   47   
   48     /**
   49      * Basic extractor. Returns all the text, and optionally all the notes
   50      */
   51     public static void main(String args[]) throws IOException
   52     {
   53   	if(args.length < 1) {
   54   		System.err.println("Useage:");
   55   		System.err.println("\tPowerPointExtractor [-notes] <file>");
   56   		System.exit(1);
   57   	}
   58   
   59   	boolean notes = false;
   60   	String file;
   61   	if(args.length > 1) {
   62   		notes = true;
   63   		file = args[1];
   64   	} else {
   65   		file = args[0];
   66   	}
   67   
   68   	PowerPointExtractor ppe = new PowerPointExtractor(file);
   69   	System.out.println(ppe.getText(true,notes));
   70   	ppe.close();
   71     }
   72   
   73   	/**
   74   	 * Creates a PowerPointExtractor, from a file
   75   	 * @param fileName The name of the file to extract from
   76   	 */
   77   	public PowerPointExtractor(String fileName) throws IOException {
   78   		this(new FileInputStream(fileName));
   79   	}
   80   	/**
   81   	 * Creates a PowerPointExtractor, from an Input Stream
   82   	 * @param iStream The input stream containing the PowerPoint document
   83   	 */
   84   	public PowerPointExtractor(InputStream iStream) throws IOException {
   85   		this(new POIFSFileSystem(iStream));
   86   	}
   87   	/**
   88   	 * Creates a PowerPointExtractor, from an open POIFSFileSystem
   89   	 * @param fs the POIFSFileSystem containing the PowerPoint document
   90   	 */
   91   	public PowerPointExtractor(POIFSFileSystem fs) throws IOException {
   92   		this(new HSLFSlideShow(fs));
   93   	}
   94   
   95   	/**
   96   	 * Creates a PowerPointExtractor, from a HSLFSlideShow
   97   	 * @param ss the HSLFSlideShow to extract text from
   98   	 */
   99   	public PowerPointExtractor(HSLFSlideShow ss) throws IOException {
  100   		super(ss);
  101   		_hslfshow = ss;
  102   		_show = new SlideShow(_hslfshow);
  103   		_slides = _show.getSlides();
  104   	}
  105   
  106   	/**
  107   	 * Shuts down the underlying streams
  108   	 */
  109   	public void close() throws IOException {
  110   		_hslfshow.close();
  111   		_hslfshow = null;
  112   		_show = null;
  113   		_slides = null;
  114   	}
  115   
  116   	/**
  117   	 * Should a call to getText() return slide text?
  118   	 * Default is yes
  119   	 */
  120   	public void setSlidesByDefault(boolean slidesByDefault) {
  121   		this.slidesByDefault = slidesByDefault;
  122   	}
  123   	/**
  124   	 * Should a call to getText() return notes text?
  125   	 * Default is no
  126   	 */
  127   	public void setNotesByDefault(boolean notesByDefault) {
  128   		this.notesByDefault = notesByDefault;
  129   	}
  130   
  131   	/**
  132   	 * Fetches all the slide text from the slideshow, 
  133   	 *  but not the notes, unless you've called
  134   	 *  setSlidesByDefault() and setNotesByDefault()
  135   	 *  to change this
  136   	 */
  137   	public String getText() {
  138   		return getText(slidesByDefault,notesByDefault);
  139   	}
  140   
  141   	/**
  142   	 * Fetches all the notes text from the slideshow, but not the slide text
  143   	 */
  144   	public String getNotes() {
  145   		return getText(false,true);
  146   	}
  147   
  148     /**
  149      * Fetches text from the slideshow, be it slide text or note text.
  150      * Because the final block of text in a TextRun normally have their
  151      *  last \n stripped, we add it back
  152      * @param getSlideText fetch slide text
  153      * @param getNoteText fetch note text
  154      */
  155     public String getText(boolean getSlideText, boolean getNoteText) {
  156   	StringBuffer ret = new StringBuffer(); 
  157   
  158   	if(getSlideText) {
  159   		for(int i=0; i<_slides.length; i++) {
  160   			Slide slide = _slides[i];
  161   			TextRun[] runs = slide.getTextRuns();
  162   			for(int j=0; j<runs.length; j++) {
  163   				TextRun run = runs[j];
  164   				if(run != null) {
  165   					String text = run.getText();
  166   					ret.append(text);
  167   					if(! text.endsWith("\n")) {
  168   						ret.append("\n");
  169   					}
  170   				}
  171   			}
  172   		}
  173   		if(getNoteText) {
  174   			ret.append("\n");
  175   		}
  176   	}
  177   
  178   	if(getNoteText) {
  179   		// Not currently using _notes, as that can have the notes of
  180   		//  master sheets in. Grab Slide list, then work from there,
  181   		//  but ensure no duplicates
  182   		HashSet seenNotes = new HashSet();
  183   		for(int i=0; i<_slides.length; i++) {
  184   			Notes notes = _slides[i].getNotesSheet();
  185   			if(notes == null) { continue; }
  186   			Integer id = new Integer(notes._getSheetNumber());
  187   			if(seenNotes.contains(id)) { continue; }
  188   			seenNotes.add(id);
  189   
  190   			TextRun[] runs = notes.getTextRuns();
  191   			if(runs != null && runs.length > 0) {
  192   				for(int j=0; j<runs.length; j++) {
  193   					TextRun run = runs[j];
  194   					String text = run.getText();
  195   					ret.append(text);
  196   					if(! text.endsWith("\n")) {
  197   						ret.append("\n");
  198   					}
  199   				}
  200   			}
  201   		}
  202   	}
  203   
  204   	return ret.toString();
  205     }
  206   }

Save This Page
Home » poi-src-3.2-FINAL-20081019 » org.apache » poi » hslf » extractor » [javadoc | source]