Home » pdfbox-1.1.0-src » org.apache.pdfbox.examples.util » [javadoc | source]

    1   /*
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *      http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   package org.apache.pdfbox.examples.util;
   18   
   19   import org.apache.pdfbox.exceptions.InvalidPasswordException;
   20   
   21   import org.apache.pdfbox.pdmodel.PDDocument;
   22   import org.apache.pdfbox.pdmodel.PDPage;
   23   import org.apache.pdfbox.util.PDFTextStripperByArea;
   24   
   25   import java.awt.Rectangle;
   26   
   27   import java.util.List;
   28   
   29   /**
   30    * This is an example on how to extract text from a specific area on the PDF document.
   31    *
   32    * Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf>
   33    *
   34    * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
   35    * @version $Revision: 1.2 $
   36    */
   37   public class ExtractTextByArea
   38   {
   39       private ExtractTextByArea()
   40       {
   41           //utility class and should not be constructed.
   42       }
   43   
   44   
   45       /**
   46        * This will print the documents text in a certain area.
   47        *
   48        * @param args The command line arguments.
   49        *
   50        * @throws Exception If there is an error parsing the document.
   51        */
   52       public static void main( String[] args ) throws Exception
   53       {
   54           if( args.length != 1 )
   55           {
   56               usage();
   57           }
   58           else
   59           {
   60               PDDocument document = null;
   61               try
   62               {
   63                   document = PDDocument.load( args[0] );
   64                   if( document.isEncrypted() )
   65                   {
   66                       try
   67                       {
   68                           document.decrypt( "" );
   69                       }
   70                       catch( InvalidPasswordException e )
   71                       {
   72                           System.err.println( "Error: Document is encrypted with a password." );
   73                           System.exit( 1 );
   74                       }
   75                   }
   76                   PDFTextStripperByArea stripper = new PDFTextStripperByArea();
   77                   stripper.setSortByPosition( true );
   78                   Rectangle rect = new Rectangle( 10, 280, 275, 60 );
   79                   stripper.addRegion( "class1", rect );
   80                   List allPages = document.getDocumentCatalog().getAllPages();
   81                   PDPage firstPage = (PDPage)allPages.get( 0 );
   82                   stripper.extractRegions( firstPage );
   83                   System.out.println( "Text in the area:" + rect );
   84                   System.out.println( stripper.getTextForRegion( "class1" ) );
   85   
   86               }
   87               finally
   88               {
   89                   if( document != null )
   90                   {
   91                       document.close();
   92                   }
   93               }
   94           }
   95       }
   96   
   97       /**
   98        * This will print the usage for this document.
   99        */
  100       private static void usage()
  101       {
  102           System.err.println( "Usage: java org.apache.pdfbox.examples.util.ExtractTextByArea <input-pdf>" );
  103       }
  104   
  105   }

Home » pdfbox-1.1.0-src » org.apache.pdfbox.examples.util » [javadoc | source]