Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » demo » [javadoc | source]
    1   package org.apache.lucene.demo;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.analysis.standard.StandardAnalyzer;
   21   import org.apache.lucene.document.Document;
   22   import org.apache.lucene.index.IndexReader;
   23   import org.apache.lucene.index.IndexWriter;
   24   import org.apache.lucene.index.Term;
   25   import org.apache.lucene.index.TermEnum;
   26   import java.io.File;
   27   import java.util.Date;
   28   import java.util.Arrays;
   29   
   30   /** Indexer for HTML files. */
   31   public class IndexHTML {
   32     private IndexHTML() {}
   33   
   34     private static boolean deleting = false;	  // true during deletion pass
   35     private static IndexReader reader;		  // existing index
   36     private static IndexWriter writer;		  // new index being built
   37     private static TermEnum uidIter;		  // document id iterator
   38   
   39     /** Indexer for HTML files.*/
   40     public static void main(String[] argv) {
   41       try {
   42         String index = "index";
   43         boolean create = false;
   44         File root = null;
   45   
   46         String usage = "IndexHTML [-create] [-index <index>] <root_directory>";
   47   
   48         if (argv.length == 0) {
   49           System.err.println("Usage: " + usage);
   50           return;
   51         }
   52   
   53         for (int i = 0; i < argv.length; i++) {
   54           if (argv[i].equals("-index")) {		  // parse -index option
   55             index = argv[++i];
   56           } else if (argv[i].equals("-create")) {	  // parse -create option
   57             create = true;
   58           } else if (i != argv.length-1) {
   59             System.err.println("Usage: " + usage);
   60             return;
   61           } else
   62             root = new File(argv[i]);
   63         }
   64   
   65         Date start = new Date();
   66   
   67         if (!create) {				  // delete stale docs
   68           deleting = true;
   69           indexDocs(root, index, create);
   70         }
   71         writer = new IndexWriter(index, new StandardAnalyzer(), create, 
   72                                  new IndexWriter.MaxFieldLength(1000000));
   73         indexDocs(root, index, create);		  // add new docs
   74   
   75         System.out.println("Optimizing index...");
   76         writer.optimize();
   77         writer.close();
   78   
   79         Date end = new Date();
   80   
   81         System.out.print(end.getTime() - start.getTime());
   82         System.out.println(" total milliseconds");
   83   
   84       } catch (Exception e) {
   85         System.out.println(" caught a " + e.getClass() +
   86             "\n with message: " + e.getMessage());
   87       }
   88     }
   89   
   90     /* Walk directory hierarchy in uid order, while keeping uid iterator from
   91     /* existing index in sync.  Mismatches indicate one of: (a) old documents to
   92     /* be deleted; (b) unchanged documents, to be left alone; or (c) new
   93     /* documents, to be indexed.
   94      */
   95   
   96     private static void indexDocs(File file, String index, boolean create)
   97          throws Exception {
   98       if (!create) {				  // incrementally update
   99   
  100         reader = IndexReader.open(index);		  // open existing index
  101         uidIter = reader.terms(new Term("uid", "")); // init uid iterator
  102   
  103         indexDocs(file);
  104   
  105         if (deleting) {				  // delete rest of stale docs
  106           while (uidIter.term() != null && uidIter.term().field() == "uid") {
  107             System.out.println("deleting " +
  108                 HTMLDocument.uid2url(uidIter.term().text()));
  109             reader.deleteDocuments(uidIter.term());
  110             uidIter.next();
  111           }
  112           deleting = false;
  113         }
  114   
  115         uidIter.close();				  // close uid iterator
  116         reader.close();				  // close existing index
  117   
  118       } else					  // don't have exisiting
  119         indexDocs(file);
  120     }
  121   
  122     private static void indexDocs(File file) throws Exception {
  123       if (file.isDirectory()) {			  // if a directory
  124         String[] files = file.list();		  // list its files
  125         Arrays.sort(files);			  // sort the files
  126         for (int i = 0; i < files.length; i++)	  // recursively index them
  127           indexDocs(new File(file, files[i]));
  128   
  129       } else if (file.getPath().endsWith(".html") || // index .html files
  130         file.getPath().endsWith(".htm") || // index .htm files
  131         file.getPath().endsWith(".txt")) { // index .txt files
  132   
  133         if (uidIter != null) {
  134           String uid = HTMLDocument.uid(file);	  // construct uid for doc
  135   
  136           while (uidIter.term() != null && uidIter.term().field() == "uid" &&
  137               uidIter.term().text().compareTo(uid) < 0) {
  138             if (deleting) {			  // delete stale docs
  139               System.out.println("deleting " +
  140                   HTMLDocument.uid2url(uidIter.term().text()));
  141               reader.deleteDocuments(uidIter.term());
  142             }
  143             uidIter.next();
  144           }
  145           if (uidIter.term() != null && uidIter.term().field() == "uid" &&
  146               uidIter.term().text().compareTo(uid) == 0) {
  147             uidIter.next();			  // keep matching docs
  148           } else if (!deleting) {			  // add new docs
  149             Document doc = HTMLDocument.Document(file);
  150             System.out.println("adding " + doc.get("path"));
  151             writer.addDocument(doc);
  152           }
  153         } else {					  // creating a new index
  154           Document doc = HTMLDocument.Document(file);
  155           System.out.println("adding " + doc.get("path"));
  156           writer.addDocument(doc);		  // add docs unconditionally
  157         }
  158       }
  159     }
  160   }

Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » demo » [javadoc | source]