Save This Page
Home » nutch-1.0 » org.apache.nutch » searcher » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.nutch.searcher;
   19   
   20   import java.io.File;
   21   import java.io.IOException;
   22   import java.util.List;
   23   
   24   import org.apache.hadoop.conf.Configuration;
   25   import org.apache.hadoop.fs.FileSystem;
   26   import org.apache.hadoop.fs.Path;
   27   import org.apache.hadoop.io.FloatWritable;
   28   import org.apache.hadoop.io.IntWritable;
   29   import org.apache.hadoop.io.Text;
   30   import org.apache.hadoop.io.WritableComparable;
   31   import org.apache.lucene.document.Document;
   32   import org.apache.lucene.document.Field;
   33   import org.apache.lucene.index.IndexReader;
   34   import org.apache.lucene.index.MultiReader;
   35   import org.apache.lucene.search.FieldCache;
   36   import org.apache.lucene.search.FieldDoc;
   37   import org.apache.lucene.search.ScoreDoc;
   38   import org.apache.lucene.search.TopDocs;
   39   import org.apache.lucene.store.Directory;
   40   import org.apache.lucene.store.FSDirectory;
   41   import org.apache.nutch.indexer.FsDirectory;
   42   import org.apache.nutch.indexer.NutchSimilarity;
   43   
   44   /** Implements {@link Searcher} and {@link HitDetailer} for either a single
   45    * merged index, or a set of indexes. */
   46   public class IndexSearcher implements Searcher, HitDetailer {
   47   
   48     private org.apache.lucene.search.Searcher luceneSearcher;
   49     private org.apache.lucene.index.IndexReader reader;
   50     private LuceneQueryOptimizer optimizer;
   51     private FileSystem fs;
   52     private Configuration conf;
   53     private QueryFilters queryFilters;
   54   
   55     /** Construct given a number of indexes. */
   56     public IndexSearcher(Path[] indexDirs, Configuration conf) throws IOException {
   57       IndexReader[] readers = new IndexReader[indexDirs.length];
   58       this.conf = conf;
   59       this.fs = FileSystem.get(conf);
   60       for (int i = 0; i < indexDirs.length; i++) {
   61         readers[i] = IndexReader.open(getDirectory(indexDirs[i]));
   62       }
   63       init(new MultiReader(readers), conf);
   64     }
   65   
   66     /** Construct given a single merged index. */
   67     public IndexSearcher(Path index,  Configuration conf)
   68       throws IOException {
   69       this.conf = conf;
   70       this.fs = FileSystem.get(conf);
   71       init(IndexReader.open(getDirectory(index)), conf);
   72     }
   73   
   74     private void init(IndexReader reader, Configuration conf) throws IOException {
   75       this.reader = reader;
   76       this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader);
   77       this.luceneSearcher.setSimilarity(new NutchSimilarity());
   78       this.optimizer = new LuceneQueryOptimizer(conf);
   79       this.queryFilters = new QueryFilters(conf);
   80     }
   81   
   82     private Directory getDirectory(Path file) throws IOException {
   83       if ("file".equals(this.fs.getUri().getScheme())) {
   84         Path qualified = file.makeQualified(FileSystem.getLocal(conf));
   85         File fsLocal = new File(qualified.toUri());
   86         return FSDirectory.getDirectory(fsLocal.getAbsolutePath());
   87       } else {
   88         return new FsDirectory(this.fs, file, false, this.conf);
   89       }
   90     }
   91   
   92     public Hits search(Query query, int numHits,
   93                        String dedupField, String sortField, boolean reverse)
   94   
   95       throws IOException {
   96       org.apache.lucene.search.BooleanQuery luceneQuery =
   97         this.queryFilters.filter(query);
   98       return translateHits
   99         (optimizer.optimize(luceneQuery, luceneSearcher, numHits,
  100                             sortField, reverse),
  101          dedupField, sortField);
  102     }
  103   
  104     public String getExplanation(Query query, Hit hit) throws IOException {
  105       return luceneSearcher.explain(this.queryFilters.filter(query),
  106           Integer.valueOf(hit.getUniqueKey())).toHtml();
  107     }
  108   
  109     public HitDetails getDetails(Hit hit) throws IOException {
  110   
  111       Document doc = luceneSearcher.doc(Integer.valueOf(hit.getUniqueKey()));
  112   
  113       List docFields = doc.getFields();
  114       String[] fields = new String[docFields.size()];
  115       String[] values = new String[docFields.size()];
  116       for (int i = 0; i < docFields.size(); i++) {
  117         Field field = (Field)docFields.get(i);
  118         fields[i] = field.name();
  119         values[i] = field.stringValue();
  120       }
  121   
  122       return new HitDetails(fields, values);
  123     }
  124   
  125     public HitDetails[] getDetails(Hit[] hits) throws IOException {
  126       HitDetails[] results = new HitDetails[hits.length];
  127       for (int i = 0; i < hits.length; i++)
  128         results[i] = getDetails(hits[i]);
  129       return results;
  130     }
  131   
  132     private Hits translateHits(TopDocs topDocs,
  133                                String dedupField, String sortField)
  134       throws IOException {
  135   
  136       String[] dedupValues = null;
  137       if (dedupField != null) 
  138         dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField);
  139   
  140       ScoreDoc[] scoreDocs = topDocs.scoreDocs;
  141       int length = scoreDocs.length;
  142       Hit[] hits = new Hit[length];
  143       for (int i = 0; i < length; i++) {
  144         
  145         int doc = scoreDocs[i].doc;
  146         
  147         WritableComparable sortValue;               // convert value to writable
  148         if (sortField == null) {
  149           sortValue = new FloatWritable(scoreDocs[i].score);
  150         } else {
  151           Object raw = ((FieldDoc)scoreDocs[i]).fields[0];
  152           if (raw instanceof Integer) {
  153             sortValue = new IntWritable(((Integer)raw).intValue());
  154           } else if (raw instanceof Float) {
  155             sortValue = new FloatWritable(((Float)raw).floatValue());
  156           } else if (raw instanceof String) {
  157             sortValue = new Text((String)raw);
  158           } else {
  159             throw new RuntimeException("Unknown sort value type!");
  160           }
  161         }
  162   
  163         String dedupValue = dedupValues == null ? null : dedupValues[doc];
  164   
  165         hits[i] = new Hit(Integer.toString(doc), sortValue, dedupValue);
  166       }
  167       return new Hits(topDocs.totalHits, hits);
  168     }
  169     
  170     public void close() throws IOException {
  171       if (luceneSearcher != null) { luceneSearcher.close(); }
  172       if (reader != null) { reader.close(); }
  173     }
  174   
  175   }

Save This Page
Home » nutch-1.0 » org.apache.nutch » searcher » [javadoc | source]