Save This Page
Home » apache-solr-1.4.0 » org.apache.solr » handler » [javadoc | source]
    1   /**
    2    * Licensed to the Apache Software Foundation (ASF) under one or more
    3    * contributor license agreements.  See the NOTICE file distributed with
    4    * this work for additional information regarding copyright ownership.
    5    * The ASF licenses this file to You under the Apache License, Version 2.0
    6    * (the "License"); you may not use this file except in compliance with
    7    * the License.  You may obtain a copy of the License at
    8    *
    9    *     http://www.apache.org/licenses/LICENSE-2.0
   10    *
   11    * Unless required by applicable law or agreed to in writing, software
   12    * distributed under the License is distributed on an "AS IS" BASIS,
   13    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   14    * See the License for the specific language governing permissions and
   15    * limitations under the License.
   16    */
   17   
   18   package org.apache.solr.handler;
   19   
   20   import java.io.IOException;
   21   import java.io.Reader;
   22   import java.net.MalformedURLException;
   23   import java.net.URL;
   24   import java.util.ArrayList;
   25   import java.util.Comparator;
   26   import java.util.HashSet;
   27   import java.util.Iterator;
   28   import java.util.List;
   29   import java.util.Map;
   30   import java.util.Set;
   31   import java.util.regex.Pattern;
   32   
   33   import org.apache.lucene.document.Document;
   34   import org.apache.lucene.index.IndexReader;
   35   import org.apache.lucene.index.Term;
   36   import org.apache.lucene.search.BooleanClause;
   37   import org.apache.lucene.search.BooleanQuery;
   38   import org.apache.lucene.search.Query;
   39   import org.apache.lucene.search.TermQuery;
   40   import org.apache.lucene.search.similar.MoreLikeThis;
   41   import org.apache.solr.common.SolrException;
   42   import org.apache.solr.common.params.CommonParams;
   43   import org.apache.solr.common.params.DisMaxParams;
   44   import org.apache.solr.common.params.FacetParams;
   45   import org.apache.solr.common.params.MoreLikeThisParams;
   46   import org.apache.solr.common.params.SolrParams;
   47   import org.apache.solr.common.params.MoreLikeThisParams.TermStyle;
   48   import org.apache.solr.common.util.ContentStream;
   49   import org.apache.solr.common.util.NamedList;
   50   import org.apache.solr.common.util.SimpleOrderedMap;
   51   import org.apache.solr.core.SolrCore;
   52   import org.apache.solr.request.SimpleFacets;
   53   import org.apache.solr.request.SolrQueryRequest;
   54   import org.apache.solr.request.SolrQueryResponse;
   55   import org.apache.solr.schema.IndexSchema;
   56   import org.apache.solr.schema.SchemaField;
   57   import org.apache.solr.search.DocIterator;
   58   import org.apache.solr.search.DocList;
   59   import org.apache.solr.search.DocListAndSet;
   60   import org.apache.solr.search.QueryParsing;
   61   import org.apache.solr.search.SolrIndexSearcher;
   62   import org.apache.solr.util.SolrPluginUtils;
   63   
   64   /**
   65    * Solr MoreLikeThis --
   66    * 
   67    * Return similar documents either based on a single document or based on posted text.
   68    * 
   69    * @since solr 1.3
   70    */
   71   public class MoreLikeThisHandler extends RequestHandlerBase  
   72   {
   73     // Pattern is thread safe -- TODO? share this with general 'fl' param
   74     private static final Pattern splitList = Pattern.compile(",| ");
   75     
   76     @Override
   77     public void init(NamedList args) {
   78       super.init(args);
   79     }
   80   
   81     @Override
   82     public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception 
   83     {
   84       SolrParams params = req.getParams();
   85       SolrIndexSearcher searcher = req.getSearcher();
   86       
   87       // Parse Required Params
   88       // This will either have a single Reader or valid query
   89       Reader reader = null;
   90       String q = params.get( CommonParams.Q );
   91       if( q == null || q.trim().length() <1 ) {
   92         Iterable<ContentStream> streams = req.getContentStreams();
   93         if( streams != null ) {
   94           Iterator<ContentStream> iter = streams.iterator();
   95           if( iter.hasNext() ) {
   96             reader = iter.next().getReader();
   97           }
   98           if( iter.hasNext() ) {
   99             throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, 
  100                 "MoreLikeThis does not support multiple ContentStreams" );
  101           }
  102         }
  103       }
  104   
  105       MoreLikeThisHelper mlt = new MoreLikeThisHelper( params, searcher );
  106       List<Query> filters = SolrPluginUtils.parseFilterQueries(req);
  107       
  108       // Hold on to the interesting terms if relevant
  109       TermStyle termStyle = TermStyle.get( params.get( MoreLikeThisParams.INTERESTING_TERMS ) );
  110       List<InterestingTerm> interesting = (termStyle == TermStyle.NONE )
  111         ? null : new ArrayList<InterestingTerm>( mlt.mlt.getMaxQueryTerms() );
  112       
  113       // What fields do we need to return
  114       String fl = params.get(CommonParams.FL);
  115       int flags = 0; 
  116       if (fl != null) {
  117         flags |= SolrPluginUtils.setReturnFields(fl, rsp);
  118       }
  119   
  120       int start = params.getInt( CommonParams.START, 0 );
  121       int rows  = params.getInt( CommonParams.ROWS, 10 );
  122       
  123       DocListAndSet mltDocs = null;
  124       
  125       // Find documents MoreLikeThis - either with a reader or a query
  126       //--------------------------------------------------------------------------------
  127       if( reader != null ) {
  128         mltDocs = mlt.getMoreLikeThis( reader, start, rows, filters, interesting, flags );
  129       }
  130       else if( q != null ) {
  131         // Matching options
  132         boolean includeMatch = params.getBool( MoreLikeThisParams.MATCH_INCLUDE, true );
  133         int matchOffset = params.getInt( MoreLikeThisParams.MATCH_OFFSET, 0 );
  134         // Find the base match  
  135         Query query = QueryParsing.parseQuery(q, params.get(CommonParams.DF), params, req.getSchema());
  136         DocList match = searcher.getDocList(query, null, null, matchOffset, 1, flags ); // only get the first one...
  137         if( includeMatch ) {
  138           rsp.add( "match", match );
  139         }
  140   
  141         // This is an iterator, but we only handle the first match
  142         DocIterator iterator = match.iterator();
  143         if( iterator.hasNext() ) {
  144           // do a MoreLikeThis query for each document in results
  145           int id = iterator.nextDoc();
  146           mltDocs = mlt.getMoreLikeThis( id, start, rows, filters, interesting, flags );
  147         }
  148       }
  149       else {
  150         throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, 
  151             "MoreLikeThis requires either a query (?q=) or text to find similar documents." );
  152       }
  153       if( mltDocs == null ) {
  154         mltDocs = new DocListAndSet(); // avoid NPE
  155       }
  156       rsp.add( "response", mltDocs.docList );
  157       
  158     
  159       if( interesting != null ) {
  160         if( termStyle == TermStyle.DETAILS ) {
  161           NamedList<Float> it = new NamedList<Float>();
  162           for( InterestingTerm t : interesting ) {
  163             it.add( t.term.toString(), t.boost );
  164           }
  165           rsp.add( "interestingTerms", it );
  166         }
  167         else {
  168           List<String> it = new ArrayList<String>( interesting.size() );
  169           for( InterestingTerm t : interesting ) {
  170             it.add( t.term.text());
  171           }
  172           rsp.add( "interestingTerms", it );
  173         }
  174       }
  175       
  176       // maybe facet the results
  177       if (params.getBool(FacetParams.FACET,false)) {
  178         if( mltDocs.docSet == null ) {
  179           rsp.add( "facet_counts", null );
  180         }
  181         else {
  182           SimpleFacets f = new SimpleFacets(req, mltDocs.docSet, params );
  183           rsp.add( "facet_counts", f.getFacetCounts() );
  184         }
  185       }
  186       
  187       // Copied from StandardRequestHandler... perhaps it should be added to doStandardDebug?
  188       try {
  189         NamedList<Object> dbg = SolrPluginUtils.doStandardDebug(req, q, mlt.mltquery, mltDocs.docList );
  190         if (null != dbg) {
  191           if (null != filters) {
  192             dbg.add("filter_queries",req.getParams().getParams(CommonParams.FQ));
  193             List<String> fqs = new ArrayList<String>(filters.size());
  194             for (Query fq : filters) {
  195               fqs.add(QueryParsing.toString(fq, req.getSchema()));
  196             }
  197             dbg.add("parsed_filter_queries",fqs);
  198           }
  199           rsp.add("debug", dbg);
  200         }
  201       } catch (Exception e) {
  202         SolrException.logOnce(SolrCore.log, "Exception during debug", e);
  203         rsp.add("exception_during_debug", SolrException.toStr(e));
  204       }
  205     }
  206     
  207     public static class InterestingTerm
  208     {
  209       public Term term;
  210       public float boost;
  211           
  212       public static Comparator<InterestingTerm> BOOST_ORDER = new Comparator<InterestingTerm>() {
  213         public int compare(InterestingTerm t1, InterestingTerm t2) {
  214           float d = t1.boost - t2.boost;
  215           if( d == 0 ) {
  216             return 0;
  217           }
  218           return (d>0)?1:-1;
  219         }
  220       };
  221     }
  222     
  223     /**
  224      * Helper class for MoreLikeThis that can be called from other request handlers
  225      */
  226     public static class MoreLikeThisHelper 
  227     { 
  228       final SolrIndexSearcher searcher;
  229       final MoreLikeThis mlt;
  230       final IndexReader reader;
  231       final SchemaField uniqueKeyField;
  232       final boolean needDocSet;
  233       Map<String,Float> boostFields;
  234       
  235       Query mltquery;  // expose this for debugging
  236       
  237       public MoreLikeThisHelper( SolrParams params, SolrIndexSearcher searcher )
  238       {
  239         this.searcher = searcher;
  240         this.reader = searcher.getReader();
  241         this.uniqueKeyField = searcher.getSchema().getUniqueKeyField();
  242         this.needDocSet = params.getBool(FacetParams.FACET,false);
  243         
  244         SolrParams required = params.required();
  245         String[] fields = splitList.split( required.get(MoreLikeThisParams.SIMILARITY_FIELDS) );
  246         if( fields.length < 1 ) {
  247           throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, 
  248               "MoreLikeThis requires at least one similarity field: "+MoreLikeThisParams.SIMILARITY_FIELDS );
  249         }
  250         
  251         this.mlt = new MoreLikeThis( reader ); // TODO -- after LUCENE-896, we can use , searcher.getSimilarity() );
  252         mlt.setFieldNames(fields);
  253         mlt.setAnalyzer( searcher.getSchema().getAnalyzer() );
  254         
  255         // configurable params
  256         mlt.setMinTermFreq(       params.getInt(MoreLikeThisParams.MIN_TERM_FREQ,         MoreLikeThis.DEFAULT_MIN_TERM_FREQ));
  257         mlt.setMinDocFreq(        params.getInt(MoreLikeThisParams.MIN_DOC_FREQ,          MoreLikeThis.DEFAULT_MIN_DOC_FREQ));
  258         mlt.setMinWordLen(        params.getInt(MoreLikeThisParams.MIN_WORD_LEN,          MoreLikeThis.DEFAULT_MIN_WORD_LENGTH));
  259         mlt.setMaxWordLen(        params.getInt(MoreLikeThisParams.MAX_WORD_LEN,          MoreLikeThis.DEFAULT_MAX_WORD_LENGTH));
  260         mlt.setMaxQueryTerms(     params.getInt(MoreLikeThisParams.MAX_QUERY_TERMS,       MoreLikeThis.DEFAULT_MAX_QUERY_TERMS));
  261         mlt.setMaxNumTokensParsed(params.getInt(MoreLikeThisParams.MAX_NUM_TOKENS_PARSED, MoreLikeThis.DEFAULT_MAX_NUM_TOKENS_PARSED));
  262         mlt.setBoost(            params.getBool(MoreLikeThisParams.BOOST, false ) );
  263         boostFields = SolrPluginUtils.parseFieldBoosts(params.getParams(MoreLikeThisParams.QF));
  264       }
  265       
  266       private void setBoosts(Query mltquery) {
  267         if (boostFields.size() > 0) {
  268           List clauses = ((BooleanQuery)mltquery).clauses();
  269           for( Object o : clauses ) {
  270             TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
  271             Float b = this.boostFields.get(q.getTerm().field());
  272             if (b != null) {
  273               q.setBoost(b*q.getBoost());
  274             }
  275           }
  276         }
  277       }
  278       
  279       public DocListAndSet getMoreLikeThis( int id, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
  280       {
  281         Document doc = reader.document(id);
  282         mltquery = mlt.like(id);
  283         setBoosts(mltquery);
  284         if( terms != null ) {
  285           fillInterestingTermsFromMLTQuery( mltquery, terms );
  286         }
  287   
  288         // exclude current document from results
  289         BooleanQuery mltQuery = new BooleanQuery();
  290         mltQuery.add(mltquery, BooleanClause.Occur.MUST);
  291         mltQuery.add(
  292             new TermQuery(new Term(uniqueKeyField.getName(), uniqueKeyField.getType().storedToIndexed(doc.getFieldable(uniqueKeyField.getName())))), 
  293               BooleanClause.Occur.MUST_NOT);
  294         
  295         DocListAndSet results = new DocListAndSet();
  296         if (this.needDocSet) {
  297           results = searcher.getDocListAndSet(mltQuery, filters, null, start, rows, flags);
  298         } else {
  299           results.docList = searcher.getDocList(mltQuery, filters, null, start, rows, flags);
  300         }
  301         return results;
  302       }
  303   
  304       public DocListAndSet getMoreLikeThis( Reader reader, int start, int rows, List<Query> filters, List<InterestingTerm> terms, int flags ) throws IOException
  305       {
  306         mltquery = mlt.like(reader);
  307         setBoosts(mltquery);
  308         if( terms != null ) {
  309           fillInterestingTermsFromMLTQuery( mltquery, terms );
  310         }
  311         DocListAndSet results = new DocListAndSet();
  312         if (this.needDocSet) {
  313           results = searcher.getDocListAndSet(mltquery, filters, null, start, rows, flags);
  314         } else {
  315           results.docList = searcher.getDocList(mltquery, filters, null, start, rows, flags);
  316         }
  317         return results;
  318       }
  319       
  320       public NamedList<DocList> getMoreLikeThese( DocList docs, int rows, int flags ) throws IOException
  321       {
  322         IndexSchema schema = searcher.getSchema();
  323         NamedList<DocList> mlt = new SimpleOrderedMap<DocList>();
  324         DocIterator iterator = docs.iterator();
  325         while( iterator.hasNext() ) {
  326           int id = iterator.nextDoc();
  327           
  328           DocListAndSet sim = getMoreLikeThis( id, 0, rows, null, null, flags );
  329           String name = schema.printableUniqueKey( reader.document( id ) );
  330   
  331           mlt.add(name, sim.docList);
  332         }
  333         return mlt;
  334       }
  335       
  336       private void fillInterestingTermsFromMLTQuery( Query query, List<InterestingTerm> terms )
  337       { 
  338         List clauses = ((BooleanQuery)mltquery).clauses();
  339         for( Object o : clauses ) {
  340           TermQuery q = (TermQuery)((BooleanClause)o).getQuery();
  341           InterestingTerm it = new InterestingTerm();
  342           it.boost = q.getBoost();
  343           it.term = q.getTerm();
  344           terms.add( it );
  345         } 
  346         // alternatively we could use
  347         // mltquery.extractTerms( terms );
  348       }
  349       
  350       public MoreLikeThis getMoreLikeThis()
  351       {
  352         return mlt;
  353       }
  354     }
  355     
  356     
  357     //////////////////////// SolrInfoMBeans methods //////////////////////
  358   
  359     @Override
  360     public String getVersion() {
  361       return "$Revision: 801768 $";
  362     }
  363   
  364     @Override
  365     public String getDescription() {
  366       return "Solr MoreLikeThis";
  367     }
  368   
  369     @Override
  370     public String getSourceId() {
  371       return "$Id: MoreLikeThisHandler.java 801768 2009-08-06 18:47:16Z yonik $";
  372     }
  373   
  374     @Override
  375     public String getSource() {
  376       return "$URL: https://svn.apache.org/repos/asf/lucene/solr/branches/branch-1.4/src/java/org/apache/solr/handler/MoreLikeThisHandler.java $";
  377     }
  378   
  379     @Override
  380     public URL[] getDocs() {
  381       try {
  382         return new URL[] { new URL("http://wiki.apache.org/solr/MoreLikeThis") };
  383       }
  384       catch( MalformedURLException ex ) { return null; }
  385     }
  386   }

Save This Page
Home » apache-solr-1.4.0 » org.apache.solr » handler » [javadoc | source]