Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » analysis » standard » [javadoc | source]
    1   package org.apache.lucene.analysis.standard;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import org.apache.lucene.analysis;
   21   
   22   import java.io.File;
   23   import java.io.IOException;
   24   import java.io.Reader;
   25   import java.util.Set;
   26   
   27   /**
   28    * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
   29    * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
   30    *
   31    * @version $Id: StandardAnalyzer.java 692634 2008-09-06 10:58:33Z mikemccand $
   32    */
   33   public class StandardAnalyzer extends Analyzer {
   34     private Set stopSet;
   35   
   36     /**
   37      * Specifies whether deprecated acronyms should be replaced with HOST type.
   38      * This is false by default to support backward compatibility.
   39      * 
   40      * @deprecated this should be removed in the next release (3.0).
   41      *
   42      * See https://issues.apache.org/jira/browse/LUCENE-1068
   43      */
   44     private boolean replaceInvalidAcronym = defaultReplaceInvalidAcronym;
   45   
   46     private static boolean defaultReplaceInvalidAcronym;
   47   
   48     // Default to true (fixed the bug), unless the system prop is set
   49     static {
   50       final String v = System.getProperty("org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym");
   51       if (v == null || v.equals("true"))
   52         defaultReplaceInvalidAcronym = true;
   53       else
   54         defaultReplaceInvalidAcronym = false;
   55     }
   56   
   57     /**
   58      *
   59      * @return true if new instances of StandardTokenizer will
   60      * replace mischaracterized acronyms
   61      *
   62      * See https://issues.apache.org/jira/browse/LUCENE-1068
   63      * @deprecated This will be removed (hardwired to true) in 3.0
   64      */
   65     public static boolean getDefaultReplaceInvalidAcronym() {
   66       return defaultReplaceInvalidAcronym;
   67     }
   68   
   69     /**
   70      *
   71      * @param replaceInvalidAcronym Set to true to have new
   72      * instances of StandardTokenizer replace mischaracterized
   73      * acronyms by default.  Set to false to preseve the
   74      * previous (before 2.4) buggy behavior.  Alternatively,
   75      * set the system property
   76      * org.apache.lucene.analysis.standard.StandardAnalyzer.replaceInvalidAcronym
   77      * to false.
   78      *
   79      * See https://issues.apache.org/jira/browse/LUCENE-1068
   80      * @deprecated This will be removed (hardwired to true) in 3.0
   81      */
   82     public static void setDefaultReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
   83       defaultReplaceInvalidAcronym = replaceInvalidAcronym;
   84     }
   85   
   86   
   87     /** An array containing some common English words that are usually not
   88     useful for searching. */
   89     public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
   90   
   91     /** Builds an analyzer with the default stop words ({@link #STOP_WORDS}). */
   92     public StandardAnalyzer() {
   93       this(STOP_WORDS);
   94     }
   95   
   96     /** Builds an analyzer with the given stop words. */
   97     public StandardAnalyzer(Set stopWords) {
   98       stopSet = stopWords;
   99     }
  100   
  101     /** Builds an analyzer with the given stop words. */
  102     public StandardAnalyzer(String[] stopWords) {
  103       stopSet = StopFilter.makeStopSet(stopWords);
  104     }
  105   
  106     /** Builds an analyzer with the stop words from the given file.
  107      * @see WordlistLoader#getWordSet(File)
  108      */
  109     public StandardAnalyzer(File stopwords) throws IOException {
  110       stopSet = WordlistLoader.getWordSet(stopwords);
  111     }
  112   
  113     /** Builds an analyzer with the stop words from the given reader.
  114      * @see WordlistLoader#getWordSet(Reader)
  115      */
  116     public StandardAnalyzer(Reader stopwords) throws IOException {
  117       stopSet = WordlistLoader.getWordSet(stopwords);
  118     }
  119   
  120     /**
  121      *
  122      * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
  123      *
  124      * See https://issues.apache.org/jira/browse/LUCENE-1068
  125      *
  126      * @deprecated Remove in 3.X and make true the only valid value
  127      */
  128     public StandardAnalyzer(boolean replaceInvalidAcronym) {
  129       this(STOP_WORDS);
  130       this.replaceInvalidAcronym = replaceInvalidAcronym;
  131     }
  132   
  133     /**
  134      *  @param stopwords The stopwords to use
  135      * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
  136      *
  137      * See https://issues.apache.org/jira/browse/LUCENE-1068
  138      *
  139      * @deprecated Remove in 3.X and make true the only valid value
  140      */
  141     public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{
  142       this(stopwords);
  143       this.replaceInvalidAcronym = replaceInvalidAcronym;
  144     }
  145   
  146     /**
  147      * @param stopwords The stopwords to use
  148      * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
  149      *
  150      * See https://issues.apache.org/jira/browse/LUCENE-1068
  151      *
  152      * @deprecated Remove in 3.X and make true the only valid value
  153      */
  154     public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{
  155       this(stopwords);
  156       this.replaceInvalidAcronym = replaceInvalidAcronym;
  157     }
  158   
  159     /**
  160      *
  161      * @param stopwords The stopwords to use
  162      * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
  163      *
  164      * See https://issues.apache.org/jira/browse/LUCENE-1068
  165      *
  166      * @deprecated Remove in 3.X and make true the only valid value
  167      */
  168     public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
  169       this(stopwords);
  170       this.replaceInvalidAcronym = replaceInvalidAcronym;
  171     }
  172   
  173     /**
  174      * @param stopwords The stopwords to use
  175      * @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
  176      *
  177      * See https://issues.apache.org/jira/browse/LUCENE-1068
  178      *
  179      * @deprecated Remove in 3.X and make true the only valid value
  180      */
  181     public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{
  182       this(stopwords);
  183       this.replaceInvalidAcronym = replaceInvalidAcronym;
  184     }
  185   
  186     /** Constructs a {@link StandardTokenizer} filtered by a {@link
  187     StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
  188     public TokenStream tokenStream(String fieldName, Reader reader) {
  189       StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
  190       tokenStream.setMaxTokenLength(maxTokenLength);
  191       TokenStream result = new StandardFilter(tokenStream);
  192       result = new LowerCaseFilter(result);
  193       result = new StopFilter(result, stopSet);
  194       return result;
  195     }
  196   
  197     private static final class SavedStreams {
  198       StandardTokenizer tokenStream;
  199       TokenStream filteredTokenStream;
  200     }
  201   
  202     /** Default maximum allowed token length */
  203     public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
  204   
  205     private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
  206   
  207     /**
  208      * Set maximum allowed token length.  If a token is seen
  209      * that exceeds this length then it is discarded.  This
  210      * setting only takes effect the next time tokenStream or
  211      * reusableTokenStream is called.
  212      */
  213     public void setMaxTokenLength(int length) {
  214       maxTokenLength = length;
  215     }
  216       
  217     /**
  218      * @see #setMaxTokenLength
  219      */
  220     public int getMaxTokenLength() {
  221       return maxTokenLength;
  222     }
  223     
  224     public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
  225       SavedStreams streams = (SavedStreams) getPreviousTokenStream();
  226       if (streams == null) {
  227         streams = new SavedStreams();
  228         setPreviousTokenStream(streams);
  229         streams.tokenStream = new StandardTokenizer(reader);
  230         streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
  231         streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
  232         streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
  233       } else {
  234         streams.tokenStream.reset(reader);
  235       }
  236       streams.tokenStream.setMaxTokenLength(maxTokenLength);
  237       
  238       streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
  239   
  240       return streams.filteredTokenStream;
  241     }
  242   
  243     /**
  244      *
  245      * @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
  246      *
  247      * See https://issues.apache.org/jira/browse/LUCENE-1068
  248      * @deprecated This will be removed (hardwired to true) in 3.0
  249      */
  250     public boolean isReplaceInvalidAcronym() {
  251       return replaceInvalidAcronym;
  252     }
  253   
  254     /**
  255      *
  256      * @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
  257      *
  258      * See https://issues.apache.org/jira/browse/LUCENE-1068
  259      * @deprecated This will be removed (hardwired to true) in 3.0
  260      */
  261     public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
  262       this.replaceInvalidAcronym = replaceInvalidAcronym;
  263     }
  264   }

Save This Page
Home » lucene-2.4.1-src » org.apache » lucene » analysis » standard » [javadoc | source]