lucene4.x自定义停用分词器

jopen 9年前

package com.kkrgwbj.util;    import org.apache.lucene.analysis.Analyzer;  import org.apache.lucene.analysis.TokenStream;  import org.apache.lucene.analysis.Tokenizer;  import org.apache.lucene.analysis.core.LetterTokenizer;  import org.apache.lucene.analysis.core.LowerCaseFilter;  import org.apache.lucene.analysis.core.StopAnalyzer;  import org.apache.lucene.analysis.core.StopFilter;  import org.apache.lucene.analysis.util.CharArraySet;  import org.apache.lucene.util.Version;    import java.io.Reader;  import java.util.HashSet;  import java.util.Set;    /**   * 自定义停用分词器   * Created by lvbingyang on 2015/11/25 0025.   */  public class MyStopAnalyzer extends Analyzer {      private Set stops;        public MyStopAnalyzer(String[] sws) {          //将字符串数组添加到停用词的set集合中          stops = StopFilter.makeStopSet(Version.LUCENE_45, sws, true);          //加入原来的停用词          stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);      }        /**       * 默认构造方法       */      public MyStopAnalyzer() {          stops = new HashSet<>();          stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入原来的停用词      }        @Override      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {          //主要负责接收reader,将reader进行分词操作          Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_45, reader);          //创建停用词的set对象          CharArraySet charArraySet = CharArraySet.copy(Version.LUCENE_45, stops);          //分词器做好处理之后得到的一个流,这个流中存储了分词的信息          //使用了忽略大小写的filter,停用filter过滤          TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_45, new StopFilter(Version.LUCENE_45, tokenizer, charArraySet));          return new TokenStreamComponents(tokenizer, tokenStream);      }  }

junit测试:

@Test      public void test2() {          Analyzer analyzer = new MyStopAnalyzer(new String[]{"I", "you", "hate"});          Analyzer analyzer1 = new StopAnalyzer(Version.LUCENE_45);          String txt = "i love you,i hate you";          //自定义的停用词分词器          AnalyzerUtils.displayToken(txt, analyzer);          //默认的停用词分词器          AnalyzerUtils.displayToken(txt, analyzer1);      }



在这里,我们停用了i,you,hate,运行结果:

 

 

 

 

来自: http://my.oschina.net/kkrgwbj/blog/535592