lucene4.x自定义停用分词器
jopen
9年前
package com.kkrgwbj.util; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.util.Version; import java.io.Reader; import java.util.HashSet; import java.util.Set; /** * 自定义停用分词器 * Created by lvbingyang on 2015/11/25 0025. */ public class MyStopAnalyzer extends Analyzer { private Set stops; public MyStopAnalyzer(String[] sws) { //将字符串数组添加到停用词的set集合中 stops = StopFilter.makeStopSet(Version.LUCENE_45, sws, true); //加入原来的停用词 stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET); } /** * 默认构造方法 */ public MyStopAnalyzer() { stops = new HashSet<>(); stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);//加入原来的停用词 } @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { //主要负责接收reader,将reader进行分词操作 Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_45, reader); //创建停用词的set对象 CharArraySet charArraySet = CharArraySet.copy(Version.LUCENE_45, stops); //分词器做好处理之后得到的一个流,这个流中存储了分词的信息 //使用了忽略大小写的filter,停用filter过滤 TokenStream tokenStream = new LowerCaseFilter(Version.LUCENE_45, new StopFilter(Version.LUCENE_45, tokenizer, charArraySet)); return new TokenStreamComponents(tokenizer, tokenStream); } }
junit测试:
@Test public void test2() { Analyzer analyzer = new MyStopAnalyzer(new String[]{"I", "you", "hate"}); Analyzer analyzer1 = new StopAnalyzer(Version.LUCENE_45); String txt = "i love you,i hate you"; //自定义的停用词分词器 AnalyzerUtils.displayToken(txt, analyzer); //默认的停用词分词器 AnalyzerUtils.displayToken(txt, analyzer1); }
在这里,我们停用了i,you,hate,运行结果: