Lucene 3.6 中文分词、分页查询、高亮显示等

jopen 12年前

1、准备工作

下载lucene 3.6.1 : http://lucene.apache.org/ 

下载中文分词IK Analyzer: http://code.google.com/p/ik-analyzer/downloads/list (注意下载的是IK Analyzer 2012_u5_source.zip,其他版本有bug) 

下载solr 3.6.1:  http://lucene.apache.org/solr/(编译IK Analyzer时需引用包) 

OK,将lucene 、solr 相关包(lucene-core-3.6.1.jar、lucene-highlighter-3.6.1.jar、lucene-analyzers-3.6.1.jar、apache-solr-core-3.6.1.jar、apache-solr-solrj-3.6.1.jar)拷贝到项目lib下,IK源码置于项目src下。

2、从Oracle数据库中取数据创建索引(使用IK分词)

package lucene.util;    import org.apache.lucene.index.IndexWriter;  import org.apache.lucene.index.IndexWriterConfig;  import org.apache.lucene.index.CorruptIndexException;  import org.apache.lucene.store.FSDirectory;  import org.apache.lucene.store.Directory;  import org.apache.lucene.analysis.Analyzer;  import org.apache.lucene.analysis.standard.StandardAnalyzer;  import org.apache.lucene.util.Version;  import org.apache.lucene.document.Document;  import org.apache.lucene.document.Field;  import org.wltea.analyzer.lucene.IKAnalyzer;    import java.sql.Connection;  import java.io.File;  import java.io.IOException;  import java.util.ArrayList;  import java.util.Date;    import modules.gk.Gk_info;  import modules.gk.Gk_infoSub;  import web.sys.Globals;  import web.db.DBConnector;  import web.db.ObjectCtl;  import web.util.StringUtil;  //Wizzer.cn  public class LuceneIndex {      IndexWriter writer = null;      FSDirectory dir = null;      boolean create = true;        public void init() {          long a1 = System.currentTimeMillis();          System.out.println("[Lucene 开始执行:" + new Date() + "]");          Connection con = DBConnector.getconecttion(); //取得一个数据库连接          try {              final File docDir = new File(Globals.SYS_COM_CONFIG.get("sys.index.path").toString());//E:\lucene              if (!docDir.exists()) {                  docDir.mkdirs();              }              String cr = Globals.SYS_COM_CONFIG.get("sys.index.create").toString();//true or false              if ("false".equals(cr.toLowerCase())) {                  create = false;              }              Directory dir = FSDirectory.open(docDir);  //            Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);              Analyzer analyzer = new IKAnalyzer(true);              IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);              if (create) {                  // Create a new index in the directory, removing any                  // previously indexed documents:                  iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);              } else {                  // Add new documents to an existing index:                  iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);              }              IndexWriter writer = new IndexWriter(dir, iwc);              String sql = "SELECT indexno,title,describes,pdate,keywords FROM TABLEA WHERE STATE=1 AND SSTAG<>1 ";              int rowCount = ObjectCtl.getRowCount(con, sql);              int pageSize = StringUtil.StringToInt(Globals.SYS_COM_CONFIG.get("sys.index.size").toString());   //每页记录数              int pages = (rowCount - 1) / pageSize + 1; //计算总页数              ArrayList list = null;              Gk_infoSub gk = null;              for (int i = 1; i < pages+1; i++) {                  long a = System.currentTimeMillis();                  list = ObjectCtl.listPage(con, sql, i, pageSize, new Gk_infoSub());                  for (int j = 0; j < list.size(); j++) {                      gk = (Gk_infoSub) list.get(j);                      Document doc = new Document();                      doc.add(new Field("indexno", StringUtil.null2String(gk.getIndexno()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//主键不分词                      doc.add(new Field("title", StringUtil.null2String(gk.getTitle()), Field.Store.YES, Field.Index.ANALYZED));                      doc.add(new Field("describes", StringUtil.null2String(gk.getDescribes()), Field.Store.YES, Field.Index.ANALYZED));                      doc.add(new Field("pdate", StringUtil.null2String(gk.getPdate()), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));//日期不分词                      doc.add(new Field("keywords", StringUtil.null2String(gk.getKeywords()), Field.Store.YES, Field.Index.ANALYZED));                      writer.addDocument(doc);                      ObjectCtl.executeUpdateBySql(con,"UPDATE TABLEA SET SSTAG=1 WHERE indexno='"+gk.getIndexno()+"'");//更新已索引状态                  }                    long b = System.currentTimeMillis();                  long c = b - a;                  System.out.println("[Lucene " + rowCount + "条," + pages + "页,第" + i + "页花费时间:" + c + "毫秒]");              }              writer.commit();            } catch (Exception e) {              e.printStackTrace();          } finally {              DBConnector.freecon(con); //释放数据库连接              try {                  if (writer != null) {                      writer.close();                  }              } catch (CorruptIndexException e) {                  e.printStackTrace();              } catch (IOException e) {                  e.printStackTrace();              } finally {                  try {                      if (dir != null && IndexWriter.isLocked(dir)) {                          IndexWriter.unlock(dir);//注意解锁                      }                  } catch (IOException e) {                      e.printStackTrace();                  }              }          }          long b1 = System.currentTimeMillis();          long c1 = b1 - a1;          System.out.println("[Lucene 执行完毕,花费时间:" + c1 + "毫秒,完成时间:" + new Date() + "]");      }  }
 

3、单字段查询以及多字段分页查询高亮显示

 
package lucene.util;    import org.apache.lucene.store.FSDirectory;  import org.apache.lucene.store.Directory;  import org.apache.lucene.search.*;  import org.apache.lucene.search.highlight.SimpleHTMLFormatter;  import org.apache.lucene.search.highlight.Highlighter;  import org.apache.lucene.search.highlight.SimpleFragmenter;  import org.apache.lucene.search.highlight.QueryScorer;  import org.apache.lucene.queryParser.QueryParser;  import org.apache.lucene.queryParser.MultiFieldQueryParser;  import org.apache.lucene.analysis.TokenStream;  import org.apache.lucene.analysis.Analyzer;  import org.apache.lucene.analysis.KeywordAnalyzer;  import org.apache.lucene.document.Document;  import org.apache.lucene.index.IndexReader;  import org.apache.lucene.index.Term;  import org.apache.lucene.util.Version;  import modules.gk.Gk_infoSub;    import java.util.ArrayList;  import java.io.File;  import java.io.StringReader;  import java.lang.reflect.Constructor;    import web.util.StringUtil;  import web.sys.Globals;  import org.wltea.analyzer.lucene.IKAnalyzer;  //Wizzer.cn  public class LuceneQuery {      private static String indexPath;// 索引生成的目录      private int rowCount;// 记录数      private int pages;// 总页数      private int currentPage;// 当前页数      private int pageSize;   //每页记录数        public LuceneQuery() {          this.indexPath = Globals.SYS_COM_CONFIG.get("sys.index.path").toString();      }        public int getRowCount() {          return rowCount;      }        public int getPages() {          return pages;      }        public int getPageSize() {          return pageSize;      }        public int getCurrentPage() {          return currentPage;      }        /**       * 函数功能:根据字段查询索引       */      public ArrayList queryIndexTitle(String keyWord, int curpage, int pageSize) {          ArrayList list = new ArrayList();          try {              if (curpage <= 0) {                  curpage = 1;              }              if (pageSize <= 0) {                  pageSize = 20;              }              this.pageSize = pageSize;   //每页记录数              this.currentPage = curpage;   //当前页              int start = (curpage - 1) * pageSize;              Directory dir = FSDirectory.open(new File(indexPath));              IndexReader reader = IndexReader.open(dir);              IndexSearcher searcher = new IndexSearcher(reader);              Analyzer analyzer = new IKAnalyzer(true);              QueryParser queryParser = new QueryParser(Version.LUCENE_36, "title", analyzer);              queryParser.setDefaultOperator(QueryParser.AND_OPERATOR);              Query query = queryParser.parse(keyWord);              int hm = start + pageSize;              TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);              searcher.search(query, res);                SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");              Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query));              this.rowCount = res.getTotalHits();              this.pages = (rowCount - 1) / pageSize + 1; //计算总页数              TopDocs tds = res.topDocs(start, pageSize);              ScoreDoc[] sd = tds.scoreDocs;              for (int i = 0; i < sd.length; i++) {                  Document hitDoc = reader.document(sd[i].doc);                  list.add(createObj(hitDoc, analyzer, highlighter));              }            } catch (Exception e) {              e.printStackTrace();          }            return list;        }      /**       * 函数功能:根据字段查询索引       */      public ArrayList queryIndexFields(String allkeyword, String onekeyword, String nokeyword, int curpage, int pageSize) {          ArrayList list = new ArrayList();          try {              if (curpage <= 0) {                  curpage = 1;              }              if (pageSize <= 0) {                  pageSize = 20;              }              this.pageSize = pageSize;   //每页记录数              this.currentPage = curpage;   //当前页              int start = (curpage - 1) * pageSize;              Directory dir = FSDirectory.open(new File(indexPath));              IndexReader reader = IndexReader.open(dir);              IndexSearcher searcher = new IndexSearcher(reader);              BooleanQuery bQuery = new BooleanQuery();  //组合查询              if (!"".equals(allkeyword)) {//包含全部关键词                  KeywordAnalyzer analyzer = new KeywordAnalyzer();                  BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//AND                  Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, allkeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);                  bQuery.add(query, BooleanClause.Occur.MUST);  //AND              }              if (!"".equals(onekeyword)) { //包含任意关键词                  Analyzer analyzer = new IKAnalyzer(true);                  BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//OR                  Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, onekeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);                  bQuery.add(query, BooleanClause.Occur.MUST);  //AND              }              if (!"".equals(nokeyword)) { //排除关键词                  Analyzer analyzer = new IKAnalyzer(true);                  BooleanClause.Occur[] flags = {BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD};//NOT                  Query query = MultiFieldQueryParser.parse(Version.LUCENE_36, nokeyword, new String[]{"title", "describes", "keywords"}, flags, analyzer);                  bQuery.add(query, BooleanClause.Occur.MUST_NOT);  //AND                }              int hm = start + pageSize;              TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);              searcher.search(bQuery, res);              SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");              Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(bQuery));              this.rowCount = res.getTotalHits();              this.pages = (rowCount - 1) / pageSize + 1; //计算总页数              System.out.println("rowCount:" + rowCount);              TopDocs tds = res.topDocs(start, pageSize);              ScoreDoc[] sd = tds.scoreDocs;              Analyzer analyzer = new IKAnalyzer();              for (int i = 0; i < sd.length; i++) {                  Document hitDoc = reader.document(sd[i].doc);                  list.add(createObj(hitDoc, analyzer, highlighter));              }            } catch (Exception e) {              e.printStackTrace();          }            return list;        }        /**       * 创建返回对象(高亮)       */        private synchronized static Object createObj(Document doc, Analyzer analyzer, Highlighter highlighter) {            Gk_infoSub gk = new Gk_infoSub();          try {                if (doc != null) {                  gk.setIndexno(StringUtil.null2String(doc.get("indexno")));                  gk.setPdate(StringUtil.null2String(doc.get("pdate")));                  String title = StringUtil.null2String(doc.get("title"));                  gk.setTitle(title);                  if (!"".equals(title)) {                      highlighter.setTextFragmenter(new SimpleFragmenter(title.length()));                      TokenStream tk = analyzer.tokenStream("title", new StringReader(title));                      String htext = StringUtil.null2String(highlighter.getBestFragment(tk, title));                      if (!"".equals(htext)) {                          gk.setTitle(htext);                      }                  }                  String keywords = StringUtil.null2String(doc.get("keywords"));                  gk.setKeywords(keywords);                  if (!"".equals(keywords)) {                      highlighter.setTextFragmenter(new SimpleFragmenter(keywords.length()));                      TokenStream tk = analyzer.tokenStream("keywords", new StringReader(keywords));                      String htext = StringUtil.null2String(highlighter.getBestFragment(tk, keywords));                      if (!"".equals(htext)) {                          gk.setKeywords(htext);                      }                  }                  String describes = StringUtil.null2String(doc.get("describes"));                  gk.setDescribes(describes);                  if (!"".equals(describes)) {                      highlighter.setTextFragmenter(new SimpleFragmenter(describes.length()));                      TokenStream tk = analyzer.tokenStream("keywords", new StringReader(describes));                      String htext = StringUtil.null2String(highlighter.getBestFragment(tk, describes));                      if (!"".equals(htext)) {                          gk.setDescribes(htext);                      }                  }                }              return gk;          }          catch (Exception e) {                e.printStackTrace();              return null;          }          finally {              gk = null;          }        }        private synchronized static Object createObj(Document doc) {            Gk_infoSub gk = new Gk_infoSub();          try {                if (doc != null) {                  gk.setIndexno(StringUtil.null2String(doc.get("indexno")));                  gk.setPdate(StringUtil.null2String(doc.get("pdate")));                  gk.setTitle(StringUtil.null2String(doc.get("title")));                  gk.setKeywords(StringUtil.null2String(doc.get("keywords")));                  gk.setDescribes(StringUtil.null2String(doc.get("describes")));              }              return gk;          }          catch (Exception e) {                e.printStackTrace();              return null;          }          finally {              gk = null;          }        }  }
  单字段查询:
        long a = System.currentTimeMillis();          try {              int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));              int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));              String title = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("title")));              LuceneQuery lu = new LuceneQuery();              form.addResult("list", lu.queryIndexTitle(title, curpage, pagesize));              form.addResult("curPage", lu.getCurrentPage());              form.addResult("pageSize", lu.getPageSize());              form.addResult("rowCount", lu.getRowCount());              form.addResult("pageCount", lu.getPages());          } catch (Exception e) {              e.printStackTrace();          }          long b = System.currentTimeMillis();          long c = b - a;          System.out.println("[搜索信息花费时间:" + c + "毫秒]");
多字段查询:
        long a = System.currentTimeMillis();          try {              int curpage = StringUtil.StringToInt(StringUtil.null2String(form.get("curpage")));              int pagesize = StringUtil.StringToInt(StringUtil.null2String(form.get("pagesize")));              String allkeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("allkeyword")));              String onekeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("onekeyword")));              String nokeyword = StringUtil.replaceLuceneStr(StringUtil.null2String(form.get("nokeyword")));              LuceneQuery lu = new LuceneQuery();              form.addResult("list", lu.queryIndexFields(allkeyword,onekeyword,nokeyword, curpage, pagesize));              form.addResult("curPage", lu.getCurrentPage());              form.addResult("pageSize", lu.getPageSize());              form.addResult("rowCount", lu.getRowCount());              form.addResult("pageCount", lu.getPages());          } catch (Exception e) {              e.printStackTrace();          }          long b = System.currentTimeMillis();          long c = b - a;          System.out.println("[高级检索花费时间:" + c + "毫秒]");

4、Lucene通配符查询

            BooleanQuery bQuery = new BooleanQuery();  //组合查询              if (!"".equals(title)) {                  WildcardQuery w1 = new WildcardQuery(new Term("title", title+ "*"));                    bQuery.add(w1, BooleanClause.Occur.MUST);  //AND              }              int hm = start + pageSize;              TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);              searcher.search(bQuery, res);
 

5、Lucene嵌套查询

实现SQL:(unitid like 'unitid%'  and idml like 'id2%') or (tounitid like 'unitid%' and tomlid like 'id2%' and tostate=1)
                    BooleanQuery bQuery = new BooleanQuery();                      BooleanQuery b1 = new BooleanQuery();                      WildcardQuery w1 = new WildcardQuery(new Term("unitid", unitid + "*"));                      WildcardQuery w2 = new WildcardQuery(new Term("idml", id2 + "*"));                      b1.add(w1, BooleanClause.Occur.MUST);//AND                      b1.add(w2, BooleanClause.Occur.MUST);//AND                      bQuery.add(b1, BooleanClause.Occur.SHOULD);//OR                      BooleanQuery b2 = new BooleanQuery();                      WildcardQuery w3 = new WildcardQuery(new Term("tounitid", unitid + "*"));                      WildcardQuery w4 = new WildcardQuery(new Term("tomlid", id2 + "*"));                      WildcardQuery w5 = new WildcardQuery(new Term("tostate", "1"));                      b2.add(w3, BooleanClause.Occur.MUST);//AND                      b2.add(w4, BooleanClause.Occur.MUST);//AND                      b2.add(w5, BooleanClause.Occur.MUST);//AND                      bQuery.add(b2, BooleanClause.Occur.SHOULD);//OR

6、Lucene先根据时间排序后分页

            int hm = start + pageSize;              Sort sort = new Sort(new SortField("pdate", SortField.STRING, true));              TopScoreDocCollector res = TopScoreDocCollector.create(pageSize, false);              searcher.search(bQuery, res);              this.rowCount = res.getTotalHits();              this.pages = (rowCount - 1) / pageSize + 1; //计算总页数              TopDocs tds =searcher.search(bQuery,rowCount,sort);// res.topDocs(start, pageSize);              ScoreDoc[] sd = tds.scoreDocs;              System.out.println("rowCount:" + rowCount);              int i=0;              for (ScoreDoc scoreDoc : sd) {                  i++;                  if(i<start){                      continue;                  }                  if(i>hm){                      break;                  }                  Document doc = searcher.doc(scoreDoc.doc);                  list.add(createObj(doc));              }
这个效率不高,正常的做饭是创建索引的时候进行排序,之后使用分页方法,不要这样进行2次查询。