lucene索引_创建_域选项和lucene索引_的删除、更新.

jopen 9年前

    package com.dhb.index;                import java.io.File;        import java.io.IOException;                import org.apache.lucene.analysis.standard.StandardAnalyzer;        import org.apache.lucene.document.Document;        import org.apache.lucene.document.Field;        import org.apache.lucene.index.CorruptIndexException;        import org.apache.lucene.index.IndexReader;        import org.apache.lucene.index.IndexWriter;        import org.apache.lucene.index.IndexWriterConfig;        import org.apache.lucene.index.IndexReader.FieldOption;        import org.apache.lucene.index.Term;        import org.apache.lucene.store.Directory;        import org.apache.lucene.store.FSDirectory;        import org.apache.lucene.store.LockObtainFailedException;        import org.apache.lucene.util.Version;        import org.junit.Before;        import org.junit.Test;                public class IndexUtil {            private String[] ids = {"1","2","3","4","5","6"};            private String[] emails = {"aa@csdn.org","bb@csdn.org","cc@sina.org","dd@sina.org",                    "ee@qq.com","ff@qq.com"};            private String[] contents = {"Welcome to my office","hello boys","hello girls",                    "I like football","I like basketball","bye-bye see you"};            private int[] attachment ={2,3,1,4,5,5};            private String[] names = {"Victor","Nancy","Kitty","Cindy","Tom","Tony"};                         private Directory directory = null;            @Before            public void IndexUtilBefore() {                try {                    directory = FSDirectory.open(new File("D:/luceneData/index02"));                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            public void index() {                IndexWriter writer = null;                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,                         new StandardAnalyzer(Version.LUCENE_35));                try {                    writer = new IndexWriter(directory, iwc);                    //清空所有索引                    writer.deleteAll();                    Document doc = null;                    for (int i = 0;i < ids.length; i++) {                        /**                        * Field.Store.YES或者NO(存储域选项)                        * 1、设置为YES表示把这个域中的内容完全存储到文件中,方便进行文本的还原                        * 2、设置为NO表示把这个域中的内容不存储到文件中,但是可以被索引,此时内容无法还原(doc.get)                        */                        /**                        * 使用Field.Index.*来进行操作                        * Index.ANALYZED:进行分词和索引,适用于标题和内容                        * Index.NOT_ANALYZED:进行索引,但不进行分词,如身份证号码,姓名,ID等,适用于精确搜索                        * Index.ANALYZED_NO_NORMS进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值等信息                        * Index.NOT_ANALYZED_NO_NORMS即不进行分词也不存储norms信息                        * Index.NO不进行索引                        */                        /**                        * NOT_ANALYZED_NO_NORMS         YES    标示符(主键,文件名),电话号码,身份证号,姓名,日期                        * ANALYZED                      YES    文档标题和摘要                        * ANALYZED                      NO     文档正文                        * NO                            YES    文档类型,数据库主键(不进行索引)                        * NOT_ANALYZED                  NO     隐藏关键字                        */                        doc = new Document();                        doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));                        doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));                        doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        writer.addDocument(doc);                                            }                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                }            }            @Test            public void query() {                try {                    IndexReader reader = IndexReader.open(directory);                    //通过reader可以有效地获取文档的数量                    System.out.println("numDocs:"+reader.numDocs());                    System.out.println("maxDocs:"+reader.maxDoc());                    //通过reader可以有效地获取删除的文档的数量                    System.out.println("numDeletedDocs:"+reader.numDeletedDocs());                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }               }            @Test            /**            * 删除            */            public void delete() {                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    //参数是一个选项,可以是一个query;也可以是一个term,term是一个精确查找的值                    //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,是可以恢复的                    writer.deleteDocuments(new Term("id", "1"));                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                 }            }            @Test            /**            * 强制删除            * 在lucene3.5之前都是使用optimize()进行处理,但是这个操作消耗资源,已经被弃用            */            public void forceDelete() {                //删除优化,删除回收站文件                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    writer.forceMergeDeletes();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                 }            }            @Test            /**            * 恢复删除            */            public void unDelete() {                //使用indexReader进行恢复                try {                    IndexReader reader = IndexReader.open(directory, false);                    //恢复时必须把IndexReader的只读(readOnly)设置为false                    reader.undeleteAll();                    reader.close();                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }            }            @Test            /**            * 优化和合并索引            */            public void merge() {                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    //参数是maxNumSegments,即最大段的数量                    //会将索引合并成2段,这两段中被删除的数据会被清空                    //特别注意:此处lucene在3.5之后不建议使用,因为会消耗大量的开销,lucene会根据情况自动处理                    writer.forceMerge(2);                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                 }            }            @Test            /**            * 更新索引            */            public void update() {                IndexWriter writer = null;                try {                    writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_35,                             new StandardAnalyzer(Version.LUCENE_35)));                    /**                    * lucene并没有提供更新,这里的更新操作实际是如下两个操作的合集                    * 先删除之后再添加                    */                    Document doc = new Document();                    doc.add(new Field("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                    doc.add(new Field("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));                    doc.add(new Field("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));                    doc.add(new Field("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                    writer.updateDocument(new Term("id", "1"), doc);                    /**                    * 更新之前:                    * numDocs:6                      maxDocs:6                      numDeletedDocs:0                    */                    /**                    * 更新之后:                    * numDocs:6                      maxDocs:7                      numDeletedDocs:1                    */                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                 }            }        }  

  补充:lucene索引_加权操作,修改代码如下:
    private String[] contents = {"Welcome to my office ,I like surfing internet.",                                          "hello boys like haha",                                          "hello girls we like each other.",                                          "I like football,you like too.",                                          "I like basketball very much, how about you?",                                          "bye-bye see you I don't like."};        private Map<String, Float> scores = new HashMap<String, Float>();        public void IndexUtilBefore() {                try {                    scores.put("qq.com", 2.0f);                    scores.put("sina.org", 1.5f);                    directory = FSDirectory.open(new File("D:/luceneData/index02"));                } catch (IOException e) {                    e.printStackTrace();                }            }            public void index() {                IndexWriter writer = null;                IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35,                         new StandardAnalyzer(Version.LUCENE_35));                try {                    writer = new IndexWriter(directory, iwc);                    //清空所有索引                    writer.deleteAll();                    Document doc = null;                    for (int i = 0;i < ids.length; i++) {                        /**                        * Field.Store.YES或者NO(存储域选项)                        * 1、设置为YES表示把这个域中的内容完全存储到文件中,方便进行文本的还原                        * 2、设置为NO表示把这个域中的内容不存储到文件中,但是可以被索引,此时内容无法还原(doc.get)                        */                        /**                        * 使用Field.Index.*来进行操作                        * Index.ANALYZED:进行分词和索引,适用于标题和内容                        * Index.NOT_ANALYZED:进行索引,但不进行分词,如身份证号码,姓名,ID等,适用于精确搜索                        * Index.ANALYZED_NO_NORMS进行分词但是不存储norms信息,这个norms中包含了创建索引的时间和权值等信息                        * Index.NOT_ANALYZED_NO_NORMS即不进行分词也不存储norms信息                        * Index.NO不进行索引                        */                        /**                        * NOT_ANALYZED_NO_NORMS         YES    标示符(主键,文件名),电话号码,身份证号,姓名,日期                        * ANALYZED                      YES    文档标题和摘要                        * ANALYZED                      NO     文档正文                        * NO                            YES    文档类型,数据库主键(不进行索引)                        * NOT_ANALYZED                  NO     隐藏关键字                        */                        doc = new Document();                        doc.add(new Field("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        doc.add(new Field("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));                        doc.add(new Field("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));                        doc.add(new Field("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));                        /**                        * 加权操作                        */                        String et = emails[i].substring(emails[i].indexOf("@")+1);                        System.out.println(et);                        if(scores.containsKey(et)) {                            doc.setBoost(scores.get(et));                        } else {                            doc.setBoost(0.5f);                        }                                                writer.addDocument(doc);                                            }                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (LockObtainFailedException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                } finally {                    if(writer!=null)                        try {                            writer.close();                        } catch (CorruptIndexException e) {                            e.printStackTrace();                        } catch (IOException e) {                            e.printStackTrace();                        }                }            }                @Test            public void search() {                try {                    IndexReader reader = IndexReader.open(directory);                    IndexSearcher searcher = new IndexSearcher(reader);                    TermQuery query = new TermQuery(new Term("content", "like"));                    TopDocs tds = searcher.search(query, 10);                    for(ScoreDoc sd : tds.scoreDocs) {                        Document d = searcher.doc(sd.doc);                        System.out.println("("+sd.doc+") "+d.get("name")+"["+d.get("email")+"] "+d.get("id"));                    }                } catch (CorruptIndexException e) {                    e.printStackTrace();                } catch (IOException e) {                    e.printStackTrace();                }                            }  

输出结果如下: (5) Tony[ff@qq.com] 6
(3) Cindy[dd@sina.org] 4
(4) Tom[ee@qq.com] 5
(2) Kitty[cc@sina.org] 3
(1) Nancy[bb@csdn.org] 2
(0) Victor[aa@csdn.org] 1