使用Lucene4.8进行索引及搜索的基本操作

jopen 11年前

在Lucene对文本进行处理的过程中,可以大致分为两大部分:

1、索引文件:提取文档内容并分析,生成索引

2、搜索内容:搜索索引内容,根据搜索关键字得出搜索结果

 

一、索引文件

基本步骤如下:

1、创建索引库IndexWriter

2、根据文件创建文档Document

 3、向索引库中写入文档内容

    package com.ljh.search.index;                import java.io.File;        import java.io.FileReader;        import java.io.IOException;                import org.apache.lucene.analysis.standard.StandardAnalyzer;        import org.apache.lucene.document.Document;        import org.apache.lucene.document.Field;        import org.apache.lucene.document.LongField;        import org.apache.lucene.document.StringField;        import org.apache.lucene.document.TextField;        import org.apache.lucene.index.IndexWriter;        import org.apache.lucene.index.IndexWriterConfig;        import org.apache.lucene.store.Directory;        import org.apache.lucene.store.FSDirectory;        import org.apache.lucene.util.Version;                // 1、创建索引库IndexWriter        // 2、根据文件创建文档Document        // 3、向索引库中写入文档内容                public class IndexFiles {                    public static void main(String[] args) throws IOException {                        String usage = "java IndexFiles"                        + " [-index INDEX_PATH] [-docs DOCS_PATH] \n\n"                        + "This indexes the documents in DOCS_PATH, creating a Lucene index"                        + "in INDEX_PATH that can be searched with SearchFiles";                        String indexPath = null;                String docsPath = null;                for (int i = 0; i < args.length; i++) {                    if ("-index".equals(args[i])) {                        indexPath = args[i + 1];                        i++;                    } else if ("-docs".equals(args[i])) {                        docsPath = args[i + 1];                        i++;                    }                }                        if (docsPath == null) {                    System.err.println("Usage: " + usage);                    System.exit(1);                }                        final File docDir = new File(docsPath);                if (!docDir.exists() || !docDir.canRead()) {                    System.out                            .println("Document directory '"                                    + docDir.getAbsolutePath()                                    + "' does not exist or is not readable, please check the path");                    System.exit(1);                }                        IndexWriter writer = null;                try {                    // 1、创建索引库IndexWriter                    writer = getIndexWriter(indexPath);                    index(writer, docDir);                } catch (IOException e) {                    e.printStackTrace();                } finally {                    writer.close();                }                    }                    private static IndexWriter getIndexWriter(String indexPath)                    throws IOException {                        Directory indexDir = FSDirectory.open(new File(indexPath));                        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48,                        new StandardAnalyzer(Version.LUCENE_48));                        IndexWriter writer = new IndexWriter(indexDir, iwc);                        return writer;            }                    private static void index(IndexWriter writer, File file) throws IOException {                        if (file.isDirectory()) {                    String[] files = file.list();                    if (files != null) {                        for (int i = 0; i < files.length; i++) {                            index(writer, new File(file, files[i]));                        }                    }                } else {                    // 2、根据文件创建文档Document                    Document doc = new Document();                    Field pathField = new StringField("path", file.getPath(),                            Field.Store.YES);                    doc.add(pathField);                    doc.add(new LongField("modified", file.lastModified(),                            Field.Store.NO));                    doc.add(new TextField("contents", new FileReader(file)));                    System.out.println("Indexing " + file.getName());                                        // 3、向索引库中写入文档内容                    writer.addDocument(doc);                }                    }                }  
</div> </div>

(1)使用“java indexfiles -index d:/index -docs d:/tmp”运行程序,索引d:/tmp中的文件,并将索引文件放置到d:/index。

(2)上述生成的索引文件可以使用Luke进行查看。目前Luke已迁移至github进行托管。


二、搜索文件

1、打开索引库IndexSearcher
2、根据关键词进行搜索
3、遍历结果并处理

package com.ljh.search.search;        //1、打开索引库IndexSearcher    //2、根据关键词进行搜索    //3、遍历结果并处理    import java.io.File;    import java.io.IOException;        import org.apache.lucene.index.DirectoryReader;    import org.apache.lucene.index.IndexReader;    import org.apache.lucene.index.Term;    import org.apache.lucene.search.IndexSearcher;    import org.apache.lucene.search.ScoreDoc;    import org.apache.lucene.search.TermQuery;    import org.apache.lucene.search.TopDocs;    import org.apache.lucene.store.Directory;    import org.apache.lucene.store.FSDirectory;        public class Searcher {        public static void main(String[] args) throws IOException {                String indexPath = null;            String term = null;            for (int i = 0; i < args.length; i++) {                if ("-index".equals(args[i])) {                    indexPath = args[i + 1];                    i++;                } else if ("-term".equals(args[i])) {                    term = args[i + 1];                    i++;                }            }                System.out.println("Searching " + term + " in " + indexPath);                // 1、打开索引库            Directory indexDir = FSDirectory.open(new File(indexPath));            IndexReader ir = DirectoryReader.open(indexDir);            IndexSearcher searcher = new IndexSearcher(ir);                // 2、根据关键词进行搜索            TopDocs docs = searcher.search(                    new TermQuery(new Term("contents", term)), 20);                // 3、遍历结果并处理            ScoreDoc[] hits = docs.scoreDocs;            System.out.println(hits.length);            for (ScoreDoc hit : hits) {                System.out.println("doc: " + hit.doc + " score: " + hit.score);            }                ir.close();            }        }  
</div> </div> 来自:http://blog.csdn.net/jediael_lu/article/details/30035025