3.5、Lucene文件检索实战 —— 索引文档


  1. 先建立model对象
  2. 创建索引

-———————————-

代码仓库:https://gitee.com/carloz/lucene-learn.git

https://gitee.com/carloz/lucene-learn/tree/master/lucene-filesearch

-———————————-

工程搭建完成以后,首先构建索引;

检索的对象:文件;

为了简单:只索引 文档名 和 文档内容;

1、先建立model对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
package com.learn.lucenefilesearch.model;

import lombok.*;

/**
* 文件对象
*/
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
public class FileModel {
private String title; // 文件标题
private String content; // 文件内容
}

2、创建索引

将 IKTokenizer8x 和 IKAnalyzer8x 从第2章的工程里复制过来;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
package com.learn.lucenefilesearch.service;

import com.learn.lucenefilesearch.model.FileModel;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

/**
* 对 webapp/files 下的文档生成索引,保存在webapp/indexdir中
*/
public class CreateIndex {

public static void main(String[] args) throws IOException {
Analyzer analyzer = new IKAnalyzer8x(true);
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
Directory directory = null;
IndexWriter indexWriter = null;
Path indexPath = Paths.get("indexdir");
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorOffsets(true);
Date start = new Date();
if (!Files.isReadable(indexPath)) {
System.out.println(indexPath.toAbsolutePath() + "不存在或不可读,请检查");
System.exit(1);
}
directory = FSDirectory.open(indexPath);
indexWriter = new IndexWriter(directory, indexWriterConfig);
ArrayList<FileModel> fileModelList = (ArrayList<FileModel>) extractFile();
for (FileModel f : fileModelList) {
Document doc = new Document();
doc.add(new Field("title", f.getTitle(), fieldType));
doc.add(new Field("content", f.getContent(), fieldType));
indexWriter.addDocument(doc);
}
indexWriter.commit();
indexWriter.close();
directory.close();
Date end = new Date();
System.out.println("索引文档完成,共耗时:" + (end.getTime() - start.getTime()) + " 毫秒。");
}

public static List<FileModel> extractFile() throws IOException {
ArrayList<FileModel> list = new ArrayList<>();
File fileDir = new File("files");
File[] allFiles = fileDir.listFiles();
for (File f : allFiles) {
FileModel fm = new FileModel(f.getName(), ParserExtraction(f));
list.add(fm);
}
return list;
}

public static String ParserExtraction(File file) {
String fileContent = "";
BodyContentHandler handler = new BodyContentHandler();
Parser parser = new AutoDetectParser(); // 自动解析器接口
Metadata metadata = new Metadata();
FileInputStream inputStream;
try {
inputStream = new FileInputStream(file);
ParseContext context = new ParseContext();
parser.parse(inputStream, handler, metadata, context);
fileContent = handler.toString();
inputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
}
return fileContent;
}
}