3.7、Lucene文件检索实战 —— 文件检索


1、根据索引查找文件

2、搜索结果调试

3、整个文件如下

-———————————-

代码仓库:https://gitee.com/carloz/lucene-learn.git

https://gitee.com/carloz/lucene-learn/tree/master/lucene-filesearch

-———————————-

1、根据索引查找文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/**
* @param keywords 用户输入的 关键词
* @param indexPathStr 索引路径
* @param N 结果条数
* @return 在索引中搜索 关键词,返回前N条结果
*/
public static ArrayList<FileModel> getTopDoc(String keywords, String indexPathStr, int N) {
ArrayList<FileModel> hitsList = new ArrayList<FileModel>();
// 检索域
String[] fields = {"title", "content"};
Path indexPath = Paths.get(indexPathStr);
try {
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer8x(true);
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
// 查询字符串
Query query = parser.parse(keywords);
TopDocs topDocs = searcher.search(query, N);
// 定制高亮标签
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red;\">", "</span>");
QueryScorer scoreTitle = new QueryScorer(query, fields[0]);
Highlighter hlTitle = new Highlighter(htmlFormatter, scoreTitle);
QueryScorer scoreContent = new QueryScorer(query, fields[1]);
Highlighter hlContent = new Highlighter(htmlFormatter, scoreContent);

TopDocs hits = searcher.search(query, 100);
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = searcher.doc(sd.doc);
String title = doc.get("title");
String content = doc.get("content");
TokenStream tokenStream = TokenSources.getAnyTokenStream(
searcher.getIndexReader(), sd.doc, fields[0], new IKAnalyzer8x(true));
Fragmenter fragmenter = new SimpleSpanFragmenter(scoreTitle);
hlTitle.setTextFragmenter(fragmenter);
String hlTitleStr = hlTitle.getBestFragment(tokenStream, title); // 获取高亮的片段,可以对其数量进行限制

tokenStream = TokenSources.getAnyTokenStream(
searcher.getIndexReader(), sd.doc, fields[1], new IKAnalyzer8x(true));
fragmenter = new SimpleSpanFragmenter(scoreContent);
String hlContentStr = hlContent.getBestFragment(tokenStream, content); // 获取高亮的片段,可以对其数量进行限制

FileModel fileModel = new FileModel(
RegexHtml.delHtmlTag(title),
hlTitleStr != null ? hlTitleStr : title,
hlContentStr != null ? hlContentStr : content);
hitsList.add(fileModel);
}

reader.close();
directory.close();
} catch (Exception e) {
e.printStackTrace();
}
return hitsList;
}

2、搜索结果调试

定义接口 search-list 用于调试数据

1
2
3
4
5
6
7
8
9
10
11
12
/**
* @param keywords
* @return 搜索数据调试
*/
@RequestMapping("/search-list")
@ResponseBody
public ArrayList<FileModel> searchFileList(String keywords) {
String indexPathStr = "indexdir";
ArrayList<FileModel> hitsList = getTopDoc(keywords, indexPathStr, 100);
log.info("共搜到:" + hitsList.size() + " 条数据!");
return hitsList;
}

http://localhost:18080/search-list?keywords=session

3、整个文件如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package com.learn.lucenefilesearch.controller;

import com.learn.lucenefilesearch.model.FileModel;
import com.learn.lucenefilesearch.service.IKAnalyzer8x;
import com.learn.lucenefilesearch.service.RegexHtml;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.ResponseBody;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;

@Slf4j
@Controller
public class SearchFileController {

@RequestMapping("/")
public String index() {
return "index";
}

/**
* @param keywords
* @return 搜索数据调试
*/
@RequestMapping("/search-list")
@ResponseBody
public ArrayList<FileModel> searchFileList(String keywords) {
String indexPathStr = "indexdir";
ArrayList<FileModel> hitsList = getTopDoc(keywords, indexPathStr, 100);
log.info("共搜到:" + hitsList.size() + " 条数据!");
return hitsList;
}

/**
* @param keywords 用户输入的 关键词
* @param indexPathStr 索引路径
* @param N 结果条数
* @return 在索引中搜索 关键词,返回前N条结果
*/
public static ArrayList<FileModel> getTopDoc(String keywords, String indexPathStr, int N) {
ArrayList<FileModel> hitsList = new ArrayList<FileModel>();
// 检索域
String[] fields = {"title", "content"};
Path indexPath = Paths.get(indexPathStr);
try {
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer8x(true);
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
// 查询字符串
Query query = parser.parse(keywords);
TopDocs topDocs = searcher.search(query, N);
// 定制高亮标签
SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red;\">", "</span>");
QueryScorer scoreTitle = new QueryScorer(query, fields[0]);
Highlighter hlTitle = new Highlighter(htmlFormatter, scoreTitle);
QueryScorer scoreContent = new QueryScorer(query, fields[1]);
Highlighter hlContent = new Highlighter(htmlFormatter, scoreContent);

TopDocs hits = searcher.search(query, 100);
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = searcher.doc(sd.doc);
String title = doc.get("title");
String content = doc.get("content");
TokenStream tokenStream = TokenSources.getAnyTokenStream(
searcher.getIndexReader(), sd.doc, fields[0], new IKAnalyzer8x(true));
Fragmenter fragmenter = new SimpleSpanFragmenter(scoreTitle);
hlTitle.setTextFragmenter(fragmenter);
String hlTitleStr = hlTitle.getBestFragment(tokenStream, title); // 获取高亮的片段,可以对其数量进行限制

tokenStream = TokenSources.getAnyTokenStream(
searcher.getIndexReader(), sd.doc, fields[1], new IKAnalyzer8x(true));
fragmenter = new SimpleSpanFragmenter(scoreContent);
String hlContentStr = hlContent.getBestFragment(tokenStream, content); // 获取高亮的片段,可以对其数量进行限制

FileModel fileModel = new FileModel(
RegexHtml.delHtmlTag(title),
hlTitleStr != null ? hlTitleStr : title,
hlContentStr != null ? hlContentStr : content);
hitsList.add(fileModel);
}

reader.close();
directory.close();
} catch (Exception e) {
e.printStackTrace();
}
return hitsList;
}
}