2.5、Lucene查询详解



目录

2.5.1、搜索入门

2.5.2、多域搜索(MultiFieldQueryParse)

2.5.3、词项搜索(TermQuery)

2.5.4、布尔搜索(BooleanQuery)

2.5.5、范围搜索(RangeQuery)

2.5.6、前缀搜索(PrefixQuery)

2.5.7、多关键字搜索(PhraseQuery)

2.5.8、模糊搜索(FuzzyQuery)

2.5.9、通配符搜索(WildcardQuery)

-—————————————————

文档索引完成以后就能对其进行搜索;

当用户输入一个关键字,

​ –> 首先 对这个关键字 进行 分析和处理, 转化成后台可以理解的形式

​ –> 进行检索

2.5.1、搜索入门

处理关键词 <==> 构建Query对象的过程;

搜索文档 <==> 实例化 IndexSearcher 对象,使用search()方法完成;

​ 参数:Query对象

​ 结果:保存在 TopDocs 类型的文档集合中;

删除indexdir下的索引文件后,重新使用CreateIndex.java 生成索引

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
package com.learn.lucene.chapter2.queries;

import com.learn.lucene.chapter2.ik.IKAnalyzer8x;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.1、查询
*/
public class QueryParseTest {
public static void main(String[] args) throws IOException, ParseException {
String field = "title";
Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer8x();
QueryParser parser = new QueryParser(field, analyzer);
parser.setDefaultOperator(QueryParser.Operator.AND);
Query query = parser.parse("农村学生");
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}

}

运行结果:

修改后再运行:

2.5.2、多域搜索(MultiFieldQueryParse)

根据多个字段搜索

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
package com.learn.lucene.chapter2.queries;

import com.learn.lucene.chapter2.ik.IKAnalyzer8x;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.2、多域搜索
*/
public class MultiFieldQueryParseTest {
public static void main(String[] args) throws IOException, ParseException {
String[] field = {"title", "content"};
Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer8x(true);
MultiFieldQueryParser parser = new MultiFieldQueryParser(field, analyzer);
parser.setDefaultOperator(QueryParser.Operator.AND);
Query query = parser.parse("美国");
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

运行结果:

2.5.3、词项搜索(TermQuery)

TermQuery 是 最常用的 Query

TermQuery 是 Lucene中搜索的最基本单位

本质上:一个词条就是一个 key/value 对

使用TermQuery:

  • 首先构造一个 Term对象;Term term = new Term(“title”, “美国”);
  • 然后使用Term对象为参数,构造一个TermQuery对象;TermQuery query = new TermQuery(term);
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.3 词项搜索
*/
public class TermQueryTest {
public static void main(String[] args) throws IOException, ParseException {

Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term("title", "美国");
TermQuery query = new TermQuery(term);
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

运行结果:

2.5.4、布尔搜索(BooleanQuery)

BooleanQuery 可以 组合 其他 Query,并标明他们的逻辑关系;

例如:查询 content 中包含美国,并且 title 不包含美国的文档;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.4、布尔搜索(BooleanQuery)
* 查询 content 中包含美国,并且 title 不包含美国的文档;
*/
public class BooleanQueryTest {
public static void main(String[] args) throws IOException, ParseException {
Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
TermQuery query1 = new TermQuery(new Term("title", "美国"));
TermQuery query2 = new TermQuery(new Term("content", "美国"));
BooleanClause booleanClause1 = new BooleanClause(query1, BooleanClause.Occur.MUST_NOT);
BooleanClause booleanClause2 = new BooleanClause(query2, BooleanClause.Occur.MUST);
BooleanQuery query = new BooleanQuery.Builder().add(booleanClause1).add(booleanClause2).build();

System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}

}

运行结果:

2.5.5、范围搜索(RangeQuery)

举例:查询新闻回复条数在 500~1000 之间的文档

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.IntPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.5、范围搜索(RangeQuery)
* 举例:查询新闻回复条数在 500~1000 之间的文档
*/
public class RangeQueryTest {
public static void main(String[] args) throws IOException, ParseException {

Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = IntPoint.newRangeQuery("reply", 500, 1000);
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("reply: " + doc.get("reply_display"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

2.5.6、前缀搜索(PrefixQuery)

举例:搜索 包含以“学”开头的词项 的文档

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.6、前缀搜索(PrefixQuery)
* 举例:搜索 包含以“学”开头的词项 的文档
*/
public class PrefixQueryTest {
public static void main(String[] args) throws IOException, ParseException {

Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term("title", "学");
Query query = new PrefixQuery(term);
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

运行结果:

2.5.7、多关键字搜索(PhraseQuery)

  • PhraseQuery 可以 通过add方法添加多个关键字
  • 还可以通过 setSlop() 设定“坡度”,允许关键字之间 无关词汇存在量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.7、多关键字搜索(PhraseQuery)
* PhraseQuery 可以 通过add方法添加多个关键字
* 还可以通过 setSlop() 设定“坡度”,允许关键字之间 无关词汇存在量
*/
public class PhraseQueryTest {
public static void main(String[] args) throws IOException, ParseException {
String str = "习近平会见奥巴马,学习国外经验";

Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("title", "奥巴马"), str.indexOf("奥巴马"));
builder.add(new Term("title", "学习国外经验"), str.indexOf("学习国外经验"));
PhraseQuery query = builder.build();
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("reply: " + doc.get("reply_display"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

运行结果(感觉没有成功):

2.5.8、模糊搜索(FuzzyQuery)

它可以简单的识别两个相近的词语。

举例:“Trump”,写成“Trmp”,拼写错误,仍然可以搜索得到正确的结果

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.8、模糊搜索(FuzzyQuery)
* 它可以简单的识别两个相近的词语。
* 举例:“Trump”,写成“Trmp”,拼写错误,仍然可以搜索得到正确的结果
*/
public class FuzzyQueryTest {
public static void main(String[] args) throws IOException, ParseException {
Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new FuzzyQuery(new Term("title", "Trmp"));
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("reply: " + doc.get("reply_display"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}

2.5.9、通配符搜索(WildcardQuery)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
package com.learn.lucene.chapter2.queries;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
* 2.5.9、通配符搜索(WildcardQuery)
*/
public class WildcardQueryTest {
public static void main(String[] args) throws IOException, ParseException {
Path indexPath = Paths.get("indexdir");
Directory directory = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
Query query = new WildcardQuery(new Term("title", "习?平"));
System.out.println("Query: " + query.toString()); // 查询关键词
// 返回前10条
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println("DocID: " + scoreDoc.doc);
System.out.println("id: " + doc.get("id"));
System.out.println("title: " + doc.get("title"));
System.out.println("content: " + doc.get("content"));
System.out.println("reply: " + doc.get("reply_display"));
System.out.println("文档评分: " + scoreDoc.score);
}
directory.close();
reader.close();
}
}