2.3、Lucene分词详解


目录

2.3.1、Lucene分词系统

2.3.2、分词测试

2.3.3、IK分词器配置

2.3.4、中文分词器对比

2.3.5、扩展停用词词典

2.3.6、扩展自定义词典

-——————————————————————

2.3.1、Lucene分词系统

索引和查询 都是以 词项 为基本单位

Lucene中,分词 主要依靠 Analyzer类 解析实现

Analyzer是抽象类,内部调用 TokenStream 实现

2.3.2、分词测试

StandardAnalyzer 分词器测试:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
package com.learn.lucene.chapter2.analyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.io.StringReader;

/**
* StandardAnalyzer demo
*/
public class StdAnalyzerDemo {
private static String strCh = "中华人名共和国简称中国,是一个有13亿人口的国家";
private static String strEn = "Dogs can not achieve a place, eyes can reach;";
public static void main(String[] args) throws IOException {
System.out.println("StandardAnalyzer 对中文分词:");
stdAnalyzer(strCh);
System.out.println("StandardAnalyzer 对英文分词:");
stdAnalyzer(strEn);
}

public static void stdAnalyzer(String str) throws IOException {
Analyzer analyzer = null;
analyzer = new StandardAnalyzer();
StringReader reader = new StringReader(str);
TokenStream tokenStream = analyzer.tokenStream(str, reader);
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
System.out.println("分词结果:");
while (tokenStream.incrementToken()) {
System.out.print(charTermAttribute.toString() + "|");
}
System.out.println("\n");
analyzer.close();
}
}

运行结果:

测试多种Analyzer,注意要特意指定 jdk8:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package com.learn.lucene.chapter2.analyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.io.StringReader;
import java.util.StringJoiner;

/**
* 测试多种分词器
*/
public class VariousAnalyzersDemo {

private static String strCh = "中华人民共和国简称中国,是一个有13亿人口的国家";
private static String strEn = "Dogs can not achieve a place, eyes can reach;";

public static void main(String[] args) throws IOException {
System.out.println("标准分词:" + printAnalyzer(new StandardAnalyzer(), strCh));
System.out.println("空格分词:" + printAnalyzer(new WhitespaceAnalyzer(), strCh));
System.out.println("简单分词:" + printAnalyzer(new SimpleAnalyzer(), strCh));
System.out.println("二分法分词:" + printAnalyzer(new CJKAnalyzer(), strCh));
System.out.println("关键字分词:" + printAnalyzer(new KeywordAnalyzer(), strCh));
System.out.println("停用词分词:" + printAnalyzer(new StopAnalyzer(new StringReader(strCh)), strCh));
System.out.println("中文智能分词:" + printAnalyzer(new SmartChineseAnalyzer(), strCh));
}

public static String printAnalyzer(Analyzer analyzer, String str) throws IOException {
StringReader reader = new StringReader(str);
TokenStream tokenStream = analyzer.tokenStream(str, reader);
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
StringJoiner stringJoiner = new StringJoiner("|");
while (tokenStream.incrementToken()) {
stringJoiner.add(charTermAttribute.toString());
}
analyzer.close();
return stringJoiner.toString();
}
}

运行结果:

2.3.3、IK分词器配置

Lucene 8.0 实用 IK分词器需要修改 IKTokenizer 和 IKAnalyzer

在 com.learn.lucene.chapter2.ik 下 新建 IKTokenizer8x.java 和 IKAnalyzer8x.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
package com.learn.lucene.chapter2.ik;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;

import java.io.IOException;

/**
* 修改 IKTokenizer —— incrementToken()
*/
public class IKTokenizer8x extends Tokenizer {
// IK分词器实现
private IKSegmenter _IKIkSegmenter;
// 词元文本属性
private final CharTermAttribute termAttribute;
// 词元位移属性
private final OffsetAttribute offsetAttribute;
// 词元分类属性
// (该属性分类参考 org.wltea.analyzer.core.Lexeme 中的分类常量)
private final TypeAttribute typeAttribute;
// 记录最后一个词元的结束位置
private int endPosttion;

// Lucene 8.x Tokenizer适配器类构造函数,实现最新的 Tokenizer 接口
public IKTokenizer8x(boolean useSmart) {
super();
offsetAttribute = addAttribute(OffsetAttribute.class);
termAttribute = addAttribute(CharTermAttribute.class);
typeAttribute = addAttribute(TypeAttribute.class);
_IKIkSegmenter = new IKSegmenter(input, useSmart);
}

@Override
public boolean incrementToken() throws IOException {
clearAttributes(); // 清除所有的词元属性
Lexeme nextLexeme = _IKIkSegmenter.next();
if (nextLexeme != null) {
// 将 Lexeme 转化成 Attributes
termAttribute.append(nextLexeme.getLexemeText()); // 设置词元文本
termAttribute.setLength(nextLexeme.getLength()); // 设置词元长度
offsetAttribute.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); // 设置词元位移
endPosttion = nextLexeme.getEndPosition(); // 记录 分词的最后位置
typeAttribute.setType(nextLexeme.getLexemeText()); // 记录词元分分类
return true; // 返回true 告知 还有下个词元
}
return false;
}

@Override
public void reset() throws IOException {
super.reset();
_IKIkSegmenter.reset(input);
}

@Override
public final void end() throws IOException {
int finalOffset = correctOffset(this.endPosttion);
offsetAttribute.setOffset(finalOffset, finalOffset);
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
package com.learn.lucene.chapter2.ik;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;

/**
* 修改IKAnalyzer —— createComponents(String fieldName)
*/
public class IKAnalyzer8x extends Analyzer {

private boolean useSmart;

public IKAnalyzer8x() {
this(false);
}

public IKAnalyzer8x(boolean useSmart) {
super();
this.useSmart = useSmart;
}

@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer _IKTokenizer = new IKTokenizer8x(this.isUseSmart());
return new TokenStreamComponents(_IKTokenizer);
}

public boolean isUseSmart() {
return useSmart;
}

public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
}

实例化 IKAnalyzer8x 就能实用IK分词器了

1、 默认使用细粒度切分算法:

​ Analyzer analyzer = new IKAnalyzer8x();

2、创建智能切分算法的 IKAnalyzer:

​ Analyzer analyzer = new IKAnalyzer8x(true);

2.3.4、中文分词器对比

​ 分词效果会直接影响文档搜索的准确性

​ 我们对比一下 Lucene自带的 SmartChineseAnalyzer 和 IK Analyzer的效率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
package com.learn.lucene.chapter2.analyzer;

import com.learn.lucene.chapter2.ik.IKAnalyzer8x;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import java.io.IOException;
import java.io.StringReader;
import java.util.StringJoiner;

/**
* 中文分词器效果对比
* 1、Lucene 自带的 SmartChineseAnalyzer 分词器
* 2、IKAnalyzer8x分词器
*/
public class IkVSSmartchDemo {

private static String str1 = "公路局正在治理解放大道路面积水问题。";
private static String str2 = "IKAnalyzer 是一个开源的,基于java语言开发的轻量级的中文分词工具包。";

public static void main(String[] args) throws IOException {
System.out.println("句子一:" + str1);
System.out.println("SmartChineseAnalyzer分词结果:" + printAnalyzer(new SmartChineseAnalyzer(), str1));
System.out.println("IKAnalyzer8x分词结果:" + printAnalyzer(new IKAnalyzer8x(true), str1));
// System.out.println("IKAnalyzer分词结果(bug):" + printAnalyzer(new IKAnalyzer(), str1));
System.out.println("----------------------------------------");
System.out.println("句子二:" + str2);
System.out.println("SmartChineseAnalyzer分词结果:" + printAnalyzer(new SmartChineseAnalyzer(), str2));
System.out.println("IKAnalyzer8x分词结果:" + printAnalyzer(new IKAnalyzer8x(true), str2));
// System.out.println("IKAnalyzer分词结果(bug):" + printAnalyzer(new IKAnalyzer(true), str2));
}

public static String printAnalyzer(Analyzer analyzer, String str) throws IOException {
StringReader reader = new StringReader(str);
TokenStream tokenStream = analyzer.tokenStream(str, reader);
tokenStream.reset();
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
StringJoiner stringJoiner = new StringJoiner("|");
while (tokenStream.incrementToken()) {
stringJoiner.add(charTermAttribute.toString());
}
analyzer.close();
return stringJoiner.toString();
}
}

2.3.5、扩展停用词词典

IK Analyzer 默认的停用词词典为 IKAnalyzer2012_u6/stopword.dic

这个词典只有30多个英文停用词,并不完整

推荐使用扩展额停用词词表:https://github.com/cseryp/stopwords

在工程中新建 ext_stopword.dic,放在IKAnalyzer.cfg.xml同一目录;

编辑IKAnalyzer.cfg.xml,