1
、
2、语汇单元的结构解释
3、同义词的设计思路
4、分词器的比较和测试
package org.lucene.test;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.lucene.util.AnalyzerUtils;
import org.lucene.util.MySameAnalyzer;
import org.lucene.util.MyStopAnalyzer;
import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
public class TestAnalyzer {
/**
* 几种分词器在英文分词下面的比较
*/
@Test
public void test01(){
//标准分词器
Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);
//停用词分词器
Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);
//简单分词器
Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);
//空格分词器
Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);
String txt = "this is my house,I am come from yunnang zhaotong," +
"My email is ynkonghao@gmail.com,My QQ is 707807876";
AnalyzerUtils.displayToken(txt, a1);
//[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail.com][my][qq][707807876]
AnalyzerUtils.displayToken(txt, a2);
//[my][house][i][am][come][from][yunnang][zhaotong][my][email][ynkonghao][gmail][com][my][qq]
AnalyzerUtils.displayToken(txt, a3);
//[this][is][my][house][i][am][come][from][yunnang][zhaotong][my][email][is][ynkonghao][gmail][com][my][qq][is]
AnalyzerUtils.displayToken(txt, a4);
//[this][is][my][house,I][am][come][from][yunnang][zhaotong,My][email][is][ynkonghao@gmail.com,My][QQ][is][707807876]
}
/**
* 几种分词器在中文分词下面的比较
*/
@Test
public void test02(){
//标准分词器
Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);
//停用词分词器
Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);
//简单分词器
Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);
//空格分词器
Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);
String txt = "我来自云南昭通昭阳区师专";
AnalyzerUtils.displayToken(txt, a1);
//[我][来][自][云][南][昭][通][昭][阳][区][师][专]
AnalyzerUtils.displayToken(txt, a2);
//[我来自云南昭通昭阳区师专]
AnalyzerUtils.displayToken(txt, a3);
//[我来自云南昭通昭阳区师专]
AnalyzerUtils.displayToken(txt, a4);
//[我来自云南昭通昭阳区师专]
}
/**
* 打印分词的详细信息
*/
@Test
public void test03(){
//标准分词器
Analyzer a1 = new StandardAnalyzer(Version.LUCENE_35);
//停用词分词器
Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);
//简单分词器
Analyzer a3 = new SimpleAnalyzer(Version.LUCENE_35);
//空格分词器
Analyzer a4 = new WhitespaceAnalyzer(Version.LUCENE_35);
String txt = "how are you thank you";
AnalyzerUtils.displayAllToken(txt, a1);
AnalyzerUtils.displayAllToken(txt, a2);
AnalyzerUtils.displayAllToken(txt, a3);
AnalyzerUtils.displayAllToken(txt, a4);
}
/**
* 停用词的测试
*/
@Test
public void test04(){
Analyzer a1 = new MyStopAnalyzer(new String[]{"I","you","hate"});
Analyzer a2 = new StopAnalyzer(Version.LUCENE_35);
String txt = "how are You thAnk's you I hate you";
AnalyzerUtils.displayToken(txt, a1);
AnalyzerUtils.displayToken(txt, a2);
}
/**
* 中文分词测试
* 使用词库分词,自己可扩展词库
*/
@Test
public void test05(){
// Analyzer a1 = new MMSegAnalyzer();//未加入该分词器自带的词库
//[我][来][自][云][南][昭][通][昭][阳][区][师][专]
//导入分词的词典便有词库
Analyzer a1 = new MMSegAnalyzer(new File("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data"));
//[我][来自][云南][昭][通][昭][阳][区][师专]
//可以在data文件下面的words-my.dic扩展自己的词典,比如加了昭通,分词结果为:
//[我][来自][云南][昭通][昭][阳][区][师专]
String txt = "我来自云南昭通昭阳区师专";
AnalyzerUtils.displayToken(txt, a1);
}
/**
* 同义词测试
* @throws IOException
* @throws CorruptIndexException
*/
@Test
public void test06() throws CorruptIndexException, IOException{
Analyzer a1 = new MySameAnalyzer();
String txt = "我来自中国云南昭通昭阳区师专";
AnalyzerUtils.displayAllToken(txt, a1);
String keyword = "俺";
Directory dire = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dire,new IndexWriterConfig(Version.LUCENE_35, a1));
Document doc = new Document();
doc.add(new Field("content",txt,Field.Store.YES,Field.Index.ANALYZED));
indexWriter.addDocument(doc);
indexWriter.close();
IndexSearcher search = new IndexSearcher(IndexReader.open(dire));
TopDocs topDoc = search.search(new TermQuery(new Term("content",keyword)),10);
ScoreDoc[] scoreDoc = topDoc.scoreDocs;
for(ScoreDoc score : scoreDoc){
Document doc1 = search.doc(score.doc);
System.out.println(doc1.get("content"));
}
}
}
5、扩展自己的停用词分词器
package org.lucene.util;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LetterTokenizer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
/**
* 扩展自己的停用词分词器
* @author user
*
*/
public class MyStopAnalyzer extends Analyzer{
private Set stops;
public MyStopAnalyzer(String[] sws){
//会自动将字符串数组转化为Set
stops = StopFilter.makeStopSet(Version.LUCENE_35, sws, true);
//把原来的停用词给加进来
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
public MyStopAnalyzer(){
//获取原有的停用词
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
System.out.println("//------------------------------------");
Tokenizer tokenizer = new LetterTokenizer(Version.LUCENE_35,reader);
// Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_35,reader);
CharTermAttribute cta = tokenizer.addAttribute(CharTermAttribute.class);
try {
while(tokenizer.incrementToken()){
System.out.println(cta);
}
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("------------------------------------\\");
//为这个分词器设定过滤链和Tokenizer
return new StopFilter(Version.LUCENE_35,
new LowerCaseFilter(Version.LUCENE_35,
new LetterTokenizer(Version.LUCENE_35, reader)),
stops);
}
}
6、分词器的扩展,同义词分词器
package org.lucene.util;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import com.chenlb.mmseg4j.Dictionary;
import com.chenlb.mmseg4j.MaxWordSeg;
import com.chenlb.mmseg4j.analysis.MMSegTokenizer;
/**
* 分词器的扩展,同义词分词器
* @author user
*
*/
public class MySameAnalyzer extends Analyzer{
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
Dictionary dic = Dictionary.getInstance("D:\\Workspaces\\03_lucene_analyzer\\mmseg4j-1.8.4\\data");
return new MySameTokenFilter(new MMSegTokenizer(new MaxWordSeg(dic), reader));
}
}
7、同义词过滤器的扩展
package org.lucene.util;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Stack;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
/**
* 同义词过滤器的扩展
* @author user
*
*/
public class MySameTokenFilter extends TokenFilter{
private CharTermAttribute cta = null;
private PositionIncrementAttribute pia = null;
private AttributeSource.State current = null;
private Stack<String> sames = null;
protected MySameTokenFilter(TokenStream input) {
super(input);
cta = this.addAttribute(CharTermAttribute.class);
pia = this.addAttribute(PositionIncrementAttribute.class);
sames = new Stack<String>();
}
/**
* 思想如下:
* 其实每个同义词都要放在CharTermAttribute里面,但是如果直接cta.append("大陆");的话
* 那会直接把原来的词和同义词连接在同一个语汇单元里面[中国大陆],这样是不行的
* 要的是这样的效果[中国][大陆]
* 那么就要在遇到同义词的时候把当前的状态保存一份,并把同义词的数组放入栈中,
* 这样在下一个语汇单元的时候判断同义词数组是否为空,不为空的话把之前的保存的一份状态
* 还原,然后在修改之前状态的值cta.setEmpty(),然后在把同义词的值加入cta.append("大陆")
* 再把位置增量设为0,pia.setPositionIncrement(0),这样的话就表示是同义词,
* 接着把该同义词的语汇单元返回
*/
@Override
public boolean incrementToken() throws IOException {
while(sames.size() > 0){
//将元素出栈,并获取这个同义词
String str = sames.pop();
//还原状态
restoreState(current);
cta.setEmpty();
cta.append(str);
//设置位置
pia.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()) return false;
if(getSameWords(cta.toString())){
//如果有同义词将当前状态先保存
current = captureState();
}
return true;
}
/*
* 使用这种方式是不行的,这种会把的结果是[中国]替换成了[大陆]
* 而不是变成了[中国][大陆]
@Override
public boolean incrementToken() throws IOException {
if(!input.incrementToken()) return false;
if(cta.toString().equals("中国")){
cta.setEmpty();
cta.append("大陆");
}
return true;
}
*/
private boolean getSameWords(String name){
Map<String,String[]> maps = new HashMap<String,String[]>();
maps.put("中国", new String[]{"大陆","天朝"});
maps.put("我", new String[]{"咱","俺"});
String[] sws = maps.get(name);
if(sws != null){
for(String s : sws){
sames.push(s);
}
return true;
}
return false;
}
}
8、打印语汇单元的信息
package org.lucene.util;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* 打印语汇单元的信息
* @author user
*
*/
public class AnalyzerUtils {
public static void displayToken(String str,Analyzer a){
TokenStream stream = a.tokenStream("content", new StringReader(str));
/*
* TokenStream相当于一条流
* CharTermAttribute相当于一个碗
* 然后把碗丢进流里面,当碗得到一个元素后,碗又会自动流到了下
* 一个元素进行取值
* 这是一种设计模式:创建一个属性,这个属性会添加流中,
* 随着这个TokenStream增加
*/
CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
try {
while(stream.incrementToken()){
System.out.print("["+cta+"]");
// System.out.println(stream);
//如果直接打印Stream的话,toString打印如下:
//(来,startOffset=1,endOffset=2,positionIncrement=1,type=<IDEOGRAPHIC>)
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 打印详细信息的语汇单元
* @param str
* @param a
*/
public static void displayAllToken(String str,Analyzer a){
TokenStream stream = a.tokenStream("content", new StringReader(str));
//位置增量
PositionIncrementAttribute pia = stream.addAttribute(PositionIncrementAttribute.class);
//偏移量
OffsetAttribute oa = stream.addAttribute(OffsetAttribute.class);
//词元
CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
//分词的类型
TypeAttribute ta = stream.addAttribute(TypeAttribute.class);
try {
while(stream.incrementToken()){
System.out.print(pia.getPositionIncrement()+":");
System.out.print(cta+"["+oa.startOffset()+"-"+
oa.endOffset()+"-"+ta.type());
System.out.println();
}
System.out.println();
} catch (IOException e) {
e.printStackTrace();
}
}
}
工程下载路径: http://download.csdn.net/detail/wxwzy738/5284705