0
点赞
收藏
分享

微信扫一扫

拆字工具类


import com.google.gson.Gson;
import lombok.extern.slf4j.Slf4j;
import net.go2global.common.core.bean.dto.StringSplitDTO;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.BufferedReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
* 拆词
* @Author zyh
* @Date 2020/10/23 17:04
*/
@Slf4j
public class StringSplitUtils {

public static void main(String[] args) {

//String input="Günaydın Patron 좋은 아침 Reduce 1 hour of the a-b a_b a:b www.163.com can't remaining building duration你好我就随便测测";
String input="I love I Beijing Tiananmen Square, the sun rises on Tiananmen Square!";
List<StringSplitDTO> list = getSplit(input);
log.info(new Gson().toJson(list));
}


public static List<StringSplitDTO> getSplit(String input){

List<StringSplitDTO> returnList = new ArrayList<>();

try {
Analyzer analyzer = new StandardAnalyzer();

BufferedReader fileReader = null;
fileReader = new BufferedReader(new StringReader(input));
List<String> result = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(null, fileReader);
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

ts.reset();//必须的
while( ts.incrementToken() ){
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();

StringSplitDTO stringSplitDTO=new StringSplitDTO();
stringSplitDTO.setString(term);
stringSplitDTO.setStartIndex(startOffset);
stringSplitDTO.setEndIndex(endOffset);

returnList.add(stringSplitDTO);
//System.out.println(term + " ["+startOffset+","+endOffset + "]");
}
//System.out.println(result.size());
ts.end();
ts.close();
} catch (Exception e) {
log.error("拆词:"+e);
}

return returnList;
}

public static Map<String,StringSplitDTO> getSplitMap(String input){

Map<String,StringSplitDTO> map = new HashMap<>();

try {
Analyzer analyzer = new StandardAnalyzer();

BufferedReader fileReader = null;
fileReader = new BufferedReader(new StringReader(input));
List<String> result = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(null, fileReader);
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

ts.reset();//必须的
while( ts.incrementToken() ){
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();

StringSplitDTO stringSplitDTO=new StringSplitDTO();
stringSplitDTO.setString(term);
stringSplitDTO.setStartIndex(startOffset);
stringSplitDTO.setEndIndex(endOffset);

map.put(term,stringSplitDTO);
//System.out.println(term + " ["+startOffset+","+endOffset + "]");
}
//System.out.println(result.size());
ts.end();
ts.close();
} catch (Exception e) {
log.error("拆词:"+e);
}

return map;
}








}

 

举报

相关推荐

0 条评论