1. 引入maven依赖
<!--simhash算法(文章得相似度依赖)-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.2</version>
</dependency>
2.创建工具类
package com.datago.common.utils.similarity;
import com.hankcs.hanlp.seg.common.Term;
import com.hankcs.hanlp.tokenizer.StandardTokenizer;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class SimilarityUtils {
private String tokens;
private BigInteger strSimHash;
private int hashbits;
private SimilarityUtils(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.strSimHash = this.simHash();
}
private String cleanResume(String content) {
content = Jsoup.clean(content, Whitelist.none());
content = StringUtils.lowerCase(content);
String[] strings = {" ", "\n", "\r", "\t", "\\r", "\\n", "\\t", " "};
for (String s : strings) {
content = content.replaceAll(s, "");
}
return content;
}
private BigInteger simHash() {
tokens = cleanResume(tokens);
int[] v = new int[this.hashbits];
List<Term> termList = StandardTokenizer.segment(this.tokens);
Map<String, Integer> weightOfNature = new HashMap<String, Integer>();
weightOfNature.put("n", 2);
Map<String, String> stopNatures = new HashMap<String, String>();
stopNatures.put("w", "");
int overCount = 5;
Map<String, Integer> wordCount = new HashMap<String, Integer>();
for (Term term : termList) {
String word = term.word;
String nature = term.nature.toString();
if (wordCount.containsKey(word)) {
int count = wordCount.get(word);
if (count > overCount) {
continue;
}
wordCount.put(word, count + 1);
} else {
wordCount.put(word, 1);
}
if (stopNatures.containsKey(nature)) {
continue;
}
BigInteger t = this.hash(word);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
int weight = 1;
if (weightOfNature.containsKey(nature)) {
weight = weightOfNature.get(nature);
}
if (t.and(bitmask).signum() != 0) {
v[i] += weight;
} else {
v[i] -= weight;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
while (source.length() < 3) {
source = source + source.charAt(0);
}
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
private int hammingDistance(SimilarityUtils other) {
BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(
new BigInteger("1"));
BigInteger x = this.strSimHash.xor(other.strSimHash).and(m);
int tot = 0;
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
public double getSemblance(SimilarityUtils s2) {
double i = (double) this.hammingDistance(s2);
return 1 - i / this.hashbits;
}
public static double getRatio(String inValue, String outValue) {
SimilarityUtils hash1 = new SimilarityUtils(inValue, 64);
SimilarityUtils hash2 = new SimilarityUtils(outValue, 64);
return hash1.getSemblance(hash2);
}
}
3.应用
public static void main(String[] args) {
SimilarityUtils hash1 = new SimilarityUtils("老铁,加个关注呗!!!666", 64);
SimilarityUtils hash2 = new SimilarityUtils("老铁,加个关注呗!!!6666", 64);
System.out.println(hash1.hammingDistance(hash2));
System.out.println(hash1.getSemblance(hash2));
}
4.控制台输出结果
