举个例子:/aaa/bbb/ccc/12345/kkk/45678和/aaa/bbb/ccc/512515/kkk/32143
这样的例子url中变化的参数路径参数用正则表示就是 /aaa/bbb/ccc/[^/]+/kkk/[^/]+
这边记录一下我的思路
我的一个思路是:
1、将两个url根据/拆分成两个有序字符串数组
2、统计两个数组中字符串出现的次数
3、相似度计算:

以字符串出现的次数的两个集合为两个坐标得到两个向量a和b,
根据余弦相似度计算出相似度,大于0.5则表示这两个url相似可用正则表示
4、通过比较两个url的不同位置替换为路径参数
相似度计算
public static BigDecimal findLike(String domain1, String path1, String domain2, String path2){
if(StringUtils.isAnyBlank(domain1, domain2, path1, path2)){
return new BigDecimal(0);
}
if(!domain1.equals(domain2)){
return new BigDecimal(0);
}
List<String> path1Arr = Arrays.asList(path1.split("/"));
List<String> path2Arr = Arrays.asList(path2.split("/"));
if(path1Arr.size() == 0 || path2Arr.size() == 0){
return new BigDecimal(0);
}
path1Arr = path1Arr.subList(1, path1Arr.size());
path2Arr = path2Arr.subList(1, path2Arr.size());
if(path1Arr.size() != path2Arr.size()){
return new BigDecimal(0);
}
Map<String, AtomicInteger> frequency1 = getFrequency(path1Arr);
Map<String, AtomicInteger> frequency2 = getFrequency(path2Arr);
List<String> allPathArr = new ArrayList<>();
allPathArr.addAll(path1Arr);
allPathArr.addAll(path2Arr);
List<String> allDistinctList = allPathArr.stream().distinct().collect(Collectors.toList());
BigDecimal ab = new BigDecimal(0);
BigDecimal aa = new BigDecimal(0);
BigDecimal bb = new BigDecimal(0);
for (String s : allDistinctList) {
AtomicInteger x1 = frequency1.get(s);
AtomicInteger x2 = frequency2.get(s);
if(x1 != null && x2 != null){
ab = ab.add(new BigDecimal(x1.get() * x2.get()));
}
if(x1 != null){
aa = aa.add(new BigDecimal(x1.get() * x1.get()));
}
if(x2 != null){
bb = bb.add(new BigDecimal(x2.get() * x2.get()));
}
}
double aaa = Math.sqrt(aa.doubleValue());
double bbb = Math.sqrt(bb.doubleValue());
BigDecimal aabb = BigDecimal.valueOf(aaa).multiply(BigDecimal.valueOf(bbb));
return ab.divide(aabb, 2, BigDecimal.ROUND_HALF_UP);
}
public static Map<String, AtomicInteger> getFrequency(List<String> list){
Map<String, AtomicInteger> freq = new HashMap<>();
list.forEach(i -> freq.computeIfAbsent(i, k -> new AtomicInteger()).incrementAndGet());
return freq;
}
转换正则
public static String getRegex(String path1, String path2){
List<String> path1Arr = Arrays.asList(path1.split("/"));
List<String> path2Arr = Arrays.asList(path2.split("/"));
List<String> path1List = path1Arr.subList(1, path1Arr.size());
List<String> path2List = path2Arr.subList(1, path2Arr.size());
List<String> collect1 = path1List.stream().filter(s -> !path2List.contains(s)).collect(Collectors.toList());
List<String> collect2 = path2List.stream().filter(s -> !path1List.contains(s)).collect(Collectors.toList());
for (String s : collect1) {
int i = path1Arr.indexOf(s);
path1Arr.set(i, any_string);
}
for (String s : collect2) {
int i = path2Arr.indexOf(s);
path2Arr.set(i, any_string);
}
String regex1 = String.join("/", path1Arr);
String regex2 = String.join("/", path2Arr);
if(regex1.equals(regex2)){
return regex1;
}
return "";
}
测试结果
public static void main(String[] args) {
String domain1 = "www.aaa.com";
String domain2 = "www.aaa.com";
String path1 = "/aaa/bbb/ccc/12345/kkk/45678";
String path2 = "/aaa/bbb/ccc/45235/kkk/52545252";
System.out.println(findLike(domain1, path1, domain2, path2).doubleValue());
System.out.println(getRegex(path1, path2));
}
0.67
/aaa/bbb/ccc/[^/]+/kkk/[^/]+
Process finished with exit code 0