0
点赞
收藏
分享

微信扫一扫

ElasticSearch--去重查询/根据字段去重--方法/实例

吃面多放酱 2022-02-17 阅读 156


简介

        本文介绍如何根据某一个字段进行去重。包括:获取去重后的结果,统计去重后的数量。

在SQL中,我们可以用dinstinct语句进行去重,例如:     


  • 获取去重后的结果:SELECT DISTINCT name, sex FROM person;
  • 统计去重后的数量:SELECT COUNT(DISTINCT name, sex) FROM person;

Elasticsearch也可以做到获取去重后的结果,统计去重后的数量,例如:


  • 获取去重后的结果

  • 方案1:collapse折叠功能(ES5.3之后支持)
  • 推荐。原因:性能高,占内存小
  • 方案2:字段聚合+top_hits聚合
  • 不推荐。原因:性能差,占内存大

  • 统计去重后的数量
  • 聚合+cardinality聚合函数

索引结构及数据

索引结构

​​http://localhost:9200/​​

PUT blog

{
"mappings": {
"properties": {
"id":{
"type":"long"
},
"title": {
"type": "text"
},
"content": {
"type": "text"
},
"author":{
"type": "text",
"fields": {
"keyword": {
"type": "keyword"
}
}
},
"category":{
"type": "keyword"
},
"createTime": {
"type": "date",
"format":"yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd'T'HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||epoch_millis"
},
"updateTime": {
"type": "date",
"format":"yyyy-MM-dd HH:mm:ss.SSS||yyyy-MM-dd'T'HH:mm:ss.SSS||yyyy-MM-dd HH:mm:ss||epoch_millis"
},
"status":{
"type":"integer"
},
"serialNum": {
"type": "keyword"
}
}
}
}

数据


  • 每个文档必须独占一行,不能换行。
  • 此命令要放到postman中去执行,如果用head执行会失败

​​http://localhost:9200/​​

POST _bulk

{"index":{"_index":"blog","_id":1}}
{"blogId":1,"title":"Spring Data ElasticSearch学习教程1","content":"这是批量添加的文档1","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"1","createTime":"2021-10-10 11:52:01.249","updateTime":null}
{"index":{"_index":"blog","_id":2}}
{"blogId":2,"title":"Spring Data ElasticSearch学习教程2","content":"这是批量添加的文档2","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"2","createTime":"2021-10-10 11:52:02.249","updateTime":null}
{"index":{"_index":"blog","_id":3}}
{"blogId":3,"title":"Spring Data ElasticSearch学习教程3","content":"这是批量添加的文档3","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"3","createTime":"2021-10-10 11:52:03.249","updateTime":null}
{"index":{"_index":"blog","_id":4}}
{"blogId":4,"title":"Spring Data ElasticSearch学习教程4","content":"这是批量添加的文档4","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"4","createTime":"2021-10-10 11:52:04.249","updateTime":null}
{"index":{"_index":"blog","_id":5}}
{"blogId":5,"title":"Spring Data ElasticSearch学习教程5","content":"这是批量添加的文档5","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"5","createTime":"2021-10-10 11:52:05.249","updateTime":null}
{"index":{"_index":"blog","_id":6}}
{"blogId":6,"title":"Java学习教程6","content":"这是批量添加的文档6","author":"Tony","category":"ElasticSearch","status":1,"serialNum":"6","createTime":"2021-10-10 11:52:06.249","updateTime":null}
{"index":{"_index":"blog","_id":7}}
{"blogId":7,"title":"Java学习教程7","content":"这是批量添加的文档7","author":"Pepper","category":"ElasticSearch","status":1,"serialNum":"7","createTime":"2021-10-10 11:52:07.249","updateTime":null}
{"index":{"_index":"blog","_id":8}}
{"blogId":8,"title":"Java学习教程8","content":"这是批量添加的文档8","author":"Pepper","category":"ElasticSearch","status":1,"serialNum":"8","createTime":"2021-10-10 11:52:08.249","updateTime":null}
{"index":{"_index":"blog","_id":9}}
{"blogId":9,"title":"Java学习教程9","content":"这是批量添加的文档9","author":"Pepper","category":"ElasticSearch","status":1,"serialNum":"9","createTime":"2021-10-10 11:52:09.249","updateTime":null}
{"index":{"_index":"blog","_id":10}}
{"blogId":10,"title":"Java学习教程10","content":"这是批量添加的文档10","author":"Pepper","category":"ElasticSearch","status":1,"serialNum":"10","createTime":"2021-10-10 11:52:10.249","updateTime":null}

执行之后的结果

ElasticSearch--去重查询/根据字段去重--方法/实例_elasticsearch

实例:手写DSL

        去重的字段不能是text类型。所以,author的mapping要有keyword,且通过author.keyword去重。

        如果去重字段是其他可以直接去重的类型,比如:数字类型、keyword、日期等,则直接用字段名就可以。即:如果本处author是keyword,则author.keyword处写成author就行。

collapse获取去重结果

POST /blog/_search

{
"query": {
"match": {
"title":{
"query": "java"
}
}
},
"collapse":{
"field": "author.keyword"
}
}

结果:

ElasticSearch--去重查询/根据字段去重--方法/实例_java_02

我把结果全部贴出来

{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": [{
"_index": "blog",
"_type": "_doc",
"_id": "9",
"_score": 1.0596458,
"_source": {
"blogId": 9,
"title": "Java学习教程9",
"content": "这是批量添加的文档9",
"author": "Pepper",
"category": "ElasticSearch",
"status": 1,
"serialNum": "9",
"createTime": "2021-10-10 11:52:09.249",
"updateTime": null
},
"fields": {
"author.keyword": [
"Pepper"
]
}
}, {
"_index": "blog",
"_type": "_doc",
"_id": "6",
"_score": 0.7361701,
"_source": {
"blogId": 6,
"title": "Java学习教程6",
"content": "这是批量添加的文档6",
"author": "Tony",
"category": "ElasticSearch",
"status": 1,
"serialNum": "6",
"createTime": "2021-10-10 11:52:06.249",
"updateTime": null
},
"fields": {
"author.keyword": [
"Tony"
]
}
}
]
}
}

聚合获取去重结果

POST /blog/_search

{
"query": {
"match": {
"title": {
"query": "java"
}
}
},
"size": 0,
"aggs": {
"author_aggs": {
"terms": {
"field": "author.keyword",
"size": 10
},
"aggs": {
"author_top": {
"top_hits": {
"sort": [{
"author.keyword": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
}

结果

ElasticSearch--去重查询/根据字段去重--方法/实例_java_03

我把结果全部贴出来

{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"skipped": 0,
"failed": 0
},
"hits": {
"total": {
"value": 5,
"relation": "eq"
},
"max_score": null,
"hits": []
},
"aggregations": {
"author_aggs": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{
"key": "Pepper",
"doc_count": 4,
"author_top": {
"hits": {
"total": {
"value": 4,
"relation": "eq"
},
"max_score": null,
"hits": [{
"_index": "blog",
"_type": "_doc",
"_id": "9",
"_score": null,
"_source": {
"blogId": 9,
"title": "Java学习教程9",
"content": "这是批量添加的文档9",
"author": "Pepper",
"category": "ElasticSearch",
"status": 1,
"serialNum": "9",
"createTime": "2021-10-10 11:52:09.249",
"updateTime": null
},
"sort": [
"Pepper"
]
}
]
}
}
}, {
"key": "Tony",
"doc_count": 1,
"author_top": {
"hits": {
"total": {
"value": 1,
"relation": "eq"
},
"max_score": null,
"hits": [{
"_index": "blog",
"_type": "_doc",
"_id": "6",
"_score": null,
"_source": {
"blogId": 6,
"title": "Java学习教程6",
"content": "这是批量添加的文档6",
"author": "Tony",
"category": "ElasticSearch",
"status": 1,
"serialNum": "6",
"createTime": "2021-10-10 11:52:06.249",
"updateTime": null
},
"sort": [
"Tony"
]
}
]
}
}
}
]
}
}
}

聚合获取去重数量

POST /blog/_search

{
"query": {
"match": {
"title": {
"query": "java"
}
}
},
"size": 0,
"aggs": {
"author_aggs": {
"cardinality": {
"field": "author.keyword"
}
}
}
}

结果

实例:Java代码

collapse获取去重结果

在这个网址里搜索“去重”


举报

相关推荐

0 条评论