一、Prometheus+Grafana
Prometheus+Grafana的安装参考之前的文档 centos7.9搭建prometheus+grafana监控系统
二、alertmanager告警组件安装配置
邮箱和企业微信开通和配置设置参考zabbix5.0自定义web监控和邮箱告警,企业微信告警
1.alertmanager安装配置
tar zxf alertmanager-0.23.0.linux-amd64.tar.gz
mv alertmanager-0.23.0.linux-amd64 /opt/prometheus/alertmanager
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Descriptinotallow=alertmanager
Documentatinotallow=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/prometheus/alertmanager/alertmanager --config.file=/opt/prometheus/alertmanager/alertmanager.yml --storage.path=/opt/prometheus/alertmanager/data
Restart=on-failure
[Install]
WantedBy=multi-user.target
vim /opt/prometheus/prometheus.yml
找到alertmanager告警相关配置进行修改:
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.142.132:9093
rule_files:
# - "first_rules.yml"
- "rules.yml"
vim /opt/prometheus/rules.yml
groups:
- name: hostStatsAlert
rules:
- alert: NodeDown
expr: up == 0
for: 1m
labels:
severity: "Critical"
annotations:
summary: "Instance {{$labels.instance}} down"
description: "{{$labels.instance}} of job {{$labels.job}} has been down for more than 5 minutes."
- alert: NodeCPUUsage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance) > 0.85
for: 1m
labels:
severity: "Warning"
annotations:
summary: "Instance {{ $labels.instance }} CPU usgae high"
description: "{{ $labels.instance }} CPU usage above 85% (current value: {{ $value }})"
- alert: NodeMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.85
for: 1m
labels:
severity: "Warning"
annotations:
summary: "Instance {{ $labels.instance }} MEM usgae high"
description: "{{ $labels.instance }} MEM usage above 85% (current value: {{ $value }})"
- alert: filesystemUsageAlert
expr: 100 - ((node_filesystem_avail_bytes{mountpoint="/",fstype=~"ext4|xfs"} * 100) / node_filesystem_size_bytes {mountpoint="/",fstype=~"ext4|xfs"}) > 85
for: 1m
labels:
severity: "Warning"
annotations:
summary: "Instance {{ $labels.instance }} root DISK usgae high"
description: "{{ $labels.instance }} root DISK usage above 85% (current value: {{ $value }})"
vim /opt/prometheus/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: '******@163.com'
smtp_auth_username: '******@163.com'
smtp_auth_password: 'VTKQYELFHUNAPLYC' #获取的授权码
smtp_require_tls: false
templates:
- '/opt/prometheus/alertmanager/template/*.tmpl'
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 10m
receiver: 'mail'
receivers:
- name: 'mail'
email_configs:
- to: '*********@163.com' #自己的邮箱
wechat_configs: # 企业微信报警配置
- send_resolved: true
to_party: '2' # 接收组的id
agent_id: '1000002' # (企业微信-->自定应用-->AgentId)
corp_id: '*************' # 企业信息(我的企业-->CorpId[在底部])
api_secret: '***************' # 企业微信(企业微信-->自定应用-->Secret)
message: '{{ template "test_wechat.html" . }}' # 发送消息模板的设定
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
vim /opt/prometheus/alertmanager/template/testmail.tmpl
{{ define "test.html" }}
<table border="1">
<tr>
<td>报警项</td>
<td>实例</td>
<td>报警阀值</td>
<td>开始时间</td>
</tr>
{{ range $i, $alert := .Alerts }}
<tr>
<td>{{ index $alert.Labels "alertname" }}</td>
<td>{{ index $alert.Labels "instance" }}</td>
<td>{{ index $alert.Annotations "value" }}</td>
<td>{{ $alert.StartsAt }}</td>
</tr>
{{ end }}
</table>
{{ end }}
vim /opt/prometheus/alertmanager/template/testwechat.tmpl
{{ define "cdn_live_wechat.html" }}
{{ range $i, $alert := .Alerts.Firing }}
[报警项]:{{ index $alert.Labels "alertname" }}
[实例]:{{ index $alert.Labels "instance" }}
[报警阀值]:{{ index $alert.Annotations "value" }}
[开始时间]:{{ $alert.StartsAt }}
{{ end }}
{{ end }}
2.alertmanager服务启动
chown -R prometheus:prometheus /usr/lib/systemd/system/alertmanager.service
chown -R prometheus:prometheus /opt/prometheus/*
curl -X POST http://localhost:9090/-/reload
systemctl daemon-reload
systemctl enable alertmanager
systemctl start alertmanager
三.邮件告警测试
linux被监控主机执行 fallocate -l 20G /etc/swap 使磁盘使用超过90%达到告警条件