----alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465' # 定义163邮箱服务器端
smtp_from: 'xxxxxxxxxx@163.com' #来自哪个邮箱发的
smtp_auth_username: 'xxxxxxxxxx@163.com' #邮箱验证
smtp_auth_password: 'xxxxxxxxxxx' # 邮箱授权码,不是登录密码
smtp_require_tls: false # 是否启用tls
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 1m # 发送告警后间隔多久再次发送,减少发送邮件频率
receiver: 'email' #发送的告警媒体
receivers:
- name: 'email' # 接收者配置,这里要与接收媒体一致
email_configs:
- to: 'xxxxxxxxxxxxx@163.com' #发送给谁的邮箱,多个人多行列出
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
----prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- xxx.xxx.xxx:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- /opt/prometheus-2.37.0.linux-amd64/rules/*.yml
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ["xxx.xxx.xxx:9090"]
- job_name: "node_export"
static_configs:
- targets: ["xxx.xxx.xxx:9100","xxx.xxx.xxx:9100","xxx.xxx.xxx:9100"]
rules
groups:
- name: node-alert
rules:
- alert: node status is WODN
expr: up{job="node_export"} == 0
for: 5m
labels:
severity: emergency
instance: "{{ $labels.instance }}"
annotations:
summary: "node: {{ $labels.instance }} down"
description: "{{$labels.instance}} down more than 5 minutes"
value: "{{ $value }}"