1 在每一台被监控的客户端上安装omsa (查看我另一篇文章关于omsa安装的)
(27条消息) dell服务器硬件监控omsa部署_m0_61626354的博客-CSDN博客
2 cd /etc/zabbix/ #进入zabbix目录下
3 vim /etc/zabbix/zabbix_agent.d/userparameter_hardwarecheck.conf
#到配置文件下写zabbix的参数,这边思路主要是通过/opt下面的脚本去执行omsa命令去获取服务器硬件的状态,如果有故障就输出1,没有就是0
UserParameter=hardware_memory_health, sh /opt/hardware_check.sh memory
UserParameter=hardware_cpu, sh /opt/hardware_check.sh cpu
UserParameter=hardware_physics_health, sh /opt/hardware_check.sh pdisk
UserParameter=hardware_virtual_health, sh /opt/hardware_check.sh vdisk
UserParameter=hardware_raid_health, sh /opt/hardware_check.sh raidcard
4 vim /opt/hardware_check.sh #到opt下创建脚本
#!/bin/sh
raid_card_exist() {
/bin/omreport storage controller > /dev/null
if [ $? -eq 0 ];then
/bin/omreport storage controller | grep "^Status "|egrep -v "Ok|Non-Critical" > /dev/null
if [ $? -eq 0 ];then
echo 1
else
echo 0
fi
else
echo 2
fi
}
check_physical_disk() {
/bin/omreport storage pdisk controller=0|grep "^State "|egrep -v "Online|Ready|Non-Critical" > /dev/null
if [ $? -eq 0 ];then
echo 1
else
echo 0
fi
}
check_virtual_disk() {
/bin/omreport storage vdisk controller=0|grep "^Status "|egrep -v "Ok|Non-Critical" > /dev/null
if [ $? -eq 0 ];then
echo 1
else
echo 0
fi
}
check_memory() {
/bin/omreport chassis memory |grep "^Health"|grep -v "Ok" > /dev/null
if [ $? -eq 0 ];then
echo 1
else
echo 0
fi
}
check_cpu() {
/bin/omreport chassis processors |grep "^Health "|egrep -v "Ok" > /dev/null
if [ $? -eq 0 ];then
echo 1
else
echo 0
fi
}
if [ -z $1 ];then
echo "Usage: $0 [pdisk|vdisk|memory|cpu|raidcard]"
else
# raid_card_exist
# if [ $? -ne 0 ];then
# echo 2
# exit
#fi
case "$1" in
pdisk)
check_physical_disk
;;
vdisk)
check_virtual_disk
;;
memory)
check_memory
;;
cpu)
check_cpu
;;
raidcard)
raid_card_exist
;;
esac
fi
保存后 sh执行对照参数核对下状态
5 重启zabbix客户端并查看状态
systemctl restart zabbix-agent
systemctl status zabbix-agent
6 在服务端上自己写硬件监控模板
对照上面的参数创建触发器
{Template Hardware Monitor:hardware_cpu.last()}=1
{Template Hardware Monitor:hardware_raid_health.last()}=2
{Template Hardware Monitor:hardware_raid_health.last()}=1
{Template Hardware Monitor:hardware_memory_health.last()}=1
{Template Hardware Monitor:hardware_physics_health.last()}<>0
{Template Hardware Monitor:hardware_virtual_health.last()}=1