0
点赞
收藏
分享

微信扫一扫

Kubernetes Pod 异常诊断

 Kubernetes Pod 的生命周期管理之后,那有什么方式来诊断 Pod 异常呢?

这里作者提供了一个脚本,可以检查重启次数、异常事件、容器退出状态以及容器日志来辅助判断 Pod 异常状态以及异常原因。

退出码

名称

含义



0





正常退出





开发者用来表明容器是正常退出





1





应用错误





容器因应用程序错误或镜像规范中的错误引用而停止





125





容器未能运行





docker run 命令没有执行成功





126





命令调用错误





无法调用镜像中指定的命令





127





找不到文件或目录





找不到镜像中指定的文件或目录





128





退出时使用的参数无效





退出是用无效的退出码触发的(有效代码是 0-255 之间的整数)





134





异常终止 (SIGABRT)





容器使用 abort() 函数自行中止





137





立即终止 (SIGKILL)





容器被操作系统通过 SIGKILL 信号终止





139





分段错误 (SIGSEGV)





容器试图访问未分配给它的内存并被终止





143





优雅终止 (SIGTERM)





容器收到即将终止的警告,然后终止





255





退出状态超出范围





容器退出,返回可接受范围之外的退出代码,表示错误原因未知




#定义pod-check.sh

#!/bin/bash

#是否开启 debug
debug=false
#命名空间
namespace=""
#pod name
pod_name=""
#检查所有
all=false
#忽略只重启过的 Pod
ignore_restart_pod=false

#打印用法
print_usage() {
  echo "使用方法: $0 -p <pod_name> -n <namespace> -a  [-d] [-i] [-h]"
  echo ""
  echo "选项说明:"
  echo "  -n <namespace>      指定命名空间"
  echo "  -p <pod name>       检查指定的 pod,如果不指定 namespace 就是 default"
  echo "  -a                  检查所有 pod"
  echo "  -d                  开启调试模式"
  echo "  -i                  忽略重启过的目前运行正常的 Pod"
  echo "  -h                  打印帮助信息(当前显示)"
  echo ""
}

# 解析命令行参数
while getopts "ahdin:p:c:" opt; do
  case $opt in
  a)
    all=true
    ;;
  d)
    debug=true
    ;;
  i)
    ignore_restart_pod=true
    ;;
  h)
    print_usage
    exit 0
    ;;
  n)
    namespace=$OPTARG
    ;;
  p)
    pod_name=$OPTARG
    ;;
  \?)
    echo "无效的选项: -$OPTARG" >&2
    print_usage
    exit 1
    ;;
  :)
    echo "选项 -$OPTARG 需要一个参数" >&2
    print_usage
    exit 1
    ;;
  esac
done

function print_red() {
  echo -e "\x1b[1;31m$1\x1b[0m"
}

function print_green() {
  echo -e "\x1b[1;32m$1\x1b[0m"
}

function print_bold() {
  echo -e "\033[1;m$1\033[0m"
}

# 读取输入
read_input() {

  print_bold "输入任意键继续..."
  read user_input </dev/tty

}

# 检查是否开启调试模式
if $debug; then
  set -x
fi

#需要确保当前脚本可以调用 kubectl


#定义容器退出码arry
declare -A EXIT_CODES=(["0"]="Purposely stopped, Used by developers to indicate that the container was automatically stopped" \
  ["1"]="Application error, Container was stopped due to application error or incorrect reference in the image specification" \
  ["125"]="Container failed to run error, The docker run command did not execute successfully" \
  ["126"]="Command invoke error, A command specified in the image specification could not be invoked" \
  ["127"]="File or directory not found, File or directory specified in the image specification was not found" \
  ["128"]="Invalid argument used on exit, Exit was triggered with an invalid exit code (valid codes are integers between 0-255)" \
  ["134"]="Abnormal termination (SIGABRT), The container aborted itself using the abort() function" \
  ["137"]="Immediate termination (SIGKILL), Container was immediately terminated by the operating system via SIGKILL signal" \
  ["139"]="Segmentation fault (SIGSEGV), Container attempted to access memory that was not assigned to it and was terminated" \
  ["143"]="Graceful termination (SIGTERM), Container received warning that it was about to be terminated, then terminated" \
  ["255"]="Exit Status Out Of Range, Container exited, returning an exit code outside the acceptable range, meaning the cause of the error is not known")

# 检查 pod 的状态
check_abnormal_pod() {

  local pod_name=$1
  local namespace=$2

  print_bold "---------Check Pod Events---------"
  # 检查事件
  local events=$(kubectl get events --field-selector involvedObject.name=$pod_name -n $namespace --sort-by='{.metadata.creationTimestamp}')

  if [[ -n $events ]]; then
    print_bold "Pod ${pod_name} 有需要关注的事件:"
    print_red "$events"
  else
    print_bold "Pod ${pod_name} : 未找到重要事件,有可能事件已被覆盖!"
  fi

  print_bold "---------Begin to inspect all container status---------"

  # Pod 状态
  local container_statuses=$(kubectl get pods $pod_name -n $namespace -o json | jq -r '.status.containerStatuses')

  if [[ "$container_statuses" == "null" ]]; then

    print_bold "Pod 未调度,所以容器还未创建!"
    print_bold "---------Inspect all container status finished---------"
    return 0

  fi

  #检查所有 container
  echo "${container_statuses}" | jq -c '.[]' | while IFS= read -r container_status; do

    if [[ -n "$container_status" ]]; then

      local container_name=$(echo "$container_status" | jq -r '.name')
      print_bold "---------Inspect container: $container_name ---------"

      local current_state=$(echo "$container_status" | jq -r '.state')
      local restart_count=$(echo "$container_status" | jq -r '.restartCount')
      local last_state=$(echo "$container_status" | jq -r '.lastState')
      local ready=$(echo "$container_status" | jq -r '.ready')
      local reason=$(echo "$container_status" | jq -r '.lastState.terminated.reason')
      local exit_code=$(echo "$container_status" | jq -r '.lastState.terminated.exitCode')

      if [[ "$(echo "$current_state" | jq -e '.running' 2>/dev/null)" != "null" ]]; then
        print_green "1. Container 状态(state):$current_state"
      else
        print_red "1. Container 状态(state):$current_state"
      fi

      if [[ "$ready" == false ]]; then
        print_red "2. Container Ready 状态(ready):$ready"
      else
        print_green "2. Container Ready 状态(ready):$ready"
      fi
      echo "3. Container 重启次数(restartCount):$restart_count"
      echo "4. Container 前一次状态(lastState):$last_state"
      echo "4.1 Container 退出原因(reason):$reason"
      echo "4.2 Container 退出码(exit code):$exit_code"

      if [[ $exit_code =~ ^[0-9]+$ ]]; then
        echo "4.3 Pod 退出码释义:${EXIT_CODES[$exit_code]}"
      fi
    else
      print_red "未找到 Container 状态信息。"
    fi

    if [ "$ready" == false ]; then
      print_bold "---------Print log---------"
      #打印当前日志
      print_bold "容器:$container_name - 当前启动日志:"
      kubectl logs $pod_name -c $container_name -n $namespace --tail=15

      #打印前一次的日志
      print_bold "容器:$container_name - 前一次启动日志:"
      kubectl logs $pod_name -p -c $container_name -n $namespace --tail=15
    elif [ "$restart_count" ] >0 && [ "$ignore_restart_pod" == false ]; then
      print_bold "---------Print log---------"

      #打印前一次的日志
      print_bold "容器:$container_name - 前一次启动日志:"
      kubectl logs $pod_name -p -c $container_name -n $namespace --tail=15
    fi

    print_bold "---------Inspect finished---------"
  done
  print_bold "---------Inspect all container status finished---------"
}

#根据 label 来检查 pods
check_pods_by_label() {
  local label=$1
  local namespace=$2

  # 获取 Pod
  local pods=$(kubectl get pods -n $namespace -l $label --no-headers)

  # 检查 Pod 是否存在
  if [ -z "$pods" ]; then
    print_red "命名空间[${namespace}]不存在满足 label selector-${label} 的 Pod!"
    return 1
  fi

  check_pods "$pods" "$namespace"
}

#根据 namespace 来检查 pods
check_pods_by_namespace() {

  local namespace=$1

  # 获取 Pod
  local pods=$(kubectl get pods -n $namespace --no-headers)

  # 检查 Pod 是否存在
  if [ -z "$pods" ]; then
    print_red "命名空间[${namespace}]未找到任何 Pod,有可能是命名空间名称不准确或者该命名空间没有创建任何 Pod!"
    return 1
  fi

  check_pods "$pods" "$namespace"
}

#检查单个 Pod
check_single_pod() {
  local pod_name=$1
  local namespace=$2

  # 获取 Pod
  local pods=$(kubectl get pods $pod_name -n $namespace --no-headers)

  # 检查 Pod 是否存在
  if [ -z "$pods" ]; then
    print_red "命名空间[${namespace}]未找到 pod [$pod_name]!"
    return 1
  fi

  check_pods "$pods" "$namespace"

}

#查找所有的 pods
check_pods_by_all_namespace() {

  #获取所有 namespace
  local namespace_names=$(kubectl get ns -o jsonpath='{.items[*].metadata.name}')

  IFS=" "
  for namespace_name in $namespace_names; do
    print_bold "--------------- Check namespace: $namespace_name---------------"
    check_pods_by_namespace "$namespace_name"
    print_bold "--------------- Check namespace: $namespace_name end---------------"
    echo ""
  done
}

# 检查 Pods
check_pods() {

  local pods=$1
  local namespace=$2
  local isNormal=true
  local msg=""

  while IFS=' ' read -r pod_name pod_ready pod_status pod_restart pod_up_time; do
    isNormal=true
    msg=""

    # 如果 Pod 处于 Completed 状态,则跳过
    if [[ "$pod_status" == "Completed" ]]; then
      break
    fi

    if [[ ! $(echo "$pod_ready" | cut -d'/' -f 1) == $(echo "$pod_ready" | cut -d'/' -f 2) ]]; then
      msg+="Pod 中的容器还未就绪:${pod_ready}\n"
      isNormal=false
    fi

    if [[ ! "$pod_status" == "Running" ]]; then
      msg+="Pod 运行状态不正常:${pod_status}\n"
      isNormal=false
    fi

    if [[ "$pod_restart" != "0" ]]; then
      msg+="Pod 发生了多次重启:${pod_restart}\n"
      if [[ ! $ignore_restart_pod == true ]]; then
        isNormal=false
      fi
    fi

    if [[ $isNormal == false ]]; then
      print_bold "------------Check Pod - ${pod_name}------------"
      # 去掉最后两个字符
      print_red "${msg%??}"
      check_abnormal_pod "$pod_name" "$namespace"
      print_bold "------------Check Pod - ${pod_name} end------------"
      read_input
    else
      print_green "$pod_name 运行正常。"
    fi

  done <<<"$pods"
}

# 执行(如果指定了 pod name 以及 namespace
if [[ ! -z $pod_name ]]; then
  if [[ -z $namespace ]]; then
    namespace="default"
  fi
  check_single_pod "$pod_name" "$namespace"
else
  if [[ ! -z $namespace ]]; then
    print_bold "---------------Check namespace: $namespace---------------"
    check_pods_by_namespace $namespace
    print_bold "---------------Check namespace: $namespace end---------------"
  elif [[ $all == true ]]; then
    check_pods_by_all_namespace
  fi
fi

#在kube master 节点执行 

[root@region-k8s-master1 ~]# sh  /tmp/pod-check.sh  --help
/tmp/pod-check.sh: illegal option -- -
无效的选项: -
使用方法: /tmp/pod-check.sh -p <pod_name> -n <namespace> -a  [-d] [-i] [-h]

选项说明:
  -n <namespace>      指定命名空间
  -p <pod name>       检查指定的 pod,如果不指定 namespace 就是 default
  -a                  检查所有 pod
  -d                  开启调试模式
  -i                  忽略重启过的目前运行正常的 Pod
  -h                  打印帮助信息(当前显示)

检查结果:

Kubernetes Pod 异常诊断_命名空间

举报

相关推荐

0 条评论