问题
- 服务配置的健康检查为脚本的方式
- 使用docker pause命令的方式,将容器里面的所有进程挂住
- readiness失败,不会将endpoint从service里面隔离出,客户端调用会导致偶现挂住
原因
-
k8s这种场景下,不会认为健康检查是失败的,不会记录失败次数,只是不断重试,然后上报event
-
代码的实现
func (pb *prober) probe(probeType probeType, pod *v1.Pod, status v1.PodStatus, container v1.Container, containerID kubecontainer.ContainerID) (results.Result, error) { ... result, output, err := pb.runProbeWithRetries(probeType, probeSpec, pod, status, container, containerID, maxProbeRetries) if err != nil || (result != probe.Success && result != probe.Warning) { // Probe failed in one way or another. if err != nil { klog.V(1).ErrorS(err, "Probe errored", "probeType", probeType, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name) pb.recordContainerEvent(pod, &container, v1.EventTypeWarning, events.ContainerUnhealthy, "%s probe errored: %v", probeType, err) } else { // result != probe.Success klog.V(1).InfoS("Probe failed", "probeType", probeType, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "probeResult", result, "output", output) pb.recordContainerEvent(pod, &container, v1.EventTypeWarning, events.ContainerUnhealthy, "%s probe failed: %s", probeType, output) } return results.Failure, err } if result == probe.Warning { pb.recordContainerEvent(pod, &container, v1.EventTypeWarning, events.ContainerProbeWarning, "%s probe warning: %s", probeType, output) klog.V(3).InfoS("Probe succeeded with a warning", "probeType", probeType, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name, "output", output) } else { klog.V(3).InfoS("Probe succeeded", "probeType", probeType, "pod", klog.KObj(pod), "podUID", pod.UID, "containerName", container.Name) } ... } func (w *worker) doProbe() (keepGoing bool) { ... // exec 执行失败,会返回true,继续重试 result, err := w.probeManager.prober.probe(w.probeType, w.pod, status, w.container, w.containerID) if err != nil { // Prober error, throw away the result. return true } switch result { case results.Success: ProberResults.With(w.proberResultsSuccessfulMetricLabels).Inc() case results.Failure: ProberResults.With(w.proberResultsFailedMetricLabels).Inc() default: ProberResults.With(w.proberResultsUnknownMetricLabels).Inc() } ... }
解决方式
- 如果是7层的,可以通过接口的方式进行验证,可以使用httpGet的方式进行替代,
httpGet: path: xxx // 对应的uri schema: HTTPS // HTTP、HTTPS port: xxx // 监听的port host: xxx // 可选,默认为pod ip
- 如果是4层的
tcpSocket port: xxx host: xxx // 可选参数,默认是pod ip