前言:
利用k8s CronJob 来实现etcd集群的自动备份,并通过sftp传输到本k8s集群外的服务器上,进行存储。
实验步骤:
基本环境情况:
| 服务器角色 | IP | 系统 | ETCD版本 | 
| K8S集群操作服务器 | 192.168.1.136 | Centos7.9 | 3.4.9 | 
| 存储服务器 | 192.168.1.105 | Centos7.9 | - | 
创建Dockerfile镜像:
[root@k8s-master1 ~]# mkdir /software/k8s-yaml/etcd-backup/
[root@k8s-master1 ~]# cd /software/k8s-yaml/etcd-backup/[root@k8s-master1 etcd-backup]# vim Dockerfile
FROM python:3-alpine
RUN mkdir /root/.ssh  \
    && touch /root/.ssh/config \
    && echo -e "Host *\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null\n\tKexAlgorithms +diffie-hellman-group1-sha1\n\tPubkeyAcceptedKeyTypes +ssh-rsa\n\tHostkeyAlgorithms +ssh-rsa" > /root/.ssh/config
RUN apk add -U --no-cache curl lftp ca-certificates openssh \ 
    && curl -L https://yunwei-software.oss-cn-zhangjiakou.aliyuncs.com/etcdctl -o /usr/local/bin/etcdctl \
    && chmod +x /usr/local/bin/etcdctlPS:etcd版本为3.4.9,如ETCD版本是不是3.4.9,可以使用ADD将自己集群中的etcdctl打入镜像中。或调整下面的Dockerfile,从Gitlab上拉去。
GitHub拉取使用的Dockerfile:
FROM python:3-alpine
RUN sed -i 's/dl-cdn.alpinelinux.org/mirrors.aliyun.com/g' /etc/apk/repositories
# 设置自己集群中etcd的版本
ARG ETCD_VERSION=v3.4.9
RUN apk add -U --no-cache curl lftp ca-certificates openssh
RUN mkdir /root/.ssh  \
    && touch /root/.ssh/config \
    && echo -e "Host *\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null\n\tKexAlgorithms +diffie-hellman-group1-sha1\n\tPubkeyAcceptedKeyTypes +ssh-rsa\n\tHostkeyAlgorithms +ssh-rsa" > /root/.ssh/config
ADD s3cmd-master.zip /s3cmd-master.zip
RUN unzip /s3cmd-master.zip -d /opt \
    && cd /opt/s3cmd-master \
    && python setup.py install \
    && rm -rf /s3cmd-master.zip
RUN curl -L https://github.com/etcd-io/etcd/releases/download/${ETCD_VERSION}/etcd-${ETCD_VERSION}-linux-amd64.tar.gz -o /opt/etcd-${ETCD_VERSION}-linux-amd64.tar.gz \
    && cd /opt && tar xzf etcd-${ETCD_VERSION}-linux-amd64.tar.gz \
    && mv etcd-${ETCD_VERSION}-linux-amd64/etcdctl /usr/local/bin/etcdctl \
    && rm -rf etcd-${ETCD_VERSION}-linux-amd64*镜像创建并上传至镜像仓库(本地和云上都可,方便其他节点拉取该镜像)
[root@k8s-master1 etcd-backup]# docker build -t lws_etcd_backups:v1 .
[root@k8s-master1 etcd-backup]# docker tag lws_etcd_backups:v1 registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1
[root@k8s-master1 etcd-backup]# docker push registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1ConfigMap创建:
[root@k8s-master1 etcd-backup]# vim etcd-backup-cm.yaml
apiVersion: v1
kind: ConfigMap
metadata:
  name: cron-sftp
  namespace: backups
data:
  entrypoint.sh: |
    #!/bin/bash
    #variables
    sftp_user="ftp01"
    sftp_passwd="Nisec123456"
    sftp_url="sftp://192.168.1.105:22"
    backup_dir=/home/ftp/etcd-backup/$CLUSTER_NAME
    # backup etcd data
    mkdir -p /snapshot
    chmod +x /usr/local/bin/etcdctl
    file=etcd-snapshot-$(date +%Y%m%d-%H%M%S).db
    etcdctl --endpoints $ENDPOINTS \
    --cert=/opt/etcd/ssl/server.pem \
    --key=/opt/etcd/ssl/server-key.pem \
    --cacert=/opt/etcd/ssl/ca.pem \
    snapshot save /snapshot/$file
    # upload etcd snapshot file
    lftp -u $sftp_user,$sftp_passwd $sftp_url<<EOF
    mkdir -p $backup_dir
    cd $backup_dir
    lcd /snapshot
    put $file
    by
    EOF
    # remove the expired snapshot file
    total_num=$(lftp -u $sftp_user,$sftp_passwd $sftp_url -e "ls $backup_dir | wc -l;by")
    if [ $total_num -gt $BACKUP_COUNTS ]; then
      expired_num=$(expr $total_num - $BACKUP_COUNTS)
      expired_files=$(lftp -u $sftp_user,$sftp_passwd $sftp_url -e "ls $backup_dir | head -n $expired_num;by" | awk '{print $NF}')
      for f in $expired_files; do
        to_remove=${backup_dir}/${f}
        echo "start to remove $to_remove"
        lftp -u $sftp_user,$sftp_passwd $sftp_url -e "rm -f $to_remove;by"
      done
    fi
    # remove local etcd snapshot file
    rm -f /snapshot/$filePS:按实际情况修改SFTP段落的配置。
#创建cm类型的cron-sftp
[root@k8s-master1 etcd-backup]# kubectl create ns backups
[root@k8s-master1 etcd-backup]# kubectl apply -f etcd-backup-cm.yaml
[root@k8s-master1 etcd-backup]# kubectl get cm -n backups
NAME               DATA   AGE
cron-sftp          1      6s
kube-root-ca.crt   1      11sCronJob创建:
[root@k8s-master1 etcd-backup]# vim etcd-backup-cronjob.yaml
apiVersion: batch/v1beta1
kind: CronJob
metadata:
  name: etcd-backup-sftp
  namespace: backups
spec:
 schedule: "*/5 * * * *"
 jobTemplate:
  spec:
    template:
      metadata:
       labels:
        app: etcd-backup
      spec:
        containers:
        - name: etcd-backup
          image: registry.cn-zhangjiakou.aliyuncs.com/newtime-test/etcd_backups:lws_v1
          imagePullPolicy: IfNotPresent
          workingDir: /
          command: ["sh", "./entrypoint.sh"]
          env:
          - name: ENDPOINTS
            value: "192.168.1.136:2379"
          - name: ETCDCTL_API
            value: "3"
          - name: BACKUP_COUNTS
            value: "5"
          - name: CLUSTER_NAME
            value: "cluster1"
          volumeMounts:
            - mountPath: /entrypoint.sh
              name: configmap-volume
              readOnly: true
              subPath: entrypoint.sh
            - mountPath: /opt/etcd/ssl
              name: etcd-certs
              readOnly: true
            - mountPath: /etc/localtime
              name: lt-config
            - mountPath: /etc/timezone
              name: tz-config
        volumes:
          - name: configmap-volume
            configMap:
              defaultMode: 0777
              name: cron-sftp
          - name: etcd-certs
            hostPath:
              path: /opt/etcd/ssl
          - name: lt-config
            hostPath:
              path: /etc/localtime
          - name: tz-config
            hostPath:
              path: /etc/timezone
        hostNetwork: true
        restartPolicy: OnFailurePS:可以通过nodeAffinity将执行etcd备份的CrobJob调度到任意etcd节点上运行。示例如下:
affinity:
  nodeAffinity:
    requiredDuringSchedulingIgnoredDuringExecution:
      nodeSelectorTerms:
      - matchExpressions:
        - key: node-role.kubernetes.io/etcd
          operator: Exists我这边共有4个节点,是将ETCD的SSL证书放到了每个节点中,所以没有设置nodeAffinity。
#把SSL证书放到所有节点中:
[root@k8s-master1 etcd-backup]# scp -p /opt/etcd/ssl/ 192.168.1.139:/opt/etcd/ssl运行etcd-backup-cronjob.yaml:
[root@k8s-master1 etcd-backup]# kubectl apply -f etcd-backup-cronjob.yaml
[root@k8s-master1 etcd-backup]# kubectl get cj -n backups
NAME               SCHEDULE      SUSPEND   ACTIVE   LAST SCHEDULE   AGE
etcd-backup-sftp   */5 * * * *   False     0        <none>           7s
#5分钟后查询pods创建情况:
[root@k8s-master1 etcd-backup]# kubectl get pods -n backups
NAME                                READY   STATUS      RESTARTS   AGE
etcd-backup-sftp-1677308100-cw4b8   0/1     Completed   0          1m51s
[root@k8s-master1 etcd-backup]# kubectl logs etcd-backup-sftp-1677308100-cw4b8 -n backups
{"level":"info","ts":1677308105.1600003,"caller":"snapshot/v3_snapshot.go:119","msg":"created temporary db file","path":"/snapshot/etcd-snapshot-20230225-145505.db.part"}
{"level":"info","ts":"2023-02-25T14:55:05.191+0800","caller":"clientv3/maintenance.go:200","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":1677308105.1914499,"caller":"snapshot/v3_snapshot.go:127","msg":"fetching snapshot","endpoint":"192.168.1.136:2379"}
{"level":"info","ts":"2023-02-25T14:55:05.872+0800","caller":"clientv3/maintenance.go:208","msg":"completed snapshot read; closing"}
{"level":"info","ts":1677308106.153034,"caller":"snapshot/v3_snapshot.go:142","msg":"fetched snapshot","endpoint":"192.168.1.136:2379","size":"18 MB","took":0.992465311}
{"level":"info","ts":1677308106.1532946,"caller":"snapshot/v3_snapshot.go:152","msg":"saved","path":"/snapshot/etcd-snapshot-20230225-145505.db"}
Snapshot saved at /snapshot/etcd-snapshot-20230225-145505.db
mkdir: Access failed: Failure (/home/ftp/etcd-backup/cluster1)
start to remove /home/ftp/etcd-backup/cluster1/.
start to remove /home/ftp/etcd-backup/cluster1/..
start to remove /home/ftp/etcd-backup/cluster1/etcd-snapshot-20230225-143011.db查看etcd备份情况:

因为机房的K8S集群目前没有出现过问题,自己目前也没有时间去测试使用snapshot.db文件恢复,等有时间了再去做实验吧。
恢复以及参考的链接如下:
参考链接










