79-云原生操作系统-Kubectl命令以及Etcd基于快照的数据备份与恢复-CFANZ编程社区

Kubectl常用命令

[root@K8s-ansible ~]#kubectl api-resources 
NAME                              SHORTNAMES   APIVERSION                             NAMESPACED   KIND
bindings                                       v1                                     true         Binding
componentstatuses                 cs           v1                                     false        ComponentStatus
configmaps                        cm           v1                                     true         ConfigMap
endpoints                         ep           v1                                     true         Endpoints
events                            ev           v1                                     true         Event
limitranges                       limits       v1                                     true         LimitRange
namespaces                        ns           v1                                     false        Namespace
nodes                             no           v1                                     false        Node
persistentvolumeclaims            pvc          v1                                     true         PersistentVolumeClaim
persistentvolumes                 pv           v1                                     false        PersistentVolume
pods                              po           v1                                     true         Pod
podtemplates                                   v1                                     true         PodTemplate
replicationcontrollers            rc           v1                                     true         ReplicationController
resourcequotas                    quota        v1                                     true         ResourceQuota
secrets                                        v1                                     true         Secret
serviceaccounts                   sa           v1                                     true         ServiceAccount
services                          svc          v1                                     true         Service
mutatingwebhookconfigurations                  admissionregistration.k8s.io/v1        false        MutatingWebhookConfiguration
validatingwebhookconfigurations                admissionregistration.k8s.io/v1        false        ValidatingWebhookConfiguration
customresourcedefinitions         crd,crds     apiextensions.k8s.io/v1                false        CustomResourceDefinition
apiservices                                    apiregistration.k8s.io/v1              false        APIService
controllerrevisions                            apps/v1                                true         ControllerRevision
daemonsets                        ds           apps/v1                                true         DaemonSet
deployments                       deploy       apps/v1                                true         Deployment
replicasets                       rs           apps/v1                                true         ReplicaSet
statefulsets                      sts          apps/v1                                true         StatefulSet
tokenreviews                                   authentication.k8s.io/v1               false        TokenReview
localsubjectaccessreviews                      authorization.k8s.io/v1                true         LocalSubjectAccessReview
selfsubjectaccessreviews                       authorization.k8s.io/v1                false        SelfSubjectAccessReview
selfsubjectrulesreviews                        authorization.k8s.io/v1                false        SelfSubjectRulesReview
subjectaccessreviews                           authorization.k8s.io/v1                false        SubjectAccessReview
horizontalpodautoscalers          hpa          autoscaling/v2                         true         HorizontalPodAutoscaler
cronjobs                          cj           batch/v1                               true         CronJob
jobs                                           batch/v1                               true         Job
certificatesigningrequests        csr          certificates.k8s.io/v1                 false        CertificateSigningRequest
leases                                         coordination.k8s.io/v1                 true         Lease
endpointslices                                 discovery.k8s.io/v1                    true         EndpointSlice
events                            ev           events.k8s.io/v1                       true         Event
flowschemas                                    flowcontrol.apiserver.k8s.io/v1beta3   false        FlowSchema
prioritylevelconfigurations                    flowcontrol.apiserver.k8s.io/v1beta3   false        PriorityLevelConfiguration
ingressclasses                                 networking.k8s.io/v1                   false        IngressClass
ingresses                         ing          networking.k8s.io/v1                   true         Ingress
networkpolicies                   netpol       networking.k8s.io/v1                   true         NetworkPolicy
runtimeclasses                                 node.k8s.io/v1                         false        RuntimeClass
poddisruptionbudgets              pdb          policy/v1                              true         PodDisruptionBudget
clusterrolebindings                            rbac.authorization.k8s.io/v1           false        ClusterRoleBinding
clusterroles                                   rbac.authorization.k8s.io/v1           false        ClusterRole
rolebindings                                   rbac.authorization.k8s.io/v1           true         RoleBinding
roles                                          rbac.authorization.k8s.io/v1           true         Role
priorityclasses                   pc           scheduling.k8s.io/v1                   false        PriorityClass
csidrivers                                     storage.k8s.io/v1                      false        CSIDriver
csinodes                                       storage.k8s.io/v1                      false        CSINode
csistoragecapacities                           storage.k8s.io/v1                      true         CSIStorageCapacity
storageclasses                    sc           storage.k8s.io/v1                      false        StorageClass
volumeattachments                              storage.k8s.io/v1                      false        VolumeAttachment

常见命令

#官网API
https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands

# kubectl get service --all-namespaces -o wide
# kubectl get pods --all-namespaces -o wide
# kubectl get nodes --all-namespaces -o wide
# kubectl get deployment --all-namespaces
# kubectl get deployment -n magedu -o wide #更改显示格式
# kubectl describe pods mooreyxia-tomcat-app1-deployment -n myserver #查看某个资源详细信息
# kubectl create -f tomcat-app1.yaml
# kubectl apply -f tomcat-app1.yaml
# kubectl delete -f tomcat-app1.yaml
# kubectl create -f tomcat-app1.yaml --save-config --record
# kubectl apply -f tomcat-app1.yaml --record #推荐命令
# kubectl exec -it mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 bash -n myserver
# kubectl logs mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 -n magedu
# kubectl delete pods mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 -n myserver

设置kubectl命令补全

[root@K8s-ansible script]#kubectl --help|grep completion
  completion      Output shell completion code for the specified shell (bash, zsh, fish, or powershell)
  
#生成命令行自动补全脚本
[root@K8s-ansible script]#kubectl completion bash
#加载自动补全脚本即可
[root@K8s-ansible script]#source <(kubectl completion bash)

#如果出现命令补全报错，则需要下载bash命令补全安装包
[root@K8s-ansible ~]#kubectl g-bash: _get_comp_words_by_ref: command not found
-bash: _get_comp_words_by_ref: command not found
#安装bash-completion
[root@K8s-ansible ~]#apt install bash-completion -y
[root@K8s-ansible ~]#source /etc/bash_completion

Etcd客户端命令及基于快照的备份和恢复

79-云原生操作系统-Kubectl命令以及Etcd基于快照的数据备份与恢复_快照

Service文件配置说明

[root@K8s-etcd01 ~]#cat /etc/systemd/system/etcd.service
[Unit]
Description=Etcd Server
After=network.target
After=network-online.target
Wants=network-online.target
Documentation=https://github.com/coreos

[Service]
Type=notify
WorkingDirectory=/var/lib/etcd #数据保存目录
ExecStart=/usr/local/bin/etcd \ #二进制文件路径
  --name=etcd-192.168.11.217 \ #当前node名称
  --cert-file=/etc/kubernetes/ssl/etcd.pem \
  --key-file=/etc/kubernetes/ssl/etcd-key.pem \
  --peer-cert-file=/etc/kubernetes/ssl/etcd.pem \
  --peer-key-file=/etc/kubernetes/ssl/etcd-key.pem \
  --trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
  --peer-trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
  --initial-advertise-peer-urls=https://192.168.11.217:2380 \ #通告自己的集群端口
  --listen-peer-urls=https://192.168.11.217:2380 \ #集群之间通讯端口
  --listen-client-urls=https://192.168.11.217:2379,http://127.0.0.1:2379 \ #客户端访问地址
  --advertise-client-urls=https://192.168.11.217:2379 \ #通告自己的客户端端口
  --initial-cluster-token=etcd-cluster-0 \ #创建集群使用的token，一个集群内的节点保持一致
  --initial-cluster=etcd-192.168.11.217=https://192.168.11.217:2380,etcd- 192.168.11.218=https://192.168.11.218:2380,etcd-192.168.11.219=https://192.168.11.219:2380 \ 
  --initial-cluster-state=new \ #新建集群的时候的值为new,如果是已经存在的集群为existing
  --data-dir=/var/lib/etcd \ #数据目录路径
  --wal-dir= \
  --snapshot-count=50000 \
  --auto-compaction-retention=1 \ #数据压缩：第一次压缩等待1小时，以后每次1小时*10%=1小时压缩一次。
  --auto-compaction-mode=periodic \ #周期性数据压缩
  --max-request-bytes=10485760 \ #request size limit(请求的最大字节数,默认一个key最大1.5Mib,官方推荐最大10Mib)
  --quota-backend-bytes=8589934592 #storage size limit(磁盘存储空间大小限制,默认为2G,此值超过8G启动会有警告信息)
Restart=always
RestartSec=15
LimitNOFILE=65536
OOMScoreAdjust=-999

[Install]
WantedBy=multi-user.target

#对etcd集群进行心跳检测
[root@K8s-etcd01 ~]#/usr/local/bin/etcdctl defrag --cluster --endpoints=https://192.168.11.217:2379 --cacert=/etc/kubernetes/ssl/ca.pem --cert=/etc/kubernetes/ssl/etcd.pem --key=/etc/kubernetes/ssl/etcd-key.pem
Finished defragmenting etcd member[https://192.168.11.219:2379]
Finished defragmenting etcd member[https://192.168.11.218:2379]
Finished defragmenting etcd member[https://192.168.11.217:2379]

etcd客户端命令使用

[root@K8s-etcd01 ~]#etcdctl --help
NAME:
	etcdctl - A simple command line client for etcd3.

USAGE:
	etcdctl [flags]

VERSION:
	3.5.5

API VERSION:
	3.5


COMMANDS:
	alarm disarm		Disarms all alarms
	alarm list		Lists all alarms
	auth disable		Disables authentication
	auth enable		Enables authentication
	auth status		Returns authentication status
	check datascale		Check the memory usage of holding data for different workloads on a given server endpoint.
	check perf		Check the performance of the etcd cluster
	compaction		Compacts the event history in etcd
	defrag			Defragments the storage of the etcd members with given endpoints
	del			Removes the specified key or range of keys [key, range_end)
	elect			Observes and participates in leader election
	endpoint hashkv		Prints the KV history hash for each endpoint in --endpoints
	endpoint health		Checks the healthiness of endpoints specified in `--endpoints` flag
	endpoint status		Prints out the status of endpoints specified in `--endpoints` flag
	get			Gets the key or a range of keys
	help			Help about any command
	lease grant		Creates leases
	lease keep-alive	Keeps leases alive (renew)
	lease list		List all active leases
	lease revoke		Revokes leases
	lease timetolive	Get lease information
	lock			Acquires a named lock
	make-mirror		Makes a mirror at the destination etcd cluster
	member add		Adds a member into the cluster
	member list		Lists all members in the cluster
	member promote		Promotes a non-voting member in the cluster
	member remove		Removes a member from the cluster
	member update		Updates a member in the cluster
	move-leader		Transfers leadership to another etcd cluster member.
	put			Puts the given key into the store
	role add		Adds a new role
	role delete		Deletes a role
	role get		Gets detailed information of a role
	role grant-permission	Grants a key to a role
	role list		Lists all roles
	role revoke-permission	Revokes a key from a role
	snapshot restore	Restores an etcd member snapshot to an etcd directory
	snapshot save		Stores an etcd node backend snapshot to a given file
	snapshot status		[deprecated] Gets backend snapshot status of a given file
	txn			Txn processes all the requests in one transaction
	user add		Adds a new user
	user delete		Deletes a user
	user get		Gets detailed information of a user
	user grant-role		Grants a role to a user
	user list		Lists all users
	user passwd		Changes password of user
	user revoke-role	Revokes a role from a user
	version			Prints the version of etcdctl
	watch			Watches events stream on keys or prefixes

OPTIONS:
      --cacert=""				verify certificates of TLS-enabled secure servers using this CA bundle
      --cert=""					identify secure client using this TLS certificate file
      --command-timeout=5s			timeout for short running command (excluding dial timeout)
      --debug[=false]				enable client-side debug logging
      --dial-timeout=2s				dial timeout for client connections
  -d, --discovery-srv=""			domain name to query for SRV records describing cluster endpoints
      --discovery-srv-name=""			service name to query when using DNS discovery
      --endpoints=[127.0.0.1:2379]		gRPC endpoints
  -h, --help[=false]				help for etcdctl
      --hex[=false]				print byte strings as hex encoded strings
      --insecure-discovery[=true]		accept insecure SRV records describing cluster endpoints
      --insecure-skip-tls-verify[=false]	skip server certificate verification (CAUTION: this option should be enabled only for testing purposes)
      --insecure-transport[=true]		disable transport security for client connections
      --keepalive-time=2s			keepalive time for client connections
      --keepalive-timeout=6s			keepalive timeout for client connections
      --key=""					identify secure client using this TLS key file
      --password=""				password for authentication (if this option is used, --user option shouldn't include password)
      --user=""					username[:password] for authentication (prompt if password is not supplied)
  -w, --write-out="simple"			set the output format (fields, json, protobuf, simple, table)


#查询etcd节点信息
[root@K8s-etcd01 ~]#cat check_etcdcluster.sh 
#!/bin/bash

IP="
192.168.11.217
192.168.11.218
192.168.11.219
"
for ip in ${IP}; do
ETCDCTL_API=3 \
/usr/local/bin/etcdctl \
--endpoints=https://${ip}:2379 \
--cacert=/etc/kubernetes/ssl/ca.pem \
--cert=/etc/kubernetes/ssl/etcd.pem \
--key=/etc/kubernetes/ssl/etcd-key.pem endpoint health;
done
[root@K8s-etcd01 ~]#bash check_etcdcluster.sh 
https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 28.898693ms
https://192.168.11.218:2379 is healthy: successfully committed proposal: took = 20.830087ms
https://192.168.11.219:2379 is healthy: successfully committed proposal: took = 25.347226ms

#以表格方式显示节点详细状态
[root@K8s-etcd01 ~]#cat check_etcdcluster.sh 
#!/bin/bash

IP="
192.168.11.217
192.168.11.218
192.168.11.219
"
for ip in ${IP}; do
ETCDCTL_API=3 \
/usr/local/bin/etcdctl \
--write-out=table endpoint status \
--endpoints=https://${ip}:2379 \
--cacert=/etc/kubernetes/ssl/ca.pem \
--cert=/etc/kubernetes/ssl/etcd.pem \
--key=/etc/kubernetes/ssl/etcd-key.pem endpoint health;
done

79-云原生操作系统-Kubectl命令以及Etcd基于快照的数据备份与恢复_kubectl_02

#查询etcd数据信息
[root@K8s-etcd01 ~]#ETCDCTL_API=3 etcdctl get / --prefix --keys-only | grep nginx
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded01.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--dlvrh-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded02.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--vg675-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded02.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--wbpvl-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded03.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--rntpm-eth0
/registry/deployments/myserver/mooreyxia-nginx-deployment
/registry/endpointslices/myserver/mooreyxia-nginx-service-cpn6k
/registry/events/myserver/mooreyxia-nginx-deployment-789dfdcb7b-dlvrh.1750771802f5c1b0

#数据增删改查
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl put /name "tom"
OK
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get /name 
/name
tom
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl put /name "moore"
OK
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get  /name 
/name
moore
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl del  /name 
1
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get  /name

etcd数据watch机制

持续查看，发生变化就主动触发通知客户端，Etcd v3 的watch机制支持watch某个固定的key，也支持watch一个范围。

#在etcd node1上watch一个key，没有此key也可以执行watch，后期可以再创建：
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl watch /data

79-云原生操作系统-Kubectl命令以及Etcd基于快照的数据备份与恢复_Etcd_03

etcd V3 API版本数据备份与恢复

WAL是write ahead log(预写日志)的缩写，顾名思义，也就是在执行真正的写操作之前先写一个日志，预写日志。

#wal: 存放预写式日志,最大的作用是记录了整个数据变化的全部历程。在etcd中，所有数据的修改在提交前，都要先写入到WAL中。
[root@K8s-etcd01 ~]#ll /var/lib/etcd/member/wal/
total 187512
drwx------ 2 root root     4096 Mar 29 03:16 ./
drwx------ 4 root root     4096 Mar 29 03:16 ../
-rw------- 1 root root 64000000 Mar 29 03:16 0.tmp
-rw------- 1 root root 64000048 Mar 28 10:30 0000000000000000-0000000000000000.wal
-rw------- 1 root root 64000000 Mar 29 03:18 0000000000000001-0000000000014d5d.wal
[root@K8s-etcd01 ~]#file /var/lib/etcd/member/wal/0000000000000001-0000000000014d5d.wal 
/var/lib/etcd/member/wal/0000000000000001-0000000000014d5d.wal: data

#存放数据的文件 - 数据备份主要针对这个文件
[root@K8s-etcd01 ~]#file /var/lib/etcd/member/snap/db
/var/lib/etcd/member/snap/db: data

V3版本备份数据

#etcd数据库集群默认同步机制的情况下只需要备份一个数据库的数据即可
[root@K8s-etcd01 ~]#etcdctl --help|grep snapshot
	snapshot restore	Restores an etcd member snapshot to an etcd directory
	snapshot save		Stores an etcd node backend snapshot to a given file
	snapshot status		[deprecated] Gets backend snapshot status of a given file
[root@K8s-etcd01 ~]#etcdctl snapshot save /tmp/backup.db
{"level":"info","ts":"2023-03-29T03:25:58.725Z","caller":"snapshot/v3_snapshot.go:65","msg":"created temporary db file","path":"/tmp/backup.db.part"}
{"level":"info","ts":"2023-03-29T03:25:58.739Z","logger":"client","caller":"v3/maintenance.go:211","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":"2023-03-29T03:25:58.739Z","caller":"snapshot/v3_snapshot.go:73","msg":"fetching snapshot","endpoint":"127.0.0.1:2379"}
{"level":"info","ts":"2023-03-29T03:25:58.861Z","logger":"client","caller":"v3/maintenance.go:219","msg":"completed snapshot read; closing"}
{"level":"info","ts":"2023-03-29T03:25:58.924Z","caller":"snapshot/v3_snapshot.go:88","msg":"fetched snapshot","endpoint":"127.0.0.1:2379","size":"4.3 MB","took":"now"}
{"level":"info","ts":"2023-03-29T03:25:58.925Z","caller":"snapshot/v3_snapshot.go:97","msg":"saved","path":"/tmp/backup.db"}
Snapshot saved at /tmp/backup.db
#备份文件建议进行集群外主机远端备份
[root@K8s-etcd01 ~]#ll /tmp/backup.db
-rw------- 1 root root 4263968 Mar 29 03:25 /tmp/backup.db

V3版本恢复数据

#etcd用于恢复数据的目录会自动创建，如果目录已存在，需要清空
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir1
Deprecated: Use `etcdutl snapshot restore` instead.

2023-03-29T03:33:29Z	info	snapshot/v3_snapshot.go:248	restoring snapshot	{"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir1/member/wal", "data-dir": "/opt/etcd-testdir1", "snap-dir": "/opt/etcd-testdir1/member/snap", "stack": "go.etcd.io/etcd/etcdutl/v3/snapshot.(*v3Manager).Restore\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/snapshot/v3_snapshot.go:254\ngo.etcd.io/etcd/etcdutl/v3/etcdutl.SnapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/etcdutl/snapshot_command.go:147\ngo.etcd.io/etcd/etcdctl/v3/ctlv3/command.snapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/command/snapshot_command.go:129\ngithub.com/spf13/cobra.(*Command).execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:856\ngithub.com/spf13/cobra.(*Command).ExecuteC\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:960\ngithub.com/spf13/cobra.(*Command).Execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:897\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.Start\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:107\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.MustStart\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:111\nmain.main\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/main.go:59\nruntime.main\n\t/usr/local/google/home/siarkowicz/.gvm/gos/go1.16.15/src/runtime/proc.go:225"}
2023-03-29T03:33:29Z	info	membership/store.go:141	Trimming membership information from the backend...
2023-03-29T03:33:29Z	info	membership/cluster.go:421	added member	{"cluster-id": "cdf818194e3a8c32", "local-member-id": "0", "added-peer-id": "8e9e05c52164694d", "added-peer-peer-urls": ["http://localhost:2380"]}
2023-03-29T03:33:29Z	info	snapshot/v3_snapshot.go:269	restored snapshot	{"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir1/member/wal", "data-dir": "/opt/etcd-testdir1", "snap-dir": "/opt/etcd-testdir1/member/snap"}

[root@K8s-etcd01 ~]#tree /opt/etcd-testdir1
/opt/etcd-testdir1
└── member
    ├── snap
    │   ├── 0000000000000001-0000000000000001.snap
    │   └── db
    └── wal
        └── 0000000000000000-0000000000000000.wal

3 directories, 3 files

#如果用存在文件的目录进行数据恢复会报错
[root@K8s-etcd01 ~]#mkdir -p /opt/etcd-testdir2/
[root@K8s-etcd01 ~]#touch /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#ll /opt/etcd-testdir2/test
-rw-r--r-- 1 root root 0 Mar 29 03:35 /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir2
Deprecated: Use `etcdutl snapshot restore` instead.

Error: data-dir "/opt/etcd-testdir2" not empty or could not be read
#清空文件夹再用来恢复数据
[root@K8s-etcd01 ~]#rm -f /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir2
Deprecated: Use `etcdutl snapshot restore` instead.

2023-03-29T03:36:19Z	info	snapshot/v3_snapshot.go:248	restoring snapshot	{"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir2/member/wal", "data-dir": "/opt/etcd-testdir2", "snap-dir": "/opt/etcd-testdir2/member/snap", "stack": "go.etcd.io/etcd/etcdutl/v3/snapshot.(*v3Manager).Restore\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/snapshot/v3_snapshot.go:254\ngo.etcd.io/etcd/etcdutl/v3/etcdutl.SnapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/etcdutl/snapshot_command.go:147\ngo.etcd.io/etcd/etcdctl/v3/ctlv3/command.snapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/command/snapshot_command.go:129\ngithub.com/spf13/cobra.(*Command).execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:856\ngithub.com/spf13/cobra.(*Command).ExecuteC\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:960\ngithub.com/spf13/cobra.(*Command).Execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:897\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.Start\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:107\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.MustStart\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:111\nmain.main\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/main.go:59\nruntime.main\n\t/usr/local/google/home/siarkowicz/.gvm/gos/go1.16.15/src/runtime/proc.go:225"}
2023-03-29T03:36:19Z	info	membership/store.go:141	Trimming membership information from the backend...
2023-03-29T03:36:19Z	info	membership/cluster.go:421	added member	{"cluster-id": "cdf818194e3a8c32", "local-member-id": "0", "added-peer-id": "8e9e05c52164694d", "added-peer-peer-urls": ["http://localhost:2380"]}
2023-03-29T03:36:19Z	info	snapshot/v3_snapshot.go:269	restored snapshot	{"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir2/member/wal", "data-dir": "/opt/etcd-testdir2", "snap-dir": "/opt/etcd-testdir2/member/snap"}
[root@K8s-etcd01 ~]#tree /opt/etcd-testdir2/
/opt/etcd-testdir2/
└── member
    ├── snap
    │   ├── 0000000000000001-0000000000000001.snap
    │   └── db
    └── wal
        └── 0000000000000000-0000000000000000.wal

3 directories, 3 files

定时执行数据备份

[root@K8s-etcd01 ~]# mkdir /data/etcd-backup-dir/ -p
[root@K8s-etcd01 ~]# cat script.sh
#!/bin/bash
source /etc/profile
DATE=`date +%Y-%m-%d_%H-%M-%S`
/usr/bin/etcdctl snapshot save /data/etcd-backup-dir/etcd-snapshot-${DATE}.db

基于kubeasz的数据备份与恢复

查看kubeasz的数据备份策略

[root@K8s-ansible kubeasz]#pwd
/etc/kubeasz
[root@K8s-ansible kubeasz]#cat playbooks/94.backup.yml 
# cluster-backup playbook
# read the guide: 'op/cluster_restore.md'

- hosts:
  - localhost
  tasks:
  # step1: find a healthy member in the etcd cluster
  - name: set NODE_IPS of the etcd cluster
    set_fact: NODE_IPS="{% for host in groups['etcd'] %}{{ host }} {% endfor %}"

  - name: get etcd cluster status
    shell: 'for ip in {{ NODE_IPS }};do \
              ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl \
              --endpoints=https://"$ip":2379 \
              --cacert={{ cluster_dir }}/ssl/ca.pem \
              --cert={{ cluster_dir }}/ssl/etcd.pem \
              --key={{ cluster_dir }}/ssl/etcd-key.pem \
              endpoint health; \
            done'
    register: ETCD_CLUSTER_STATUS
    ignore_errors: true

  - debug: var="ETCD_CLUSTER_STATUS"

  - name: get a running ectd node
    shell: 'echo -e "{{ ETCD_CLUSTER_STATUS.stdout }}" \
             "{{ ETCD_CLUSTER_STATUS.stderr }}" \
             |grep "is healthy"|sed -n "1p"|cut -d: -f2|cut -d/ -f3'
    register: RUNNING_NODE

  - debug: var="RUNNING_NODE.stdout"

  - name: get current time
    shell: "date +'%Y%m%d%H%M'"
    register: timestamp

  # step2: backup data to the ansible node 
  - name: make a backup on the etcd node
    shell: "mkdir -p {{ cluster_dir }}/backup && cd {{ cluster_dir }}/backup && \
        ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl \
              --endpoints=https://{{ RUNNING_NODE.stdout }}:2379 \
              --cacert={{ cluster_dir }}/ssl/ca.pem \
              --cert={{ cluster_dir }}/ssl/etcd.pem \
              --key={{ cluster_dir }}/ssl/etcd-key.pem \
        snapshot save snapshot_{{ timestamp.stdout }}.db"
    args:
      warn: false

  - name: update the latest backup
    shell: 'cd {{ cluster_dir }}/backup/ && /bin/cp -f snapshot_{{ timestamp.stdout }}.db snapshot.db'

数据备份

#确认需要备份的pod及集群对象
[root@K8s-ansible kubeasz]#kubectl run net-test1 --image=centos:7.9.2009 sleep 100000000 -n myserver
pod/net-test1 created
[root@K8s-ansible kubeasz]#kubectl run net-test2 --image=centos:7.9.2009 sleep 100000000 -n myserver
pod/net-test2 created
[root@K8s-ansible kubeasz]#kubectl run net-test3 --image=centos:7.9.2009 sleep 100000000 -n myserver

[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE     NAME                                          READY   STATUS    RESTARTS      AGE
myserver      net-test1                                     1/1     Running   0             41s
myserver      net-test2                                     1/1     Running   0             35s
myserver      net-test3                                     1/1     Running   0             30s

#备份集群
[root@K8s-ansible kubeasz]#./ezctl backup  k8s-cluster1
ansible-playbook -i clusters/k8s-cluster1/hosts -e @clusters/k8s-cluster1/config.yml playbooks/94.backup.yml
2023-03-29 03:54:20 INFO cluster:k8s-cluster1 backup begins in 5s, press any key to abort:


PLAY [localhost] *****************************************************************************************************************************************************************

TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [localhost]

TASK [set NODE_IPS of the etcd cluster] ******************************************************************************************************************************************
ok: [localhost]

TASK [get etcd cluster status] ***************************************************************************************************************************************************
changed: [localhost]

TASK [debug] *********************************************************************************************************************************************************************
ok: [localhost] => {
    "ETCD_CLUSTER_STATUS": {
        "changed": true,
        "cmd": "for ip in 192.168.11.217 192.168.11.218 192.168.11.219 ;do ETCDCTL_API=3 /etc/kubeasz/bin/etcdctl --endpoints=https://\"$ip\":2379 --cacert=/etc/kubeasz/clusters/k8s-cluster1/ssl/ca.pem --cert=/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd.pem --key=/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd-key.pem endpoint health; done",
        "delta": "0:00:00.655272",
        "end": "2023-03-29 03:54:30.559406",
        "failed": false,
        "rc": 0,
        "start": "2023-03-29 03:54:29.904134",
        "stderr": "",
        "stderr_lines": [],
        "stdout": "https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 36.927011ms\nhttps://192.168.11.218:2379 is healthy: successfully committed proposal: took = 26.676368ms\nhttps://192.168.11.219:2379 is healthy: successfully committed proposal: took = 27.100284ms",
        "stdout_lines": [
            "https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 36.927011ms",
            "https://192.168.11.218:2379 is healthy: successfully committed proposal: took = 26.676368ms",
            "https://192.168.11.219:2379 is healthy: successfully committed proposal: took = 27.100284ms"
        ]
    }
}

TASK [get a running ectd node] ***************************************************************************************************************************************************
changed: [localhost]

TASK [debug] *********************************************************************************************************************************************************************
ok: [localhost] => {
    "RUNNING_NODE.stdout": "192.168.11.217"
}

TASK [get current time] **********************************************************************************************************************************************************
changed: [localhost]

TASK [make a backup on the etcd node] ********************************************************************************************************************************************
changed: [localhost]

TASK [update the latest backup] **************************************************************************************************************************************************
changed: [localhost]

PLAY RECAP ***********************************************************************************************************************************************************************
localhost                  : ok=9    changed=5    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   

#查看备份文件
[root@K8s-ansible kubeasz]#ll clusters/k8s-cluster1/backup/
total 8560
drwxr-xr-x 2 root root    4096 Mar 29 03:54 ./
drwxr-xr-x 5 root root    4096 Mar 27 12:03 ../
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot.db
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot_202303290354.db

查看kubeasz的数据恢复策略

#数据恢复需要停止集群服务
[root@K8s-ansible kubeasz]#cat playbooks/95.restore.yml 
# cluster-restore playbook
# read the guide: 'op/cluster_restore.md'
# https://kubernetes.io/docs/tasks/administer-cluster/configure-upgrade-etcd/#restoring-an-etcd-cluster

- hosts: kube_master
  tasks:
  - name: stopping kube_master services
    service: name={{ item }} state=stopped
    with_items:
    - kube-apiserver
    - kube-controller-manager
    - kube-scheduler

- hosts:
  - kube_master
  - kube_node
  tasks:
  - name: stopping kube_node services
    service: name={{ item }} state=stopped
    with_items:
    - kubelet
    - kube-proxy

- hosts: etcd
  roles:
  - cluster-restore

- hosts: kube_master
  tasks:
  - name: starting kube_master services
    service: name={{ item }} state=started enabled=yes
    with_items:
    - kube-apiserver
    - kube-controller-manager
    - kube-scheduler

- hosts:
  - kube_master
  - kube_node
  tasks:
  - name: starting kube_node services
    service: name={{ item }} state=started enabled=yes
    with_items:
    - kubelet
    - kube-proxy

#cluster-restore
[root@K8s-ansible kubeasz]#cat roles/cluster-restore/tasks/main.yml 
- name: 停止ectd 服务
  service: name=etcd state=stopped

- name: 清除etcd 数据目录
  file: name={{ ETCD_DATA_DIR }}/member state=absent

- name: 清除 etcd 备份目录
  file: name={{ cluster_dir }}/backup/etcd-restore state=absent
  delegate_to: 127.0.0.1 
  run_once: true

- name: etcd 数据恢复
  shell: "cd {{ cluster_dir }}/backup && \
	ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl snapshot restore snapshot.db \
	 --data-dir={{ cluster_dir }}/backup/etcd-restore"
  delegate_to: 127.0.0.1
  run_once: true

- name: 分发恢复文件到 etcd 各个节点
  copy: src={{ cluster_dir }}/backup/etcd-restore/member dest={{ ETCD_DATA_DIR }}

- name: 重启etcd 服务
  service: name=etcd state=restarted

- name: 以轮询的方式等待服务同步完成
  shell: "systemctl is-active etcd.service"
  register: etcd_status
  until: '"active" in etcd_status.stdout'
  retries: 8
  delay: 8

数据恢复

#删除数据
[root@K8s-ansible kubeasz]#kubectl delete pod net-test2 -n myserver
[root@K8s-ansible kubeasz]#kubectl delete pod net-test1 -n myserver
pod "net-test2" deleted
[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE     NAME                                          READY   STATUS    RESTARTS      AGE
myserver      net-test3                                     1/1     Running   0             10m

#恢复数据
#1.防止误操作备份现有数据并将文件夹clusters/k8s-cluster1/backup/备份后进行恢复操作
[root@K8s-ansible kubeasz]#./ezctl backup  k8s-cluster1
[root@K8s-ansible kubeasz]#ll clusters/k8s-cluster1/backup/
total 12836
drwxr-xr-x 2 root root    4096 Mar 29 04:04 ./
drwxr-xr-x 5 root root    4096 Mar 27 12:03 ../
-rw------- 1 root root 4374560 Mar 29 04:04 snapshot.db
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot_202303290354.db #删除前备份
-rw------- 1 root root 4374560 Mar 29 04:04 snapshot_202303290404.db #最近备份
#2.将指定备份文件改名为snapshot.db 
[root@K8s-ansible kubeasz]#cp -rf clusters/k8s-cluster1/backup/snapshot_202303290354.db clusters/k8s-cluster1/backup/snapshot.db 
#3.恢复集群
[root@K8s-ansible kubeasz]#./ezctl restore  k8s-cluster1
ansible-playbook -i clusters/k8s-cluster1/hosts -e @clusters/k8s-cluster1/config.yml playbooks/95.restore.yml
2023-03-29 04:22:11 INFO cluster:k8s-cluster1 restore begins in 5s, press any key to abort:


PLAY [kube_master] ***************************************************************************************************************************************************************

TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.211]
ok: [192.168.11.212]
ok: [192.168.11.213]

TASK [stopping kube_master services] *********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-apiserver)
changed: [192.168.11.211] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-controller-manager)
changed: [192.168.11.211] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-scheduler)
changed: [192.168.11.212] => (item=kube-scheduler)
changed: [192.168.11.211] => (item=kube-scheduler)

PLAY [kube_master,kube_node] *****************************************************************************************************************************************************

TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.215]
ok: [192.168.11.214]
ok: [192.168.11.216]

TASK [stopping kube_node services] ***********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kubelet)
changed: [192.168.11.212] => (item=kubelet)
changed: [192.168.11.216] => (item=kubelet)
changed: [192.168.11.211] => (item=kubelet)
changed: [192.168.11.214] => (item=kubelet)
changed: [192.168.11.212] => (item=kube-proxy)
changed: [192.168.11.213] => (item=kube-proxy)
changed: [192.168.11.214] => (item=kube-proxy)
changed: [192.168.11.211] => (item=kube-proxy)
changed: [192.168.11.216] => (item=kube-proxy)
changed: [192.168.11.215] => (item=kubelet)
changed: [192.168.11.215] => (item=kube-proxy)

PLAY [etcd] **********************************************************************************************************************************************************************

TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.217]
ok: [192.168.11.219]
ok: [192.168.11.218]

TASK [cluster-restore : 停止ectd 服务] ***********************************************************************************************************************************************
changed: [192.168.11.218]
changed: [192.168.11.217]
changed: [192.168.11.219]

TASK [cluster-restore : 清除etcd 数据目录] *********************************************************************************************************************************************
changed: [192.168.11.217]
changed: [192.168.11.219]
changed: [192.168.11.218]

TASK [cluster-restore : 清除 etcd 备份目录] ********************************************************************************************************************************************
ok: [192.168.11.217]

TASK [cluster-restore : etcd 数据恢复] ***********************************************************************************************************************************************
changed: [192.168.11.217]

TASK [cluster-restore : 分发恢复文件到 etcd 各个节点] ***************************************************************************************************************************************
changed: [192.168.11.217]
changed: [192.168.11.219]
changed: [192.168.11.218]

TASK [cluster-restore : 重启etcd 服务] ***********************************************************************************************************************************************
changed: [192.168.11.219]
changed: [192.168.11.218]
changed: [192.168.11.217]

TASK [cluster-restore : 以轮询的方式等待服务同步完成] ******************************************************************************************************************************************
changed: [192.168.11.218]
changed: [192.168.11.219]
changed: [192.168.11.217]

PLAY [kube_master] ***************************************************************************************************************************************************************

TASK [starting kube_master services] *********************************************************************************************************************************************
changed: [192.168.11.211] => (item=kube-apiserver)
changed: [192.168.11.213] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-apiserver)
changed: [192.168.11.211] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-controller-manager)
changed: [192.168.11.211] => (item=kube-scheduler)
changed: [192.168.11.213] => (item=kube-scheduler)
changed: [192.168.11.212] => (item=kube-controller-manager)
changed: [192.168.11.212] => (item=kube-scheduler)

PLAY [kube_master,kube_node] *****************************************************************************************************************************************************

TASK [starting kube_node services] ***********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kubelet)
changed: [192.168.11.212] => (item=kubelet)
changed: [192.168.11.211] => (item=kubelet)
changed: [192.168.11.214] => (item=kubelet)
changed: [192.168.11.216] => (item=kubelet)
changed: [192.168.11.213] => (item=kube-proxy)
changed: [192.168.11.212] => (item=kube-proxy)
changed: [192.168.11.211] => (item=kube-proxy)
changed: [192.168.11.214] => (item=kube-proxy)
changed: [192.168.11.216] => (item=kube-proxy)
changed: [192.168.11.215] => (item=kubelet)
changed: [192.168.11.215] => (item=kube-proxy)

PLAY RECAP ***********************************************************************************************************************************************************************
192.168.11.211             : ok=5    changed=4    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.212             : ok=5    changed=4    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.213             : ok=5    changed=4    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.214             : ok=3    changed=2    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.215             : ok=3    changed=2    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.216             : ok=3    changed=2    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.217             : ok=8    changed=6    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.218             : ok=6    changed=5    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
192.168.11.219             : ok=6    changed=5    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   

#4.确认恢复成功
[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE     NAME                                          READY   STATUS              RESTARTS      AGE
myserver      net-test1                                     0/1     ContainerCreating   1             32m
myserver      net-test2                                     0/1     ContainerCreating   1             32m
myserver      net-test3                                     1/1     Running             0             32m


#如果需要将现有数据加入新的集群，除了数据恢复策略，还有etcd数据加入，可以参考kubeasz的数据add配置文件
[root@K8s-ansible kubeasz]#ll playbooks/21.addetcd.yml 
-rw-rw-r-- 1 root root 1567 Feb  9 15:00 playbooks/21.addetcd.yml

总结 - ETCD数据恢复流程

当etcd集群宕机数量超过集群总节点数一半以上的时候(如总数为三台宕机两台)，就会导致整合集群宕机，后期需要重新恢复数据，则恢复流程如下：

恢复服务器系统
重新部署ETCD集群
停止kube-apiserver/controller-manager/scheduler/kubelet/kube-proxy
停止ETCD集群
各ETCD节点恢复同一份备份数据
启动各节点并验证ETCD集群
启动kube-apiserver/controller-manager/scheduler/kubelet/kube-proxy
验证kubernetes master状态及pod数据

我是moore,大家一起加油！！！