Kubectl常用命令
[root@K8s-ansible ~]#kubectl api-resources
NAME SHORTNAMES APIVERSION NAMESPACED KIND
bindings v1 true Binding
componentstatuses cs v1 false ComponentStatus
configmaps cm v1 true ConfigMap
endpoints ep v1 true Endpoints
events ev v1 true Event
limitranges limits v1 true LimitRange
namespaces ns v1 false Namespace
nodes no v1 false Node
persistentvolumeclaims pvc v1 true PersistentVolumeClaim
persistentvolumes pv v1 false PersistentVolume
pods po v1 true Pod
podtemplates v1 true PodTemplate
replicationcontrollers rc v1 true ReplicationController
resourcequotas quota v1 true ResourceQuota
secrets v1 true Secret
serviceaccounts sa v1 true ServiceAccount
services svc v1 true Service
mutatingwebhookconfigurations admissionregistration.k8s.io/v1 false MutatingWebhookConfiguration
validatingwebhookconfigurations admissionregistration.k8s.io/v1 false ValidatingWebhookConfiguration
customresourcedefinitions crd,crds apiextensions.k8s.io/v1 false CustomResourceDefinition
apiservices apiregistration.k8s.io/v1 false APIService
controllerrevisions apps/v1 true ControllerRevision
daemonsets ds apps/v1 true DaemonSet
deployments deploy apps/v1 true Deployment
replicasets rs apps/v1 true ReplicaSet
statefulsets sts apps/v1 true StatefulSet
tokenreviews authentication.k8s.io/v1 false TokenReview
localsubjectaccessreviews authorization.k8s.io/v1 true LocalSubjectAccessReview
selfsubjectaccessreviews authorization.k8s.io/v1 false SelfSubjectAccessReview
selfsubjectrulesreviews authorization.k8s.io/v1 false SelfSubjectRulesReview
subjectaccessreviews authorization.k8s.io/v1 false SubjectAccessReview
horizontalpodautoscalers hpa autoscaling/v2 true HorizontalPodAutoscaler
cronjobs cj batch/v1 true CronJob
jobs batch/v1 true Job
certificatesigningrequests csr certificates.k8s.io/v1 false CertificateSigningRequest
leases coordination.k8s.io/v1 true Lease
endpointslices discovery.k8s.io/v1 true EndpointSlice
events ev events.k8s.io/v1 true Event
flowschemas flowcontrol.apiserver.k8s.io/v1beta3 false FlowSchema
prioritylevelconfigurations flowcontrol.apiserver.k8s.io/v1beta3 false PriorityLevelConfiguration
ingressclasses networking.k8s.io/v1 false IngressClass
ingresses ing networking.k8s.io/v1 true Ingress
networkpolicies netpol networking.k8s.io/v1 true NetworkPolicy
runtimeclasses node.k8s.io/v1 false RuntimeClass
poddisruptionbudgets pdb policy/v1 true PodDisruptionBudget
clusterrolebindings rbac.authorization.k8s.io/v1 false ClusterRoleBinding
clusterroles rbac.authorization.k8s.io/v1 false ClusterRole
rolebindings rbac.authorization.k8s.io/v1 true RoleBinding
roles rbac.authorization.k8s.io/v1 true Role
priorityclasses pc scheduling.k8s.io/v1 false PriorityClass
csidrivers storage.k8s.io/v1 false CSIDriver
csinodes storage.k8s.io/v1 false CSINode
csistoragecapacities storage.k8s.io/v1 true CSIStorageCapacity
storageclasses sc storage.k8s.io/v1 false StorageClass
volumeattachments storage.k8s.io/v1 false VolumeAttachment
- 常见命令
#官网API
https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands
# kubectl get service --all-namespaces -o wide
# kubectl get pods --all-namespaces -o wide
# kubectl get nodes --all-namespaces -o wide
# kubectl get deployment --all-namespaces
# kubectl get deployment -n magedu -o wide #更改显示格式
# kubectl describe pods mooreyxia-tomcat-app1-deployment -n myserver #查看某个资源详细信息
# kubectl create -f tomcat-app1.yaml
# kubectl apply -f tomcat-app1.yaml
# kubectl delete -f tomcat-app1.yaml
# kubectl create -f tomcat-app1.yaml --save-config --record
# kubectl apply -f tomcat-app1.yaml --record #推荐命令
# kubectl exec -it mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 bash -n myserver
# kubectl logs mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 -n magedu
# kubectl delete pods mooreyxia-tomcat-app1-deployment-6bccd8f9c7-g76s5 -n myserver
- 设置kubectl命令补全
[root@K8s-ansible script]#kubectl --help|grep completion
completion Output shell completion code for the specified shell (bash, zsh, fish, or powershell)
#生成命令行自动补全脚本
[root@K8s-ansible script]#kubectl completion bash
#加载自动补全脚本即可
[root@K8s-ansible script]#source <(kubectl completion bash)
#如果出现命令补全报错,则需要下载bash命令补全安装包
[root@K8s-ansible ~]#kubectl g-bash: _get_comp_words_by_ref: command not found
-bash: _get_comp_words_by_ref: command not found
#安装bash-completion
[root@K8s-ansible ~]#apt install bash-completion -y
[root@K8s-ansible ~]#source /etc/bash_completion
Etcd客户端命令及基于快照的备份和恢复
Service文件配置说明
[root@K8s-etcd01 ~]#cat /etc/systemd/system/etcd.service
[Unit]
Description=Etcd Server
After=network.target
After=network-online.target
Wants=network-online.target
Documentation=https://github.com/coreos
[Service]
Type=notify
WorkingDirectory=/var/lib/etcd #数据保存目录
ExecStart=/usr/local/bin/etcd \ #二进制文件路径
--name=etcd-192.168.11.217 \ #当前node名称
--cert-file=/etc/kubernetes/ssl/etcd.pem \
--key-file=/etc/kubernetes/ssl/etcd-key.pem \
--peer-cert-file=/etc/kubernetes/ssl/etcd.pem \
--peer-key-file=/etc/kubernetes/ssl/etcd-key.pem \
--trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
--peer-trusted-ca-file=/etc/kubernetes/ssl/ca.pem \
--initial-advertise-peer-urls=https://192.168.11.217:2380 \ #通告自己的集群端口
--listen-peer-urls=https://192.168.11.217:2380 \ #集群之间通讯端口
--listen-client-urls=https://192.168.11.217:2379,http://127.0.0.1:2379 \ #客户端访问地址
--advertise-client-urls=https://192.168.11.217:2379 \ #通告自己的客户端端口
--initial-cluster-token=etcd-cluster-0 \ #创建集群使用的token,一个集群内的节点保持一致
--initial-cluster=etcd-192.168.11.217=https://192.168.11.217:2380,etcd- 192.168.11.218=https://192.168.11.218:2380,etcd-192.168.11.219=https://192.168.11.219:2380 \
--initial-cluster-state=new \ #新建集群的时候的值为new,如果是已经存在的集群为existing
--data-dir=/var/lib/etcd \ #数据目录路径
--wal-dir= \
--snapshot-count=50000 \
--auto-compaction-retention=1 \ #数据压缩:第一次压缩等待1小时,以后每次1小时*10%=1小时压缩一次。
--auto-compaction-mode=periodic \ #周期性数据压缩
--max-request-bytes=10485760 \ #request size limit(请求的最大字节数,默认一个key最大1.5Mib,官方推荐最大10Mib)
--quota-backend-bytes=8589934592 #storage size limit(磁盘存储空间大小限制,默认为2G,此值超过8G启动会有警告信息)
Restart=always
RestartSec=15
LimitNOFILE=65536
OOMScoreAdjust=-999
[Install]
WantedBy=multi-user.target
#对etcd集群进行心跳检测
[root@K8s-etcd01 ~]#/usr/local/bin/etcdctl defrag --cluster --endpoints=https://192.168.11.217:2379 --cacert=/etc/kubernetes/ssl/ca.pem --cert=/etc/kubernetes/ssl/etcd.pem --key=/etc/kubernetes/ssl/etcd-key.pem
Finished defragmenting etcd member[https://192.168.11.219:2379]
Finished defragmenting etcd member[https://192.168.11.218:2379]
Finished defragmenting etcd member[https://192.168.11.217:2379]
etcd客户端命令使用
[root@K8s-etcd01 ~]#etcdctl --help
NAME:
etcdctl - A simple command line client for etcd3.
USAGE:
etcdctl [flags]
VERSION:
3.5.5
API VERSION:
3.5
COMMANDS:
alarm disarm Disarms all alarms
alarm list Lists all alarms
auth disable Disables authentication
auth enable Enables authentication
auth status Returns authentication status
check datascale Check the memory usage of holding data for different workloads on a given server endpoint.
check perf Check the performance of the etcd cluster
compaction Compacts the event history in etcd
defrag Defragments the storage of the etcd members with given endpoints
del Removes the specified key or range of keys [key, range_end)
elect Observes and participates in leader election
endpoint hashkv Prints the KV history hash for each endpoint in --endpoints
endpoint health Checks the healthiness of endpoints specified in `--endpoints` flag
endpoint status Prints out the status of endpoints specified in `--endpoints` flag
get Gets the key or a range of keys
help Help about any command
lease grant Creates leases
lease keep-alive Keeps leases alive (renew)
lease list List all active leases
lease revoke Revokes leases
lease timetolive Get lease information
lock Acquires a named lock
make-mirror Makes a mirror at the destination etcd cluster
member add Adds a member into the cluster
member list Lists all members in the cluster
member promote Promotes a non-voting member in the cluster
member remove Removes a member from the cluster
member update Updates a member in the cluster
move-leader Transfers leadership to another etcd cluster member.
put Puts the given key into the store
role add Adds a new role
role delete Deletes a role
role get Gets detailed information of a role
role grant-permission Grants a key to a role
role list Lists all roles
role revoke-permission Revokes a key from a role
snapshot restore Restores an etcd member snapshot to an etcd directory
snapshot save Stores an etcd node backend snapshot to a given file
snapshot status [deprecated] Gets backend snapshot status of a given file
txn Txn processes all the requests in one transaction
user add Adds a new user
user delete Deletes a user
user get Gets detailed information of a user
user grant-role Grants a role to a user
user list Lists all users
user passwd Changes password of user
user revoke-role Revokes a role from a user
version Prints the version of etcdctl
watch Watches events stream on keys or prefixes
OPTIONS:
--cacert="" verify certificates of TLS-enabled secure servers using this CA bundle
--cert="" identify secure client using this TLS certificate file
--command-timeout=5s timeout for short running command (excluding dial timeout)
--debug[=false] enable client-side debug logging
--dial-timeout=2s dial timeout for client connections
-d, --discovery-srv="" domain name to query for SRV records describing cluster endpoints
--discovery-srv-name="" service name to query when using DNS discovery
--endpoints=[127.0.0.1:2379] gRPC endpoints
-h, --help[=false] help for etcdctl
--hex[=false] print byte strings as hex encoded strings
--insecure-discovery[=true] accept insecure SRV records describing cluster endpoints
--insecure-skip-tls-verify[=false] skip server certificate verification (CAUTION: this option should be enabled only for testing purposes)
--insecure-transport[=true] disable transport security for client connections
--keepalive-time=2s keepalive time for client connections
--keepalive-timeout=6s keepalive timeout for client connections
--key="" identify secure client using this TLS key file
--password="" password for authentication (if this option is used, --user option shouldn't include password)
--user="" username[:password] for authentication (prompt if password is not supplied)
-w, --write-out="simple" set the output format (fields, json, protobuf, simple, table)
#查询etcd节点信息
[root@K8s-etcd01 ~]#cat check_etcdcluster.sh
#!/bin/bash
IP="
192.168.11.217
192.168.11.218
192.168.11.219
"
for ip in ${IP}; do
ETCDCTL_API=3 \
/usr/local/bin/etcdctl \
--endpoints=https://${ip}:2379 \
--cacert=/etc/kubernetes/ssl/ca.pem \
--cert=/etc/kubernetes/ssl/etcd.pem \
--key=/etc/kubernetes/ssl/etcd-key.pem endpoint health;
done
[root@K8s-etcd01 ~]#bash check_etcdcluster.sh
https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 28.898693ms
https://192.168.11.218:2379 is healthy: successfully committed proposal: took = 20.830087ms
https://192.168.11.219:2379 is healthy: successfully committed proposal: took = 25.347226ms
#以表格方式显示节点详细状态
[root@K8s-etcd01 ~]#cat check_etcdcluster.sh
#!/bin/bash
IP="
192.168.11.217
192.168.11.218
192.168.11.219
"
for ip in ${IP}; do
ETCDCTL_API=3 \
/usr/local/bin/etcdctl \
--write-out=table endpoint status \
--endpoints=https://${ip}:2379 \
--cacert=/etc/kubernetes/ssl/ca.pem \
--cert=/etc/kubernetes/ssl/etcd.pem \
--key=/etc/kubernetes/ssl/etcd-key.pem endpoint health;
done
#查询etcd数据信息
[root@K8s-etcd01 ~]#ETCDCTL_API=3 etcdctl get / --prefix --keys-only | grep nginx
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded01.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--dlvrh-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded02.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--vg675-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded02.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--wbpvl-eth0
/calico/resources/v3/projectcalico.org/workloadendpoints/myserver/k8s--noded03.mooreyxia.com-k8s-mooreyxia--nginx--deployment--789dfdcb7b--rntpm-eth0
/registry/deployments/myserver/mooreyxia-nginx-deployment
/registry/endpointslices/myserver/mooreyxia-nginx-service-cpn6k
/registry/events/myserver/mooreyxia-nginx-deployment-789dfdcb7b-dlvrh.1750771802f5c1b0
#数据增删改查
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl put /name "tom"
OK
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get /name
/name
tom
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl put /name "moore"
OK
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get /name
/name
moore
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl del /name
1
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl get /name
etcd数据watch机制
持续查看,发生变化就主动触发通知客户端,Etcd v3 的watch机制支持watch某个固定的key,也支持watch一个范围。
#在etcd node1上watch一个key,没有此key也可以执行watch,后期可以再创建:
[root@K8s-etcd01 ~]#ETCDCTL_API=3 /usr/local/bin/etcdctl watch /data
etcd V3 API版本数据备份与恢复
- WAL是write ahead log(预写日志)的缩写,顾名思义,也就是在执行真正的写操作之前先写一个日志,预写日志。
#wal: 存放预写式日志,最大的作用是记录了整个数据变化的全部历程。在etcd中,所有数据的修改在提交前,都要先写入到WAL中。
[root@K8s-etcd01 ~]#ll /var/lib/etcd/member/wal/
total 187512
drwx------ 2 root root 4096 Mar 29 03:16 ./
drwx------ 4 root root 4096 Mar 29 03:16 ../
-rw------- 1 root root 64000000 Mar 29 03:16 0.tmp
-rw------- 1 root root 64000048 Mar 28 10:30 0000000000000000-0000000000000000.wal
-rw------- 1 root root 64000000 Mar 29 03:18 0000000000000001-0000000000014d5d.wal
[root@K8s-etcd01 ~]#file /var/lib/etcd/member/wal/0000000000000001-0000000000014d5d.wal
/var/lib/etcd/member/wal/0000000000000001-0000000000014d5d.wal: data
#存放数据的文件 - 数据备份主要针对这个文件
[root@K8s-etcd01 ~]#file /var/lib/etcd/member/snap/db
/var/lib/etcd/member/snap/db: data
- V3版本备份数据
#etcd数据库集群默认同步机制的情况下只需要备份一个数据库的数据即可
[root@K8s-etcd01 ~]#etcdctl --help|grep snapshot
snapshot restore Restores an etcd member snapshot to an etcd directory
snapshot save Stores an etcd node backend snapshot to a given file
snapshot status [deprecated] Gets backend snapshot status of a given file
[root@K8s-etcd01 ~]#etcdctl snapshot save /tmp/backup.db
{"level":"info","ts":"2023-03-29T03:25:58.725Z","caller":"snapshot/v3_snapshot.go:65","msg":"created temporary db file","path":"/tmp/backup.db.part"}
{"level":"info","ts":"2023-03-29T03:25:58.739Z","logger":"client","caller":"v3/maintenance.go:211","msg":"opened snapshot stream; downloading"}
{"level":"info","ts":"2023-03-29T03:25:58.739Z","caller":"snapshot/v3_snapshot.go:73","msg":"fetching snapshot","endpoint":"127.0.0.1:2379"}
{"level":"info","ts":"2023-03-29T03:25:58.861Z","logger":"client","caller":"v3/maintenance.go:219","msg":"completed snapshot read; closing"}
{"level":"info","ts":"2023-03-29T03:25:58.924Z","caller":"snapshot/v3_snapshot.go:88","msg":"fetched snapshot","endpoint":"127.0.0.1:2379","size":"4.3 MB","took":"now"}
{"level":"info","ts":"2023-03-29T03:25:58.925Z","caller":"snapshot/v3_snapshot.go:97","msg":"saved","path":"/tmp/backup.db"}
Snapshot saved at /tmp/backup.db
#备份文件建议进行集群外主机远端备份
[root@K8s-etcd01 ~]#ll /tmp/backup.db
-rw------- 1 root root 4263968 Mar 29 03:25 /tmp/backup.db
- V3版本恢复数据
#etcd用于恢复数据的目录会自动创建,如果目录已存在,需要清空
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir1
Deprecated: Use `etcdutl snapshot restore` instead.
2023-03-29T03:33:29Z info snapshot/v3_snapshot.go:248 restoring snapshot {"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir1/member/wal", "data-dir": "/opt/etcd-testdir1", "snap-dir": "/opt/etcd-testdir1/member/snap", "stack": "go.etcd.io/etcd/etcdutl/v3/snapshot.(*v3Manager).Restore\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/snapshot/v3_snapshot.go:254\ngo.etcd.io/etcd/etcdutl/v3/etcdutl.SnapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/etcdutl/snapshot_command.go:147\ngo.etcd.io/etcd/etcdctl/v3/ctlv3/command.snapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/command/snapshot_command.go:129\ngithub.com/spf13/cobra.(*Command).execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:856\ngithub.com/spf13/cobra.(*Command).ExecuteC\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:960\ngithub.com/spf13/cobra.(*Command).Execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:897\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.Start\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:107\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.MustStart\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:111\nmain.main\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/main.go:59\nruntime.main\n\t/usr/local/google/home/siarkowicz/.gvm/gos/go1.16.15/src/runtime/proc.go:225"}
2023-03-29T03:33:29Z info membership/store.go:141 Trimming membership information from the backend...
2023-03-29T03:33:29Z info membership/cluster.go:421 added member {"cluster-id": "cdf818194e3a8c32", "local-member-id": "0", "added-peer-id": "8e9e05c52164694d", "added-peer-peer-urls": ["http://localhost:2380"]}
2023-03-29T03:33:29Z info snapshot/v3_snapshot.go:269 restored snapshot {"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir1/member/wal", "data-dir": "/opt/etcd-testdir1", "snap-dir": "/opt/etcd-testdir1/member/snap"}
[root@K8s-etcd01 ~]#tree /opt/etcd-testdir1
/opt/etcd-testdir1
└── member
├── snap
│ ├── 0000000000000001-0000000000000001.snap
│ └── db
└── wal
└── 0000000000000000-0000000000000000.wal
3 directories, 3 files
#如果用存在文件的目录进行数据恢复会报错
[root@K8s-etcd01 ~]#mkdir -p /opt/etcd-testdir2/
[root@K8s-etcd01 ~]#touch /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#ll /opt/etcd-testdir2/test
-rw-r--r-- 1 root root 0 Mar 29 03:35 /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir2
Deprecated: Use `etcdutl snapshot restore` instead.
Error: data-dir "/opt/etcd-testdir2" not empty or could not be read
#清空文件夹再用来恢复数据
[root@K8s-etcd01 ~]#rm -f /opt/etcd-testdir2/test
[root@K8s-etcd01 ~]#etcdctl snapshot restore /tmp/backup.db --data-dir=/opt/etcd-testdir2
Deprecated: Use `etcdutl snapshot restore` instead.
2023-03-29T03:36:19Z info snapshot/v3_snapshot.go:248 restoring snapshot {"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir2/member/wal", "data-dir": "/opt/etcd-testdir2", "snap-dir": "/opt/etcd-testdir2/member/snap", "stack": "go.etcd.io/etcd/etcdutl/v3/snapshot.(*v3Manager).Restore\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/snapshot/v3_snapshot.go:254\ngo.etcd.io/etcd/etcdutl/v3/etcdutl.SnapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdutl/etcdutl/snapshot_command.go:147\ngo.etcd.io/etcd/etcdctl/v3/ctlv3/command.snapshotRestoreCommandFunc\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/command/snapshot_command.go:129\ngithub.com/spf13/cobra.(*Command).execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:856\ngithub.com/spf13/cobra.(*Command).ExecuteC\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:960\ngithub.com/spf13/cobra.(*Command).Execute\n\t/usr/local/google/home/siarkowicz/.gvm/pkgsets/go1.16.15/global/pkg/mod/github.com/spf13/cobra@v1.1.3/command.go:897\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.Start\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:107\ngo.etcd.io/etcd/etcdctl/v3/ctlv3.MustStart\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/ctlv3/ctl.go:111\nmain.main\n\t/tmp/etcd-release-3.5.5/etcd/release/etcd/etcdctl/main.go:59\nruntime.main\n\t/usr/local/google/home/siarkowicz/.gvm/gos/go1.16.15/src/runtime/proc.go:225"}
2023-03-29T03:36:19Z info membership/store.go:141 Trimming membership information from the backend...
2023-03-29T03:36:19Z info membership/cluster.go:421 added member {"cluster-id": "cdf818194e3a8c32", "local-member-id": "0", "added-peer-id": "8e9e05c52164694d", "added-peer-peer-urls": ["http://localhost:2380"]}
2023-03-29T03:36:19Z info snapshot/v3_snapshot.go:269 restored snapshot {"path": "/tmp/backup.db", "wal-dir": "/opt/etcd-testdir2/member/wal", "data-dir": "/opt/etcd-testdir2", "snap-dir": "/opt/etcd-testdir2/member/snap"}
[root@K8s-etcd01 ~]#tree /opt/etcd-testdir2/
/opt/etcd-testdir2/
└── member
├── snap
│ ├── 0000000000000001-0000000000000001.snap
│ └── db
└── wal
└── 0000000000000000-0000000000000000.wal
3 directories, 3 files
- 定时执行数据备份
[root@K8s-etcd01 ~]# mkdir /data/etcd-backup-dir/ -p
[root@K8s-etcd01 ~]# cat script.sh
#!/bin/bash
source /etc/profile
DATE=`date +%Y-%m-%d_%H-%M-%S`
/usr/bin/etcdctl snapshot save /data/etcd-backup-dir/etcd-snapshot-${DATE}.db
基于kubeasz的数据备份与恢复
- 查看kubeasz的数据备份策略
[root@K8s-ansible kubeasz]#pwd
/etc/kubeasz
[root@K8s-ansible kubeasz]#cat playbooks/94.backup.yml
# cluster-backup playbook
# read the guide: 'op/cluster_restore.md'
- hosts:
- localhost
tasks:
# step1: find a healthy member in the etcd cluster
- name: set NODE_IPS of the etcd cluster
set_fact: NODE_IPS="{% for host in groups['etcd'] %}{{ host }} {% endfor %}"
- name: get etcd cluster status
shell: 'for ip in {{ NODE_IPS }};do \
ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl \
--endpoints=https://"$ip":2379 \
--cacert={{ cluster_dir }}/ssl/ca.pem \
--cert={{ cluster_dir }}/ssl/etcd.pem \
--key={{ cluster_dir }}/ssl/etcd-key.pem \
endpoint health; \
done'
register: ETCD_CLUSTER_STATUS
ignore_errors: true
- debug: var="ETCD_CLUSTER_STATUS"
- name: get a running ectd node
shell: 'echo -e "{{ ETCD_CLUSTER_STATUS.stdout }}" \
"{{ ETCD_CLUSTER_STATUS.stderr }}" \
|grep "is healthy"|sed -n "1p"|cut -d: -f2|cut -d/ -f3'
register: RUNNING_NODE
- debug: var="RUNNING_NODE.stdout"
- name: get current time
shell: "date +'%Y%m%d%H%M'"
register: timestamp
# step2: backup data to the ansible node
- name: make a backup on the etcd node
shell: "mkdir -p {{ cluster_dir }}/backup && cd {{ cluster_dir }}/backup && \
ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl \
--endpoints=https://{{ RUNNING_NODE.stdout }}:2379 \
--cacert={{ cluster_dir }}/ssl/ca.pem \
--cert={{ cluster_dir }}/ssl/etcd.pem \
--key={{ cluster_dir }}/ssl/etcd-key.pem \
snapshot save snapshot_{{ timestamp.stdout }}.db"
args:
warn: false
- name: update the latest backup
shell: 'cd {{ cluster_dir }}/backup/ && /bin/cp -f snapshot_{{ timestamp.stdout }}.db snapshot.db'
- 数据备份
#确认需要备份的pod及集群对象
[root@K8s-ansible kubeasz]#kubectl run net-test1 --image=centos:7.9.2009 sleep 100000000 -n myserver
pod/net-test1 created
[root@K8s-ansible kubeasz]#kubectl run net-test2 --image=centos:7.9.2009 sleep 100000000 -n myserver
pod/net-test2 created
[root@K8s-ansible kubeasz]#kubectl run net-test3 --image=centos:7.9.2009 sleep 100000000 -n myserver
[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
myserver net-test1 1/1 Running 0 41s
myserver net-test2 1/1 Running 0 35s
myserver net-test3 1/1 Running 0 30s
#备份集群
[root@K8s-ansible kubeasz]#./ezctl backup k8s-cluster1
ansible-playbook -i clusters/k8s-cluster1/hosts -e @clusters/k8s-cluster1/config.yml playbooks/94.backup.yml
2023-03-29 03:54:20 INFO cluster:k8s-cluster1 backup begins in 5s, press any key to abort:
PLAY [localhost] *****************************************************************************************************************************************************************
TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [localhost]
TASK [set NODE_IPS of the etcd cluster] ******************************************************************************************************************************************
ok: [localhost]
TASK [get etcd cluster status] ***************************************************************************************************************************************************
changed: [localhost]
TASK [debug] *********************************************************************************************************************************************************************
ok: [localhost] => {
"ETCD_CLUSTER_STATUS": {
"changed": true,
"cmd": "for ip in 192.168.11.217 192.168.11.218 192.168.11.219 ;do ETCDCTL_API=3 /etc/kubeasz/bin/etcdctl --endpoints=https://\"$ip\":2379 --cacert=/etc/kubeasz/clusters/k8s-cluster1/ssl/ca.pem --cert=/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd.pem --key=/etc/kubeasz/clusters/k8s-cluster1/ssl/etcd-key.pem endpoint health; done",
"delta": "0:00:00.655272",
"end": "2023-03-29 03:54:30.559406",
"failed": false,
"rc": 0,
"start": "2023-03-29 03:54:29.904134",
"stderr": "",
"stderr_lines": [],
"stdout": "https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 36.927011ms\nhttps://192.168.11.218:2379 is healthy: successfully committed proposal: took = 26.676368ms\nhttps://192.168.11.219:2379 is healthy: successfully committed proposal: took = 27.100284ms",
"stdout_lines": [
"https://192.168.11.217:2379 is healthy: successfully committed proposal: took = 36.927011ms",
"https://192.168.11.218:2379 is healthy: successfully committed proposal: took = 26.676368ms",
"https://192.168.11.219:2379 is healthy: successfully committed proposal: took = 27.100284ms"
]
}
}
TASK [get a running ectd node] ***************************************************************************************************************************************************
changed: [localhost]
TASK [debug] *********************************************************************************************************************************************************************
ok: [localhost] => {
"RUNNING_NODE.stdout": "192.168.11.217"
}
TASK [get current time] **********************************************************************************************************************************************************
changed: [localhost]
TASK [make a backup on the etcd node] ********************************************************************************************************************************************
changed: [localhost]
TASK [update the latest backup] **************************************************************************************************************************************************
changed: [localhost]
PLAY RECAP ***********************************************************************************************************************************************************************
localhost : ok=9 changed=5 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
#查看备份文件
[root@K8s-ansible kubeasz]#ll clusters/k8s-cluster1/backup/
total 8560
drwxr-xr-x 2 root root 4096 Mar 29 03:54 ./
drwxr-xr-x 5 root root 4096 Mar 27 12:03 ../
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot.db
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot_202303290354.db
- 查看kubeasz的数据恢复策略
#数据恢复需要停止集群服务
[root@K8s-ansible kubeasz]#cat playbooks/95.restore.yml
# cluster-restore playbook
# read the guide: 'op/cluster_restore.md'
# https://kubernetes.io/docs/tasks/administer-cluster/configure-upgrade-etcd/#restoring-an-etcd-cluster
- hosts: kube_master
tasks:
- name: stopping kube_master services
service: name={{ item }} state=stopped
with_items:
- kube-apiserver
- kube-controller-manager
- kube-scheduler
- hosts:
- kube_master
- kube_node
tasks:
- name: stopping kube_node services
service: name={{ item }} state=stopped
with_items:
- kubelet
- kube-proxy
- hosts: etcd
roles:
- cluster-restore
- hosts: kube_master
tasks:
- name: starting kube_master services
service: name={{ item }} state=started enabled=yes
with_items:
- kube-apiserver
- kube-controller-manager
- kube-scheduler
- hosts:
- kube_master
- kube_node
tasks:
- name: starting kube_node services
service: name={{ item }} state=started enabled=yes
with_items:
- kubelet
- kube-proxy
#cluster-restore
[root@K8s-ansible kubeasz]#cat roles/cluster-restore/tasks/main.yml
- name: 停止ectd 服务
service: name=etcd state=stopped
- name: 清除etcd 数据目录
file: name={{ ETCD_DATA_DIR }}/member state=absent
- name: 清除 etcd 备份目录
file: name={{ cluster_dir }}/backup/etcd-restore state=absent
delegate_to: 127.0.0.1
run_once: true
- name: etcd 数据恢复
shell: "cd {{ cluster_dir }}/backup && \
ETCDCTL_API=3 {{ base_dir }}/bin/etcdctl snapshot restore snapshot.db \
--data-dir={{ cluster_dir }}/backup/etcd-restore"
delegate_to: 127.0.0.1
run_once: true
- name: 分发恢复文件到 etcd 各个节点
copy: src={{ cluster_dir }}/backup/etcd-restore/member dest={{ ETCD_DATA_DIR }}
- name: 重启etcd 服务
service: name=etcd state=restarted
- name: 以轮询的方式等待服务同步完成
shell: "systemctl is-active etcd.service"
register: etcd_status
until: '"active" in etcd_status.stdout'
retries: 8
delay: 8
- 数据恢复
#删除数据
[root@K8s-ansible kubeasz]#kubectl delete pod net-test2 -n myserver
[root@K8s-ansible kubeasz]#kubectl delete pod net-test1 -n myserver
pod "net-test2" deleted
[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
myserver net-test3 1/1 Running 0 10m
#恢复数据
#1.防止误操作备份现有数据并将文件夹clusters/k8s-cluster1/backup/备份后进行恢复操作
[root@K8s-ansible kubeasz]#./ezctl backup k8s-cluster1
[root@K8s-ansible kubeasz]#ll clusters/k8s-cluster1/backup/
total 12836
drwxr-xr-x 2 root root 4096 Mar 29 04:04 ./
drwxr-xr-x 5 root root 4096 Mar 27 12:03 ../
-rw------- 1 root root 4374560 Mar 29 04:04 snapshot.db
-rw------- 1 root root 4374560 Mar 29 03:54 snapshot_202303290354.db #删除前备份
-rw------- 1 root root 4374560 Mar 29 04:04 snapshot_202303290404.db #最近备份
#2.将指定备份文件改名为snapshot.db
[root@K8s-ansible kubeasz]#cp -rf clusters/k8s-cluster1/backup/snapshot_202303290354.db clusters/k8s-cluster1/backup/snapshot.db
#3.恢复集群
[root@K8s-ansible kubeasz]#./ezctl restore k8s-cluster1
ansible-playbook -i clusters/k8s-cluster1/hosts -e @clusters/k8s-cluster1/config.yml playbooks/95.restore.yml
2023-03-29 04:22:11 INFO cluster:k8s-cluster1 restore begins in 5s, press any key to abort:
PLAY [kube_master] ***************************************************************************************************************************************************************
TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.211]
ok: [192.168.11.212]
ok: [192.168.11.213]
TASK [stopping kube_master services] *********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-apiserver)
changed: [192.168.11.211] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-controller-manager)
changed: [192.168.11.211] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-scheduler)
changed: [192.168.11.212] => (item=kube-scheduler)
changed: [192.168.11.211] => (item=kube-scheduler)
PLAY [kube_master,kube_node] *****************************************************************************************************************************************************
TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.215]
ok: [192.168.11.214]
ok: [192.168.11.216]
TASK [stopping kube_node services] ***********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kubelet)
changed: [192.168.11.212] => (item=kubelet)
changed: [192.168.11.216] => (item=kubelet)
changed: [192.168.11.211] => (item=kubelet)
changed: [192.168.11.214] => (item=kubelet)
changed: [192.168.11.212] => (item=kube-proxy)
changed: [192.168.11.213] => (item=kube-proxy)
changed: [192.168.11.214] => (item=kube-proxy)
changed: [192.168.11.211] => (item=kube-proxy)
changed: [192.168.11.216] => (item=kube-proxy)
changed: [192.168.11.215] => (item=kubelet)
changed: [192.168.11.215] => (item=kube-proxy)
PLAY [etcd] **********************************************************************************************************************************************************************
TASK [Gathering Facts] ***********************************************************************************************************************************************************
ok: [192.168.11.217]
ok: [192.168.11.219]
ok: [192.168.11.218]
TASK [cluster-restore : 停止ectd 服务] ***********************************************************************************************************************************************
changed: [192.168.11.218]
changed: [192.168.11.217]
changed: [192.168.11.219]
TASK [cluster-restore : 清除etcd 数据目录] *********************************************************************************************************************************************
changed: [192.168.11.217]
changed: [192.168.11.219]
changed: [192.168.11.218]
TASK [cluster-restore : 清除 etcd 备份目录] ********************************************************************************************************************************************
ok: [192.168.11.217]
TASK [cluster-restore : etcd 数据恢复] ***********************************************************************************************************************************************
changed: [192.168.11.217]
TASK [cluster-restore : 分发恢复文件到 etcd 各个节点] ***************************************************************************************************************************************
changed: [192.168.11.217]
changed: [192.168.11.219]
changed: [192.168.11.218]
TASK [cluster-restore : 重启etcd 服务] ***********************************************************************************************************************************************
changed: [192.168.11.219]
changed: [192.168.11.218]
changed: [192.168.11.217]
TASK [cluster-restore : 以轮询的方式等待服务同步完成] ******************************************************************************************************************************************
changed: [192.168.11.218]
changed: [192.168.11.219]
changed: [192.168.11.217]
PLAY [kube_master] ***************************************************************************************************************************************************************
TASK [starting kube_master services] *********************************************************************************************************************************************
changed: [192.168.11.211] => (item=kube-apiserver)
changed: [192.168.11.213] => (item=kube-apiserver)
changed: [192.168.11.212] => (item=kube-apiserver)
changed: [192.168.11.211] => (item=kube-controller-manager)
changed: [192.168.11.213] => (item=kube-controller-manager)
changed: [192.168.11.211] => (item=kube-scheduler)
changed: [192.168.11.213] => (item=kube-scheduler)
changed: [192.168.11.212] => (item=kube-controller-manager)
changed: [192.168.11.212] => (item=kube-scheduler)
PLAY [kube_master,kube_node] *****************************************************************************************************************************************************
TASK [starting kube_node services] ***********************************************************************************************************************************************
changed: [192.168.11.213] => (item=kubelet)
changed: [192.168.11.212] => (item=kubelet)
changed: [192.168.11.211] => (item=kubelet)
changed: [192.168.11.214] => (item=kubelet)
changed: [192.168.11.216] => (item=kubelet)
changed: [192.168.11.213] => (item=kube-proxy)
changed: [192.168.11.212] => (item=kube-proxy)
changed: [192.168.11.211] => (item=kube-proxy)
changed: [192.168.11.214] => (item=kube-proxy)
changed: [192.168.11.216] => (item=kube-proxy)
changed: [192.168.11.215] => (item=kubelet)
changed: [192.168.11.215] => (item=kube-proxy)
PLAY RECAP ***********************************************************************************************************************************************************************
192.168.11.211 : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.212 : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.213 : ok=5 changed=4 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.214 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.215 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.216 : ok=3 changed=2 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.217 : ok=8 changed=6 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.218 : ok=6 changed=5 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
192.168.11.219 : ok=6 changed=5 unreachable=0 failed=0 skipped=0 rescued=0 ignored=0
#4.确认恢复成功
[root@K8s-ansible kubeasz]#kubectl get pod -A
NAMESPACE NAME READY STATUS RESTARTS AGE
myserver net-test1 0/1 ContainerCreating 1 32m
myserver net-test2 0/1 ContainerCreating 1 32m
myserver net-test3 1/1 Running 0 32m
#如果需要将现有数据加入新的集群,除了数据恢复策略,还有etcd数据加入,可以参考kubeasz的数据add配置文件
[root@K8s-ansible kubeasz]#ll playbooks/21.addetcd.yml
-rw-rw-r-- 1 root root 1567 Feb 9 15:00 playbooks/21.addetcd.yml
总结 - ETCD数据恢复流程
- 当etcd集群宕机数量超过集群总节点数一半以上的时候(如总数为三台宕机两台),就会导致整合集群宕机,后期需要重新恢复数据,则恢复流程如下:
- 恢复服务器系统
- 重新部署ETCD集群
- 停止kube-apiserver/controller-manager/scheduler/kubelet/kube-proxy
- 停止ETCD集群
- 各ETCD节点恢复同一份备份数据
- 启动各节点并验证ETCD集群
- 启动kube-apiserver/controller-manager/scheduler/kubelet/kube-proxy
- 验证kubernetes master状态及pod数据
我是moore,大家一起加油!!!