Segmentation fault on CA certificate rotation
Fil0sOFF opened this issue · comments
RKE version: v1.4.8
Docker version: v24.0.7
Client: Docker Engine - Community
Version: 24.0.7
API version: 1.43
Go version: go1.20.10
Git commit: afdd53b
Built: Thu Oct 26 09:07:41 2023
OS/Arch: linux/amd64
Context: default
Server: Docker Engine - Community
Engine:
Version: 24.0.7
API version: 1.43 (minimum version 1.12)
Go version: go1.20.10
Git commit: 311b9ff
Built: Thu Oct 26 09:07:41 2023
OS/Arch: linux/amd64
Experimental: false
containerd:
Version: 1.6.25
GitCommit: d8f198a4ed8892c764191ef7b3b06d8a2eeb5c7f
runc:
Version: 1.1.10
GitCommit: v1.1.10-0-g18a0cb0
docker-init:
Version: 0.19.0
GitCommit: de40ad0
Type/provider of hosts: VirtualBox
cluster.yml file:
nodes:
- address: 192.168.56.101
hostname_override: rke-01
user: root
role:
- controlplane
- etcd
- worker
port: 22
labels:
app: ingress
- address: 192.168.56.102
hostname_override: rke-02
user: root
role:
- controlplane
- etcd
- worker
port: 22
labels:
app: ingress
- address: 192.168.56.103
hostname_override: rke-03
user: root
role:
- controlplane
- etcd
- worker
port: 22
labels:
app: ingress
ignore_docker_version: true
ssh_key_path: ~/.ssh/id_rsa_rke
ssh_agent_auth: false
cluster_name: rke-sb-01
kubernetes_version: v1.23.16-rancher2-3
services:
etcd:
snapshot: true
creation: 6h
retention: 24h
kube-api:
service_cluster_ip_range: 10.43.0.0/16
service_node_port_range: 30000-32767
pod_security_policy: false
kube-controller:
# CIDR pool used to assign IP addresses to pods in the cluster
cluster_cidr: 10.42.0.0/16
kubelet:
# Base domain for the cluster
cluster_domain: cluster.local
# IP address for the DNS service endpoint
cluster_dns_server: 10.43.0.10
# Fail if swap is on
fail_swap_on: true
# Set max pods to 250 instead of default 110
extra_args:
image-gc-high-threshold: 80
image-gc-low-threshold: 75
max-pods: 250
# Optionally define additional volume binds to a service
extra_binds:
- "/usr/libexec/kubernetes/kubelet-plugins:/usr/libexec/kubernetes/kubelet-plugins"
- "/lib/modules:/lib/modules"
authentication:
strategy: x509
authorization:
mode: rbac
addon_job_timeout: 60
network:
plugin: calico
dns:
provider: coredns
upstreamnameservers:
- 1.1.1.1
- 8.8.8.8
ingress:
provider: nginx
node_selector:
app: ingress
upgrade_strategy:
max_unavailable_worker: 10%
max_unavailable_controlplane: "1"
Steps to Reproduce:
./rke_linux-amd64 up
./rke_linux-amd64 cert rotate --rotate-ca --ignore-docker-version
Results:
The
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x60 pc=0x13c39c9]
goroutine 387 [running]:
github.com/rancher/rke/k8s.DeletePods(0x1c23d60, 0x0)
/go/src/github.com/rancher/rke/k8s/pod.go:13 +0x29
github.com/rancher/rke/cluster.RestartClusterPods.func1()
/go/src/github.com/rancher/rke/cluster/cluster.go:1101 +0x17f
golang.org/x/sync/errgroup.(*Group).Go.func1()
/go/pkg/mod/golang.org/x/sync@v0.0.0-20220601150217-0de741cfad7f/errgroup/errgroup.go:75 +0x64
created by golang.org/x/sync/errgroup.(*Group).Go
/go/pkg/mod/golang.org/x/sync@v0.0.0-20220601150217-0de741cfad7f/errgroup/errgroup.go:72 +0xa9
Looks like the fix is as simple as
diff --git a/cluster/cluster.go b/cluster/cluster.go
index 254af7ca..2883472e 100644
--- a/cluster/cluster.go
+++ b/cluster/cluster.go
@@ -1139,9 +1139,11 @@ func RestartClusterPods(ctx context.Context, kubeCluster *Cluster) error {
errList = append(errList, err)
}
// delete pods
- err = k8s.DeletePods(kubeClient, pods)
- if err != nil {
- errList = append(errList, err)
+ if pods != nil {
+ err = k8s.DeletePods(kubeClient, pods)
+ if err != nil {
+ errList = append(errList, err)
+ }
}
}
return util.ErrList(errList)
I could've created a PR, but I don't know the correct target branch. Should I create 2, one for release/v1.4
and one for release/v1.5
?
@vardhaman22 hi, not sure if you are right person about this issue. but can you please take a look?
This repository uses an automated workflow to automatically label issues which have not had any activity (commit/comment/label) for 60 days. This helps us manage the community issues better. If the issue is still relevant, please add a comment to the issue so the workflow can remove the label and we know it is still valid. If it is no longer relevant (or possibly fixed in the latest release), the workflow will automatically close the issue in 14 days. Thank you for your contributions.