Segmentation fault on CA certificate rotation

Question

Segmentation fault on CA certificate rotation

Fil0sOFF opened this issue 10 months ago · comments

RKE version: v1.4.8

Docker version: v24.0.7

Client: Docker Engine - Community
 Version:           24.0.7
 API version:       1.43
 Go version:        go1.20.10
 Git commit:        afdd53b
 Built:             Thu Oct 26 09:07:41 2023
 OS/Arch:           linux/amd64
 Context:           default

Server: Docker Engine - Community
 Engine:
  Version:          24.0.7
  API version:      1.43 (minimum version 1.12)
  Go version:       go1.20.10
  Git commit:       311b9ff
  Built:            Thu Oct 26 09:07:41 2023
  OS/Arch:          linux/amd64
  Experimental:     false
 containerd:
  Version:          1.6.25
  GitCommit:        d8f198a4ed8892c764191ef7b3b06d8a2eeb5c7f
 runc:
  Version:          1.1.10
  GitCommit:        v1.1.10-0-g18a0cb0
 docker-init:
  Version:          0.19.0
  GitCommit:        de40ad0

**Operating system and kernel: Ubuntu 22.04.3 LTS, 5.15.0-87-generic**

Type/provider of hosts: VirtualBox

cluster.yml file:

nodes:
  - address: 192.168.56.101
    hostname_override: rke-01
    user: root
    role:
      - controlplane
      - etcd
      - worker
    port: 22
    labels:
      app: ingress

  - address: 192.168.56.102
    hostname_override: rke-02
    user: root
    role:
      - controlplane
      - etcd
      - worker
    port: 22
    labels:
      app: ingress

  - address: 192.168.56.103
    hostname_override: rke-03
    user: root
    role:
      - controlplane
      - etcd
      - worker
    port: 22
    labels:
      app: ingress

ignore_docker_version: true

ssh_key_path: ~/.ssh/id_rsa_rke
ssh_agent_auth: false

cluster_name: rke-sb-01

kubernetes_version: v1.23.16-rancher2-3

services:
  etcd:
    snapshot: true
    creation: 6h
    retention: 24h
  kube-api:
    service_cluster_ip_range: 10.43.0.0/16
    service_node_port_range: 30000-32767
    pod_security_policy: false


  kube-controller:
    # CIDR pool used to assign IP addresses to pods in the cluster
    cluster_cidr: 10.42.0.0/16
  kubelet:
    # Base domain for the cluster
    cluster_domain: cluster.local
    # IP address for the DNS service endpoint
    cluster_dns_server: 10.43.0.10
    # Fail if swap is on
    fail_swap_on: true
    # Set max pods to 250 instead of default 110
    extra_args:
      image-gc-high-threshold: 80
      image-gc-low-threshold: 75
      max-pods: 250
    # Optionally define additional volume binds to a service
    extra_binds:
      - "/usr/libexec/kubernetes/kubelet-plugins:/usr/libexec/kubernetes/kubelet-plugins"
      - "/lib/modules:/lib/modules"

authentication:
  strategy: x509

authorization:
  mode: rbac

addon_job_timeout: 60

network:
  plugin: calico

dns:
  provider: coredns
  upstreamnameservers:
    - 1.1.1.1
    - 8.8.8.8

ingress:
  provider: nginx
  node_selector:
    app: ingress

upgrade_strategy:
  max_unavailable_worker: 10%
  max_unavailable_controlplane: "1"

Steps to Reproduce:

./rke_linux-amd64 up
./rke_linux-amd64 cert rotate --rotate-ca --ignore-docker-version

Results:
The

panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x60 pc=0x13c39c9]
goroutine 387 [running]:
github.com/rancher/rke/k8s.DeletePods(0x1c23d60, 0x0)
	/go/src/github.com/rancher/rke/k8s/pod.go:13 +0x29
github.com/rancher/rke/cluster.RestartClusterPods.func1()
	/go/src/github.com/rancher/rke/cluster/cluster.go:1101 +0x17f
golang.org/x/sync/errgroup.(*Group).Go.func1()
	/go/pkg/mod/golang.org/x/sync@v0.0.0-20220601150217-0de741cfad7f/errgroup/errgroup.go:75 +0x64
created by golang.org/x/sync/errgroup.(*Group).Go
	/go/pkg/mod/golang.org/x/sync@v0.0.0-20220601150217-0de741cfad7f/errgroup/errgroup.go:72 +0xa9

Looks like the fix is as simple as

diff --git a/cluster/cluster.go b/cluster/cluster.go
index 254af7ca..2883472e 100644
--- a/cluster/cluster.go
+++ b/cluster/cluster.go
@@ -1139,9 +1139,11 @@ func RestartClusterPods(ctx context.Context, kubeCluster *Cluster) error {
                                        errList = append(errList, err)
                                }
                                // delete pods
-                               err = k8s.DeletePods(kubeClient, pods)
-                               if err != nil {
-                                       errList = append(errList, err)
+                               if pods != nil {
+                                       err = k8s.DeletePods(kubeClient, pods)
+                                       if err != nil {
+                                               errList = append(errList, err)
+                                       }
                                }
                        }
                        return util.ErrList(errList)

I could've created a PR, but I don't know the correct target branch. Should I create 2, one for release/v1.4 and one for release/v1.5 ?

Namelessinterester · Answer 1 · Mon Jan 29 2024 09:00:46 GMT+0800 (China Standard Time)

@vardhaman22 hi, not sure if you are right person about this issue. but can you please take a look?

github-actions · Answer 2 · Fri Mar 29 2024 10:02:56 GMT+0800 (China Standard Time)

This repository uses an automated workflow to automatically label issues which have not had any activity (commit/comment/label) for 60 days. This helps us manage the community issues better. If the issue is still relevant, please add a comment to the issue so the workflow can remove the label and we know it is still valid. If it is no longer relevant (or possibly fixed in the latest release), the workflow will automatically close the issue in 14 days. Thank you for your contributions.