From 3a6d43f78ac16df543543b3224cac89734424cbd Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Mon, 6 Oct 2025 14:06:46 +0200 Subject: [PATCH 01/27] OCPEDGE-2037: Add certificate authority controller --- pkg/components/certificateauthority.go | 167 +++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 pkg/components/certificateauthority.go diff --git a/pkg/components/certificateauthority.go b/pkg/components/certificateauthority.go new file mode 100644 index 0000000000..da8cab39f7 --- /dev/null +++ b/pkg/components/certificateauthority.go @@ -0,0 +1,167 @@ +package components + +import ( + "context" + "fmt" + "os" + "path/filepath" + + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + + "github.com/openshift/microshift/pkg/config" + "github.com/openshift/microshift/pkg/util/cryptomaterial" +) + +const ( + // Resource namespace + caResourceNamespace = "kube-system" +) + +var ( + CertificateAuthorityResources = []struct { + Name string + Dir string + }{ + {Name: "kube-control-plane-signer", Dir: cryptomaterial.KubeControlPlaneSignerCertDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kube-apiserver-to-kubelet-signer", Dir: cryptomaterial.KubeAPIServerToKubeletSignerCertDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "admin-kubeconfig-signer", Dir: cryptomaterial.AdminKubeconfigSignerDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kubelet-signer", Dir: cryptomaterial.KubeletCSRSignerSignerCertDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kube-csr-signer", Dir: cryptomaterial.CSRSignerCertDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "aggregator-signer", Dir: cryptomaterial.AggregatorSignerDir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "service-ca", Dir: cryptomaterial.ServiceCADir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "ingress-ca", Dir: cryptomaterial.IngressCADir(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kube-apiserver-external-signer", Dir: cryptomaterial.KubeAPIServerExternalSigner(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kube-apiserver-localhost-signer", Dir: cryptomaterial.KubeAPIServerLocalhostSigner(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "kube-apiserver-service-network-signer", Dir: cryptomaterial.KubeAPIServerServiceNetworkSigner(cryptomaterial.CertsDirectory(config.DataDir))}, + {Name: "etcd-signer", Dir: cryptomaterial.EtcdSignerDir(cryptomaterial.CertsDirectory(config.DataDir))}, + } +) + +func startCertificateAuthorityController(ctx context.Context, kubeconfigPath string) error { + client, err := getKubernetesClient(kubeconfigPath) + if err != nil { + return fmt.Errorf("failed to get Kubernetes client: %w", err) + } + + resourceNames := make([]string, len(CertificateAuthorityResources)) + for i, resource := range CertificateAuthorityResources { + resourceNames[i] = resource.Name + if err := exposeCertificateAuthority(ctx, client, resource.Dir, resource.Name); err != nil { + return fmt.Errorf("failed to expose certificate authority %s: %w", resource.Name, err) + } + } + + err = createClusterRole(ctx, client, resourceNames) + if err != nil { + return fmt.Errorf("failed to create etcd CA admin Role: %w", err) + } + return nil +} + +func getKubernetesClient(kubeconfigPath string) (*kubernetes.Clientset, error) { + restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, err + } + return kubernetes.NewForConfig(restConfig) +} + +func createClusterRole(ctx context.Context, client kubernetes.Interface, resourceNames []string) error { + role := &rbacv1.Role{ + ObjectMeta: metav1.ObjectMeta{ + Name: "microshift-ca-admin", + Namespace: caResourceNamespace, + }, + Rules: []rbacv1.PolicyRule{ + { + APIGroups: []string{""}, + Resources: []string{"secrets"}, + ResourceNames: resourceNames, + Verbs: []string{"*"}, + }, + }, + } + + _, err := client.RbacV1().Roles(caResourceNamespace).Create(ctx, role, metav1.CreateOptions{}) + if err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create etcd CA admin Role: %w", err) + } + + roleBinding := &rbacv1.RoleBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "microshift-ca-admin-binding", + Namespace: caResourceNamespace, + }, + Subjects: []rbacv1.Subject{ + { + Kind: "Group", + Name: "system:masters", + APIGroup: rbacv1.GroupName, + }, + }, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "Role", + Name: "microshift-ca-admin", + }, + } + + _, err = client.RbacV1().RoleBindings(caResourceNamespace).Create(ctx, roleBinding, metav1.CreateOptions{}) + if err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create etcd CA admin RoleBinding: %w", err) + } + + return nil +} + +func exposeCertificateAuthority(ctx context.Context, client kubernetes.Interface, dir, name string) error { + caCertPath := cryptomaterial.CACertPath(dir) + caKeyPath := cryptomaterial.CAKeyPath(dir) + serialPath := filepath.Join(dir, "serial.txt") + + caCert, err := os.ReadFile(caCertPath) + if err != nil { + return fmt.Errorf("failed to read CA certificate from %s: %w", caCertPath, err) + } + caKey, err := os.ReadFile(caKeyPath) + if err != nil { + return fmt.Errorf("failed to read CA key from %s: %w", caKeyPath, err) + } + serial, err := os.ReadFile(serialPath) + if err != nil { + return fmt.Errorf("failed to read CA serial: %w", err) + } + + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: caResourceNamespace, + }, + Type: corev1.SecretTypeOpaque, + Data: map[string][]byte{ + "ca.crt": caCert, + "ca.key": caKey, + "serial.txt": serial, + }, + } + + caBundlePath := cryptomaterial.CABundlePath(dir) + if _, err := os.Stat(caBundlePath); err == nil { + caBundle, err := os.ReadFile(caBundlePath) + if err != nil { + return fmt.Errorf("failed to read CA bundle from %s: %w", caBundlePath, err) + } + secret.Data["ca-bundle.crt"] = caBundle + } + + _, err = client.CoreV1().Secrets(caResourceNamespace).Create(ctx, secret, metav1.CreateOptions{}) + if err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create CA secret %s: %w", name, err) + } + return nil +} From dbf5715865bfaf4c8c8a4ddad6aef6abbf4f59fd Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Mon, 6 Oct 2025 14:40:59 +0200 Subject: [PATCH 02/27] OCPEDGE-2037: Add kubelet bootstrap functions for config handling --- pkg/config/kubeconfig.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pkg/config/kubeconfig.go b/pkg/config/kubeconfig.go index bd0ec842b2..2d18b9de08 100644 --- a/pkg/config/kubeconfig.go +++ b/pkg/config/kubeconfig.go @@ -1,6 +1,9 @@ package config -import "path/filepath" +import ( + "os" + "path/filepath" +) // KubeConfigID identifies the different kubeconfigs managed in the DataDir type KubeConfigID string @@ -27,3 +30,12 @@ func (cfg *Config) KubeConfigAdminPath(id string) string { func (cfg *Config) KubeConfigRootAdminPath() string { return filepath.Join(DataDir, "resources", string(KubeAdmin)) } + +func (cfg *Config) BootstrapKubeConfigPath() string { + return filepath.Join(DataDir, "bootstrap", "kubeconfig") +} + +func (cfg *Config) BootstrapKubeConfigExists() bool { + _, err := os.Stat(cfg.BootstrapKubeConfigPath()) + return err == nil +} From d8d676cc4efcdb73613d56848dfdda786f8e245e Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Mon, 6 Oct 2025 14:48:57 +0200 Subject: [PATCH 03/27] OCPEDGE-2037: Configure kubelet to use bootstrap kubeconfig if present --- pkg/node/kubelet.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pkg/node/kubelet.go b/pkg/node/kubelet.go index d5d67b2d38..de6f19d862 100644 --- a/pkg/node/kubelet.go +++ b/pkg/node/kubelet.go @@ -77,6 +77,9 @@ func (s *KubeletServer) configure(cfg *config.Config) { } kubeletFlags := kubeletoptions.NewKubeletFlags() kubeletFlags.BootstrapKubeconfig = cfg.KubeConfigPath(config.Kubelet) + if cfg.BootstrapKubeConfigExists() { + kubeletFlags.BootstrapKubeconfig = cfg.BootstrapKubeConfigPath() + } kubeletFlags.KubeConfig = cfg.KubeConfigPath(config.Kubelet) kubeletFlags.RuntimeCgroups = "/system.slice/crio.service" kubeletFlags.HostnameOverride = cfg.Node.HostnameOverride From 9297227d3f4bd7c23e3966da38bbede2377b77c3 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Mon, 6 Oct 2025 14:56:09 +0200 Subject: [PATCH 04/27] OCPEDGE-2037: Clean up kubelet config assets --- assets/core/kubelet.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/core/kubelet.yaml b/assets/core/kubelet.yaml index 47f4102c1f..60298cd0d3 100644 --- a/assets/core/kubelet.yaml +++ b/assets/core/kubelet.yaml @@ -18,14 +18,14 @@ featureGates: APIPriorityAndFairness: true DownwardAPIHugePages: true PodSecurity: true - RotateKubeletServerCertificate: false # TODO + RotateKubeletServerCertificate: false kubeAPIBurst: 100 kubeAPIQPS: 50 maxPods: 250 nodeStatusReportFrequency: 5m -rotateCertificates: false # TODO +rotateCertificates: false serializeImagePulls: false -serverTLSBootstrap: false # TODO +serverTLSBootstrap: false tlsCertFile: "{{ .tlsCertFile }}" tlsCipherSuites: [{{ .tlsCipherSuites }}] tlsMinVersion: "{{ .tlsMinVersion }}" From b643256b1a1f235dd8aa737b311a9fb97da24883 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Mon, 6 Oct 2025 15:13:56 +0200 Subject: [PATCH 05/27] OCPEDGE-2037: Add node ip address to etcd certificates --- pkg/cmd/init.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/init.go b/pkg/cmd/init.go index 798dfdf2e6..2bb339f19b 100644 --- a/pkg/cmd/init.go +++ b/pkg/cmd/init.go @@ -324,7 +324,7 @@ func certSetup(cfg *config.Config) (*certchains.CertificateChains, error) { Validity: cryptomaterial.LongLivedCertificateValidity, }, UserInfo: &user.DefaultInfo{Name: "system:etcd-peer:etcd-client", Groups: []string{"system:etcd-peers"}}, - Hostnames: []string{"localhost", cfg.Node.HostnameOverride}, + Hostnames: []string{"localhost", cfg.Node.HostnameOverride, cfg.Node.NodeIP}, }, &certchains.PeerCertificateSigningRequestInfo{ CSRMeta: certchains.CSRMeta{ @@ -332,7 +332,7 @@ func certSetup(cfg *config.Config) (*certchains.CertificateChains, error) { Validity: cryptomaterial.LongLivedCertificateValidity, }, UserInfo: &user.DefaultInfo{Name: "system:etcd-server:etcd-client", Groups: []string{"system:etcd-servers"}}, - Hostnames: []string{"localhost", cfg.Node.HostnameOverride}, + Hostnames: []string{"localhost", cfg.Node.HostnameOverride, cfg.Node.NodeIP}, }, ), ).WithCABundle( From a22067ff0fce852ed613b8a9010f9bd3875060c9 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 09:15:00 +0200 Subject: [PATCH 06/27] OCPEDGE-2037: Start CA controller --- pkg/components/components.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/components/components.go b/pkg/components/components.go index ed1ca8cb09..665ce80424 100755 --- a/pkg/components/components.go +++ b/pkg/components/components.go @@ -44,5 +44,10 @@ func StartComponents(cfg *config.Config, ctx context.Context) error { return err } + if err := startCertificateAuthorityController(ctx, kubeAdminConfig); err != nil { + klog.Warningf("Failed to start certificate authority controller: %v", err) + return err + } + return nil } From 0b5bac1edce486caeca249b6c7efdddf1cd874ee Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 09:29:19 +0200 Subject: [PATCH 07/27] OCPEDGE-2037: Update etcd serving configuration --- etcd/cmd/microshift-etcd/run.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/etcd/cmd/microshift-etcd/run.go b/etcd/cmd/microshift-etcd/run.go index c0a1204a19..25f9f61321 100644 --- a/etcd/cmd/microshift-etcd/run.go +++ b/etcd/cmd/microshift-etcd/run.go @@ -75,20 +75,17 @@ func (s *EtcdService) configure(cfg *config.Config) { // based on https://github.com/openshift/cluster-etcd-operator/blob/master/bindata/bootkube/bootstrap-manifests/etcd-member-pod.yaml#L19 s.etcdCfg = etcd.NewConfig() s.etcdCfg.ClusterState = "new" - //s.etcdCfg.ForceNewCluster = true //TODO s.etcdCfg.Logger = "zap" s.etcdCfg.Dir = dataDir s.etcdCfg.QuotaBackendBytes = cfg.Etcd.QuotaBackendBytes - url2380 := setURL([]string{"localhost"}, "2380") - url2379 := setURL([]string{"localhost"}, "2379") - s.etcdCfg.AdvertisePeerUrls = url2380 - s.etcdCfg.ListenPeerUrls = url2380 - s.etcdCfg.AdvertiseClientUrls = url2379 - s.etcdCfg.ListenClientUrls = url2379 - s.etcdCfg.ListenMetricsUrls = setURL([]string{"localhost"}, "2381") + s.etcdCfg.AdvertisePeerUrls = setURL([]string{cfg.Node.NodeIP}, "2380") + s.etcdCfg.ListenPeerUrls = setURL([]string{"0.0.0.0"}, "2380") + s.etcdCfg.AdvertiseClientUrls = setURL([]string{cfg.Node.NodeIP}, "2379") + s.etcdCfg.ListenClientUrls = setURL([]string{"0.0.0.0"}, "2379") + s.etcdCfg.ListenMetricsUrls = setURL([]string{cfg.Node.NodeIP}, "2381") s.etcdCfg.Name = cfg.Node.HostnameOverride - s.etcdCfg.InitialCluster = fmt.Sprintf("%s=https://%s:2380", cfg.Node.HostnameOverride, "localhost") + s.etcdCfg.InitialCluster = fmt.Sprintf("%s=https://%s:2380", cfg.Node.HostnameOverride, cfg.Node.NodeIP) s.etcdCfg.TlsMinVersion = getTLSMinVersion(cfg.ApiServer.TLS.MinVersion) if cfg.ApiServer.TLS.MinVersion != string(configv1.VersionTLS13) { From 9ba4405da930f445d25495f493c8f9d4c9e7585d Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 09:55:48 +0200 Subject: [PATCH 08/27] OCPEDGE-2037: Add etcd config file --- etcd/cmd/microshift-etcd/run.go | 35 +++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/etcd/cmd/microshift-etcd/run.go b/etcd/cmd/microshift-etcd/run.go index 25f9f61321..909874fe41 100644 --- a/etcd/cmd/microshift-etcd/run.go +++ b/etcd/cmd/microshift-etcd/run.go @@ -9,6 +9,7 @@ import ( "os" "os/signal" "path/filepath" + "strings" "syscall" "time" @@ -100,6 +101,8 @@ func (s *EtcdService) configure(cfg *config.Config) { s.etcdCfg.PeerTLSInfo.TrustedCAFile = etcdSignerCertPath s.etcdCfg.ExperimentalMaxLearners = MaxLearners + + updateConfigFromFile(s.etcdCfg, getConfigFilePath()) } func (s *EtcdService) Run() error { @@ -214,3 +217,35 @@ func checkFragmentationPercentage(ondisk, inuse int64) float64 { fragmentedPercentage := (diff / float64(ondisk)) * 100 return math.Round(fragmentedPercentage*100) / 100 } + +func getConfigFilePath() string { + return filepath.Join(config.DataDir, "etcd", "config") +} + +func updateConfigFromFile(etcdCfg *etcd.Config, configPath string) { + data, err := os.ReadFile(configPath) + if err != nil { + klog.Errorf("failed to read config file: %v", err) + return + } + lines := strings.Split(string(data), "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" || strings.HasPrefix(line, "#") { + continue + } + eqIdx := strings.Index(line, "=") + if eqIdx == -1 { + continue + } + parts := []string{line[:eqIdx], line[eqIdx+1:]} + key := strings.TrimSpace(parts[0]) + val := strings.TrimSpace(parts[1]) + switch key { + case "ETCD_INITIAL_CLUSTER": + etcdCfg.InitialCluster = val + case "ETCD_INITIAL_CLUSTER_STATE": + etcdCfg.ClusterState = val + } + } +} From 34ad026251261a882ab4649b8bc91930c49f3014 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 10:03:48 +0200 Subject: [PATCH 09/27] OCPEDGE-2037: Update etcd controller readiness check --- pkg/controllers/etcd.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/controllers/etcd.go b/pkg/controllers/etcd.go index 06cd7f6be8..a65d21db85 100644 --- a/pkg/controllers/etcd.go +++ b/pkg/controllers/etcd.go @@ -175,6 +175,14 @@ func checkIfEtcdIsReady(ctx context.Context) error { } defer func() { _ = client.Close() }() + s, err := client.Status(ctx, "localhost:2379") + if err != nil { + return fmt.Errorf("failed to get etcd status: %v", err) + } + if s.IsLearner { + return nil + } + for i := 0; i < HealthCheckRetries; i++ { time.Sleep(HealthCheckWait) if _, err = client.Get(ctx, "health"); err == nil { From 21d702048ba7e450ea3c4484b968ceec42bb0ca9 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 10:06:58 +0200 Subject: [PATCH 10/27] OCPEDGE-2037: Update multinode config --- pkg/config/multinode.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pkg/config/multinode.go b/pkg/config/multinode.go index ea5d8255c3..6db5e73322 100644 --- a/pkg/config/multinode.go +++ b/pkg/config/multinode.go @@ -14,12 +14,5 @@ func ConfigMultiNode(c *Config, enabled bool) *Config { } c.MultiNode.Enabled = enabled c.MultiNode.Controlplane = c.Node.NodeIP - - // Use controlplane node IP as APIServer backend (instead of next available - // IP from service network) - c.ApiServer.AdvertiseAddress = c.Node.NodeIP - // Don't configure the advertise address on the device - c.ApiServer.SkipInterface = true - return c } From fff7c2c597126bddf4aba41cea9a1e689e242694 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 10:25:24 +0200 Subject: [PATCH 11/27] OCPEDGE-2037: Update apiserver controller configuration handling --- pkg/controllers/kube-apiserver.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index 8b58193b8d..2968e246f9 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -75,7 +75,7 @@ func init() { type KubeAPIServer struct { kasConfigBytes []byte verbosity int - configureErr error // todo: report configuration errors immediately + configuration *config.Config masterURL string servingCAPath string @@ -83,9 +83,8 @@ type KubeAPIServer struct { } func NewKubeAPIServer(cfg *config.Config) *KubeAPIServer { - s := &KubeAPIServer{} - if err := s.configure(cfg); err != nil { - s.configureErr = err + s := &KubeAPIServer{ + configuration: cfg, } return s } @@ -309,8 +308,8 @@ func (s *KubeAPIServer) configureAuditPolicy(cfg *config.Config) error { } func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped chan<- struct{}) error { - if s.configureErr != nil { - return fmt.Errorf("configuration failed: %w", s.configureErr) + if err := s.configure(s.configuration); err != nil { + return fmt.Errorf("configuration failed: %w", err) } defer close(stopped) From 67074cfb89f3763e43b1452905bf255df712ced2 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 10:38:54 +0200 Subject: [PATCH 12/27] OCPEDGE-2037: Add etcd-severs discovery to apiserver --- pkg/controllers/kube-apiserver.go | 115 ++++++++++++++++++++++++++---- 1 file changed, 103 insertions(+), 12 deletions(-) diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index 2968e246f9..75a3973308 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -28,6 +28,8 @@ import ( "strings" "time" + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" @@ -162,20 +164,23 @@ func (s *KubeAPIServer) configure(cfg *config.Config) error { } } + etcdServers, err := discoverEtcdServers(s.configuration.Node.HostnameOverride, s.configuration.BootstrapKubeConfigPath()) + if err != nil { + return fmt.Errorf("failed to discover etcd servers: %w", err) + } + overrides := &kubecontrolplanev1.KubeAPIServerConfig{ APIServerArguments: map[string]kubecontrolplanev1.Arguments{ - "advertise-address": {s.advertiseAddress}, - "audit-policy-file": {filepath.Join(config.DataDir, "/resources/kube-apiserver-audit-policies/default.yaml")}, - "audit-log-maxage": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFileAge)}, - "audit-log-maxbackup": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFiles)}, - "audit-log-maxsize": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFileSize)}, - "client-ca-file": {clientCABundlePath}, - "etcd-cafile": {cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir))}, - "etcd-certfile": {cryptomaterial.ClientCertPath(etcdClientCertDir)}, - "etcd-keyfile": {cryptomaterial.ClientKeyPath(etcdClientCertDir)}, - "etcd-servers": { - "https://localhost:2379", - }, + "advertise-address": {s.advertiseAddress}, + "audit-policy-file": {filepath.Join(config.DataDir, "/resources/kube-apiserver-audit-policies/default.yaml")}, + "audit-log-maxage": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFileAge)}, + "audit-log-maxbackup": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFiles)}, + "audit-log-maxsize": {strconv.Itoa(cfg.ApiServer.AuditLog.MaxFileSize)}, + "client-ca-file": {clientCABundlePath}, + "etcd-cafile": {cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir))}, + "etcd-certfile": {cryptomaterial.ClientCertPath(etcdClientCertDir)}, + "etcd-keyfile": {cryptomaterial.ClientKeyPath(etcdClientCertDir)}, + "etcd-servers": etcdServers, "kubelet-certificate-authority": {cryptomaterial.CABundlePath(kubeCSRSignerDir)}, "kubelet-client-certificate": {cryptomaterial.ClientCertPath(kubeletClientDir)}, "kubelet-client-key": {cryptomaterial.ClientKeyPath(kubeletClientDir)}, @@ -406,3 +411,89 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped panic(perr) } } + +func discoverEtcdServers(hostname, kubeconfigPath string) ([]string, error) { + certsDir := cryptomaterial.CertsDirectory(config.DataDir) + etcdPeerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) + + tlsInfo := transport.TLSInfo{ + CertFile: cryptomaterial.PeerCertPath(etcdPeerCertDir), + KeyFile: cryptomaterial.PeerKeyPath(etcdPeerCertDir), + TrustedCAFile: cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir)), + } + tlsConfig, err := tlsInfo.ClientConfig() + if err != nil { + return nil, fmt.Errorf("failed to create etcd client TLS config: %v", err) + } + + client, err := clientv3.New(clientv3.Config{ + DialTimeout: 5 * time.Second, + Endpoints: []string{"https://localhost:2379"}, + TLS: tlsConfig, + }) + if err != nil { + return nil, fmt.Errorf("failed to create etcd client: %w", err) + } + defer client.Close() + + st, err := client.Status(context.Background(), "localhost:2379") + if err != nil { + return nil, fmt.Errorf("failed to get etcd status: %w", err) + } + if st.IsLearner { + //TODO if its a learner I need to take the server from the current non-learner members. Use the bootstrap for that. + kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("failed to load bootstrap kubeconfig: %w", err) + } + + if kubeconfig == nil || kubeconfig.Clusters == nil || len(kubeconfig.Clusters) == 0 { + return nil, fmt.Errorf("invalid bootstrap kubeconfig: no clusters found") + } + + var etcdHost string + for _, cluster := range kubeconfig.Clusters { + etcdHost = cluster.Server + break + } + + if etcdHost == "" { + return nil, fmt.Errorf("failed to extract etcd hostname from bootstrap kubeconfig") + } + + etcdHost = strings.TrimPrefix(etcdHost, "https://") + etcdHost, _, _ = net.SplitHostPort(etcdHost) + etcdHost = fmt.Sprintf("https://%s", net.JoinHostPort(etcdHost, "2379")) + client, err = clientv3.New(clientv3.Config{ + DialTimeout: 5 * time.Second, + Endpoints: []string{etcdHost}, + TLS: tlsConfig, + }) + if err != nil { + return nil, fmt.Errorf("failed to create etcd client: %w", err) + } + } + + resp, err := client.MemberList(context.Background()) + if err != nil { + return nil, fmt.Errorf("failed to retrieve etcd member list: %w", err) + } + + //TODO I already know if I am a learner, I had to do this before. + iAmLearner := false + var members []string + for _, member := range resp.Members { + if member.Name == hostname && member.IsLearner { + iAmLearner = true + continue + } + if !member.IsLearner { + members = append(members, member.ClientURLs...) + } + } + if iAmLearner { + return members, nil + } + + return []string{"https://localhost:2379"}, nil +} From 90e08203a7393f0423f987b229ce10ac3048bfa3 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 11:11:07 +0200 Subject: [PATCH 13/27] OCPEDGE-2037: Add join-cluster command --- cmd/microshift/main.go | 1 + pkg/cmd/joincluster.go | 531 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 532 insertions(+) create mode 100644 pkg/cmd/joincluster.go diff --git a/cmd/microshift/main.go b/cmd/microshift/main.go index e303b16c8e..917047ca06 100644 --- a/cmd/microshift/main.go +++ b/cmd/microshift/main.go @@ -41,5 +41,6 @@ func newCommand() *cobra.Command { cmd.AddCommand(cmds.NewBackupCommand()) cmd.AddCommand(cmds.NewRestoreCommand()) cmd.AddCommand(cmds.NewHealthcheckCommand()) + cmd.AddCommand(cmds.NewJoinClusterCommand()) return cmd } diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go new file mode 100644 index 0000000000..43d85b74d5 --- /dev/null +++ b/pkg/cmd/joincluster.go @@ -0,0 +1,531 @@ +package cmd + +import ( + "context" + "crypto/x509" + "fmt" + "math/big" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + librarycrypto "github.com/openshift/library-go/pkg/crypto" + "github.com/openshift/microshift/pkg/components" + "github.com/openshift/microshift/pkg/config" + "github.com/openshift/microshift/pkg/util/cryptomaterial" + "github.com/spf13/cobra" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/apiserver/pkg/authentication/user" + + "go.etcd.io/etcd/client/pkg/v3/transport" + clientv3 "go.etcd.io/etcd/client/v3" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" + "k8s.io/klog/v2" +) + +const ( + // Default timeout for operations + joinDefaultTimeout = 10 * time.Minute +) + +type JoinClusterOptions struct { + KubeconfigPath string + Timeout time.Duration + Learner bool +} + +func NewJoinClusterCommand() *cobra.Command { + opts := &JoinClusterOptions{ + KubeconfigPath: "/var/lib/microshift/resources/kubeadmin/bootstrap/kubeconfig", + Timeout: joinDefaultTimeout, + } + + cmd := &cobra.Command{ + Use: "join-cluster", + Short: "Join a node to an existing MicroShift cluster", + Long: `This command joins a node to an existing MicroShift cluster by: +1. Loading the MicroShift configuration for current node. +2. Fetching etcd CA certificate and key from the cluster using provided kubeconfig to generate etcd certificates. +4. Configuring etcd cluster to add the new member. +5. Configuring kubelet to bootstrap into the cluster. +6. Restarting the MicroShift systemd unit. +7. Verifying the node is ready in the cluster.`, + RunE: func(cmd *cobra.Command, args []string) error { + return runJoinCluster(cmd.Context(), opts) + }, + } + + cmd.Flags().StringVar(&opts.KubeconfigPath, "kubeconfig", opts.KubeconfigPath, + "Path to kubeconfig file for connecting to the cluster") + cmd.Flags().DurationVar(&opts.Timeout, "timeout", opts.Timeout, + "Timeout for cluster join operations") + cmd.Flags().BoolVar(&opts.Learner, "learner", true, + "Join the cluster as a learner node (default is to join as a member)") + + return cmd +} + +func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { + //TODO I need to cleanup certain directories first. etcd, certs and kubeconfigs, plus /var/lib/kubelet. + // I should detect if I am already in another cluster too. if not, perform the above. if i am then do nothing. + ctx, cancel := context.WithTimeout(ctx, opts.Timeout) + defer cancel() + + klog.Info("Starting cluster join process...") + if opts.Learner { + klog.Info("Will add etcd node as learner") + } + + cfg, err := config.ActiveConfig() + if err != nil { + return fmt.Errorf("failed to load MicroShift configuration: %w", err) + } + klog.Info("MicroShift configuration loaded successfully") + + client, err := createKubernetesClient(opts.KubeconfigPath) + if err != nil { + return fmt.Errorf("failed to create Kubernetes client: %w", err) + } + + for _, resource := range components.CertificateAuthorityResources { + if err := fetchCertificateAuthority(ctx, client, resource.Name, resource.Dir); err != nil { + return fmt.Errorf("failed to fetch certificate authority %s: %w", resource.Name, err) + } + } + klog.Info("Certificate authorities fetched and written successfully") + + //TODO generate it using paths. + if err := generateEtcdCertificates(cfg); err != nil { + return fmt.Errorf("failed to generate etcd certificates: %w", err) + } + klog.Info("Etcd certificates generated successfully") + + nodeCount, clusterMembers, err := getClusterInfo(ctx, client) + if err != nil { + return fmt.Errorf("failed to get cluster information: %w", err) + } + klog.Infof("Found %d nodes in cluster", nodeCount) + + if err := configureEtcdForCluster(cfg, clusterMembers, opts.Learner); err != nil { + return fmt.Errorf("failed to configure etcd for cluster: %w", err) + } + + if err := configureBootstrapKubeconfig(cfg, opts.KubeconfigPath); err != nil { + return fmt.Errorf("failed to configure bootstrap kubeconfig: %w", err) + } + + if err := restartMicroShift(); err != nil { + return fmt.Errorf("failed to restart MicroShift service: %w", err) + } + klog.Info("MicroShift service restarted") + + if err := waitForNodeReady(ctx, client, cfg.CanonicalNodeName()); err != nil { + return fmt.Errorf("node failed to become ready: %w", err) + } + + klog.Info("Node successfully joined the cluster and is ready!") + return nil +} + +func createKubernetesClient(kubeconfigPath string) (kubernetes.Interface, error) { + if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { + return nil, fmt.Errorf("kubeconfig file does not exist at %s", kubeconfigPath) + } + + _, err := clientcmd.LoadFromFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("invalid kubeconfig file: %w", err) + } + + restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) + if err != nil { + return nil, err + } + + return kubernetes.NewForConfig(restConfig) +} + +func fetchCertificateAuthority(ctx context.Context, client kubernetes.Interface, name, dir string) error { + secret, err := client.CoreV1().Secrets("kube-system").Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get %s secret: %w", name, err) + } + + caCert, exists := secret.Data["ca.crt"] + if !exists { + return fmt.Errorf("ca.crt not found in secret") + } + + caKey, exists := secret.Data["ca.key"] + if !exists { + return fmt.Errorf("ca.key not found in secret") + } + + serial, exists := secret.Data["serial.txt"] + if !exists { + return fmt.Errorf("serial.txt not found in secret") + } + + caBundle, caBundleExists := secret.Data["ca-bundle.crt"] + + if err := os.MkdirAll(dir, 0755); err != nil { + return fmt.Errorf("failed to create destination directory: %w", err) + } + + if err := os.WriteFile(cryptomaterial.CACertPath(dir), caCert, 0644); err != nil { + return fmt.Errorf("failed to write ca.crt: %w", err) + } + + if err := os.WriteFile(cryptomaterial.CAKeyPath(dir), caKey, 0600); err != nil { + return fmt.Errorf("failed to write ca.key: %w", err) + } + + if err := os.WriteFile(cryptomaterial.CASerialsPath(dir), serial, 0644); err != nil { + return fmt.Errorf("failed to write serial.txt: %w", err) + } + + if caBundleExists { + if err := os.WriteFile(cryptomaterial.CABundlePath(dir), caBundle, 0644); err != nil { + return fmt.Errorf("failed to write ca-bundle.crt: %w", err) + } + } + + return nil +} + +func generateEtcdCertificates(cfg *config.Config) error { + certsDir := cryptomaterial.CertsDirectory(config.DataDir) + etcdSignerDir := cryptomaterial.EtcdSignerDir(certsDir) + + caCertPath := cryptomaterial.CACertPath(etcdSignerDir) + caKeyPath := cryptomaterial.CAKeyPath(etcdSignerDir) + + caCert, err := os.ReadFile(caCertPath) + if err != nil { + return fmt.Errorf("failed to read CA certificate: %w", err) + } + caKey, err := os.ReadFile(caKeyPath) + if err != nil { + return fmt.Errorf("failed to read CA key: %w", err) + } + + // Create CA config from the provided cert and key + caTLSConfig, err := librarycrypto.GetTLSCertificateConfigFromBytes(caCert, caKey) + if err != nil { + return fmt.Errorf("failed to load CA certificate config: %w", err) + } + + // Create CA object for signing + caConfig := &librarycrypto.CA{ + Config: caTLSConfig, + SerialGenerator: &librarycrypto.RandomSerialGenerator{}, + } + + // Create directories for etcd certificates + servingCertDir := cryptomaterial.EtcdServingCertDir(certsDir) + if err := os.MkdirAll(servingCertDir, 0755); err != nil { + return fmt.Errorf("failed to create serving cert directory: %w", err) + } + + peerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) + if err := os.MkdirAll(peerCertDir, 0755); err != nil { + return fmt.Errorf("failed to create peer cert directory: %w", err) + } + + clientCertDir := cryptomaterial.EtcdAPIServerClientCertDir(certsDir) + if err := os.MkdirAll(clientCertDir, 0755); err != nil { + return fmt.Errorf("failed to create client cert directory: %w", err) + } + + // Prepare hostnames and IPs for etcd certificates + hostnames := []string{"localhost", cfg.Node.HostnameOverride} + ips := []net.IP{net.ParseIP("127.0.0.1")} + if cfg.Node.NodeIP != "" { + if ip := net.ParseIP(cfg.Node.NodeIP); ip != nil { + ips = append(ips, ip) + } + } + + //TODO something is wrong with serial numbers. investigate. + + // Generate serving certificate + servingTLS, err := caConfig.MakeServerCertForDuration( + sets.New[string](hostnames...), + time.Duration(cryptomaterial.ShortLivedCertificateValidity), + func(certTemplate *x509.Certificate) error { + certTemplate.Subject.CommonName = "system:etcd-server:etcd-client" + certTemplate.Subject.Organization = []string{"system:etcd-servers"} + certTemplate.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth} + certTemplate.IPAddresses = ips + certTemplate.SerialNumber = big.NewInt(4) + serialNumberPath := filepath.Join(servingCertDir, "serial.txt") + if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0644); err != nil { + return fmt.Errorf("failed to write serial number to disk: %w", err) + } + return nil + }, + ) + if err != nil { + return fmt.Errorf("failed to generate serving certificate: %w", err) + } + + servingCertPath := cryptomaterial.PeerCertPath(servingCertDir) + servingKeyPath := cryptomaterial.PeerKeyPath(servingCertDir) + if err := servingTLS.WriteCertConfigFile(servingCertPath, servingKeyPath); err != nil { + return fmt.Errorf("failed to write serving certificate: %w", err) + } + + peerTLS, err := caConfig.MakeServerCertForDuration( + sets.New[string](hostnames...), + time.Duration(cryptomaterial.ShortLivedCertificateValidity), + func(certTemplate *x509.Certificate) error { + certTemplate.Subject.CommonName = "system:etcd-peer:etcd-client" + certTemplate.Subject.Organization = []string{"system:etcd-peers"} + certTemplate.ExtKeyUsage = []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth, x509.ExtKeyUsageClientAuth} + certTemplate.IPAddresses = ips + certTemplate.SerialNumber = big.NewInt(4) + serialNumberPath := filepath.Join(peerCertDir, "serial.txt") + if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0644); err != nil { + return fmt.Errorf("failed to write serial number to disk: %w", err) + } + return nil + }, + ) + if err != nil { + return fmt.Errorf("failed to generate peer certificate: %w", err) + } + + peerCertPath := cryptomaterial.PeerCertPath(peerCertDir) + peerKeyPath := cryptomaterial.PeerKeyPath(peerCertDir) + if err := peerTLS.WriteCertConfigFile(peerCertPath, peerKeyPath); err != nil { + return fmt.Errorf("failed to write peer certificate: %w", err) + } + + // Generate client certificate + clientUserInfo := &user.DefaultInfo{Name: "etcd", Groups: []string{"etcd"}} + clientTLS, err := caConfig.MakeClientCertificateForDuration( + clientUserInfo, + time.Duration(cryptomaterial.ShortLivedCertificateValidity), + ) + if err != nil { + return fmt.Errorf("failed to generate client certificate: %w", err) + } + + clientCertPath := cryptomaterial.ClientCertPath(clientCertDir) + clientKeyPath := cryptomaterial.ClientKeyPath(clientCertDir) + if err := clientTLS.WriteCertConfigFile(clientCertPath, clientKeyPath); err != nil { + return fmt.Errorf("failed to write client certificate: %w", err) + } + + klog.Info("All etcd certificates generated successfully with proper signatures and SAN entries") + return nil +} + +func getClusterInfo(ctx context.Context, client kubernetes.Interface) (int, []string, error) { + nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, nil, fmt.Errorf("failed to list nodes: %w", err) + } + + readyCount := 0 + var members []string + for _, node := range nodes.Items { + if isJoinNodeReady(&node) { + readyCount++ + nodeIP := "" + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break + } + } + if nodeIP != "" { + members = append(members, fmt.Sprintf("%s=https://%s:2380", node.Name, nodeIP)) + } + } + } + + return readyCount, members, nil +} + +func isJoinNodeReady(node *corev1.Node) bool { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady { + return condition.Status == corev1.ConditionTrue + } + } + return false +} + +func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLearner bool) error { + // Create etcd configuration for joining cluster + //TODO need to do something with paths. constants or something. + dataDir := filepath.Join(config.DataDir, "etcd") + if err := os.MkdirAll(dataDir, 0755); err != nil { + return fmt.Errorf("failed to create etcd data directory: %w", err) + } + + // Add current node to the cluster members list + nodeIP := cfg.Node.NodeIP + if nodeIP == "" { + nodeIP = "127.0.0.1" // fallback + } + currentNodeMember := fmt.Sprintf("%s=https://%s:2380", cfg.CanonicalNodeName(), nodeIP) + cfgInitialCluster := append(clusterMembers, currentNodeMember) + + clusterConfig := fmt.Sprintf("ETCD_INITIAL_CLUSTER=%s\nETCD_INITIAL_CLUSTER_STATE=existing", strings.Join(cfgInitialCluster, ",")) + + configDir := "/var/lib/microshift/etcd" + if err := os.MkdirAll(configDir, 0755); err != nil { + return fmt.Errorf("failed to create configuration directory: %w", err) + } + + //TODO rework this. not good anymore. + configFilePath := filepath.Join(configDir, "config") + if err := os.WriteFile(configFilePath, []byte(clusterConfig), 0644); err != nil { + return fmt.Errorf("failed to write etcd cluster configuration: %w", err) + } + + //TODO not a yaml now + klog.Infof("Etcd YAML configuration written to %s", configFilePath) + + certsDir := cryptomaterial.CertsDirectory(config.DataDir) + etcdPeerClientCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) + + tlsInfo := transport.TLSInfo{ + CertFile: cryptomaterial.PeerCertPath(etcdPeerClientCertDir), + KeyFile: cryptomaterial.PeerKeyPath(etcdPeerClientCertDir), + TrustedCAFile: cryptomaterial.CACertPath(cryptomaterial.EtcdSignerDir(certsDir)), + } + tlsConfig, err := tlsInfo.ClientConfig() + if err != nil { + return fmt.Errorf("failed to create etcd client TLS config: %v", err) + } + + var endpoints []string + for _, member := range clusterMembers { + parts := strings.SplitN(member, "=", 2) + if len(parts) == 2 { + endpoint := strings.Replace(parts[1], ":2380", ":2379", 1) + endpoints = append(endpoints, endpoint) + } + } + //TODO this could be even easier. just target the bootstrap server. + client, err := clientv3.New(clientv3.Config{ + Endpoints: endpoints, + DialTimeout: 5 * time.Second, + TLS: tlsConfig, + Context: context.Background(), + }) + if err != nil { + return fmt.Errorf("failed to create etcd client: %v", err) + } + + memberResponse, err := client.MemberList(context.Background()) + if err != nil { + return fmt.Errorf("failed to list etcd members: %v", err) + } + + //TODO can i skip all of this if I am already in the cluster? this is pointless. + // what i need is to get all the non-learner members and add myself to them. + var filteredEndpoints []string + initialCluster := fmt.Sprintf("%s=https://%s:2380", cfg.Node.HostnameOverride, cfg.Node.NodeIP) + for _, member := range memberResponse.Members { + if member.Name == cfg.Node.HostnameOverride { + klog.Infof("etcd member %s already exists", cfg.Node.HostnameOverride) + continue + } + if !member.IsLearner { + filteredEndpoints = append(filteredEndpoints, member.ClientURLs[0]) + } + initialCluster = fmt.Sprintf("%s,%s=%s", initialCluster, member.Name, member.PeerURLs[0]) + } + klog.Infof("initial cluster: %v", initialCluster) + klog.Infof("filtered endpoints: %v", filteredEndpoints) + + client, err = clientv3.New(clientv3.Config{ + Endpoints: filteredEndpoints, + DialTimeout: 5 * time.Second, + TLS: tlsConfig, + Context: context.Background(), + }) + if err != nil { + return fmt.Errorf("failed to create etcd client with filtered endpoints: %v", err) + } + + addFunction := client.MemberAdd + if isLearner { + addFunction = client.MemberAddAsLearner + } + response, err := addFunction(context.Background(), []string{fmt.Sprintf("https://%s:2380", cfg.Node.NodeIP)}) + if err != nil { + return fmt.Errorf("failed to add etcd node: %v", err) + } + klog.Infof("Successfully added etcd node: %v", response) + return nil +} + +func configureBootstrapKubeconfig(cfg *config.Config, kubeconfigPath string) error { + bootstrapKubeConfigPath := cfg.BootstrapKubeConfigPath() + if err := os.MkdirAll(filepath.Dir(bootstrapKubeConfigPath), 0755); err != nil { + return fmt.Errorf("failed to create kubelet directory: %w", err) + } + + if err := copyFile(kubeconfigPath, bootstrapKubeConfigPath); err != nil { + return fmt.Errorf("failed to copy kubeconfig for kubelet: %w", err) + } + return nil +} + +func copyFile(src, dst string) error { + data, err := os.ReadFile(src) + if err != nil { + return err + } + return os.WriteFile(dst, data, 0644) +} + +func restartMicroShift() error { + cmd := exec.Command("systemctl", "restart", "microshift") + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to restart microshift service: %w", err) + } + return nil +} + +func waitForNodeReady(ctx context.Context, client kubernetes.Interface, nodeName string) error { + klog.Infof("Waiting for node %s to become ready...", nodeName) + + timeout := time.After(5 * time.Minute) + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-timeout: + return fmt.Errorf("timeout waiting for node to become ready") + case <-ticker.C: + node, err := client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + klog.Warningf("Failed to get node %s: %v", nodeName, err) + continue + } + + if isJoinNodeReady(node) { + klog.Infof("Node %s is ready!", nodeName) + return nil + } + + klog.Infof("Node %s is not ready yet, waiting...", nodeName) + } + } +} From 1fb73e04658efcb484bff17cf80547c185a18c51 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 11:48:12 +0200 Subject: [PATCH 14/27] OCPEDGE-2037: Skip join cluster if already a member --- pkg/cmd/joincluster.go | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 43d85b74d5..2058eebe1b 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -94,6 +94,12 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { return fmt.Errorf("failed to create Kubernetes client: %w", err) } + nodeName := cfg.CanonicalNodeName() + if isNodeAlreadyInCluster(ctx, client, nodeName) { + klog.Infof("Node %s is already part of the cluster. Skipping join process.", nodeName) + return nil + } + for _, resource := range components.CertificateAuthorityResources { if err := fetchCertificateAuthority(ctx, client, resource.Name, resource.Dir); err != nil { return fmt.Errorf("failed to fetch certificate authority %s: %w", resource.Name, err) @@ -253,8 +259,6 @@ func generateEtcdCertificates(cfg *config.Config) error { } } - //TODO something is wrong with serial numbers. investigate. - // Generate serving certificate servingTLS, err := caConfig.MakeServerCertForDuration( sets.New[string](hostnames...), @@ -500,6 +504,19 @@ func restartMicroShift() error { return nil } +func isNodeAlreadyInCluster(ctx context.Context, client kubernetes.Interface, nodeName string) bool { + nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return false + } + for _, node := range nodes.Items { + if node.Name == nodeName { + return true + } + } + return false +} + func waitForNodeReady(ctx context.Context, client kubernetes.Interface, nodeName string) error { klog.Infof("Waiting for node %s to become ready...", nodeName) From 2d3d3f24aaff931d542ac21011c52e2585aa8a4c Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 13:28:10 +0200 Subject: [PATCH 15/27] OCPEDGE-2037: Improve etcd directory handling for join-cluster --- pkg/cmd/joincluster.go | 23 +++++------------------ pkg/config/etcd.go | 9 ++++++++- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 2058eebe1b..653a0c0d6b 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -107,7 +107,6 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { } klog.Info("Certificate authorities fetched and written successfully") - //TODO generate it using paths. if err := generateEtcdCertificates(cfg); err != nil { return fmt.Errorf("failed to generate etcd certificates: %w", err) } @@ -369,14 +368,11 @@ func isJoinNodeReady(node *corev1.Node) bool { } func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLearner bool) error { - // Create etcd configuration for joining cluster - //TODO need to do something with paths. constants or something. - dataDir := filepath.Join(config.DataDir, "etcd") + dataDir := filepath.Dir(cfg.EtcdConfigPath()) if err := os.MkdirAll(dataDir, 0755); err != nil { return fmt.Errorf("failed to create etcd data directory: %w", err) } - // Add current node to the cluster members list nodeIP := cfg.Node.NodeIP if nodeIP == "" { nodeIP = "127.0.0.1" // fallback @@ -386,19 +382,11 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear clusterConfig := fmt.Sprintf("ETCD_INITIAL_CLUSTER=%s\nETCD_INITIAL_CLUSTER_STATE=existing", strings.Join(cfgInitialCluster, ",")) - configDir := "/var/lib/microshift/etcd" - if err := os.MkdirAll(configDir, 0755); err != nil { - return fmt.Errorf("failed to create configuration directory: %w", err) - } - - //TODO rework this. not good anymore. - configFilePath := filepath.Join(configDir, "config") - if err := os.WriteFile(configFilePath, []byte(clusterConfig), 0644); err != nil { + if err := os.WriteFile(cfg.EtcdConfigPath(), []byte(clusterConfig), 0644); err != nil { return fmt.Errorf("failed to write etcd cluster configuration: %w", err) } - //TODO not a yaml now - klog.Infof("Etcd YAML configuration written to %s", configFilePath) + klog.Infof("Etcd configuration written to %s", cfg.EtcdConfigPath()) certsDir := cryptomaterial.CertsDirectory(config.DataDir) etcdPeerClientCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) @@ -413,6 +401,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear return fmt.Errorf("failed to create etcd client TLS config: %v", err) } + //TODO replace this with the bootstrap server. var endpoints []string for _, member := range clusterMembers { parts := strings.SplitN(member, "=", 2) @@ -421,7 +410,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear endpoints = append(endpoints, endpoint) } } - //TODO this could be even easier. just target the bootstrap server. + client, err := clientv3.New(clientv3.Config{ Endpoints: endpoints, DialTimeout: 5 * time.Second, @@ -437,8 +426,6 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear return fmt.Errorf("failed to list etcd members: %v", err) } - //TODO can i skip all of this if I am already in the cluster? this is pointless. - // what i need is to get all the non-learner members and add myself to them. var filteredEndpoints []string initialCluster := fmt.Sprintf("%s=https://%s:2380", cfg.Node.HostnameOverride, cfg.Node.NodeIP) for _, member := range memberResponse.Members { diff --git a/pkg/config/etcd.go b/pkg/config/etcd.go index 52dbb49cf4..427ed9b5ff 100644 --- a/pkg/config/etcd.go +++ b/pkg/config/etcd.go @@ -1,6 +1,9 @@ package config -import "time" +import ( + "path/filepath" + "time" +) const ( // Etcd performance degrades significantly if the memory available @@ -27,3 +30,7 @@ type EtcdConfig struct { // defrags, except for a single on startup). DefragCheckFreq time.Duration `json:"-"` } + +func (cfg *Config) EtcdConfigPath() string { + return filepath.Join(DataDir, "etcd", "config") +} From 42b415248687da3b07c44dd5a7994a7aa34c5bf2 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 13:50:04 +0200 Subject: [PATCH 16/27] OCPEDGE-2037: Add automatic cleanup of directories before joining --- pkg/cmd/joincluster.go | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 653a0c0d6b..49213515dc 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -73,8 +73,6 @@ func NewJoinClusterCommand() *cobra.Command { } func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { - //TODO I need to cleanup certain directories first. etcd, certs and kubeconfigs, plus /var/lib/kubelet. - // I should detect if I am already in another cluster too. if not, perform the above. if i am then do nothing. ctx, cancel := context.WithTimeout(ctx, opts.Timeout) defer cancel() @@ -100,6 +98,10 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { return nil } + if err := cleanupMicroShiftData(cfg); err != nil { + return fmt.Errorf("failed to cleanup MicroShift data directories: %w", err) + } + for _, resource := range components.CertificateAuthorityResources { if err := fetchCertificateAuthority(ctx, client, resource.Name, resource.Dir); err != nil { return fmt.Errorf("failed to fetch certificate authority %s: %w", resource.Name, err) @@ -401,7 +403,6 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear return fmt.Errorf("failed to create etcd client TLS config: %v", err) } - //TODO replace this with the bootstrap server. var endpoints []string for _, member := range clusterMembers { parts := strings.SplitN(member, "=", 2) @@ -533,3 +534,20 @@ func waitForNodeReady(ctx context.Context, client kubernetes.Interface, nodeName } } } + +func cleanupMicroShiftData(cfg *config.Config) error { + directoriesToClean := []string{ + filepath.Dir(cfg.EtcdConfigPath()), + cryptomaterial.CertsDirectory(config.DataDir), + filepath.Join(config.DataDir, "resources"), + filepath.Join(config.DataDir, "bootstrap"), + } + + for _, dir := range directoriesToClean { + if err := os.RemoveAll(dir); err != nil { + return fmt.Errorf("failed to remove directory %s: %w", dir, err) + } + } + klog.Info("MicroShift data directories cleaned up successfully") + return nil +} From 03e3aa7748c3f65f091cead5ab98156563314007 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 14:23:16 +0200 Subject: [PATCH 17/27] OCPEDGE-2037: Hide new join-cluster command --- pkg/cmd/joincluster.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 49213515dc..7706a0e211 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -16,6 +16,7 @@ import ( "github.com/openshift/microshift/pkg/components" "github.com/openshift/microshift/pkg/config" "github.com/openshift/microshift/pkg/util/cryptomaterial" + "github.com/openshift/microshift/pkg/version" "github.com/spf13/cobra" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apiserver/pkg/authentication/user" @@ -69,6 +70,10 @@ func NewJoinClusterCommand() *cobra.Command { cmd.Flags().BoolVar(&opts.Learner, "learner", true, "Join the cluster as a learner node (default is to join as a member)") + if version.Get().BuildVariant != version.BuildVariantCommunity { + cmd.Hidden = true + } + return cmd } From 89bf0e8d5918c3f855ec3b37313b33d378bdc130 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 14:55:17 +0200 Subject: [PATCH 18/27] OCPEDGE-2037: Update logs and usage --- pkg/cmd/joincluster.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 7706a0e211..343f75adb0 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -53,7 +53,7 @@ func NewJoinClusterCommand() *cobra.Command { Short: "Join a node to an existing MicroShift cluster", Long: `This command joins a node to an existing MicroShift cluster by: 1. Loading the MicroShift configuration for current node. -2. Fetching etcd CA certificate and key from the cluster using provided kubeconfig to generate etcd certificates. +2. Fetch Certificate Authorities from the cluster using provided kubeconfig. 4. Configuring etcd cluster to add the new member. 5. Configuring kubelet to bootstrap into the cluster. 6. Restarting the MicroShift systemd unit. @@ -106,6 +106,7 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { if err := cleanupMicroShiftData(cfg); err != nil { return fmt.Errorf("failed to cleanup MicroShift data directories: %w", err) } + klog.Info("MicroShift data directories cleaned up successfully") for _, resource := range components.CertificateAuthorityResources { if err := fetchCertificateAuthority(ctx, client, resource.Name, resource.Dir); err != nil { @@ -119,11 +120,10 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { } klog.Info("Etcd certificates generated successfully") - nodeCount, clusterMembers, err := getClusterInfo(ctx, client) + clusterMembers, err := getClusterInfo(ctx, client) if err != nil { return fmt.Errorf("failed to get cluster information: %w", err) } - klog.Infof("Found %d nodes in cluster", nodeCount) if err := configureEtcdForCluster(cfg, clusterMembers, opts.Learner); err != nil { return fmt.Errorf("failed to configure etcd for cluster: %w", err) @@ -142,7 +142,7 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { return fmt.Errorf("node failed to become ready: %w", err) } - klog.Info("Node successfully joined the cluster and is ready!") + klog.Info("Node successfully joined the cluster") return nil } @@ -338,17 +338,15 @@ func generateEtcdCertificates(cfg *config.Config) error { return nil } -func getClusterInfo(ctx context.Context, client kubernetes.Interface) (int, []string, error) { +func getClusterInfo(ctx context.Context, client kubernetes.Interface) ([]string, error) { nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { - return 0, nil, fmt.Errorf("failed to list nodes: %w", err) + return nil, fmt.Errorf("failed to list nodes: %w", err) } - readyCount := 0 var members []string for _, node := range nodes.Items { if isJoinNodeReady(&node) { - readyCount++ nodeIP := "" for _, addr := range node.Status.Addresses { if addr.Type == corev1.NodeInternalIP { @@ -362,7 +360,7 @@ func getClusterInfo(ctx context.Context, client kubernetes.Interface) (int, []st } } - return readyCount, members, nil + return members, nil } func isJoinNodeReady(node *corev1.Node) bool { @@ -444,8 +442,8 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear } initialCluster = fmt.Sprintf("%s,%s=%s", initialCluster, member.Name, member.PeerURLs[0]) } - klog.Infof("initial cluster: %v", initialCluster) - klog.Infof("filtered endpoints: %v", filteredEndpoints) + + klog.Infof("initial cluster: %v. Member endpoints: %v", initialCluster, filteredEndpoints) client, err = clientv3.New(clientv3.Config{ Endpoints: filteredEndpoints, @@ -553,6 +551,5 @@ func cleanupMicroShiftData(cfg *config.Config) error { return fmt.Errorf("failed to remove directory %s: %w", dir, err) } } - klog.Info("MicroShift data directories cleaned up successfully") return nil } From c439b3ec6d1e340aed4810429e49dd98ef850df2 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 17:02:46 +0200 Subject: [PATCH 19/27] OCPEDGE-2037: Verify nits --- pkg/cmd/joincluster.go | 46 +++++++++++++++---------------- pkg/controllers/kube-apiserver.go | 16 ++++++----- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/joincluster.go index 343f75adb0..c6fac7e57b 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/joincluster.go @@ -125,7 +125,7 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { return fmt.Errorf("failed to get cluster information: %w", err) } - if err := configureEtcdForCluster(cfg, clusterMembers, opts.Learner); err != nil { + if err := configureEtcdForCluster(ctx, cfg, clusterMembers, opts.Learner); err != nil { return fmt.Errorf("failed to configure etcd for cluster: %w", err) } @@ -146,7 +146,7 @@ func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { return nil } -func createKubernetesClient(kubeconfigPath string) (kubernetes.Interface, error) { +func createKubernetesClient(kubeconfigPath string) (*kubernetes.Clientset, error) { if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { return nil, fmt.Errorf("kubeconfig file does not exist at %s", kubeconfigPath) } @@ -187,11 +187,11 @@ func fetchCertificateAuthority(ctx context.Context, client kubernetes.Interface, caBundle, caBundleExists := secret.Data["ca-bundle.crt"] - if err := os.MkdirAll(dir, 0755); err != nil { + if err := os.MkdirAll(dir, 0750); err != nil { return fmt.Errorf("failed to create destination directory: %w", err) } - if err := os.WriteFile(cryptomaterial.CACertPath(dir), caCert, 0644); err != nil { + if err := os.WriteFile(cryptomaterial.CACertPath(dir), caCert, 0600); err != nil { return fmt.Errorf("failed to write ca.crt: %w", err) } @@ -199,12 +199,12 @@ func fetchCertificateAuthority(ctx context.Context, client kubernetes.Interface, return fmt.Errorf("failed to write ca.key: %w", err) } - if err := os.WriteFile(cryptomaterial.CASerialsPath(dir), serial, 0644); err != nil { + if err := os.WriteFile(cryptomaterial.CASerialsPath(dir), serial, 0600); err != nil { return fmt.Errorf("failed to write serial.txt: %w", err) } if caBundleExists { - if err := os.WriteFile(cryptomaterial.CABundlePath(dir), caBundle, 0644); err != nil { + if err := os.WriteFile(cryptomaterial.CABundlePath(dir), caBundle, 0600); err != nil { return fmt.Errorf("failed to write ca-bundle.crt: %w", err) } } @@ -242,17 +242,17 @@ func generateEtcdCertificates(cfg *config.Config) error { // Create directories for etcd certificates servingCertDir := cryptomaterial.EtcdServingCertDir(certsDir) - if err := os.MkdirAll(servingCertDir, 0755); err != nil { + if err := os.MkdirAll(servingCertDir, 0750); err != nil { return fmt.Errorf("failed to create serving cert directory: %w", err) } peerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) - if err := os.MkdirAll(peerCertDir, 0755); err != nil { + if err := os.MkdirAll(peerCertDir, 0750); err != nil { return fmt.Errorf("failed to create peer cert directory: %w", err) } clientCertDir := cryptomaterial.EtcdAPIServerClientCertDir(certsDir) - if err := os.MkdirAll(clientCertDir, 0755); err != nil { + if err := os.MkdirAll(clientCertDir, 0750); err != nil { return fmt.Errorf("failed to create client cert directory: %w", err) } @@ -268,7 +268,7 @@ func generateEtcdCertificates(cfg *config.Config) error { // Generate serving certificate servingTLS, err := caConfig.MakeServerCertForDuration( sets.New[string](hostnames...), - time.Duration(cryptomaterial.ShortLivedCertificateValidity), + cryptomaterial.ShortLivedCertificateValidity, func(certTemplate *x509.Certificate) error { certTemplate.Subject.CommonName = "system:etcd-server:etcd-client" certTemplate.Subject.Organization = []string{"system:etcd-servers"} @@ -276,7 +276,7 @@ func generateEtcdCertificates(cfg *config.Config) error { certTemplate.IPAddresses = ips certTemplate.SerialNumber = big.NewInt(4) serialNumberPath := filepath.Join(servingCertDir, "serial.txt") - if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0644); err != nil { + if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0600); err != nil { return fmt.Errorf("failed to write serial number to disk: %w", err) } return nil @@ -294,7 +294,7 @@ func generateEtcdCertificates(cfg *config.Config) error { peerTLS, err := caConfig.MakeServerCertForDuration( sets.New[string](hostnames...), - time.Duration(cryptomaterial.ShortLivedCertificateValidity), + cryptomaterial.ShortLivedCertificateValidity, func(certTemplate *x509.Certificate) error { certTemplate.Subject.CommonName = "system:etcd-peer:etcd-client" certTemplate.Subject.Organization = []string{"system:etcd-peers"} @@ -302,7 +302,7 @@ func generateEtcdCertificates(cfg *config.Config) error { certTemplate.IPAddresses = ips certTemplate.SerialNumber = big.NewInt(4) serialNumberPath := filepath.Join(peerCertDir, "serial.txt") - if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0644); err != nil { + if err := os.WriteFile(serialNumberPath, []byte(certTemplate.SerialNumber.String()), 0600); err != nil { return fmt.Errorf("failed to write serial number to disk: %w", err) } return nil @@ -322,7 +322,7 @@ func generateEtcdCertificates(cfg *config.Config) error { clientUserInfo := &user.DefaultInfo{Name: "etcd", Groups: []string{"etcd"}} clientTLS, err := caConfig.MakeClientCertificateForDuration( clientUserInfo, - time.Duration(cryptomaterial.ShortLivedCertificateValidity), + cryptomaterial.ShortLivedCertificateValidity, ) if err != nil { return fmt.Errorf("failed to generate client certificate: %w", err) @@ -372,9 +372,9 @@ func isJoinNodeReady(node *corev1.Node) bool { return false } -func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLearner bool) error { +func configureEtcdForCluster(ctx context.Context, cfg *config.Config, clusterMembers []string, isLearner bool) error { dataDir := filepath.Dir(cfg.EtcdConfigPath()) - if err := os.MkdirAll(dataDir, 0755); err != nil { + if err := os.MkdirAll(dataDir, 0750); err != nil { return fmt.Errorf("failed to create etcd data directory: %w", err) } @@ -387,7 +387,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear clusterConfig := fmt.Sprintf("ETCD_INITIAL_CLUSTER=%s\nETCD_INITIAL_CLUSTER_STATE=existing", strings.Join(cfgInitialCluster, ",")) - if err := os.WriteFile(cfg.EtcdConfigPath(), []byte(clusterConfig), 0644); err != nil { + if err := os.WriteFile(cfg.EtcdConfigPath(), []byte(clusterConfig), 0600); err != nil { return fmt.Errorf("failed to write etcd cluster configuration: %w", err) } @@ -419,13 +419,13 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear Endpoints: endpoints, DialTimeout: 5 * time.Second, TLS: tlsConfig, - Context: context.Background(), + Context: ctx, }) if err != nil { return fmt.Errorf("failed to create etcd client: %v", err) } - memberResponse, err := client.MemberList(context.Background()) + memberResponse, err := client.MemberList(ctx) if err != nil { return fmt.Errorf("failed to list etcd members: %v", err) } @@ -449,7 +449,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear Endpoints: filteredEndpoints, DialTimeout: 5 * time.Second, TLS: tlsConfig, - Context: context.Background(), + Context: ctx, }) if err != nil { return fmt.Errorf("failed to create etcd client with filtered endpoints: %v", err) @@ -459,7 +459,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear if isLearner { addFunction = client.MemberAddAsLearner } - response, err := addFunction(context.Background(), []string{fmt.Sprintf("https://%s:2380", cfg.Node.NodeIP)}) + response, err := addFunction(ctx, []string{fmt.Sprintf("https://%s", net.JoinHostPort(cfg.Node.NodeIP, "2380"))}) if err != nil { return fmt.Errorf("failed to add etcd node: %v", err) } @@ -469,7 +469,7 @@ func configureEtcdForCluster(cfg *config.Config, clusterMembers []string, isLear func configureBootstrapKubeconfig(cfg *config.Config, kubeconfigPath string) error { bootstrapKubeConfigPath := cfg.BootstrapKubeConfigPath() - if err := os.MkdirAll(filepath.Dir(bootstrapKubeConfigPath), 0755); err != nil { + if err := os.MkdirAll(filepath.Dir(bootstrapKubeConfigPath), 0750); err != nil { return fmt.Errorf("failed to create kubelet directory: %w", err) } @@ -484,7 +484,7 @@ func copyFile(src, dst string) error { if err != nil { return err } - return os.WriteFile(dst, data, 0644) + return os.WriteFile(dst, data, 0600) } func restartMicroShift() error { diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index 75a3973308..e81e37c62c 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -94,7 +94,7 @@ func NewKubeAPIServer(cfg *config.Config) *KubeAPIServer { func (s *KubeAPIServer) Name() string { return "kube-apiserver" } func (s *KubeAPIServer) Dependencies() []string { return []string{"etcd", "network-configuration"} } -func (s *KubeAPIServer) configure(cfg *config.Config) error { +func (s *KubeAPIServer) configure(ctx context.Context, cfg *config.Config) error { s.verbosity = cfg.GetVerbosity() certsDir := cryptomaterial.CertsDirectory(config.DataDir) @@ -164,7 +164,7 @@ func (s *KubeAPIServer) configure(cfg *config.Config) error { } } - etcdServers, err := discoverEtcdServers(s.configuration.Node.HostnameOverride, s.configuration.BootstrapKubeConfigPath()) + etcdServers, err := discoverEtcdServers(ctx, s.configuration.Node.HostnameOverride, s.configuration.BootstrapKubeConfigPath()) if err != nil { return fmt.Errorf("failed to discover etcd servers: %w", err) } @@ -313,7 +313,7 @@ func (s *KubeAPIServer) configureAuditPolicy(cfg *config.Config) error { } func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped chan<- struct{}) error { - if err := s.configure(s.configuration); err != nil { + if err := s.configure(ctx, s.configuration); err != nil { return fmt.Errorf("configuration failed: %w", err) } @@ -412,7 +412,7 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped } } -func discoverEtcdServers(hostname, kubeconfigPath string) ([]string, error) { +func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) ([]string, error) { certsDir := cryptomaterial.CertsDirectory(config.DataDir) etcdPeerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) @@ -430,13 +430,14 @@ func discoverEtcdServers(hostname, kubeconfigPath string) ([]string, error) { DialTimeout: 5 * time.Second, Endpoints: []string{"https://localhost:2379"}, TLS: tlsConfig, + Context: ctx, }) if err != nil { return nil, fmt.Errorf("failed to create etcd client: %w", err) } - defer client.Close() + defer func() { _ = client.Close() }() - st, err := client.Status(context.Background(), "localhost:2379") + st, err := client.Status(ctx, "localhost:2379") if err != nil { return nil, fmt.Errorf("failed to get etcd status: %w", err) } @@ -468,13 +469,14 @@ func discoverEtcdServers(hostname, kubeconfigPath string) ([]string, error) { DialTimeout: 5 * time.Second, Endpoints: []string{etcdHost}, TLS: tlsConfig, + Context: ctx, }) if err != nil { return nil, fmt.Errorf("failed to create etcd client: %w", err) } } - resp, err := client.MemberList(context.Background()) + resp, err := client.MemberList(ctx) if err != nil { return nil, fmt.Errorf("failed to retrieve etcd member list: %w", err) } From 04f6424d23fc61d0f9a541397cb3f091b855f90b Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 7 Oct 2025 23:47:56 +0200 Subject: [PATCH 20/27] OCPEDGE-2037: Handle ipv6 in etcd initial cluster --- etcd/cmd/microshift-etcd/run.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etcd/cmd/microshift-etcd/run.go b/etcd/cmd/microshift-etcd/run.go index 909874fe41..018b129a76 100644 --- a/etcd/cmd/microshift-etcd/run.go +++ b/etcd/cmd/microshift-etcd/run.go @@ -86,7 +86,7 @@ func (s *EtcdService) configure(cfg *config.Config) { s.etcdCfg.ListenMetricsUrls = setURL([]string{cfg.Node.NodeIP}, "2381") s.etcdCfg.Name = cfg.Node.HostnameOverride - s.etcdCfg.InitialCluster = fmt.Sprintf("%s=https://%s:2380", cfg.Node.HostnameOverride, cfg.Node.NodeIP) + s.etcdCfg.InitialCluster = fmt.Sprintf("%s=https://%s", cfg.Node.HostnameOverride, net.JoinHostPort(cfg.Node.NodeIP, "2380")) s.etcdCfg.TlsMinVersion = getTLSMinVersion(cfg.ApiServer.TLS.MinVersion) if cfg.ApiServer.TLS.MinVersion != string(configv1.VersionTLS13) { From d820dcd76df981c60e60ac197431ae476a363640 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Wed, 8 Oct 2025 22:56:59 +0200 Subject: [PATCH 21/27] OCPEDGE-2037: update prerun version check When the data dir is not empty but there is no version, assume it is the first MicroShift start. Old code was there to allow seamless upgrades from 4.13 to 4.14 (when backups were added). --- pkg/admin/prerun/version.go | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/pkg/admin/prerun/version.go b/pkg/admin/prerun/version.go index 4644948564..1682e0cd56 100644 --- a/pkg/admin/prerun/version.go +++ b/pkg/admin/prerun/version.go @@ -109,24 +109,9 @@ func getVersions() (versions, error) { // error is something else than "file does not exist", like permissions return versions{}, fmt.Errorf("failed to get version of existing MicroShift data: %w", err) } - - // Ignoring .nodename to not get false positives from mere existence of the path - dataExists, err := util.PathExistsAndIsNotEmpty(config.DataDir, ".nodename") - if err != nil { - return versions{}, err - } - - if !dataExists { - // Data directory does not exist so it's first run of MicroShift - klog.InfoS("Version file does not exist yet - assuming first run of MicroShift") - vs.data = nil // repeated for clarity - return vs, nil - } - - // Data exists but without version file, let's assume 4.13 and compare versions - klog.InfoS("MicroShift data directory exists, but doesn't contain version file" + - " - assuming 4.13.0 and proceeding with version compatibility checks") - vs.data = &versionMetadata{Major: 4, Minor: 13, Patch: 0} + // Data directory or version does not exist + klog.InfoS("Version file does not exist yet - assuming first run of MicroShift") + vs.data = nil // repeated for clarity return vs, nil } From 314da47c397e00da8e959b938f971e88d427c93c Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Wed, 8 Oct 2025 22:59:17 +0200 Subject: [PATCH 22/27] OCPEDGE-2037: Disable CA controller if not multinode --- pkg/components/components.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkg/components/components.go b/pkg/components/components.go index 665ce80424..b94da5d034 100755 --- a/pkg/components/components.go +++ b/pkg/components/components.go @@ -44,9 +44,11 @@ func StartComponents(cfg *config.Config, ctx context.Context) error { return err } - if err := startCertificateAuthorityController(ctx, kubeAdminConfig); err != nil { - klog.Warningf("Failed to start certificate authority controller: %v", err) - return err + if cfg.MultiNode.Enabled { + if err := startCertificateAuthorityController(ctx, kubeAdminConfig); err != nil { + klog.Warningf("Failed to start certificate authority controller: %v", err) + return err + } } return nil From f702b47ae2d9bb8afb863f1b5c3d56aae9a4111f Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Wed, 8 Oct 2025 23:06:21 +0200 Subject: [PATCH 23/27] OCPEDGE-2037: Rename command to add-node --- cmd/microshift/main.go | 2 +- pkg/cmd/{joincluster.go => addnode.go} | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) rename pkg/cmd/{joincluster.go => addnode.go} (97%) diff --git a/cmd/microshift/main.go b/cmd/microshift/main.go index 917047ca06..bf275fb316 100644 --- a/cmd/microshift/main.go +++ b/cmd/microshift/main.go @@ -41,6 +41,6 @@ func newCommand() *cobra.Command { cmd.AddCommand(cmds.NewBackupCommand()) cmd.AddCommand(cmds.NewRestoreCommand()) cmd.AddCommand(cmds.NewHealthcheckCommand()) - cmd.AddCommand(cmds.NewJoinClusterCommand()) + cmd.AddCommand(cmds.NewAddNodeCommand()) return cmd } diff --git a/pkg/cmd/joincluster.go b/pkg/cmd/addnode.go similarity index 97% rename from pkg/cmd/joincluster.go rename to pkg/cmd/addnode.go index c6fac7e57b..9d697de3bc 100644 --- a/pkg/cmd/joincluster.go +++ b/pkg/cmd/addnode.go @@ -36,21 +36,21 @@ const ( joinDefaultTimeout = 10 * time.Minute ) -type JoinClusterOptions struct { +type AddNodeOptions struct { KubeconfigPath string Timeout time.Duration Learner bool } -func NewJoinClusterCommand() *cobra.Command { - opts := &JoinClusterOptions{ +func NewAddNodeCommand() *cobra.Command { + opts := &AddNodeOptions{ KubeconfigPath: "/var/lib/microshift/resources/kubeadmin/bootstrap/kubeconfig", Timeout: joinDefaultTimeout, } cmd := &cobra.Command{ - Use: "join-cluster", - Short: "Join a node to an existing MicroShift cluster", + Use: "add-node", + Short: "Adds a new node to an existing MicroShift cluster", Long: `This command joins a node to an existing MicroShift cluster by: 1. Loading the MicroShift configuration for current node. 2. Fetch Certificate Authorities from the cluster using provided kubeconfig. @@ -59,7 +59,7 @@ func NewJoinClusterCommand() *cobra.Command { 6. Restarting the MicroShift systemd unit. 7. Verifying the node is ready in the cluster.`, RunE: func(cmd *cobra.Command, args []string) error { - return runJoinCluster(cmd.Context(), opts) + return runAddNode(cmd.Context(), opts) }, } @@ -68,7 +68,7 @@ func NewJoinClusterCommand() *cobra.Command { cmd.Flags().DurationVar(&opts.Timeout, "timeout", opts.Timeout, "Timeout for cluster join operations") cmd.Flags().BoolVar(&opts.Learner, "learner", true, - "Join the cluster as a learner node (default is to join as a member)") + "Join the cluster as a learner node") if version.Get().BuildVariant != version.BuildVariantCommunity { cmd.Hidden = true @@ -77,7 +77,7 @@ func NewJoinClusterCommand() *cobra.Command { return cmd } -func runJoinCluster(ctx context.Context, opts *JoinClusterOptions) error { +func runAddNode(ctx context.Context, opts *AddNodeOptions) error { ctx, cancel := context.WithTimeout(ctx, opts.Timeout) defer cancel() From a451b02ae42290df8f2f18957efb9fbd1388fc8a Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Fri, 10 Oct 2025 19:55:17 +0200 Subject: [PATCH 24/27] OCPEDGE-2037: Add service-account keys to controller --- pkg/cmd/addnode.go | 36 +++++++++++++++++++++ pkg/components/certificateauthority.go | 43 +++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/pkg/cmd/addnode.go b/pkg/cmd/addnode.go index 9d697de3bc..f3ed2fa477 100644 --- a/pkg/cmd/addnode.go +++ b/pkg/cmd/addnode.go @@ -113,6 +113,11 @@ func runAddNode(ctx context.Context, opts *AddNodeOptions) error { return fmt.Errorf("failed to fetch certificate authority %s: %w", resource.Name, err) } } + for _, resource := range components.ServiceAccountKeyResources { + if err := fetchServiceAccountKey(ctx, client, resource.Name, resource.Dir); err != nil { + return fmt.Errorf("failed to fetch service account key %s: %w", resource.Name, err) + } + } klog.Info("Certificate authorities fetched and written successfully") if err := generateEtcdCertificates(cfg); err != nil { @@ -212,6 +217,37 @@ func fetchCertificateAuthority(ctx context.Context, client kubernetes.Interface, return nil } +func fetchServiceAccountKey(ctx context.Context, client kubernetes.Interface, name, dir string) error { + secret, err := client.CoreV1().Secrets("kube-system").Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("failed to get %s secret: %w", name, err) + } + + serviceAccountKey, exists := secret.Data["service-account.key"] + if !exists { + return fmt.Errorf("service-account.key not found in secret") + } + + serviceAccountPubKey, exists := secret.Data["service-account.pub"] + if !exists { + return fmt.Errorf("service-account.pub not found in secret") + } + + if err := os.MkdirAll(dir, 0750); err != nil { + return fmt.Errorf("failed to create destination directory: %w", err) + } + + if err := os.WriteFile(filepath.Join(dir, "service-account.key"), serviceAccountKey, 0600); err != nil { + return fmt.Errorf("failed to write service-account.key: %w", err) + } + + if err := os.WriteFile(filepath.Join(dir, "service-account.pub"), serviceAccountPubKey, 0400); err != nil { + return fmt.Errorf("failed to write service-account.pub: %w", err) + } + + return nil +} + func generateEtcdCertificates(cfg *config.Config) error { certsDir := cryptomaterial.CertsDirectory(config.DataDir) etcdSignerDir := cryptomaterial.EtcdSignerDir(certsDir) diff --git a/pkg/components/certificateauthority.go b/pkg/components/certificateauthority.go index da8cab39f7..b9a87e6383 100644 --- a/pkg/components/certificateauthority.go +++ b/pkg/components/certificateauthority.go @@ -40,6 +40,12 @@ var ( {Name: "kube-apiserver-service-network-signer", Dir: cryptomaterial.KubeAPIServerServiceNetworkSigner(cryptomaterial.CertsDirectory(config.DataDir))}, {Name: "etcd-signer", Dir: cryptomaterial.EtcdSignerDir(cryptomaterial.CertsDirectory(config.DataDir))}, } + ServiceAccountKeyResources = []struct { + Name string + Dir string + }{ + {Name: "service-account-key", Dir: filepath.Join(config.DataDir, "/resources/kube-apiserver/secrets/service-account-key")}, + } ) func startCertificateAuthorityController(ctx context.Context, kubeconfigPath string) error { @@ -48,7 +54,7 @@ func startCertificateAuthorityController(ctx context.Context, kubeconfigPath str return fmt.Errorf("failed to get Kubernetes client: %w", err) } - resourceNames := make([]string, len(CertificateAuthorityResources)) + resourceNames := make([]string, len(CertificateAuthorityResources)+len(ServiceAccountKeyResources)) for i, resource := range CertificateAuthorityResources { resourceNames[i] = resource.Name if err := exposeCertificateAuthority(ctx, client, resource.Dir, resource.Name); err != nil { @@ -56,6 +62,13 @@ func startCertificateAuthorityController(ctx context.Context, kubeconfigPath str } } + for i, resource := range ServiceAccountKeyResources { + resourceNames[i] = resource.Name + if err := exposeServiceAccountKey(ctx, client, resource.Dir, resource.Name); err != nil { + return fmt.Errorf("failed to expose service account key %s: %w", resource.Name, err) + } + } + err = createClusterRole(ctx, client, resourceNames) if err != nil { return fmt.Errorf("failed to create etcd CA admin Role: %w", err) @@ -165,3 +178,31 @@ func exposeCertificateAuthority(ctx context.Context, client kubernetes.Interface } return nil } + +func exposeServiceAccountKey(ctx context.Context, client kubernetes.Interface, dir, name string) error { + serviceAccountKeyPath := filepath.Join(dir, "service-account.key") + serviceAccountPubKeyPath := filepath.Join(dir, "service-account.pub") + serviceAccountKey, err := os.ReadFile(serviceAccountKeyPath) + if err != nil { + return fmt.Errorf("failed to read service account key from %s: %w", serviceAccountKeyPath, err) + } + serviceAccountPubKey, err := os.ReadFile(serviceAccountPubKeyPath) + if err != nil { + return fmt.Errorf("failed to read service account public key from %s: %w", serviceAccountPubKeyPath, err) + } + secret := &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: caResourceNamespace, + }, + } + secret.Data = map[string][]byte{ + "service-account.key": serviceAccountKey, + "service-account.pub": serviceAccountPubKey, + } + _, err = client.CoreV1().Secrets(caResourceNamespace).Create(ctx, secret, metav1.CreateOptions{}) + if err != nil && !apierrors.IsAlreadyExists(err) { + return fmt.Errorf("failed to create service account key secret %s: %w", name, err) + } + return nil +} From 3a9443eec6c62ce6b2c064af546f9e56397bdcc3 Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Fri, 10 Oct 2025 11:00:09 +0200 Subject: [PATCH 25/27] OCPEDGE-2037: Enable leader-elect for kcm and ks --- pkg/controllers/kube-controller-manager.go | 4 ++++ pkg/controllers/kube-scheduler.go | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/pkg/controllers/kube-controller-manager.go b/pkg/controllers/kube-controller-manager.go index 6fb7c5b6e5..409f87e694 100644 --- a/pkg/controllers/kube-controller-manager.go +++ b/pkg/controllers/kube-controller-manager.go @@ -105,6 +105,10 @@ func configure(ctx context.Context, cfg *config.Config) (args []string, applyFn }, } + if cfg.MultiNode.Enabled { + overrides.ExtendedArguments["leader-elect"] = []string{"true"} + } + args, err = mergeAndConvertToArgs(overrides) applyFn = func() error { return assets.ApplyNamespaces(ctx, []string{ diff --git a/pkg/controllers/kube-scheduler.go b/pkg/controllers/kube-scheduler.go index 9d62171f4a..ef0de89c3d 100644 --- a/pkg/controllers/kube-scheduler.go +++ b/pkg/controllers/kube-scheduler.go @@ -64,12 +64,16 @@ func (s *KubeScheduler) configure(cfg *config.Config) { } func (s *KubeScheduler) writeConfig(cfg *config.Config) error { + leaderElect := "false" + if cfg.MultiNode.Enabled { + leaderElect = "true" + } data := []byte(`apiVersion: kubescheduler.config.k8s.io/v1 kind: KubeSchedulerConfiguration clientConnection: kubeconfig: ` + cfg.KubeConfigPath(config.KubeScheduler) + ` leaderElection: - leaderElect: false`) + leaderElect: ` + leaderElect) path := filepath.Join(config.DataDir, "resources", "kube-scheduler", "config", "config.yaml") if err := os.MkdirAll(filepath.Dir(path), os.FileMode(0700)); err != nil { From b98ba18f4e1f11e2c33d3ac3ffd92c67b6197d6b Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 14 Oct 2025 18:06:39 +0200 Subject: [PATCH 26/27] OCPEDGE-2037: Nits --- pkg/cmd/addnode.go | 52 +++++++++++++++++++--------------------- pkg/config/kubeconfig.go | 10 +++++--- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/pkg/cmd/addnode.go b/pkg/cmd/addnode.go index f3ed2fa477..864876e892 100644 --- a/pkg/cmd/addnode.go +++ b/pkg/cmd/addnode.go @@ -15,6 +15,7 @@ import ( librarycrypto "github.com/openshift/library-go/pkg/crypto" "github.com/openshift/microshift/pkg/components" "github.com/openshift/microshift/pkg/config" + "github.com/openshift/microshift/pkg/util" "github.com/openshift/microshift/pkg/util/cryptomaterial" "github.com/openshift/microshift/pkg/version" "github.com/spf13/cobra" @@ -44,7 +45,7 @@ type AddNodeOptions struct { func NewAddNodeCommand() *cobra.Command { opts := &AddNodeOptions{ - KubeconfigPath: "/var/lib/microshift/resources/kubeadmin/bootstrap/kubeconfig", + KubeconfigPath: "", Timeout: joinDefaultTimeout, } @@ -125,7 +126,7 @@ func runAddNode(ctx context.Context, opts *AddNodeOptions) error { } klog.Info("Etcd certificates generated successfully") - clusterMembers, err := getClusterInfo(ctx, client) + clusterMembers, err := getClusterNodes(ctx, client) if err != nil { return fmt.Errorf("failed to get cluster information: %w", err) } @@ -152,13 +153,12 @@ func runAddNode(ctx context.Context, opts *AddNodeOptions) error { } func createKubernetesClient(kubeconfigPath string) (*kubernetes.Clientset, error) { - if _, err := os.Stat(kubeconfigPath); os.IsNotExist(err) { - return nil, fmt.Errorf("kubeconfig file does not exist at %s", kubeconfigPath) - } - - _, err := clientcmd.LoadFromFile(kubeconfigPath) + exists, err := util.PathExists(kubeconfigPath) if err != nil { - return nil, fmt.Errorf("invalid kubeconfig file: %w", err) + return nil, fmt.Errorf("failed to check if kubeconfig file exists: %w", err) + } + if !exists { + return nil, fmt.Errorf("kubeconfig file %s does not exist", kubeconfigPath) } restConfig, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) @@ -374,7 +374,7 @@ func generateEtcdCertificates(cfg *config.Config) error { return nil } -func getClusterInfo(ctx context.Context, client kubernetes.Interface) ([]string, error) { +func getClusterNodes(ctx context.Context, client kubernetes.Interface) ([]string, error) { nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return nil, fmt.Errorf("failed to list nodes: %w", err) @@ -382,24 +382,25 @@ func getClusterInfo(ctx context.Context, client kubernetes.Interface) ([]string, var members []string for _, node := range nodes.Items { - if isJoinNodeReady(&node) { - nodeIP := "" - for _, addr := range node.Status.Addresses { - if addr.Type == corev1.NodeInternalIP { - nodeIP = addr.Address - break - } - } - if nodeIP != "" { - members = append(members, fmt.Sprintf("%s=https://%s:2380", node.Name, nodeIP)) + if !isNodeReady(&node) { + continue + } + nodeIP := "" + for _, addr := range node.Status.Addresses { + if addr.Type == corev1.NodeInternalIP { + nodeIP = addr.Address + break } } + if nodeIP != "" { + members = append(members, fmt.Sprintf("%s=https://%s:2380", node.Name, nodeIP)) + } } return members, nil } -func isJoinNodeReady(node *corev1.Node) bool { +func isNodeReady(node *corev1.Node) bool { for _, condition := range node.Status.Conditions { if condition.Type == corev1.NodeReady { return condition.Status == corev1.ConditionTrue @@ -532,16 +533,11 @@ func restartMicroShift() error { } func isNodeAlreadyInCluster(ctx context.Context, client kubernetes.Interface, nodeName string) bool { - nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + _, err := client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) if err != nil { return false } - for _, node := range nodes.Items { - if node.Name == nodeName { - return true - } - } - return false + return true } func waitForNodeReady(ctx context.Context, client kubernetes.Interface, nodeName string) error { @@ -564,7 +560,7 @@ func waitForNodeReady(ctx context.Context, client kubernetes.Interface, nodeName continue } - if isJoinNodeReady(node) { + if isNodeReady(node) { klog.Infof("Node %s is ready!", nodeName) return nil } diff --git a/pkg/config/kubeconfig.go b/pkg/config/kubeconfig.go index 2d18b9de08..22504c47ab 100644 --- a/pkg/config/kubeconfig.go +++ b/pkg/config/kubeconfig.go @@ -1,8 +1,9 @@ package config import ( - "os" "path/filepath" + + "github.com/openshift/microshift/pkg/util" ) // KubeConfigID identifies the different kubeconfigs managed in the DataDir @@ -36,6 +37,9 @@ func (cfg *Config) BootstrapKubeConfigPath() string { } func (cfg *Config) BootstrapKubeConfigExists() bool { - _, err := os.Stat(cfg.BootstrapKubeConfigPath()) - return err == nil + exists, err := util.PathExists(cfg.BootstrapKubeConfigPath()) + if err != nil { + return false + } + return exists } From 6c44e28d21cf27b14f0a25e5a8665b13293996eb Mon Sep 17 00:00:00 2001 From: Pablo Acevedo Montserrat Date: Tue, 14 Oct 2025 18:06:59 +0200 Subject: [PATCH 27/27] OCPEDGE-2037: Improve etcd handling --- pkg/controllers/etcd.go | 57 +++++++++++++++------ pkg/controllers/kube-apiserver.go | 84 +++++++++++++++---------------- 2 files changed, 85 insertions(+), 56 deletions(-) diff --git a/pkg/controllers/etcd.go b/pkg/controllers/etcd.go index a65d21db85..e5631c6173 100644 --- a/pkg/controllers/etcd.go +++ b/pkg/controllers/etcd.go @@ -169,30 +169,59 @@ func stopMicroshiftEtcdScopeIfExists() error { } func checkIfEtcdIsReady(ctx context.Context) error { - client, err := getEtcdClient(ctx) - if err != nil { - return fmt.Errorf("failed to obtain etcd client: %v", err) - } - defer func() { _ = client.Close() }() + var client *clientv3.Client + defer func() { + if client != nil { + _ = client.Close() + } + }() - s, err := client.Status(ctx, "localhost:2379") - if err != nil { - return fmt.Errorf("failed to get etcd status: %v", err) - } - if s.IsLearner { - return nil - } + for attempt := 0; attempt < HealthCheckRetries; attempt++ { + if client == nil { + var err error + client, err = getEtcdClient(ctx) + if err != nil { + klog.Infof("failed to obtain etcd client: %v", err) + if attempt < HealthCheckRetries-1 { + time.Sleep(HealthCheckWait) + continue + } + return fmt.Errorf("failed to obtain etcd client after %d attempts: %v", HealthCheckRetries, err) + } + } + + s, err := client.Status(ctx, "localhost:2379") + if err != nil { + _ = client.Close() + client = nil + klog.Infof("failed to get etcd status: %v", err) + if attempt < HealthCheckRetries-1 { + time.Sleep(HealthCheckWait) + continue + } + return fmt.Errorf("failed to get etcd status after %d attempts: %v", HealthCheckRetries, err) + } + + // If my own instance is a learner I dont need to check readiness because apiserver is going + // to connect to other voting members in the cluster. + if s.IsLearner { + return nil + } - for i := 0; i < HealthCheckRetries; i++ { - time.Sleep(HealthCheckWait) if _, err = client.Get(ctx, "health"); err == nil { return nil } else { + _ = client.Close() + client = nil klog.Infof("etcd not ready yet: %v", err) if err == context.Canceled { return err } } + + if attempt < HealthCheckRetries-1 { + time.Sleep(HealthCheckWait) + } } return fmt.Errorf("etcd still not healthy after checking %d times", HealthCheckRetries) } diff --git a/pkg/controllers/kube-apiserver.go b/pkg/controllers/kube-apiserver.go index e81e37c62c..41b4ac3976 100644 --- a/pkg/controllers/kube-apiserver.go +++ b/pkg/controllers/kube-apiserver.go @@ -164,7 +164,7 @@ func (s *KubeAPIServer) configure(ctx context.Context, cfg *config.Config) error } } - etcdServers, err := discoverEtcdServers(ctx, s.configuration.Node.HostnameOverride, s.configuration.BootstrapKubeConfigPath()) + etcdServers, err := discoverEtcdServers(ctx, s.configuration.BootstrapKubeConfigPath()) if err != nil { return fmt.Errorf("failed to discover etcd servers: %w", err) } @@ -412,7 +412,7 @@ func (s *KubeAPIServer) Run(ctx context.Context, ready chan<- struct{}, stopped } } -func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) ([]string, error) { +func discoverEtcdServers(ctx context.Context, kubeconfigPath string) ([]string, error) { certsDir := cryptomaterial.CertsDirectory(config.DataDir) etcdPeerCertDir := cryptomaterial.EtcdPeerCertDir(certsDir) @@ -441,39 +441,49 @@ func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) ( if err != nil { return nil, fmt.Errorf("failed to get etcd status: %w", err) } - if st.IsLearner { - //TODO if its a learner I need to take the server from the current non-learner members. Use the bootstrap for that. - kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath) - if err != nil { - return nil, fmt.Errorf("failed to load bootstrap kubeconfig: %w", err) - } - if kubeconfig == nil || kubeconfig.Clusters == nil || len(kubeconfig.Clusters) == 0 { - return nil, fmt.Errorf("invalid bootstrap kubeconfig: no clusters found") - } + // If I am not a learner it means I am a voting member, so connecting to my own etcd instance + // is fine because everything is synced. + if !st.IsLearner { + return []string{"https://localhost:2379"}, nil + } - var etcdHost string - for _, cluster := range kubeconfig.Clusters { - etcdHost = cluster.Server - break - } + // If I am a learner I need to connect to a member, retrieve the list of voting + // members and connect to all of them. + kubeconfig, err := clientcmd.LoadFromFile(kubeconfigPath) + if err != nil { + return nil, fmt.Errorf("failed to load bootstrap kubeconfig: %w", err) + } - if etcdHost == "" { - return nil, fmt.Errorf("failed to extract etcd hostname from bootstrap kubeconfig") - } + if kubeconfig == nil || kubeconfig.Clusters == nil || len(kubeconfig.Clusters) == 0 { + return nil, fmt.Errorf("invalid bootstrap kubeconfig: no clusters found") + } - etcdHost = strings.TrimPrefix(etcdHost, "https://") - etcdHost, _, _ = net.SplitHostPort(etcdHost) - etcdHost = fmt.Sprintf("https://%s", net.JoinHostPort(etcdHost, "2379")) - client, err = clientv3.New(clientv3.Config{ - DialTimeout: 5 * time.Second, - Endpoints: []string{etcdHost}, - TLS: tlsConfig, - Context: ctx, - }) - if err != nil { - return nil, fmt.Errorf("failed to create etcd client: %w", err) - } + if len(kubeconfig.Clusters) > 1 { + return nil, fmt.Errorf("invalid bootstrap kubeconfig: multiple clusters found") + } + + var etcdHost string + for _, cluster := range kubeconfig.Clusters { + etcdHost = cluster.Server + break + } + + if etcdHost == "" { + return nil, fmt.Errorf("failed to extract etcd hostname from bootstrap kubeconfig") + } + + etcdHost = strings.TrimPrefix(etcdHost, "https://") + etcdHost, _, _ = net.SplitHostPort(etcdHost) + etcdHost = fmt.Sprintf("https://%s", net.JoinHostPort(etcdHost, "2379")) + client, err = clientv3.New(clientv3.Config{ + DialTimeout: 5 * time.Second, + Endpoints: []string{etcdHost}, + TLS: tlsConfig, + Context: ctx, + }) + if err != nil { + return nil, fmt.Errorf("failed to create etcd client: %w", err) } resp, err := client.MemberList(ctx) @@ -481,21 +491,11 @@ func discoverEtcdServers(ctx context.Context, hostname, kubeconfigPath string) ( return nil, fmt.Errorf("failed to retrieve etcd member list: %w", err) } - //TODO I already know if I am a learner, I had to do this before. - iAmLearner := false var members []string for _, member := range resp.Members { - if member.Name == hostname && member.IsLearner { - iAmLearner = true - continue - } if !member.IsLearner { members = append(members, member.ClientURLs...) } } - if iAmLearner { - return members, nil - } - - return []string{"https://localhost:2379"}, nil + return members, nil }