-
Notifications
You must be signed in to change notification settings - Fork 493
OCPBUGS-20152,OCPBUGS-22364: fix timing issues upon cluster installation or upgrade #3987
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ import ( | |
| ign3types "github.com/coreos/ignition/v2/config/v3_4/types" | ||
| "github.com/imdario/mergo" | ||
| corev1 "k8s.io/api/core/v1" | ||
| apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
| macherrors "k8s.io/apimachinery/pkg/api/errors" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
| "k8s.io/apimachinery/pkg/labels" | ||
|
|
@@ -475,6 +476,18 @@ func (ctrl *Controller) syncKubeletConfig(key string) error { | |
| klog.V(4).Infof("Finished syncing kubeletconfig %q (%v)", key, time.Since(startTime)) | ||
| }() | ||
|
|
||
| if err := wait.PollUntilContextTimeout(context.TODO(), ctrlcommon.ControllerConfigRolloutInterval, ctrlcommon.ControllerConfigTimeout, false, func(_ context.Context) (bool, error) { | ||
| _, err := ctrl.client.MachineconfigurationV1().ControllerConfigs().Get(context.TODO(), ctrlcommon.ControllerConfigName, metav1.GetOptions{}) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think reading off the ccLister here instead would work instead of a direct API fetch? If not, maybe we can tone down the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also curious, does this help with upgrades? The object should always exist after install time. Or I guess this is meant to fix installs only
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It may be worth a CI cycle just to see if we can get away with using the lister instead. Although I don't know how difficult this particular issue is to reproduce there. |
||
| if err == nil { | ||
| return true, nil | ||
| } | ||
| if apierrors.IsNotFound(err) { | ||
| return false, nil | ||
| } | ||
| return false, nil | ||
| }); err != nil { | ||
| klog.Infof("Controller Config has not been created. Continuing %v", err) | ||
| } | ||
| // Wait to apply a kubelet config if the controller config is not completed | ||
| if err := apihelpers.IsControllerConfigCompleted(ctrlcommon.ControllerConfigName, ctrl.ccLister.Get); err != nil { | ||
| return err | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -416,10 +416,10 @@ func (optr *Operator) sync(key string) error { | |
| // "RenderConfig" must always run first as it sets the renderConfig in the operator | ||
| // for the sync funcs below | ||
| {"RenderConfig", optr.syncRenderConfig}, | ||
| {"MachineConfigController", optr.syncMachineConfigController}, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, just trying to remember the inter-dependencies here. Other than renderconfig being first and requiredpools being last, were MCP/MCD/MCC interdependent on each other? How does doing MCC first help here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, df7ff4d . If possible we should maintain sync ordering to avoid any undesired behavior. |
||
| {"MachineConfigNode", optr.syncMachineConfigNodes}, | ||
| {"MachineConfigPools", optr.syncMachineConfigPools}, | ||
| {"MachineConfigDaemon", optr.syncMachineConfigDaemon}, | ||
| {"MachineConfigController", optr.syncMachineConfigController}, | ||
| {"MachineConfigServer", optr.syncMachineConfigServer}, | ||
| {"MachineOSBuilder", optr.syncMachineOSBuilder}, | ||
| // this check must always run last since it makes sure the pools are in sync/upgrading correctly | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| package operator | ||
|
|
||
| import ( | ||
| "bytes" | ||
| "context" | ||
| "crypto/x509" | ||
| "encoding/base64" | ||
|
|
@@ -18,6 +19,7 @@ import ( | |
| appsv1 "k8s.io/api/apps/v1" | ||
| corev1 "k8s.io/api/core/v1" | ||
| apiextv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" | ||
| "k8s.io/apimachinery/pkg/api/equality" | ||
| "k8s.io/apimachinery/pkg/api/errors" | ||
| apierrors "k8s.io/apimachinery/pkg/api/errors" | ||
| metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
|
|
@@ -46,6 +48,7 @@ import ( | |
| ctrlcommon "github.com/openshift/machine-config-operator/pkg/controller/common" | ||
| templatectrl "github.com/openshift/machine-config-operator/pkg/controller/template" | ||
| daemonconsts "github.com/openshift/machine-config-operator/pkg/daemon/constants" | ||
| "github.com/openshift/machine-config-operator/pkg/helpers" | ||
| "github.com/openshift/machine-config-operator/pkg/server" | ||
| "github.com/openshift/machine-config-operator/pkg/upgrademonitor" | ||
| "github.com/openshift/machine-config-operator/pkg/version" | ||
|
|
@@ -624,6 +627,11 @@ func (optr *Operator) syncCustomResourceDefinitions() error { | |
| } | ||
|
|
||
| for _, crd := range crds { | ||
| dne := false | ||
| current, err := optr.ccLister.Get("machine-config-controller") | ||
| if apierrors.IsNotFound(err) { | ||
| dne = true | ||
| } | ||
|
|
||
| crdBytes, err := manifests.ReadFile(crd) | ||
| if err != nil { | ||
|
|
@@ -639,11 +647,105 @@ func (optr *Operator) syncCustomResourceDefinitions() error { | |
| return err | ||
| } | ||
| } | ||
|
|
||
| if strings.Contains(crd, "controllerconfig") { | ||
| currentCR, err := optr.apiExtClient.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), "controllerconfigs.machineconfiguration.openshift.io", metav1.GetOptions{}) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: Hoist everything in the For the section where you |
||
| if err != nil && !os.IsNotExist(err) { | ||
| klog.Errorf("Error getting controller config CRD. Cluster might be initializing: %v", err) | ||
| } | ||
| if currentCR == nil || !equality.Semantic.DeepEqual(c.Spec, currentCR.Spec) || !equality.Semantic.DeepEqual(c.Status, currentCR.Status) || dne { | ||
| // we have changes. And need to regen the CR immediately or else we face validation errors. | ||
| // right now, the only changes we know of that we support is the controller certificate migration. | ||
| newCC, err := optr.client.MachineconfigurationV1().ControllerConfigs().Get(context.TODO(), "machine-config-controller", metav1.GetOptions{}) | ||
| if err != nil { | ||
| if os.IsNotExist(err) || apierrors.IsNotFound(err) { | ||
| continue | ||
| } | ||
| return err | ||
| } | ||
| // if this is a new cconfig or data has changed | ||
| if dne || anyCertDataDiffers(current.Spec, newCC.Spec) { | ||
| names := []string{ | ||
| "KubeAPIServerServingCAData", "CloudProviderCAData", "RootCAData", "AdditionalTrustBundle", | ||
| } | ||
|
|
||
| certs := [][]byte{ | ||
| newCC.Spec.KubeAPIServerServingCAData, | ||
| newCC.Spec.CloudProviderCAData, | ||
| newCC.Spec.RootCAData, | ||
| newCC.Spec.AdditionalTrustBundle, | ||
| } | ||
|
|
||
| // update existing information ONLY, we are not here to generate new entries. | ||
| newCerts := []mcfgv1.ControllerCertificate{} | ||
| for i, certName := range names { | ||
| for _, cert := range newCC.Status.ControllerCertificates { | ||
| if cert.BundleFile == certName { | ||
| newCerts = append(newCerts, helpers.CreateNewCert(certs[i], certName)...) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| for _, entry := range newCC.Spec.ImageRegistryBundleData { | ||
| for _, cert := range newCC.Status.ControllerCertificates { | ||
| if cert.BundleFile == entry.File { | ||
| newCerts = append(newCerts, helpers.CreateNewCert(entry.Data, entry.File)...) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| for _, entry := range newCC.Spec.ImageRegistryBundleUserData { | ||
| for _, cert := range newCC.Status.ControllerCertificates { | ||
| if cert.BundleFile == entry.File { | ||
| newCerts = append(newCerts, helpers.CreateNewCert(entry.Data, entry.File)...) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| newCC.Status.ControllerCertificates = newCerts | ||
|
|
||
| _, err = optr.client.MachineconfigurationV1().ControllerConfigs().Update(context.TODO(), newCC, metav1.UpdateOptions{}) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return nil | ||
| } | ||
|
|
||
| func anyCertDataDiffers(oldSpec, spec mcfgv1.ControllerConfigSpec) bool { | ||
| if !bytes.Equal(spec.KubeAPIServerServingCAData, oldSpec.KubeAPIServerServingCAData) || bytes.Equal(spec.CloudProviderCAData, oldSpec.CloudProviderCAData) || !bytes.Equal(spec.AdditionalTrustBundle, oldSpec.AdditionalTrustBundle) || !bytes.Equal(spec.RootCAData, oldSpec.RootCAData) { | ||
| return true | ||
| } | ||
|
|
||
| if len(spec.ImageRegistryBundleData) != len(oldSpec.ImageRegistryBundleData) || len(spec.ImageRegistryBundleUserData) != len(oldSpec.ImageRegistryBundleUserData) { | ||
| return true | ||
| } | ||
| // for each new cert, see if it already existed | ||
| for _, cert := range spec.ImageRegistryBundleData { | ||
| for _, oldData := range oldSpec.ImageRegistryBundleData { | ||
| // if this is the same cert, with new data then return true | ||
| if oldData.File == cert.File && !bytes.Equal(cert.Data, oldData.Data) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
| // for each new cert, see if it already existed | ||
| for _, cert := range spec.ImageRegistryBundleUserData { | ||
| for _, oldData := range oldSpec.ImageRegistryBundleUserData { | ||
| // if this is the same cert, with new data then return true | ||
| if oldData.File == cert.File && !bytes.Equal(cert.Data, oldData.Data) { | ||
| return true | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return false | ||
| } | ||
|
|
||
| func (optr *Operator) syncMachineConfigPools(config *renderConfig) error { | ||
| mcps := []string{ | ||
| "manifests/master.machineconfigpool.yaml", | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: Hoist this wait code into a separate function for easier reuse since it looks like it is being used in multiple places.