From 384286623f35f3ca7a73aec0776d3df6db7c6779 Mon Sep 17 00:00:00 2001 From: Alexander Greene Date: Mon, 8 Aug 2022 11:14:09 -0700 Subject: [PATCH] Cleanup conversion webhooks when an operator is uninstalled (#2832) Problem: When uninstalling a CSV, OLM has always avoided deleting the associated CRD as all CRs on cluster are subsequently deleted, possibly resulting in user dataloss. OLM supports defining conversion webhooks within the CSV. On cluster, conversion webhooks are defined with a CRD and point to a service that handles conversion. If the service is unable to fulfill the request, all requests against the CRs associated with the CRD will fail. When uninstalling a CSV, OLM does not remove the conversion webhook from the CRD, meaning that all requests against the CRs associated with the CRD will fail, resulting in at least two concerns: 1. OLM is unable to subsequently reinstall the operator. When installing a CSV, if the CRD already exists and instances of CRs exist as well, OLM performs a series of checks which ensure that none of the CRs are invalidated against the new schema. The existing CRD's conversion webhooks points to a non-existant service, causing the check to fail and preventing installs. 2. Broken conversion webhooks causes kubernete's garbage collection to fail. Solution: When a CSV is deleted, if no CSV exists that is replacing it, set the CRD's conversion strategy to None. Signed-off-by: Alexander Greene Upstream-commit: 94374983d448c56d031f0493b84b6dce37b84741 Upstream-repository: operator-lifecycle-manager --- .../pkg/controller/operators/olm/operator.go | 41 +++++++++++++++++++ .../test/e2e/csv_e2e_test.go | 3 ++ .../test/e2e/webhook_e2e_test.go | 22 +++++++++- .../pkg/controller/operators/olm/operator.go | 41 +++++++++++++++++++ 4 files changed, 106 insertions(+), 1 deletion(-) diff --git a/staging/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go b/staging/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go index af6b45cc3a..54e54d9f3f 100644 --- a/staging/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go +++ b/staging/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go @@ -11,6 +11,7 @@ import ( admissionregistrationv1 "k8s.io/api/admissionregistration/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" extinf "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -1101,6 +1102,46 @@ func (a *Operator) handleClusterServiceVersionDeletion(obj interface{}) { logger.WithError(err).Warnf("failed to requeue gc event: %v", webhook) } } + + // Conversion webhooks are defined within a CRD. + // In an effort to prevent customer dataloss, OLM does not delete CRDs associated with a CSV when it is deleted. + // Deleting a CSV that introduced a conversion webhook removes the deployment that serviced the conversion webhook calls. + // If a conversion webhook is defined and the service isn't available, all requests against the CR associated with the CRD will fail. + // This ultimately breaks kubernetes garbage collection and prevents OLM from reinstalling the CSV as CR validation against the new CRD's + // openapiv3 schema fails. + // As such, when a CSV is deleted OLM will check if it is being replaced. If the CSV is not being replaced, OLM will remove the conversion + // webhook from the CRD definition. + csvs, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(clusterServiceVersion.GetNamespace()).List(labels.Everything()) + if err != nil { + logger.Errorf("error listing csvs: %v\n", err) + } + for _, csv := range csvs { + if csv.Spec.Replaces == clusterServiceVersion.GetName() { + return + } + } + + for _, desc := range clusterServiceVersion.Spec.WebhookDefinitions { + if desc.Type != v1alpha1.ConversionWebhook || len(desc.ConversionCRDs) == 0 { + continue + } + + for i, crdName := range desc.ConversionCRDs { + crd, err := a.lister.APIExtensionsV1().CustomResourceDefinitionLister().Get(crdName) + if err != nil { + logger.Errorf("error getting CRD %v which was defined in CSVs spec.WebhookDefinition[%d]: %v\n", crdName, i, err) + continue + } + + copy := crd.DeepCopy() + copy.Spec.Conversion.Strategy = apiextensionsv1.NoneConverter + copy.Spec.Conversion.Webhook = nil + + if _, err = a.opClient.ApiextensionsInterface().ApiextensionsV1().CustomResourceDefinitions().Update(context.TODO(), copy, metav1.UpdateOptions{}); err != nil { + logger.Errorf("error updating conversion strategy for CRD %v: %v\n", crdName, err) + } + } + } } func (a *Operator) removeDanglingChildCSVs(csv *v1alpha1.ClusterServiceVersion) error { diff --git a/staging/operator-lifecycle-manager/test/e2e/csv_e2e_test.go b/staging/operator-lifecycle-manager/test/e2e/csv_e2e_test.go index dd91e37ab0..b24f06649e 100644 --- a/staging/operator-lifecycle-manager/test/e2e/csv_e2e_test.go +++ b/staging/operator-lifecycle-manager/test/e2e/csv_e2e_test.go @@ -4446,6 +4446,9 @@ func findLastEvent(events *corev1.EventList) (event corev1.Event) { func buildCSVCleanupFunc(c operatorclient.ClientInterface, crc versioned.Interface, csv operatorsv1alpha1.ClusterServiceVersion, namespace string, deleteCRDs, deleteAPIServices bool) cleanupFunc { return func() { err := crc.OperatorsV1alpha1().ClusterServiceVersions(namespace).Delete(context.TODO(), csv.GetName(), metav1.DeleteOptions{}) + if err != nil && apierrors.IsNotFound(err) { + err = nil + } Expect(err).ShouldNot(HaveOccurred()) if deleteCRDs { diff --git a/staging/operator-lifecycle-manager/test/e2e/webhook_e2e_test.go b/staging/operator-lifecycle-manager/test/e2e/webhook_e2e_test.go index d746117af7..7d04d6ec56 100644 --- a/staging/operator-lifecycle-manager/test/e2e/webhook_e2e_test.go +++ b/staging/operator-lifecycle-manager/test/e2e/webhook_e2e_test.go @@ -638,6 +638,7 @@ var _ = Describe("CSVs with a Webhook", func() { }).Should(Equal(2)) }) When("Installed from a catalog Source", func() { + const csvName = "webhook-operator.v0.0.1" var cleanupCSV cleanupFunc var cleanupCatSrc cleanupFunc var cleanupSubscription cleanupFunc @@ -683,7 +684,7 @@ var _ = Describe("CSVs with a Webhook", func() { defer cleanupSubscription() // Wait for webhook-operator v2 csv to succeed - csv, err := awaitCSV(crc, testNamespace, "webhook-operator.v0.0.1", csvSucceededChecker) + csv, err := awaitCSV(crc, source.GetNamespace(), csvName, csvSucceededChecker) require.NoError(GinkgoT(), err) cleanupCSV = buildCSVCleanupFunc(c, crc, *csv, testNamespace, true, true) @@ -769,6 +770,25 @@ var _ = Describe("CSVs with a Webhook", func() { require.True(GinkgoT(), ok, "Unable to get spec.conversion.valid of v2 object") require.True(GinkgoT(), v2SpecConversionMutate) require.True(GinkgoT(), v2SpecConversionValid) + + // Check that conversion strategies are disabled after uninstalling the operator. + err = crc.OperatorsV1alpha1().ClusterServiceVersions(testNamespace).Delete(context.TODO(), csvName, metav1.DeleteOptions{}) + require.NoError(GinkgoT(), err) + + Eventually(func() error { + crd, err := c.ApiextensionsInterface().ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), "webhooktests.webhook.operators.coreos.io", metav1.GetOptions{}) + if err != nil { + return err + } + + if crd.Spec.Conversion.Strategy != apiextensionsv1.NoneConverter { + return fmt.Errorf("conversion strategy is not NoneConverter") + } + if crd.Spec.Conversion.Webhook != nil { + return fmt.Errorf("webhook is not nil") + } + return nil + }).Should(Succeed()) }) }) When("WebhookDescription has conversionCRDs field", func() { diff --git a/vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go b/vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go index af6b45cc3a..54e54d9f3f 100644 --- a/vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go +++ b/vendor/github.com/operator-framework/operator-lifecycle-manager/pkg/controller/operators/olm/operator.go @@ -11,6 +11,7 @@ import ( admissionregistrationv1 "k8s.io/api/admissionregistration/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" extinf "k8s.io/apiextensions-apiserver/pkg/client/informers/externalversions" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" @@ -1101,6 +1102,46 @@ func (a *Operator) handleClusterServiceVersionDeletion(obj interface{}) { logger.WithError(err).Warnf("failed to requeue gc event: %v", webhook) } } + + // Conversion webhooks are defined within a CRD. + // In an effort to prevent customer dataloss, OLM does not delete CRDs associated with a CSV when it is deleted. + // Deleting a CSV that introduced a conversion webhook removes the deployment that serviced the conversion webhook calls. + // If a conversion webhook is defined and the service isn't available, all requests against the CR associated with the CRD will fail. + // This ultimately breaks kubernetes garbage collection and prevents OLM from reinstalling the CSV as CR validation against the new CRD's + // openapiv3 schema fails. + // As such, when a CSV is deleted OLM will check if it is being replaced. If the CSV is not being replaced, OLM will remove the conversion + // webhook from the CRD definition. + csvs, err := a.lister.OperatorsV1alpha1().ClusterServiceVersionLister().ClusterServiceVersions(clusterServiceVersion.GetNamespace()).List(labels.Everything()) + if err != nil { + logger.Errorf("error listing csvs: %v\n", err) + } + for _, csv := range csvs { + if csv.Spec.Replaces == clusterServiceVersion.GetName() { + return + } + } + + for _, desc := range clusterServiceVersion.Spec.WebhookDefinitions { + if desc.Type != v1alpha1.ConversionWebhook || len(desc.ConversionCRDs) == 0 { + continue + } + + for i, crdName := range desc.ConversionCRDs { + crd, err := a.lister.APIExtensionsV1().CustomResourceDefinitionLister().Get(crdName) + if err != nil { + logger.Errorf("error getting CRD %v which was defined in CSVs spec.WebhookDefinition[%d]: %v\n", crdName, i, err) + continue + } + + copy := crd.DeepCopy() + copy.Spec.Conversion.Strategy = apiextensionsv1.NoneConverter + copy.Spec.Conversion.Webhook = nil + + if _, err = a.opClient.ApiextensionsInterface().ApiextensionsV1().CustomResourceDefinitions().Update(context.TODO(), copy, metav1.UpdateOptions{}); err != nil { + logger.Errorf("error updating conversion strategy for CRD %v: %v\n", crdName, err) + } + } + } } func (a *Operator) removeDanglingChildCSVs(csv *v1alpha1.ClusterServiceVersion) error {