From d0b6bf6f5c07c3c1568ecedf1b4aa3d4597fc674 Mon Sep 17 00:00:00 2001 From: Ryan King Date: Fri, 28 Jan 2022 16:11:52 -0500 Subject: [PATCH 1/7] add auto-pruning EP Signed-off-by: Ryan King --- enhancements/automatic-resource-pruning.md | 196 +++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 enhancements/automatic-resource-pruning.md diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md new file mode 100644 index 000000000..3a5bdcf91 --- /dev/null +++ b/enhancements/automatic-resource-pruning.md @@ -0,0 +1,196 @@ +--- +title: Automatic Resource Pruning +authors: + - "@ryantking" +reviewers: + - "@jmrodri" + - "@gallettilance" +approvers: + - "@jmrodri" +creation-date:2022-01-28 +last-updated: 2022-01-28 +status: implementable +--- + +## Summary + +This EP aims to provide an easy way for users to operator authors to limit the number of ephemeral resources that may +exist on the cluster at any point in time. In this context, we define an ephemeral resource as a short-lived resource +that is continuously created as the operator runs. Short-lived does not refer to a specific amount of time, rather, the +fact that the resource has a defined life span. For example, a web server running in a `Pod` is not ephemeral because it +will run until an external force such as a cluster administrator or CD pipeline acts on it while a log rotating script +running in a `Pod` is ephemeral since it will run for until it finishes its defined work. Operator authors will be able +to employ different strategies to limit the number of ephemeral resources such as adding a numerical limit, an age +limit, or something custom to the author's use case. + +## Motivation + +Often, operators will create ephemeral resources during execution. For example, if we imagine an operator that +implements Kubernetes' builtin `CronJob` functionality, every time the operator reconciles and finds a `CronJos` to run, +it creates a new `Job` type to represent a single execution of the `CronJob`. Users will often want to have access to +theses ephemeral resources in order to view historical data, but want to limit the number that can exist on the system. +Looking again at Kubernetes out-of-the-box functionalities, users can configure the retention policy for resources such +as `ReplicaSets` and `Jobs` to maintain a certain amount of historical data. Operator authors should have a defined path +for implementing the same functionality for their operators. + +### Goals + +- Add a library to [operator-lib](https://github.com/operator-framework/operator-lib) that houses this functionality + with a user-friendly API. +- Add a happy path for what we determine to be common use cases, such as removing `Pods` in a finished state if they are + older than a set age. +- Provide an easy way for operator authors to plug in custom logic for their custom resource types and use cases. + +### Non-Goals + +- Add auto-pruning to any of the templates or scaffolding functionality. +- Adding auto-pruning support for Helm or Ansible operators. + +## Proposal + +The proposed implementation is adding a package, `prune`, to +[operator-lib](https://github.com/operator-framework/operator-lib) that exposes this functionality. There will be a +primary entry point function that takes in configuration and prunes resources accordingly. The configuration will accept +one or many resource types, a pruning strategy, namespaces, label selectors, and other common settings such as a dry run +mode, hooks, and logging configuration. + +Another aspect of the library will be determining when it can and cannot prune a resource. For example, the prune +functionality should not remove a running `Pod` until it has completed, even if it meets the strategy criteria. The +library will expose a way for operator authors to specify what criteria makes a resource of a specific type safe to +prune; this can include checking annotations, a status, or any other data available on the resource. An important +distinction to draw is the difference between a strategy and checking whether it is safe to prune a resource (henceforth +called the "is-pruneable" functionality). Strategies look generically at collections of resources and decide which +resources in the collection to prune, if any. They should only take criteria common to all Kubernetes resources into +account such as the count of resources and creation timestamp. The is-pruneable functionality conversely looks at one +and only one resource at a given point in time to determine whether or not the current prune task can remove a resource +based on its specific data. + +Note that stategies will not be programmatically limited to being resource agnostic, but it will be a defined best +practice to write strategies in such a way. One exception to this recommendation will be when the operator author wants +to prune based on a cumulative value such as the summation of a field across multiple resources. The operator author +must then be sure to add safe guards to the strategy to avoid unexpected behavior if used with an incompatible resource. + +### User Stories + +#### Story 1 + +An an operator author, I want to limit the number of `Jobs` in a completed state on the cluster so that the long term +storage cost of my operator has a cap. + +#### Story 2 + +As an operator author, I want to limit the number of `Pods` in a completed state on a the cluster and preserve the +`Pods` in an error state so that the long term storage cost of my operator has a soft cap and the user can see status +information about failed `Pods` before manually removing. + +#### Story 3 + +As an operator author, I want to limit the number of `Pods`with a custom heuristic based on the creation timestamp so +that the long term storage cost of my operator has a cap based on my operator's logic. + +#### Story 4 + +As an operator author, I want to prune a custom resources with specific status information when there is a certain +number so that the long term storage cost of my operator has a cap. + +#### Story 5 + +As an operator author, I want to prune both `Jobs` and `Pods` of a certain age so that the long term storage cost of my +operator has a cap and there are no orphaned resources. + +### Implementation Details + +- A strategy is a function that takes in a collection of resources and returns a collection of resources to remove. +- The identifier for a resource types will be `GroupVersionKind` value. +- An is-pruneable function takes in one instance of a resource and returns an error value that indicates if it is safe + to prune or has other issues. +- The library will provide built-in is-pruneable functions for `Pods` and `Jobs` that can be overwritten. +- A registry will hold a mapping of resource types (`GVKs`) to is-pruneable functions. + +A proposed go API is in [Appendix A](#appendix-a). + +### Risks and Mitigations + +The primary risk associated with this EP is exposing too many knobs and features in the day 1 implementation. We can +mitigate this by only exposing functionality that is absolutely needed. APIs are easy to grow, but near-impossible to +shrink. + +## Design Details + +### Test Plan + +The following components will be unit tested: + +- The builtin strategies. +- The builtin is-prunable functions. +- The main prune routine. + +The feature author will add an integration test suite that runs the prune routine in the use cases defined in the user +stories. + +## Implementation History + +[operator-framework/operator-lib#75](https://github.com/operator-framework/operator-lib/pull/75): Implements a first +pass of the prune package with only support for `Jobs` and `Pods`. The API is also slightly different than the one +proposed in this EP. + +## Drawbacks + +The user will need to manually integrate this functionality into their operator since it is a library. + +## Alternatives + +An alternative approach would be adding this logic to the core SDK and scaffolding it optionally during operation +generation. The primary drawbacks with this approach are the increased complexity to the implementation and adding it to +existing operators. + +## Open Questions + +- What are the predefined use cases that we want to support? Currently we support pruning completed `Jobs` and `Pods` by + age and max count. + +### Implementation-specific + +- What type of Kubernetes object should we generically work with? E.g. `metav1.Object`or `runtime.Object`? +- How do we specify which Kubernetes objects to delete? Pass back another list of objects? We just need name, namespace, + and `GVK`. +- Which Kubernetes client should we work with? Dynamic client due to custom resource types? +- Should we register `IsPruneable` functions or a `ResourceConfig` structure that will hold that function and + potentially additional configuration. + +## Appendix A + +The following is the proposed Go API: + +```go +// StrategyFunc takes a list of resources and returns the subset to prune. +type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) + +// ErrUnpruneable indicates that it is not allowed to prune a specific object. +type ErrUnpruneable struct { + Obj *runtime.Object + Reason string +} + +// IsPruneableFunc is a function that checks a the data of an object to see whether or not it is safe to prune it. +// It should return `nil` if it is safe to prune, `ErrUnpruneable` if it is unsafe, or another error. +// It should safely assert the object is the expected type, otherwise it might panic. +type IsPruneableFunc func(obj *runtime.Object) error + +// RegisterIsPruneableFunc registers a function to check whether it is safe to prune a resources of a certain type. +func RegisterIsPrunableFunc(gvk schema.GroupVersionKind, isPruneable IsPruneableFunc) { /* ... */ } + +// Pruner is an object that runs a prune job. +type Pruner struct { + // ... +} + +// PrunerOption configures the pruner. +type PrunerOption func(p *Pruner) + +// NewPruner returns a pruner that uses the given startegy to prune objects. +func NewPruner(client dynamic.Interface, opts ...PrunerOption) Pruner { return Pruner{} } + +// Prune runs the pruner. +func (p Pruner) Prune(ctx Context) error { return nil } +``` From c74b8d975359726ae446ea4b55457bcffac300c6 Mon Sep 17 00:00:00 2001 From: Ryan King Date: Mon, 31 Jan 2022 15:29:56 -0500 Subject: [PATCH 2/7] update based on PR comments - Swap word "ephemeral" out for "unbounded" - Add alternative Go API proposal. Signed-off-by: Ryan King --- enhancements/automatic-resource-pruning.md | 142 ++++++++++++++++----- 1 file changed, 107 insertions(+), 35 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index 3a5bdcf91..e7df24748 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -1,34 +1,41 @@ --- -title: Automatic Resource Pruning +title: automatic-resource-pruning authors: - - "@ryantking" + - '@ryantking' reviewers: - - "@jmrodri" - - "@gallettilance" + - '@jmrodri' + - '@gallettilance' + - '@fgiloux' + - '@joelanford' approvers: - - "@jmrodri" -creation-date:2022-01-28 -last-updated: 2022-01-28 + - '@jmrodri' +creation-date: 2022-01-28 +last-updated: 2022-01-31 status: implementable --- +# Automatic Resource Pruning + ## Summary -This EP aims to provide an easy way for users to operator authors to limit the number of ephemeral resources that may -exist on the cluster at any point in time. In this context, we define an ephemeral resource as a short-lived resource -that is continuously created as the operator runs. Short-lived does not refer to a specific amount of time, rather, the -fact that the resource has a defined life span. For example, a web server running in a `Pod` is not ephemeral because it -will run until an external force such as a cluster administrator or CD pipeline acts on it while a log rotating script -running in a `Pod` is ephemeral since it will run for until it finishes its defined work. Operator authors will be able -to employ different strategies to limit the number of ephemeral resources such as adding a numerical limit, an age -limit, or something custom to the author's use case. +This EP will provide a way for operator authors to limit the number of unbounded resources that may exist on the +cluster. In the context of this EP, we define an "unbounded resource" as any resource that has the following two +properties: + +1. The resource is continuously created as the operator runs. +2. The resource is not removed as the operator runs. + +When those two properties exist for a resource, the total number of that resource will grow unbounded. The functionality +introduced by this EP will make it possible for operator authors to easily control the entire life cycle of a resource +by automatically removing resources that meet specific criteria. These criteria can include properties of individual +resources, the entire set of resources, or a combination of both. ## Motivation -Often, operators will create ephemeral resources during execution. For example, if we imagine an operator that -implements Kubernetes' builtin `CronJob` functionality, every time the operator reconciles and finds a `CronJos` to run, +Often, operators will create unbounded resources during execution. For example, if we imagine an operator that +implements Kubernetes' builtin `CronJob` functionality, every time the operator reconciles and finds a `CronJo` to run, it creates a new `Job` type to represent a single execution of the `CronJob`. Users will often want to have access to -theses ephemeral resources in order to view historical data, but want to limit the number that can exist on the system. +theses unbounded resources in order to view historical data, but want to limit the number that can exist on the system. Looking again at Kubernetes out-of-the-box functionalities, users can configure the retention policy for resources such as `ReplicaSets` and `Jobs` to maintain a certain amount of historical data. Operator authors should have a defined path for implementing the same functionality for their operators. @@ -37,8 +44,8 @@ for implementing the same functionality for their operators. - Add a library to [operator-lib](https://github.com/operator-framework/operator-lib) that houses this functionality with a user-friendly API. -- Add a happy path for what we determine to be common use cases, such as removing `Pods` in a finished state if they are - older than a set age. +- Add a happy path for what we determine to be common use cases, such as removing operator-owned `Pods` that are both in + a finished state and older than a certain age. - Provide an easy way for operator authors to plug in custom logic for their custom resource types and use cases. ### Non-Goals @@ -74,29 +81,34 @@ must then be sure to add safe guards to the strategy to avoid unexpected behavio #### Story 1 -An an operator author, I want to limit the number of `Jobs` in a completed state on the cluster so that the long term -storage cost of my operator has a cap. +An an operator author, I want to limit the number of operator-owned `Jobs` in a completed state on the cluster so that +the long term storage cost of my operator has a cap. #### Story 2 -As an operator author, I want to limit the number of `Pods` in a completed state on a the cluster and preserve the -`Pods` in an error state so that the long term storage cost of my operator has a soft cap and the user can see status -information about failed `Pods` before manually removing. +As an operator author, I want to limit the number of operator-owned `Pods` in a completed state on a the cluster and +preserve the `Pods` in an error state so that the long term storage cost of my operator has a soft cap and the user can +see status information about failed `Pods` before manually removing. #### Story 3 -As an operator author, I want to limit the number of `Pods`with a custom heuristic based on the creation timestamp so -that the long term storage cost of my operator has a cap based on my operator's logic. +As an operator author, I want to limit the number of operator-owned `Pods`with a custom heuristic based on the creation +timestamp so that the long term storage cost of my operator has a cap based on my operator's logic. #### Story 4 -As an operator author, I want to prune a custom resources with specific status information when there is a certain -number so that the long term storage cost of my operator has a cap. +As an operator author, I want to prune a custom resource with specific state information when there is a certain number +so that the long term storage cost of my operator has a cap. #### Story 5 -As an operator author, I want to prune both `Jobs` and `Pods` of a certain age so that the long term storage cost of my -operator has a cap and there are no orphaned resources. +As an operator author, I want to prune both operator-owned `Jobs` and `Pods` of a certain age so that the long term +storage cost of my operator has a cap and there are no orphaned resources. + +#### Story 6 + +As an operator author, I want to prune a custom resource with specific state information with a custom strategy so that +the long term storage cost of my operator has a cap. ### Implementation Details @@ -107,7 +119,7 @@ operator has a cap and there are no orphaned resources. - The library will provide built-in is-pruneable functions for `Pods` and `Jobs` that can be overwritten. - A registry will hold a mapping of resource types (`GVKs`) to is-pruneable functions. -A proposed go API is in [Appendix A](#appendix-a). +There are two proposed go APIis in [Appendix A](#appendix-a) and [Appendix B](#appendix-b). ### Risks and Mitigations @@ -172,6 +184,9 @@ type ErrUnpruneable struct { Reason string } +// Error returns a string reprenstation of an `ErrUnpruneable` error. +func (e *ErrUnpruneable) Error() string { return "" } + // IsPruneableFunc is a function that checks a the data of an object to see whether or not it is safe to prune it. // It should return `nil` if it is safe to prune, `ErrUnpruneable` if it is unsafe, or another error. // It should safely assert the object is the expected type, otherwise it might panic. @@ -188,9 +203,66 @@ type Pruner struct { // PrunerOption configures the pruner. type PrunerOption func(p *Pruner) -// NewPruner returns a pruner that uses the given startegy to prune objects. -func NewPruner(client dynamic.Interface, opts ...PrunerOption) Pruner { return Pruner{} } +// NewPruner returns a pruner that uses the given strategy to prune objects. +func NewPruner(client dynamic.Interface, opts ...func(p *Pruner)) Pruner { return Pruner{} } + +// Prune runs the pruner. +func (p Pruner) Prune(ctx context.Context) error { return nil } +``` + +## Appendix B + +The following is a more modular Go API that can support more custom resource functionality in the future: + +```go +// StrategyFunc takes a list of resources and returns the subset to prune. +type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) + +// ErrUnpruneable indicates that it is not allowed to prune a specific object. +type ErrUnpruneable struct { + Obj *runtime.Object + Reason string +} + +// Error returns a string reprenstation of an `ErrUnpruneable` error. +func (e *ErrUnpruneable) Error() string { return "" } + +// Registration holds information about a resource with how it should be +type Registration struct { + // IsPruneable is a function that checks the data of an object to check whether or not it is eligible for pruning. + // It should return `nil` if it is eligible to prune, `ErrUnpruneable` if it is unsafe, or another error. + // It should safely assert the object is the expected type, otherwise it might panic. + IsPruneable func(obj *runtime.Object) error +} + +// Registry holds configuration about specific resource types. +type Registry struct { + // ... +} + +// Register adds a resource to the registry. +func (r *Registry) Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } + +// Get returns a resource registered with the given GVK. +func (r *Registry) Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } + +// Register adds a resource to the default registry. +func Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } + +// Get returns a resource registered in the default registry with the given GVK. +func Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } + +// WithIsPruneable adds a function to the resource registration. +func WithIsPruneable(func (obj *runtime.Object) error) func (*Registration) { return nil } + +// Pruner is an object that runs a prune job. +type Pruner struct { + // ... +} + +// NewPruner returns a pruner that uses the given strategy to prune objects. +func NewPruner(client dynamic.Interface, opts ...func(p *Pruner)) Pruner { return Pruner{} } // Prune runs the pruner. -func (p Pruner) Prune(ctx Context) error { return nil } +func (p Pruner) Prune(ctx context.Context) error { return nil } ``` From 6b88da287e0ea9325732f1a857c6cbf29a9f479f Mon Sep 17 00:00:00 2001 From: Ryan King Date: Tue, 1 Feb 2022 14:37:41 -0500 Subject: [PATCH 3/7] fix typo Signed-off-by: Ryan King --- enhancements/automatic-resource-pruning.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index e7df24748..97b37bc88 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -10,7 +10,7 @@ reviewers: approvers: - '@jmrodri' creation-date: 2022-01-28 -last-updated: 2022-01-31 +last-updated: 2022-02-01 status: implementable --- @@ -33,12 +33,12 @@ resources, the entire set of resources, or a combination of both. ## Motivation Often, operators will create unbounded resources during execution. For example, if we imagine an operator that -implements Kubernetes' builtin `CronJob` functionality, every time the operator reconciles and finds a `CronJo` to run, -it creates a new `Job` type to represent a single execution of the `CronJob`. Users will often want to have access to -theses unbounded resources in order to view historical data, but want to limit the number that can exist on the system. -Looking again at Kubernetes out-of-the-box functionalities, users can configure the retention policy for resources such -as `ReplicaSets` and `Jobs` to maintain a certain amount of historical data. Operator authors should have a defined path -for implementing the same functionality for their operators. +implements Kubernetes' builtin `CronJob` functionality, every time the operator reconciles and finds a `CronJobs` to +run, it creates a new `Job` type to represent a single execution of the `CronJob`. Users will often want to have access +to theses unbounded resources in order to view historical data, but want to limit the number that can exist on the +system. Looking again at Kubernetes out-of-the-box functionalities, users can configure the retention policy for +resources such as `ReplicaSets` and `Jobs` to maintain a certain amount of historical data. Operator authors should have +a defined path for implementing the same functionality for their operators. ### Goals From f5db3871059bc2cfb05915361418f0732ebd7bc6 Mon Sep 17 00:00:00 2001 From: Ryan King Date: Tue, 1 Feb 2022 14:43:41 -0500 Subject: [PATCH 4/7] add open question Signed-off-by: Ryan King --- enhancements/automatic-resource-pruning.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index 97b37bc88..4808ac1a8 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -160,6 +160,7 @@ existing operators. - What are the predefined use cases that we want to support? Currently we support pruning completed `Jobs` and `Pods` by age and max count. +- Should we mandate that an author must register any resource type that they wish to prune? ### Implementation-specific @@ -215,9 +216,6 @@ func (p Pruner) Prune(ctx context.Context) error { return nil } The following is a more modular Go API that can support more custom resource functionality in the future: ```go -// StrategyFunc takes a list of resources and returns the subset to prune. -type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) - // ErrUnpruneable indicates that it is not allowed to prune a specific object. type ErrUnpruneable struct { Obj *runtime.Object @@ -240,11 +238,11 @@ type Registry struct { // ... } -// Register adds a resource to the registry. -func (r *Registry) Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } +// RegisterResource adds a resource to the registry. +func (r *Registry) RegisterResource(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } -// Get returns a resource registered with the given GVK. -func (r *Registry) Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } +// GetRegistration returns a resource registered with the given GVK. +func (r *Registry) GetRegistration(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } // Register adds a resource to the default registry. func Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } @@ -255,13 +253,16 @@ func Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false // WithIsPruneable adds a function to the resource registration. func WithIsPruneable(func (obj *runtime.Object) error) func (*Registration) { return nil } +// StrategyFunc takes a list of resources and returns the subset to prune. +type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) + // Pruner is an object that runs a prune job. type Pruner struct { // ... } // NewPruner returns a pruner that uses the given strategy to prune objects. -func NewPruner(client dynamic.Interface, opts ...func(p *Pruner)) Pruner { return Pruner{} } +func NewPruner(client dynamic.Interface, strategy StrategyFunc, opts ...func(p *Pruner)) Pruner { return Pruner{} } // Prune runs the pruner. func (p Pruner) Prune(ctx context.Context) error { return nil } From cf5e9aa0ac34b1da39aa660128291fd5ed969e29 Mon Sep 17 00:00:00 2001 From: Ryan King Date: Tue, 1 Feb 2022 14:53:21 -0500 Subject: [PATCH 5/7] fix function names Signed-off-by: Ryan King --- enhancements/automatic-resource-pruning.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index 4808ac1a8..bf0b0d542 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -239,16 +239,16 @@ type Registry struct { } // RegisterResource adds a resource to the registry. -func (r *Registry) RegisterResource(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } +func (r *Registry) Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } // GetRegistration returns a resource registered with the given GVK. -func (r *Registry) GetRegistration(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } +func (r *Registry) Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } -// Register adds a resource to the default registry. -func Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } +// RegisterResource adds a resource to the default registry. +func RegisterResource(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } -// Get returns a resource registered in the default registry with the given GVK. -func Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } +// GetResource returns a resource registered in the default registry with the given GVK. +func GetResource(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } // WithIsPruneable adds a function to the resource registration. func WithIsPruneable(func (obj *runtime.Object) error) func (*Registration) { return nil } From 687d6185035f2bf5acca8397f55f99d9513bee9d Mon Sep 17 00:00:00 2001 From: Ryan King Date: Tue, 29 Mar 2022 10:39:13 -0400 Subject: [PATCH 6/7] Add additional alternative proposal --- enhancements/automatic-resource-pruning.md | 52 +++++++++++++++------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index bf0b0d542..28ecfbc75 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -10,7 +10,7 @@ reviewers: approvers: - '@jmrodri' creation-date: 2022-01-28 -last-updated: 2022-02-01 +last-updated: 2022-03-29 status: implementable --- @@ -118,8 +118,12 @@ the long term storage cost of my operator has a cap. to prune or has other issues. - The library will provide built-in is-pruneable functions for `Pods` and `Jobs` that can be overwritten. - A registry will hold a mapping of resource types (`GVKs`) to is-pruneable functions. +- The library will use `client.Object` from controller-runtime to reference Kubernetes objects since it includes both + `metav1.Object` and `runtime.Object`. +- The library will perform all Kubernetes operations with a dynamic client to support custom resources. -There are two proposed go APIis in [Appendix A](#appendix-a) and [Appendix B](#appendix-b). +There are two proposed go APIis in [Appendix A](#appendix-a) and [Appendix B](#appendix-b). This design will go with the +first API due to its simplicity. ### Risks and Mitigations @@ -152,24 +156,19 @@ The user will need to manually integrate this functionality into their operator ## Alternatives +### Scaffolding + An alternative approach would be adding this logic to the core SDK and scaffolding it optionally during operation generation. The primary drawbacks with this approach are the increased complexity to the implementation and adding it to existing operators. -## Open Questions - -- What are the predefined use cases that we want to support? Currently we support pruning completed `Jobs` and `Pods` by - age and max count. -- Should we mandate that an author must register any resource type that they wish to prune? +### Separate Operator -### Implementation-specific - -- What type of Kubernetes object should we generically work with? E.g. `metav1.Object`or `runtime.Object`? -- How do we specify which Kubernetes objects to delete? Pass back another list of objects? We just need name, namespace, - and `GVK`. -- Which Kubernetes client should we work with? Dynamic client due to custom resource types? -- Should we register `IsPruneable` functions or a `ResourceConfig` structure that will hold that function and - potentially additional configuration. +Another alternative would be exposing a set of pruning APIs that would configure an operator that handles all resource +pruning. See [Appendix C](#appendix-c) for an example as to what the spec for a pruning CRD could look like. The +advantage to this approach is that configuring pruning logic would be simple since the user would only have to add one +resource to the cluster to enable pruning. The major disadvantage would be any operator that relies on or encourages +pruning would add a dependent operator that must me managed. ## Appendix A @@ -267,3 +266,26 @@ func NewPruner(client dynamic.Interface, strategy StrategyFunc, opts ...func(p * // Prune runs the pruner. func (p Pruner) Prune(ctx context.Context) error { return nil } ``` + +## Appendix C + +The following is an example of what a prune CRD could look like: + +```go +spec: + objects: + - group: example.com + version: v1 + kind: MyKind + matchers: + - selector: + matchLabels: + example.com/is-completed: true + - maxAge: 10h + default: + matchers: + - selector: + matchLabels: + example.com/is-completed: true + - maxCount: 50 +``` From 02cd266673ca01c6a462ab36ae7ee8198fc12501 Mon Sep 17 00:00:00 2001 From: Ryan King Date: Tue, 3 May 2022 09:57:27 -0400 Subject: [PATCH 7/7] Remove unused API --- enhancements/automatic-resource-pruning.md | 110 ++++++--------------- 1 file changed, 28 insertions(+), 82 deletions(-) diff --git a/enhancements/automatic-resource-pruning.md b/enhancements/automatic-resource-pruning.md index 28ecfbc75..723f83e52 100644 --- a/enhancements/automatic-resource-pruning.md +++ b/enhancements/automatic-resource-pruning.md @@ -1,16 +1,16 @@ --- title: automatic-resource-pruning authors: - - '@ryantking' + - "@ryantking" reviewers: - - '@jmrodri' - - '@gallettilance' - - '@fgiloux' - - '@joelanford' + - "@jmrodri" + - "@gallettilance" + - "@fgiloux" + - "@joelanford" approvers: - - '@jmrodri' + - "@jmrodri" creation-date: 2022-01-28 -last-updated: 2022-03-29 +last-updated: 2022-05-03 status: implementable --- @@ -58,7 +58,7 @@ a defined path for implementing the same functionality for their operators. The proposed implementation is adding a package, `prune`, to [operator-lib](https://github.com/operator-framework/operator-lib) that exposes this functionality. There will be a primary entry point function that takes in configuration and prunes resources accordingly. The configuration will accept -one or many resource types, a pruning strategy, namespaces, label selectors, and other common settings such as a dry run +one or resource types, a pruning strategy, namespaces, label selectors, and other common settings such as a dry run mode, hooks, and logging configuration. Another aspect of the library will be determining when it can and cannot prune a resource. For example, the prune @@ -72,7 +72,7 @@ account such as the count of resources and creation timestamp. The is-pruneable and only one resource at a given point in time to determine whether or not the current prune task can remove a resource based on its specific data. -Note that stategies will not be programmatically limited to being resource agnostic, but it will be a defined best +Note that strategies will not be programmatically limited to being resource agnostic, but it will be a defined best practice to write strategies in such a way. One exception to this recommendation will be when the operator author wants to prune based on a cumulative value such as the summation of a field across multiple resources. The operator author must then be sure to add safe guards to the strategy to avoid unexpected behavior if used with an incompatible resource. @@ -110,7 +110,7 @@ storage cost of my operator has a cap and there are no orphaned resources. As an operator author, I want to prune a custom resource with specific state information with a custom strategy so that the long term storage cost of my operator has a cap. -### Implementation Details +### Implementation Detail - A strategy is a function that takes in a collection of resources and returns a collection of resources to remove. - The identifier for a resource types will be `GroupVersionKind` value. @@ -122,8 +122,7 @@ the long term storage cost of my operator has a cap. `metav1.Object` and `runtime.Object`. - The library will perform all Kubernetes operations with a dynamic client to support custom resources. -There are two proposed go APIis in [Appendix A](#appendix-a) and [Appendix B](#appendix-b). This design will go with the -first API due to its simplicity. +The proposed API is in [Appendix A](#appendix-a). ### Risks and Mitigations @@ -138,7 +137,7 @@ shrink. The following components will be unit tested: - The builtin strategies. -- The builtin is-prunable functions. +- The builtin is-pruneable functions. - The main prune routine. The feature author will add an integration test suite that runs the prune routine in the use cases defined in the user @@ -150,6 +149,8 @@ stories. pass of the prune package with only support for `Jobs` and `Pods`. The API is also slightly different than the one proposed in this EP. +[operator-framework/operator-lib#105](https://github.com/operator-framework/operator-lib/pull/105): Refactor of the first implementation that uses the API designed in this EP. + ## Drawbacks The user will need to manually integrate this functionality into their operator since it is a library. @@ -165,7 +166,7 @@ existing operators. ### Separate Operator Another alternative would be exposing a set of pruning APIs that would configure an operator that handles all resource -pruning. See [Appendix C](#appendix-c) for an example as to what the spec for a pruning CRD could look like. The +pruning. See [Appendix B](#appendix-b) for an example as to what the spec for a pruning CRD could look like. The advantage to this approach is that configuring pruning logic would be simple since the user would only have to add one resource to the cluster to enable pruning. The major disadvantage would be any operator that relies on or encourages pruning would add a dependent operator that must me managed. @@ -176,21 +177,21 @@ The following is the proposed Go API: ```go // StrategyFunc takes a list of resources and returns the subset to prune. -type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) +type StrategyFunc func(ctx context.Context, objs []client.Object) ([]client.Object, error) -// ErrUnpruneable indicates that it is not allowed to prune a specific object. -type ErrUnpruneable struct { - Obj *runtime.Object +// Unpruneable is an error that indicates that the pruner should not prune an object. +type Unpruneable struct { + Object *client.Object Reason string } -// Error returns a string reprenstation of an `ErrUnpruneable` error. -func (e *ErrUnpruneable) Error() string { return "" } +// Error returns a string reprenstation of an `Unpruneable` error. +func (e *Unpruneable) Error() string { return "" } // IsPruneableFunc is a function that checks a the data of an object to see whether or not it is safe to prune it. -// It should return `nil` if it is safe to prune, `ErrUnpruneable` if it is unsafe, or another error. +// It should return `nil` if it is safe to prune, `Unpruneable` if it is unsafe, or another error. // It should safely assert the object is the expected type, otherwise it might panic. -type IsPruneableFunc func(obj *runtime.Object) error +type IsPruneableFunc func(obj client.Object) error // RegisterIsPruneableFunc registers a function to check whether it is safe to prune a resources of a certain type. func RegisterIsPrunableFunc(gvk schema.GroupVersionKind, isPruneable IsPruneableFunc) { /* ... */ } @@ -204,70 +205,15 @@ type Pruner struct { type PrunerOption func(p *Pruner) // NewPruner returns a pruner that uses the given strategy to prune objects. -func NewPruner(client dynamic.Interface, opts ...func(p *Pruner)) Pruner { return Pruner{} } - -// Prune runs the pruner. -func (p Pruner) Prune(ctx context.Context) error { return nil } -``` - -## Appendix B - -The following is a more modular Go API that can support more custom resource functionality in the future: - -```go -// ErrUnpruneable indicates that it is not allowed to prune a specific object. -type ErrUnpruneable struct { - Obj *runtime.Object - Reason string -} - -// Error returns a string reprenstation of an `ErrUnpruneable` error. -func (e *ErrUnpruneable) Error() string { return "" } - -// Registration holds information about a resource with how it should be -type Registration struct { - // IsPruneable is a function that checks the data of an object to check whether or not it is eligible for pruning. - // It should return `nil` if it is eligible to prune, `ErrUnpruneable` if it is unsafe, or another error. - // It should safely assert the object is the expected type, otherwise it might panic. - IsPruneable func(obj *runtime.Object) error +func NewPruner(client client.Client, gvk scheme.GroupVersionKind, *opts ...PrunerOption) (*Pruner, error) { + return &Pruner{}, nil } -// Registry holds configuration about specific resource types. -type Registry struct { - // ... -} - -// RegisterResource adds a resource to the registry. -func (r *Registry) Register(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } - -// GetRegistration returns a resource registered with the given GVK. -func (r *Registry) Get(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } - -// RegisterResource adds a resource to the default registry. -func RegisterResource(gvk *schema.GroupVersionKind, ...func(r *Registration)) { /* ... */ } - -// GetResource returns a resource registered in the default registry with the given GVK. -func GetResource(gvk *schema.GroupVersionKind) (*Registration, bool) { return nil, false } - -// WithIsPruneable adds a function to the resource registration. -func WithIsPruneable(func (obj *runtime.Object) error) func (*Registration) { return nil } - -// StrategyFunc takes a list of resources and returns the subset to prune. -type StrategyFunc func(ctx context.Context, objs []runtime.Object) ([]runtime.Object, error) - -// Pruner is an object that runs a prune job. -type Pruner struct { - // ... -} - -// NewPruner returns a pruner that uses the given strategy to prune objects. -func NewPruner(client dynamic.Interface, strategy StrategyFunc, opts ...func(p *Pruner)) Pruner { return Pruner{} } - -// Prune runs the pruner. -func (p Pruner) Prune(ctx context.Context) error { return nil } +// Prune performs a single instance. +func (p Pruner) Prune(ctx context.Context) ([]client.Object, error) { return nil, nil } ``` -## Appendix C +## Appendix B The following is an example of what a prune CRD could look like: