Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pkg/controller/machine/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

configv1 "github.com/openshift/api/config/v1"
machinev1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1"
"github.com/openshift/machine-api-operator/pkg/metrics"
"github.com/openshift/machine-api-operator/pkg/util"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand Down Expand Up @@ -430,6 +431,7 @@ func isInvalidMachineConfigurationError(err error) bool {
func (r *ReconcileMachine) setPhase(machine *machinev1.Machine, phase string, errorMessage string) error {
if stringPointerDeref(machine.Status.Phase) != phase {
klog.V(3).Infof("%v: going into phase %q", machine.GetName(), phase)

// A call to Patch will mutate our local copy of the machine to match what is stored in the API.
// Before we make any changes to the status subresource on our local copy, we need to patch the object first,
// otherwise our local changes to the status subresource will be lost.
Expand All @@ -454,6 +456,15 @@ func (r *ReconcileMachine) setPhase(machine *machinev1.Machine, phase string, er
klog.Errorf("Failed to update machine status %q: %v", machine.GetName(), err)
return err
}

// Update the metric after everything else has succeeded to prevent duplicate
// entries when there are failures
if phase != phaseDeleting {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should protect against duplicate calls to the same phase so we don't spoil this metric.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had considered this and thought it to be covered by L#432, we only update the metric on changes to the phase right? Do you think there's some extra nuance or case that needs to be covered here?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, I missed that line. We should probably refactor this function to quit early instead of putting everything in this giant if statement.

One thing to consider is protecting against empty phase.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nope, I missed that line. We should probably refactor this function to quit early instead of putting everything in this giant if statement.

Ack, that would be sensible!

One thing to consider is protecting against empty phase.

How do you mean here, as in if it were empty then skipped ahead to eg Running? Wondering when this could happen, only when the status is blanked somehow? Maybe on creation of master machines during IPI? How would you suggest we protect here? Should we only record "" to "provisioning" and ignore "" to anything else?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, probably outside the scope of this patch set. Perhaps we need a validatePhaseTransition() function to ensure we're doing the right thing here.

I'm confident the code will work as-is today, I'm worried about protecting against bugs in the future. We can probably just make another card for this, I think we can ship as-is.

// Apart from deleting, update the transition metric
// Deleting would always end up in the infinite bucket
timeElapsed := time.Now().Sub(machine.GetCreationTimestamp().Time).Seconds()
metrics.MachinePhaseTransitionSeconds.With(map[string]string{"phase": phase}).Observe(timeElapsed)
}
}
return nil
}
Expand Down
13 changes: 13 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,21 @@ var (
)
)

// Metrics for use in the Machine controller
var (
// MachinePhaseTransitionSeconds is a metric to capute the time between a Machine being created and entering a particular phase
MachinePhaseTransitionSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "mapi_machine_phase_transition_seconds",
Help: "Number of seconds between Machine creation and Machine transition to a phase.",
Buckets: []float64{5, 10, 20, 30, 60, 90, 120, 180, 240, 300, 360, 480, 600},
}, []string{"phase"},
)
)

func init() {
prometheus.MustRegister(MachineCollectorUp)
metrics.Registry.MustRegister(MachinePhaseTransitionSeconds)
metrics.Registry.MustRegister(
failedInstanceCreateCount,
failedInstanceUpdateCount,
Expand Down