From fc6ef74eeb95a4fc30af307c562d27c7d6aee69c Mon Sep 17 00:00:00 2001 From: Aaron Lehmann Date: Tue, 27 Jun 2017 12:19:17 -0700 Subject: [PATCH] api: Use TaskStatus Err field for non-terminal errors There are some cases when a task can't advance from a particular state because preconditions are not met. For example, if no nodes meet its constraints, it will not advance to "assigned". Currently, we put a note about this in the Message field of TaskStatus, but this is not surfaced to the user. It wouldn't make sense to expose Message prominently because it usually contains uninteresting notes about how the task reached that state. Its presence does not indicate that something is wrong. Expand the scope of Err to also cover non-terminal errors that are blocking the task from progressing, and use it in those cases. Signed-off-by: Aaron Lehmann --- agent/exec/controller.go | 2 ++ api/types.pb.go | 7 ++++++- api/types.proto | 7 ++++++- manager/allocator/network.go | 8 +++++--- .../constraintenforcer/constraint_enforcer.go | 3 ++- manager/scheduler/scheduler.go | 10 ++++++---- manager/scheduler/scheduler_test.go | 16 ++++++++-------- 7 files changed, 35 insertions(+), 18 deletions(-) diff --git a/agent/exec/controller.go b/agent/exec/controller.go index 1cafb47fd1..4aef80ec97 100644 --- a/agent/exec/controller.go +++ b/agent/exec/controller.go @@ -119,6 +119,7 @@ func Resolve(ctx context.Context, task *api.Task, executor Executor) (Controller // we always want to proceed to accepted when we resolve the controller status.Message = "accepted" status.State = api.TaskStateAccepted + status.Err = "" } return ctlr, status, err @@ -158,6 +159,7 @@ func Do(ctx context.Context, task *api.Task, ctlr Controller) (*api.TaskStatus, current := status.State status.State = state status.Message = msg + status.Err = "" if current > state { panic("invalid state transition") diff --git a/api/types.pb.go b/api/types.pb.go index 2befc03577..56bb14248c 100644 --- a/api/types.pb.go +++ b/api/types.pb.go @@ -1241,12 +1241,17 @@ type TaskStatus struct { // because the task is prepared, we would put "already prepared" in this // field. Message string `protobuf:"bytes,3,opt,name=message,proto3" json:"message,omitempty"` - // Err is set if the task is in an error state. + // Err is set if the task is in an error state, or is unable to + // progress from an earlier state because a precondition is + // unsatisfied. // // The following states should report a companion error: // // FAILED, REJECTED // + // In general, messages that should be surfaced to users belong in the + // Err field, and notes on routine state transitions belong in Message. + // // TODO(stevvooe) Integrate this field with the error interface. Err string `protobuf:"bytes,4,opt,name=err,proto3" json:"err,omitempty"` // Container status contains container specific status information. diff --git a/api/types.proto b/api/types.proto index 201bc3587f..ca21bccd01 100644 --- a/api/types.proto +++ b/api/types.proto @@ -481,12 +481,17 @@ message TaskStatus { // field. string message = 3; - // Err is set if the task is in an error state. + // Err is set if the task is in an error state, or is unable to + // progress from an earlier state because a precondition is + // unsatisfied. // // The following states should report a companion error: // // FAILED, REJECTED // + // In general, messages that should be surfaced to users belong in the + // Err field, and notes on routine state transitions belong in Message. + // // TODO(stevvooe) Integrate this field with the error interface. string err = 4; diff --git a/manager/allocator/network.go b/manager/allocator/network.go index c760ad53dc..cdf3f9815a 100644 --- a/manager/allocator/network.go +++ b/manager/allocator/network.go @@ -1159,9 +1159,11 @@ func PredefinedNetworks() []networkallocator.PredefinedNetworkData { // updateTaskStatus sets TaskStatus and updates timestamp. func updateTaskStatus(t *api.Task, newStatus api.TaskState, message string) { - t.Status.State = newStatus - t.Status.Message = message - t.Status.Timestamp = ptypes.MustTimestampProto(time.Now()) + t.Status = api.TaskStatus{ + State: newStatus, + Message: message, + Timestamp: ptypes.MustTimestampProto(time.Now()), + } } // IsIngressNetwork returns whether the passed network is an ingress network. diff --git a/manager/orchestrator/constraintenforcer/constraint_enforcer.go b/manager/orchestrator/constraintenforcer/constraint_enforcer.go index 2978898ccb..7aa7651db7 100644 --- a/manager/orchestrator/constraintenforcer/constraint_enforcer.go +++ b/manager/orchestrator/constraintenforcer/constraint_enforcer.go @@ -159,7 +159,8 @@ loop: // restarting the task on another node // (if applicable). t.Status.State = api.TaskStateRejected - t.Status.Message = "assigned node no longer meets constraints" + t.Status.Message = "task rejected by constraint enforcer" + t.Status.Err = "assigned node no longer meets constraints" t.Status.Timestamp = ptypes.MustTimestampProto(time.Now()) return store.UpdateTask(tx, t) }) diff --git a/manager/scheduler/scheduler.go b/manager/scheduler/scheduler.go index 07d9b0458c..443dc00046 100644 --- a/manager/scheduler/scheduler.go +++ b/manager/scheduler/scheduler.go @@ -450,7 +450,9 @@ func (s *Scheduler) applySchedulingDecisions(ctx context.Context, schedulingDeci continue } - if t.Status.State == decision.new.Status.State && t.Status.Message == decision.new.Status.Message { + if t.Status.State == decision.new.Status.State && + t.Status.Message == decision.new.Status.Message && + t.Status.Err == decision.new.Status.Err { // No changes, ignore continue } @@ -506,7 +508,7 @@ func (s *Scheduler) taskFitNode(ctx context.Context, t *api.Task, nodeID string) if !s.pipeline.Process(&nodeInfo) { // this node cannot accommodate this task newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now()) - newT.Status.Message = s.pipeline.Explain() + newT.Status.Err = s.pipeline.Explain() s.allTasks[t.ID] = &newT return &newT @@ -706,9 +708,9 @@ func (s *Scheduler) noSuitableNode(ctx context.Context, taskGroup map[string]*ap newT := *t newT.Status.Timestamp = ptypes.MustTimestampProto(time.Now()) if explanation != "" { - newT.Status.Message = "no suitable node (" + explanation + ")" + newT.Status.Err = "no suitable node (" + explanation + ")" } else { - newT.Status.Message = "no suitable node" + newT.Status.Err = "no suitable node" } s.allTasks[t.ID] = &newT schedulingDecisions[t.ID] = schedulingDecision{old: t, new: &newT} diff --git a/manager/scheduler/scheduler_test.go b/manager/scheduler/scheduler_test.go index 82a247ff18..695b0c7569 100644 --- a/manager/scheduler/scheduler_test.go +++ b/manager/scheduler/scheduler_test.go @@ -1145,7 +1145,7 @@ func TestSchedulerNoReadyNodes(t *testing.T) { defer scheduler.Stop() failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node", failure.Status.Message) + assert.Equal(t, "no suitable node", failure.Status.Err) err = s.Update(func(tx store.Tx) error { // Create a ready node. The task should get assigned to this @@ -1435,7 +1435,7 @@ func TestSchedulerResourceConstraint(t *testing.T) { defer scheduler.Stop() failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (2 nodes not available for new tasks; insufficient resources on 1 node)", failure.Status.Message) + assert.Equal(t, "no suitable node (2 nodes not available for new tasks; insufficient resources on 1 node)", failure.Status.Err) err = s.Update(func(tx store.Tx) error { // Create a node with enough memory. The task should get @@ -1695,7 +1695,7 @@ func TestSchedulerResourceConstraintDeadTask(t *testing.T) { assert.NoError(t, err) failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (insufficient resources on 1 node)", failure.Status.Message) + assert.Equal(t, "no suitable node (insufficient resources on 1 node)", failure.Status.Err) err = s.Update(func(tx store.Tx) error { // The task becomes dead @@ -1989,7 +1989,7 @@ func TestSchedulerCompatiblePlatform(t *testing.T) { }) assert.NoError(t, err) failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (unsupported platform on 3 nodes)", failure.Status.Message) + assert.Equal(t, "no suitable node (unsupported platform on 3 nodes)", failure.Status.Err) // add task3 err = s.Update(func(tx store.Tx) error { @@ -2451,7 +2451,7 @@ func TestSchedulerPluginConstraint(t *testing.T) { assert.NoError(t, err) failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (missing plugin on 1 node)", failure.Status.Message) + assert.Equal(t, "no suitable node (missing plugin on 1 node)", failure.Status.Err) // Now add the second node err = s.Update(func(tx store.Tx) error { @@ -2474,7 +2474,7 @@ func TestSchedulerPluginConstraint(t *testing.T) { assert.NoError(t, err) failure = watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (missing plugin on 2 nodes)", failure.Status.Message) + assert.Equal(t, "no suitable node (missing plugin on 2 nodes)", failure.Status.Err) // Now add the node3 err = s.Update(func(tx store.Tx) error { @@ -2498,7 +2498,7 @@ func TestSchedulerPluginConstraint(t *testing.T) { // check that t4 has been assigned failure2 := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (missing plugin on 3 nodes)", failure2.Status.Message) + assert.Equal(t, "no suitable node (missing plugin on 3 nodes)", failure2.Status.Err) err = s.Update(func(tx store.Tx) error { assert.NoError(t, store.CreateNode(tx, n4)) @@ -2803,5 +2803,5 @@ func TestSchedulerHostPort(t *testing.T) { assert.NoError(t, err) failure := watchAssignmentFailure(t, watch) - assert.Equal(t, "no suitable node (host-mode port already in use on 2 nodes)", failure.Status.Message) + assert.Equal(t, "no suitable node (host-mode port already in use on 2 nodes)", failure.Status.Err) }