diff --git a/design/orchestrators.md b/design/orchestrators.md index cc698e0764..5929309491 100644 --- a/design/orchestrators.md +++ b/design/orchestrators.md @@ -128,10 +128,10 @@ means rolling updates will go node-by-node instead of slot-by-slot. The restart supervisor manages the process of shutting down a task, and possibly starting a replacement task. Its entry point is a `Restart` method which is called inside a store write transaction in one of the orchestrators. -It atomically changes the state of the old task to `Shutdown`, and, if it's -appropriate to start a replacement task based on the service's restart policy, -creates a new task in the same slot (replicated service) or on the same node -(global service). +It atomically changes the desired state of the old task to `Shutdown`, and, if +it's appropriate to start a replacement task based on the service's restart +policy, creates a new task in the same slot (replicated service) or on the same +node (global service). If the service is set up with a restart delay, the restart supervisor handles this delay too. It initially creates the new task with the desired state diff --git a/design/task_model.md b/design/task_model.md index 5f76d3cefd..0453277682 100644 --- a/design/task_model.md +++ b/design/task_model.md @@ -127,8 +127,8 @@ that the requested node has the necessary resources, in the case of global services' tasks). It changes their state to `ASSIGNED`. From this point, control over the state passes to the agent. A task will -progress through the `ACCEPTED`, `PREPARING`, and `STARTING` states on the -way to `RUNNING`. If a task exits without an error code, it moves to the +progress through the `ACCEPTED`, `PREPARING`, `READY', and `STARTING` states on +the way to `RUNNING`. If a task exits without an error code, it moves to the `COMPLETE` state. If it fails, it moves to the `FAILED` state instead. A task may alternatively end up in the `SHUTDOWN` state if its shutdown was diff --git a/design/tla/.gitignore b/design/tla/.gitignore new file mode 100644 index 0000000000..d12e6fbdab --- /dev/null +++ b/design/tla/.gitignore @@ -0,0 +1,5 @@ +*.toolbox +*.pdf +*.tlaps +states +metadir diff --git a/design/tla/EventCounter.tla b/design/tla/EventCounter.tla new file mode 100644 index 0000000000..59daf1731d --- /dev/null +++ b/design/tla/EventCounter.tla @@ -0,0 +1,25 @@ +---------------------------- MODULE EventCounter ---------------------------- + +EXTENDS Integers + +\* The number of ``events'' that have occurred (always 0 if we're not keeping track). +VARIABLE nEvents + +\* The maximum number of events to allow, or ``-1'' for unlimited. +maxEvents == -1 + +InitEvents == + nEvents = 0 \* Start with the counter at zero + +(* If we're counting events, increment the event counter. + We don't increment the counter when we don't have a maximum because that + would make the model infinite. + Actions tagged with CountEvent cannot happen once nEvents = maxEvents. *) +CountEvent == + IF maxEvents = -1 THEN + UNCHANGED nEvents + ELSE + /\ nEvents < maxEvents + /\ nEvents' = nEvents + 1 + +============================================================================= \ No newline at end of file diff --git a/design/tla/Makefile b/design/tla/Makefile new file mode 100644 index 0000000000..88e0caeb06 --- /dev/null +++ b/design/tla/Makefile @@ -0,0 +1,24 @@ +WORKERS := 4 + +TLA := docker run --rm -it --workdir /mnt -v ${PWD}:/mnt talex5/tla + +.PHONY: all check pdfs tlaps + +all: check pdfs tlaps + +# Run the TLC model checker +check: + ${TLA} tlc -workers ${WORKERS} SwarmKit.tla -config models/SwarmKit.cfg + ${TLA} tlc -workers ${WORKERS} WorkerImpl.tla -config models/WorkerImpl.cfg + +# Run the TLAPS proof checker +tlaps: + ${TLA} tlapm -I /usr/local/lib/tlaps SwarmKit.tla + ${TLA} tlapm -I /usr/local/lib/tlaps WorkerImpl.tla + +# Generate a PDF file from a .tla file +%.pdf: %.tla + [ -d metadir ] || mkdir metadir + ${TLA} java tla2tex.TLA -shade -latexCommand pdflatex -latexOutputExt pdf -metadir metadir $< + +pdfs: SwarmKit.pdf Types.pdf Tasks.pdf WorkerSpec.pdf EventCounter.pdf diff --git a/design/tla/README.md b/design/tla/README.md new file mode 100644 index 0000000000..48d7bfc7a3 --- /dev/null +++ b/design/tla/README.md @@ -0,0 +1,14 @@ +This directory contains documentation for SwarmKit using [TLA+][] notation. + +Run `make pdfs` to render these documents as PDF files. +The best one to start with is `SwarmKit.pdf`, which introduces the TLA+ notation +and describes the overall components of SwarmKit. + +The specifications can also be executed by the TLC model checker to help find +mistakes. Use `make check` to run the checks. + +If you want to edit these specifications, you will probably want to use the [TLA+ Toolbox][], +which provides a GUI. + +[TLA+]: https://en.wikipedia.org/wiki/TLA%2B +[TLA+ Toolbox]: http://lamport.azurewebsites.net/tla/toolbox.html diff --git a/design/tla/SwarmKit.tla b/design/tla/SwarmKit.tla new file mode 100644 index 0000000000..1cf23f10c9 --- /dev/null +++ b/design/tla/SwarmKit.tla @@ -0,0 +1,633 @@ +This is a TLA+ model of SwarmKit. Even if you don't know TLA+, you should be able to +get the general idea. This section gives a very brief overview of the syntax. + +Declare `x' to be something that changes as the system runs: + + VARIABLE x + +Define `Init' to be a state predicate (== means ``is defined to be''): + + Init == + x = 0 + +`Init' is true for states in which `x = 0'. This can be used to define +the possible initial states of the system. For example, the state +[ x |-> 0, y |-> 2, ... ] satisfies this. + +Define `Next' to be an action: + + Next == + /\ x' \in Nat + /\ x' > x + +An action takes a pair of states, representing an atomic step of the system. +Unprimed expressions (e.g. `x') refer to the old state, and primed ones to +the new state. This example says that a step is a `Next' step iff the new +value of `x' is a natural number and greater than the previous value. +For example, [ x |-> 3, ... ] -> [x |-> 10, ... ] is a `Next' step. + +/\ is logical ``and''. This example uses TLA's ``bulleted-list'' syntax, which makes +writing these easier. It is indentation-sensitive. TLA also has \/ lists (``or''). + +See `.http://lamport.azurewebsites.net/tla/summary.pdf.' for a more complete summary +of the syntax. + +This specification can be read as documentation, but it can also be executed by the TLC +model checker. See the model checking section below for details about that. + +The rest of the document is organised as follows: + +1. Parameters to the model +2. Types and definitions +3. How to run the model checker +4. Actions performed by the user +5. Actions performed by the components of SwarmKit +6. The complete system +7. Properties of the system + +-------------------------------- MODULE SwarmKit -------------------------------- + +(* Import some libraries we use. + Common SwarmKit types are defined in Types.tla. You should probably read that before continuing. *) +EXTENDS Integers, TLC, FiniteSets, \* From the TLA+ standard library + Types, \* SwarmKit types + Tasks, \* The `tasks' variable + WorkerSpec, \* High-level spec for worker nodes + EventCounter \* Event limiting, for modelling purposes + +(* The maximum number of terminated tasks to keep for each slot. *) +CONSTANT maxTerminated +ASSUME maxTerminated \in Nat + +(* In the model, we share taskIDs (see ModelTaskId), which means that + we can cover most behaviours with only enough task IDs + for one running task and maxTerminated finished ones. *) +ASSUME Cardinality(TaskId) >= 1 + maxTerminated + +------------------------------------------------------------------------------- +\* Services + +VARIABLE services \* A map of currently-allocated services, indexed by ServiceId + +(* A replicated service is one that specifies some number of replicas it wants. *) +IsReplicated(sid) == + services[sid].replicas \in Nat + +(* A global service is one that wants one task running on each node. *) +IsGlobal(sid) == + services[sid].replicas = global + +(* TasksOf(sid) is the set of tasks for service `sid'. *) +TasksOf(sid) == + { t \in tasks : t.service = sid } + +(* All tasks of service `sid' in `vslot'. *) +TasksOfVSlot(sid, vslot) == + { t \in TasksOf(sid) : VSlot(t) = vslot } + +(* All vslots of service `sid'. *) +VSlotsOf(sid) == + { VSlot(t) : t \in TasksOf(sid) } + +------------------------------------------------------------------------------- +\* Types + +(* The expected type of each variable. TLA+ is an untyped language, but the model checker + can check that TypeOK is true for every reachable state. *) +TypeOK == + \* `services' is a mapping from service IDs to ServiceSpecs: + /\ DOMAIN services \subseteq ServiceId + /\ services \in [ DOMAIN services -> ServiceSpec ] + /\ TasksTypeOK \* Defined in Types.tla + /\ WorkerTypeOK \* Defined in WorkerSpec.tla + +------------------------------------------------------------------------------- +(* +`^ \textbf{Model checking} ^' + + You can test this specification using the TLC model checker. + This section describes how to do that. If you don't want to run TLC, + you can skip this section. + + To use TLC, load this specification file in the TLA+ toolbox (``Open Spec'') + and create a new model using the menu. + + You will be prompted to enter values for the various CONSTANTS. + A suitable set of initial values is: + + `. + Node <- [ model value ] {n1} + ServiceId <- [ model value ] {s1} + TaskId <- [ model value ] {t1, t2} + maxReplicas <- 1 + maxTerminated <- 1 + .' + + For the [ model value ] ones, select `Set of model values'. + + This says that we have one node, `n1', at most one service, and at most + two tasks per vslot. TLC can explore all possible behaviours of this system + in a couple of seconds on my laptop. + + You should also specify some things to check (under ``What to check?''): + + - Add `TypeOK' and `Inv' under ``Invariants'' + - Add `TransitionsOK' and `EventuallyAsDesired' under ``Properties'' + + Running the model should report ``No errors''. + + If the model fails, TLC will show you an example sequence of actions that lead to + the failure and you can inspect the state at each step. You can try this out by + commenting out any important-looking condition in the model (e.g. the requirement + in UpdateService that you can't change the mode of an existing service). + + Although the above model is very small, it should detect most errors that you might + accidentally introduce when modifying the specification. Increasing the number of nodes, + services, replicas or terminated tasks will check more behaviours of the system, + but will be MUCH slower. + + The rest of this section describes techniques to make model checking faster by reducing + the number of states that must be considered in various ways. Feel free to skip it. + +`^ \textbf{Symmetry sets} ^' + + You should configure any model sets (e.g. `TaskId') as `symmetry sets'. + For example, if you have a model with two nodes {n1, n2} then this tells TLC that + two states which are the same except that n1 and n2 are swapped are equivalent + and it only needs to continue exploring from one of them. + TLC will warn that checking temporal properties may not work correctly, + but it's much faster and I haven't had any problems with it. + +`^ \textbf{Limiting the maximum number of setbacks to consider} ^' + + Another way to speed things up is to reduce the number of failures that TLC must consider. + By default, it checks every possible combination of failures at every point, which + is very expensive. + In the `Advanced Options' panel of the model, add a ``Definition Override'' of e.g. + `maxEvents = 2'. Actions that represent unnecessary extra work (such as the user + changing the configuration or a worker node going down) are tagged with `CountEvent'. + Any run of the system cannot have more than `maxEvents' such events. + + See `EventCounter.tla' for details. + +`^ \textbf{Preventing certain failures} ^' + + If you're not interested in some actions then you can block them. For example, + adding these two constraints in the ``Action Constraint'' box of the + ``Advanced Options'' tab tells TLC not to consider workers going down or + workers rejecting tasks as possible actions: + + /\ ~WorkerDown + /\ ~RejectTask +*) + +(* +`^ \textbf{Combining task states} ^' + + A finished task can be either in the `complete' or `failed' state, depending on + its exit status. If we have 4 finished tasks, that's 16 different states. For + modelling, we might not care about exit codes and we can treat this as a single + state with another definition override: + + `.failed <- complete.' + + In a similar way, we can combine { assigned, accepted, preparing, ready } into a single + state: + + `.accepted <- assigned + preparing <- assigned + ready <- assigned.' +*) + +---------------------------- MODULE User -------------------------------------------- +\* Actions performed by users + +(* Create a new service with any ServiceSpec. + + This says that a single atomic step of the system from an old state + to a new one is a CreateService step iff `tasks', `nodes' and `nEvents' don't change + and the new value of `services' is the same as before except that some + service ID that wasn't used in the old state is now mapped to some + ServiceSpec. + + Note: A \ B means { x \in A : x \notin B } -- + i.e. the set A with all elements in B removed. + *) +CreateService == + /\ UNCHANGED << tasks, nodes, nEvents >> + /\ \E sid \in ServiceId \ DOMAIN services, \* `sid' is an unused ServiceId + spec \in ServiceSpec : \* `spec' is any ServiceSpec + /\ spec.remove = FALSE \* (not flagged for removal) + /\ services' = services @@ sid :> spec \* Add `sid |-> spec' to `services' + +(* Update an existing service's spec. *) +UpdateService == + /\ UNCHANGED << tasks, nodes >> + /\ CountEvent \* Flag as an event for model-checking purposes + /\ \E sid \in DOMAIN services, \* `sid' is an existing ServiceId + newSpec \in ServiceSpec : \* `newSpec' is any `ServiceSpec' + /\ services[sid].remove = FALSE \* We weren't trying to remove sid + /\ newSpec.remove = FALSE \* and we still aren't. + \* You can't change a service's mode: + /\ (services[sid].replicas = global) <=> (newSpec.replicas = global) + /\ services' = [ services EXCEPT ![sid] = newSpec ] + +(* The user removes a service. + + Note: Currently, SwarmKit deletes the service from its records immediately. + However, this isn't right because we need to wait for service-level resources + such as Virtual IPs to be freed. + Here we model the proposed fix, in which we just flag the service for removal. *) +RemoveService == + /\ UNCHANGED << nodes >> + /\ CountEvent + /\ \E sid \in DOMAIN services : \* sid is some existing service + \* Flag service for removal: + /\ services' = [services EXCEPT ![sid].remove = TRUE] + \* Flag every task of the service for removal: + /\ UpdateTasks([ t \in TasksOf(sid) |-> + [t EXCEPT !.desired_state = remove] ]) + +(* A user action is one of these. *) +User == + \/ CreateService + \/ UpdateService + \/ RemoveService + +============================================================================= + +---------------------------- MODULE Orchestrator ---------------------------- + +\* Actions performed the orchestrator + +\* Note: This is by far the most complicated component in the model. +\* You might want to read this section last... + +(* The set of tasks for service `sid' that should be considered as active. + This is any task that is running or on its way to running. *) +RunnableTasks(sid) == + { t \in TasksOf(sid) : Runnable(t) } + +(* Candidates for shutting down when we have too many. We don't want to count tasks that are shutting down + towards the total count when deciding whether we need to kill anything. *) +RunnableWantedTasks(sid) == + { t \in RunnableTasks(sid) : t.desired_state \preceq running } + +(* The set of possible new vslots for `sid'. *) +UnusedVSlot(sid) == + IF IsReplicated(sid) THEN Slot \ VSlotsOf(sid) + ELSE Node \ VSlotsOf(sid) + +(* The set of possible IDs for a new task in a vslot. + + The complexity here is just a side-effect of the modelling (where we need to + share and reuse task IDs for performance). + In the real system, choosing an unused ID is easy. *) +UnusedId(sid, vslot) == + LET swarmTaskIds == { t.id : t \in TasksOfVSlot(sid, vslot) } + IN TaskId \ swarmTaskIds + +(* Create a new task/slot if the number of runnable tasks is less than the number requested. *) +CreateSlot == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E sid \in DOMAIN services : \* `sid' is an existing service + /\ ~services[sid].remove \* that we're not trying to remove + (* For replicated tasks, only create as many slots as we need. + For global tasks, we want all possible vslots (nodes). *) + /\ IsReplicated(sid) => + services[sid].replicas > Cardinality(VSlotsOf(sid)) \* Desired > actual + /\ \E slot \in UnusedVSlot(sid) : + \E id \in UnusedId(sid, slot) : + tasks' = tasks \union { NewTask(sid, slot, id, running) } + +(* Add a task if a slot exists, contains no runnable tasks, and we weren't trying to remove it. + Note: if we are trying to remove it, the slot will eventually disappear and CreateSlot will + then make a new one if we later need it again. + + Currently in SwarmKit, slots do not actually exist as objects in the store. + Instead, we just infer that a slot exists because there exists a task with that slot ID. + This has the odd effect that if `maxTerminated = 0' then we may create new slots rather than reusing + existing ones, depending on exactly when the reaper runs. + *) +ReplaceTask == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E sid \in DOMAIN services : + \E slot \in VSlotsOf(sid) : + /\ \A task \in TasksOfVSlot(sid, slot) : \* If all tasks in `slot' are + ~Runnable(task) \* dead (not runnable) and + /\ \E task \in TasksOfVSlot(sid, slot) : \* there is some task that + task.desired_state # remove \* we're not trying to remove, + /\ \E id \in UnusedId(sid, slot) : \* then create a replacement task: + tasks' = tasks \union { NewTask(sid, slot, id, running) } + +(* If we have more replicas than the spec asks for, remove one of them. *) +RequestRemoval == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E sid \in DOMAIN services : + LET current == RunnableWantedTasks(sid) + IN \* Note: `current' excludes tasks we're already trying to kill + /\ IsReplicated(sid) + /\ services[sid].replicas < Cardinality(current) \* We have too many replicas + /\ \E slot \in { t.slot : t \in current } : \* Choose an allocated slot + \* Mark all tasks for that slot for removal: + UpdateTasks( [ t \in TasksOfVSlot(sid, slot) |-> + [t EXCEPT !.desired_state = remove] ] ) + +(* Mark a terminated task for removal if we already have `maxTerminated' terminated tasks for this slot. *) +CleanupTerminated == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E sid \in DOMAIN services : + \E slot \in VSlotsOf(sid) : + LET termTasksInSlot == { t \in TasksOfVSlot(sid, slot) : + State(t) \in { complete, shutdown, failed, rejected } } + IN + /\ Cardinality(termTasksInSlot) > maxTerminated \* Too many tasks for slot + /\ \E t \in termTasksInSlot : \* Pick a victim to remove + UpdateTasks(t :> [t EXCEPT !.desired_state = remove]) + +(* We don't model the updater explicitly, but we allow any task to be restarted (perhaps with + a different image) at any time, which should cover the behaviours of the restart supervisor. + + TODO: SwarmKit also allows ``start-first'' mode updates where we first get the new task to + `running' and then mark the old task for shutdown. Add this to the model. *) +RestartTask == + /\ UNCHANGED << services, nodes >> + /\ CountEvent + /\ \E oldT \in tasks : + \E newId \in UnusedId(oldT.service, VSlot(oldT)) : + /\ Runnable(oldT) \* Victim must be runnable + /\ oldT.desired_state \prec shutdown \* and we're not trying to kill it + \* Create the new task in the `ready' state (see ReleaseReady below): + /\ LET replacement == NewTask(oldT.service, VSlot(oldT), newId, ready) + IN tasks' = + (tasks \ {oldT}) \union { + [oldT EXCEPT !.desired_state = shutdown], + replacement + } + +(* A task is set to wait at `ready' and the previous task for that slot has now finished. + Allow it to proceed to `running'. *) +ReleaseReady == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E t \in tasks : + /\ t.desired_state = ready \* (and not e.g. `remove') + /\ State(t) = ready + /\ \A other \in TasksOfVSlot(t.service, VSlot(t)) \ {t} : + ~Runnable(other) \* All other tasks have finished + /\ UpdateTasks(t :> [t EXCEPT !.desired_state = running]) + +(* The user asked to remove a service, and now all its tasks have been cleaned up. *) +CleanupService == + /\ UNCHANGED << tasks, nodes, nEvents >> + /\ \E sid \in DOMAIN services : + /\ services[sid].remove = TRUE + /\ TasksOf(sid) = {} + /\ services' = [ i \in DOMAIN services \ {sid} |-> services[i] ] + +(* Tasks that the orchestrator must always do eventually if it can: *) +OrchestratorProgress == + \/ CreateSlot + \/ ReplaceTask + \/ RequestRemoval + \/ CleanupTerminated + \/ ReleaseReady + \/ CleanupService + +(* All actions that the orchestrator can perform *) +Orchestrator == + \/ OrchestratorProgress + \/ RestartTask + +============================================================================= + +---------------------------- MODULE Allocator ------------------------------- +\* Actions performed the allocator + +(* Pick a `new' task and move it to `pending'. + + The spec says the allocator will ``allocate resources such as network attachments + which are necessary for the tasks to run''. However, we don't model any resources here. *) +AllocateTask == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E t \in tasks : + /\ State(t) = new + /\ UpdateTasks(t :> [t EXCEPT !.status.state = pending]) + +AllocatorProgress == + \/ AllocateTask + +Allocator == + \/ AllocatorProgress + +============================================================================= + +---------------------------- MODULE Scheduler ------------------------------- + +\* Actions performed by the scheduler + +(* The scheduler assigns a node to a `pending' task and moves it to `assigned' + once sufficient resources are available (we don't model resources here). *) +Scheduler == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E t \in tasks : + /\ State(t) = pending + /\ LET candidateNodes == IF t.node = unassigned + THEN Node \* (all nodes) + ELSE { t.node } + IN + \E node \in candidateNodes : + UpdateTasks(t :> [t EXCEPT !.status.state = assigned, + !.node = node ]) + +============================================================================= + +---------------------------- MODULE Reaper ---------------------------------- + +\* Actions performed by the reaper + +(* Forget about tasks in remove or orphan states. + + Orphaned tasks belong to nodes that we are assuming are lost forever (or have crashed + and will come up with nothing running, which is an equally fine outcome). *) +Reaper == + /\ UNCHANGED << services, nodes, nEvents >> + /\ \E t \in tasks : + /\ \/ /\ t.desired_state = remove + /\ (State(t) \prec assigned \/ ~Runnable(t)) \* Not owned by agent + \/ State(t) = orphaned + /\ tasks' = tasks \ {t} + +============================================================================= + +\* The complete system + +\* Import definitions from the various modules +INSTANCE User +INSTANCE Orchestrator +INSTANCE Allocator +INSTANCE Scheduler +INSTANCE Reaper + +\* All the variables +vars == << tasks, services, nodes, nEvents >> + +\* Initially there are no tasks and no services, and all nodes are up. +Init == + /\ tasks = {} + /\ services = << >> + /\ nodes = [ n \in Node |-> nodeUp ] + /\ InitEvents + +(* WorkerSpec doesn't mention `services'. To combine it with this spec, we need to say + that every action of the agent leaves `services' unchanged. *) +AgentReal == + Agent /\ UNCHANGED services + +(* Unfortunately, `AgentReal' causes TLC to report all problems of the agent + as simply `AgentReal' steps, which isn't very helpful. We can get better + diagnostics by expanding it, like this: *) +AgentTLC == + \/ (ProgressTask /\ UNCHANGED services) + \/ (ShutdownComplete /\ UNCHANGED services) + \/ (OrphanTasks /\ UNCHANGED services) + \/ (WorkerUp /\ UNCHANGED services) + \/ (RejectTask /\ UNCHANGED services) + \/ (ContainerExit /\ UNCHANGED services) + \/ (WorkerDown /\ UNCHANGED services) + +(* To avoid the risk of `AgentTLC' getting out of sync, + TLAPS can check that the definitions are equivalent. *) +THEOREM AgentTLC = AgentReal +BY DEF AgentTLC, AgentReal, Agent, AgentProgress + +(* A next step is one in which any of these sub-components takes a step: *) +Next == + \/ User + \/ Orchestrator + \/ Allocator + \/ Scheduler + \/ AgentTLC + \/ Reaper + \* For model checking: don't report deadlock if we're limiting events + \/ (nEvents = maxEvents /\ UNCHANGED vars) + +(* This is a ``temporal formula''. It takes a sequence of states representing the + changing state of the world and evaluates to TRUE if that sequences of states is + a possible behaviour of SwarmKit. *) +Spec == + \* The first state in the behaviour must satisfy Init: + /\ Init + \* All consecutive pairs of states must satisfy Next or leave `vars' unchanged: + /\ [][Next]_vars + (* Some actions are required to happen eventually. For example, a behaviour in + which SwarmKit stops doing anything forever, even though it could advance some task + from the `new' state, isn't a valid behaviour of the system. + This property is called ``weak fairness''. *) + /\ WF_vars(OrchestratorProgress) + /\ WF_vars(AllocatorProgress) + /\ WF_vars(Scheduler) + /\ WF_vars(AgentProgress /\ UNCHANGED services) + /\ WF_vars(Reaper) + /\ WF_vars(WorkerUp /\ UNCHANGED services) + (* We don't require fairness of: + - User (we don't control them), + - RestartTask (services aren't required to be updated), + - RejectTask (tasks aren't required to be rejected), + - ContainerExit (we don't specify image behaviour) or + - WorkerDown (workers aren't required to fail). *) + +------------------------------------------------------------------------------- +\* Properties to verify + +(* These are properties that should follow automatically if the system behaves as + described by `Spec' in the previous section. *) + +\* A state invariant (things that should be true in every state). +Inv == + \A t \in tasks : + (* Every task has a service: + + TODO: The spec says: ``In some cases, there are tasks that exist independent of any service. + These do not have a value set in service_id.''. Add an example of one. *) + /\ t.service \in DOMAIN services + \* Tasks have nodes once they reach `assigned', except maybe if rejected: + /\ assigned \preceq State(t) => t.node \in Node \/ State(t) = rejected + \* `remove' is only used as a desired state, not an actual one: + /\ State(t) # remove + \* Task IDs are unique + /\ \A t2 \in tasks : Id(t) = Id(t2) => t = t2 + +(* The state of task `i' in `S', or `null' if it doesn't exist *) +Get(S, i) == + LET cand == { x \in S : Id(x) = i } + IN IF cand = {} THEN null + ELSE State(CHOOSE x \in cand : TRUE) + +(* An action in which all transitions were valid. *) +StepTransitionsOK == + LET permitted == { << x, x >> : x \in TaskState } \union \* No change is always OK + CASE Orchestrator -> Transitions.orchestrator + [] Allocator -> Transitions.allocator + [] Scheduler -> Transitions.scheduler + [] Agent -> Transitions.agent + [] Reaper -> Transitions.reaper + [] OTHER -> {} + oldIds == IdSet(tasks) + newIds == IdSet(tasks') + IN + \A id \in newIds \union oldIds : + << Get(tasks, id), Get(tasks', id) >> \in permitted + +(* Some of the expressions below are ``temporal formulas''. Unlike state expressions and actions, + these look at a complete behaviour (sequence of states). Summary of notation: + + [] means ``always''. e.g. []x=3 means that `x = 3' in all states. + + <> means ``eventually''. e.g. <>x=3 means that `x = 3' in some state. + + `x=3' on its own means that `x=3' in the initial state. +*) + +\* A temporal formula that checks every step satisfies StepTransitionsOK (or `vars' is unchanged) +TransitionsOK == + [][StepTransitionsOK]_vars + +(* Every service has the right number of running tasks (the system is in the desired state). *) +InDesiredState == + \A sid \in DOMAIN services : + \* We're not trying to remove the service: + /\ ~services[sid].remove + \* The service has the correct set of running replicas: + /\ LET runningTasks == { t \in TasksOf(sid) : State(t) = running } + nRunning == Cardinality(runningTasks) + IN + CASE IsReplicated(sid) -> + /\ nRunning = services[sid].replicas + [] IsGlobal(sid) -> + \* We have as many tasks as nodes: + /\ nRunning = Cardinality(Node) + \* We have a task for every node: + /\ { t.node : t \in runningTasks } = Node + \* The service does not have too many terminated tasks + /\ \A slot \in VSlotsOf(sid) : + LET terminated == { t \in TasksOfVSlot(sid, slot) : ~Runnable(t) } + IN Cardinality(terminated) <= maxTerminated + +(* The main property we want to check. + + []<> means ``always eventually'' (``infinitely-often'') + + <>[] means ``eventually always'' (always true after some point) + + This temporal formula says that if we only experience a finite number of + problems then the system will eventually settle on InDesiredState. +*) +EventuallyAsDesired == + \/ []<> <>_vars \* Either the user keeps changing the configuration, + \/ []<> <>_vars \* or restarting/updating tasks, + \/ []<> <>_vars \* or workers keep failing, + \/ []<> <>_vars \* or workers keep rejecting tasks, + \/ []<> <>_vars \* or the containers keep exiting, + \/ <>[] InDesiredState \* or we eventually get to the desired state and stay there. + +============================================================================= diff --git a/design/tla/Tasks.tla b/design/tla/Tasks.tla new file mode 100644 index 0000000000..f0b1569659 --- /dev/null +++ b/design/tla/Tasks.tla @@ -0,0 +1,112 @@ +---------------------------- MODULE Tasks ---------------------------------- + +EXTENDS TLC, Types + +VARIABLE tasks \* The set of currently-allocated tasks + +(* The expected type of each variable. TLA+ is an untyped language, but the model checker + can check that TasksTypeOK is true for every reachable state. *) +TasksTypeOK == + \* `tasks' is a subset of the set of all possible tasks + /\ tasks \in SUBSET Task + +(* Update `tasks' by performing each update in `f', which is a function + mapping old tasks to new ones. *) +UpdateTasks(f) == + /\ Assert(\A t \in DOMAIN f : t \in tasks, "An old task does not exist!") + /\ Assert(\A t \in DOMAIN f : + LET t2 == f[t] + IN \* The updated version of `t' must have + /\ t.id = t2.id \* the same task ID, + /\ t.service = t2.service \* the same service ID, + /\ VSlot(t) = VSlot(t2), \* and the same vslot. + "An update changes a task's identity!") + \* Remove all the old tasks and add the new ones: + /\ tasks' = (tasks \ DOMAIN f) \union Range(f) + +(* A `new' task belonging to service `sid' with the given slot, id, and desired state. *) +NewTask(sid, vslot, id, desired_state) == + [ + id |-> id, + service |-> sid, + status |-> [ state |-> new ], + desired_state |-> desired_state, + node |-> IF vslot \in Node THEN vslot ELSE unassigned, + slot |-> IF vslot \in Slot THEN vslot ELSE global + ] + + +\* A special ``state'' used when a task doesn't exist. +null == "null" + +(* All the possible transitions, grouped by the component that performs them. *) +Transitions == [ + orchestrator |-> { + << null, new >> + }, + + allocator |-> { + << new, pending >> + }, + + scheduler |-> { + << pending, assigned >> + }, + + agent |-> { + << assigned, accepted >>, + << accepted, preparing >>, + << preparing, ready >>, + << ready, starting >>, + << starting, running >>, + + << assigned, rejected >>, + << accepted, rejected >>, + << preparing, rejected >>, + << ready, rejected >>, + << starting, rejected >>, + + << running, complete >>, + << running, failed >>, + + << running, shutdown >>, + + << assigned, orphaned >>, + << accepted, orphaned >>, + << preparing, orphaned >>, + << ready, orphaned >>, + << starting, orphaned >>, + << running, orphaned >> + }, + + reaper |-> { + << new, null >>, + << pending, null >>, + << rejected, null >>, + << complete, null >>, + << failed, null >>, + << shutdown, null >>, + << orphaned, null >> + } +] + +(* Check that `Transitions' itself is OK. *) +TransitionTableOK == + \* No transition moves to a lower-ranked state: + /\ \A actor \in DOMAIN Transitions : + \A trans \in Transitions[actor] : + \/ trans[1] = null + \/ trans[2] = null + \/ trans[1] \preceq trans[2] + (* Every source state has exactly one component which handles transitions out of that state. + Except for the case of the reaper removing `new' and `pending' tasks that are flagged + for removal. *) + /\ \A a1, a2 \in DOMAIN Transitions : + LET exceptions == { << new, null >>, << pending, null >> } + Source(a) == { s[1] : s \in Transitions[a] \ exceptions} + IN a1 # a2 => + Source(a1) \intersect Source(a2) = {} + +ASSUME TransitionTableOK \* Note: ASSUME means ``check'' to TLC + +============================================================================= \ No newline at end of file diff --git a/design/tla/Types.tla b/design/tla/Types.tla new file mode 100644 index 0000000000..555ad4c68f --- /dev/null +++ b/design/tla/Types.tla @@ -0,0 +1,124 @@ +------------------------------- MODULE Types ------------------------------- + +EXTENDS Naturals, FiniteSets + +(* A generic operator to get the range of a function (the set of values in a map): *) +Range(S) == { S[i] : i \in DOMAIN S } + +(* The set of worker nodes. + + Note: a CONSTANT is an input to the model. The model should work with any set of nodes you provide. + + TODO: should cope with this changing at runtime, and with draining nodes. *) +CONSTANT Node + +(* A special value indicating that a task is not yet assigned to a node. + + Note: this TLA+ CHOOSE idiom just says to pick some value that isn't a Node (e.g. `null'). *) +unassigned == CHOOSE n : n \notin Node + +(* The type (set) of service IDs (e.g. `Int' or `String'). + When model checking, this will be some small set (e.g. {"s1", "s2"}). *) +CONSTANT ServiceId + +(* The type of task IDs. *) +CONSTANT TaskId + +(* The maximum possible value for `replicas' in ServiceSpec. *) +CONSTANT maxReplicas +ASSUME maxReplicas \in Nat +Slot == 1..maxReplicas \* Possible slot numbers + +(* A special value (e.g. `-1') indicating that we want one replica running on each node: *) +global == CHOOSE x : x \notin Nat + +(* The type of a description of a service (a struct/record). + This is provided by, and only changed by, the user. *) +ServiceSpec == [ + (* The replicas field is either a count giving the desired number of replicas, + or the special value `global'. *) + replicas : 0..maxReplicas \union {global}, + remove : BOOLEAN \* The user wants to remove this service +] + +(* The possible states for a task: *) +new == "new" +pending == "pending" +assigned == "assigned" +accepted == "accepted" +preparing == "preparing" +ready == "ready" +starting == "starting" +running == "running" +complete == "complete" +shutdown == "shutdown" +failed == "failed" +rejected == "rejected" +remove == "remove" \* Only used as a ``desired state'', not an actual state +orphaned == "orphaned" + +(* Every state has a rank. It is only possible for a task to change + state to a state with a higher rank (later in this sequence). *) +order == << new, pending, assigned, accepted, + preparing, ready, starting, + running, + complete, shutdown, failed, rejected, + remove, orphaned >> + +(* Maps a state to its position in `order' (e.g. StateRank(new) = 1): *) +StateRank(s) == CHOOSE i \in DOMAIN order : order[i] = s + +(* Convenient notation for comparing states: *) +s1 \prec s2 == StateRank(s1) < StateRank(s2) +s1 \preceq s2 == StateRank(s1) <= StateRank(s2) + +(* The set of possible states ({new, pending, ...}): *) +TaskState == Range(order) \ {remove} + +(* Possibly this doesn't need to be a record, but we might want to add extra fields later. *) +TaskStatus == [ + state : TaskState +] + +(* The state that SwarmKit wants to the task to be in. *) +DesiredState == { ready, running, shutdown, remove } + +(* This has every field mentioned in `task_model.md' except for `spec', which + it doesn't seem to use for anything. + + `desired_state' can be any state, although currently we only ever set it to one of + {ready, running, shutdown, remove}. *) +Task == [ + id : TaskId, \* To uniquely identify this task + service : ServiceId, \* The service that owns the task + status : TaskStatus, \* The current state + desired_state : DesiredState, \* The state requested by the orchestrator + node : Node \union {unassigned}, \* The node on which the task should be run + slot : Slot \union {global} \* A way of tracking related tasks +] + +(* The current state of task `t'. *) +State(t) == t.status.state + +(* A task is runnable if it is running or could become running in the future. *) +Runnable(t) == State(t) \preceq running + +(* A task's ``virtual slot'' is its actual slot for replicated services, + but its node for global ones. *) +VSlot(t) == + IF t.slot = global THEN t.node ELSE t.slot + +(* In the real SwarmKit, a task's ID is just its taskId field. + However, this requires lots of IDs, which is expensive for model checking. + So instead, we will identify tasks by their << serviceId, vSlot, taskId >> + triple, and only require taskId to be unique within its vslot. *) +ModelTaskId == ServiceId \X (Slot \union Node) \X TaskId + +(* A unique identifier for a task, which never changes. *) +Id(t) == + << t.service, VSlot(t), t.id >> \* A ModelTaskId + +(* The ModelTaskIds of a set of tasks. *) +IdSet(S) == { Id(t) : t \in S } + +============================================================================= diff --git a/design/tla/WorkerImpl.tla b/design/tla/WorkerImpl.tla new file mode 100644 index 0000000000..03bf39bf32 --- /dev/null +++ b/design/tla/WorkerImpl.tla @@ -0,0 +1,321 @@ +---------------------------- MODULE WorkerImpl ---------------------------------- + +EXTENDS TLC, Types, Tasks, EventCounter + +(* +`WorkerSpec' provides a high-level specification of worker nodes that only refers to +the state of the tasks recorded in SwarmKit's store. This specification (WorkerImpl) +refines WorkerSpec by also modelling the state of the containers running on a node. +It should be easier to see that this lower-level specification corresponds to what +actually happens on worker nodes. + +The reason for having this in a separate specification is that including the container +state greatly increases the number of states to be considered and so slows down model +checking. Instead of checking + + SwarmKit /\ WorkerImpl => EventuallyAsDesired + +(which is very slow), we check two separate expressions: + + SwarmKit /\ WorkerSpec => EventuallyAsDesired + WorkerImpl => WorkerSpec + +TLAPS can check that separating the specification in this way makes sense: *) +THEOREM ASSUME TEMPORAL SwarmKit, TEMPORAL WorkerSpec, + TEMPORAL WorkerImpl, TEMPORAL EventuallyAsDesired, + TEMPORAL Env, \* A simplified version of SwarmKit + SwarmKit /\ WorkerSpec => EventuallyAsDesired, + Env /\ WorkerImpl => WorkerSpec, + SwarmKit => Env + PROVE SwarmKit /\ WorkerImpl => EventuallyAsDesired +OBVIOUS + +\* This worker's node ID +CONSTANT node +ASSUME node \in Node + +VARIABLES nodes \* Defined in WorkerSpec.tla +VARIABLE containers \* The actual container state on the node, indexed by ModelTaskId + +(* The high-level specification of worker nodes. + This module should be a refinement of `WS'. *) +WS == INSTANCE WorkerSpec + +terminating == "terminating" \* A container which we're trying to stop + +(* The state of an actual container on a worker node. *) +ContainerState == { running, terminating, complete, failed } + +(* A running container finishes running on its own (or crashes). *) +ContainerExit == + /\ UNCHANGED << nodes, tasks >> + /\ CountEvent + /\ \E id \in DOMAIN containers, + s2 \in {failed, complete} : \* Either a successful or failed exit status + /\ containers[id] = running + /\ containers' = [containers EXCEPT ![id] = s2] + +(* A running container finishes because we stopped it. *) +ShutdownComplete == + /\ UNCHANGED << nodes, tasks, nEvents >> + /\ \E id \in DOMAIN containers : + /\ containers[id] = terminating + /\ containers' = [containers EXCEPT ![id] = failed] + +(* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *) +IsUp(n) == WS!IsUp(n) + +(* The new value that `containers' should take after getting an update from the + managers. If the managers asked us to run a container and then stop mentioning + that task, we shut the container down and (once stopped) remove it. *) +DesiredContainers == + LET WantShutdown(id) == + \* The managers stop mentioning the task, or ask for it to be stopped. + \/ id \notin IdSet(tasks) + \/ running \prec (CHOOSE t \in tasks : Id(t) = id).desired_state + (* Remove containers that no longer have tasks, once they've stopped. *) + rm == { id \in DOMAIN containers : + /\ containers[id] \in { complete, failed } + /\ id \notin IdSet(tasks) } + IN [ id \in DOMAIN containers \ rm |-> + IF containers[id] = running /\ WantShutdown(id) THEN terminating + ELSE containers[id] + ] + +(* The updates that SwarmKit should apply to its store to bring it up-to-date + with the real state of the containers. *) +RequiredTaskUpdates == + LET \* Tasks the manager is expecting news about: + oldTasks == { t \in tasks : t.node = node /\ State(t) = running } + \* The state to report for task `t': + ReportFor(t) == + IF Id(t) \notin DOMAIN containers THEN \* We were asked to forget about this container. + shutdown \* SwarmKit doesn't care which terminal state we finish in. + ELSE IF /\ containers[Id(t)] = failed \* It's terminated and + /\ t.desired_state = shutdown THEN \* we wanted to shut it down, + shutdown \* Report a successful shutdown + ELSE IF containers[Id(t)] = terminating THEN + running \* SwarmKit doesn't record progress of the shutdown + ELSE + containers[Id(t)] \* Report the actual state + IN [ t \in oldTasks |-> [ t EXCEPT !.status.state = ReportFor(t) ]] + +(* Our node synchronises its state with a manager. *) +DoSync == + /\ containers' = DesiredContainers + /\ UpdateTasks(RequiredTaskUpdates) + +(* Try to advance containers towards `desired_state' if we're not there yet. + + XXX: do we need a connection to the manager to do this, or can we make progress + while disconnected and just report the final state? +*) +ProgressTask == + /\ UNCHANGED << nodes, nEvents >> + /\ \E t \in tasks, + s2 \in TaskState : \* The state we want to move to + LET t2 == [t EXCEPT !.status.state = s2] + IN + /\ s2 \preceq t.desired_state \* Can't be after the desired state + /\ << State(t), State(t2) >> \in { \* Possible ``progress'' (desirable) transitions + << assigned, accepted >>, + << accepted, preparing >>, + << preparing, ready >>, + << ready, starting >>, + << starting, running >> + } + /\ IsUp(t.node) \* Node must be connected to SwarmKit + /\ IF s2 = running THEN + \* The container started running + containers' = Id(t) :> running @@ containers + ELSE + UNCHANGED containers + /\ UpdateTasks(t :> t2) + +(* The agent on the node synchronises with a manager. *) +SyncWithManager == + /\ UNCHANGED << nodes, nEvents >> + /\ IsUp(node) + /\ DoSync + +(* We can reject a task once we're responsible for it (it has reached `assigned') + until it reaches the `running' state. + Note that an ``accepted'' task can still be rejected. *) +RejectTask == + /\ UNCHANGED << nodes, containers >> + /\ CountEvent + /\ \E t \in tasks : + /\ State(t) \in { assigned, accepted, preparing, ready, starting } + /\ t.node = node + /\ IsUp(node) + /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected]) + +(* The dispatcher notices that the worker is down (the connection is lost). *) +WorkerDown == + /\ UNCHANGED << tasks, containers >> + /\ CountEvent + /\ \E n \in Node : + /\ IsUp(n) + /\ nodes' = [nodes EXCEPT ![n] = WS!nodeDown] + +(* When the node reconnects to the cluster, it gets an assignment set from the dispatcher + which does not include any tasks that have been marked orphaned and then deleted. + Any time an agent gets an assignment set that does not include some task it has running, + it shuts down those tasks. + + We model this separately with the `SyncWithManager' action. *) +WorkerUp == + /\ UNCHANGED << nEvents, containers, tasks >> + /\ \E n \in Node : + /\ ~IsUp(n) + /\ nodes' = [nodes EXCEPT ![n] = WS!nodeUp] + +(* Tasks assigned to a node and for which the node is responsible. *) +TasksOwnedByNode(n) == { t \in tasks : + /\ t.node = n + /\ assigned \preceq State(t) + /\ State(t) \prec remove +} + +(* If SwarmKit sees a node as down for a long time (48 hours or so) then + it marks all the node's tasks as orphaned. + Note that this sets the actual state, not the desired state. + + ``Moving a task to the Orphaned state is not desirable, + because it's the one case where we break the otherwise invariant + that the agent sets all states past ASSIGNED.'' +*) +OrphanTasks == + /\ UNCHANGED << nodes, containers, nEvents >> + /\ LET affected == { t \in TasksOwnedByNode(node) : Runnable(t) } + IN + /\ ~IsUp(node) \* Our connection to the agent is down + /\ UpdateTasks([ t \in affected |-> + [t EXCEPT !.status.state = orphaned] ]) + +(* The worker reboots. All containers are terminated. *) +WorkerReboot == + /\ UNCHANGED << nodes, tasks >> + /\ CountEvent + /\ containers' = [ id \in DOMAIN containers |-> + LET state == containers[id] + IN CASE state \in {running, terminating} -> failed + [] state \in {complete, failed} -> state + ] + +(* Actions we require to happen eventually when possible. *) +AgentProgress == + \/ ProgressTask + \/ OrphanTasks + \/ WorkerUp + \/ ShutdownComplete + \/ SyncWithManager + +(* All actions of the agent/worker. *) +Agent == + \/ AgentProgress + \/ RejectTask + \/ WorkerDown + \/ ContainerExit + \/ WorkerReboot + +------------------------------------------------------------------------------- +\* A simplified model of the rest of the system + +(* A new task is created. *) +CreateTask == + /\ UNCHANGED << containers, nEvents, nodes >> + /\ \E t \in Task : \* `t' is the new task + \* Don't reuse IDs (only really an issue for model checking) + /\ Id(t) \notin IdSet(tasks) + /\ Id(t) \notin DOMAIN containers + /\ State(t) = new + /\ t.desired_state \in { ready, running } + /\ \/ /\ t.node = unassigned \* A task for a replicated service + /\ t.slot \in Slot + \/ /\ t.node \in Node \* A task for a global service + /\ t.slot = global + /\ ~\E t2 \in tasks : \* All tasks of a service have the same mode + /\ t.service = t2.service + /\ (t.slot = global) # (t2.slot = global) + /\ tasks' = tasks \union {t} + +(* States before `assigned' aren't shared with worker nodes, so modelling them + isn't very useful. You can use this in a model to override `CreateTask' to + speed things up a bit. It creates tasks directly in the `assigned' state. *) +CreateTaskQuick == + /\ UNCHANGED << containers, nEvents, nodes >> + /\ \E t \in Task : + /\ Id(t) \notin IdSet(tasks) + /\ Id(t) \notin DOMAIN containers + /\ State(t) = assigned + /\ t.desired_state \in { ready, running } + /\ t.node \in Node + /\ t.slot \in Slot \union {global} + /\ ~\E t2 \in tasks : \* All tasks of a service have the same mode + /\ t.service = t2.service + /\ (t.slot = global) # (t2.slot = global) + /\ tasks' = tasks \union {t} + +(* The state or desired_state of a task is updated. *) +UpdateTask == + /\ UNCHANGED << containers, nEvents, nodes >> + /\ \E t \in tasks, t2 \in Task : \* `t' becomes `t2' + /\ Id(t) = Id(t2) \* The ID can't change + /\ State(t) # State(t2) => \* If the state changes then + \E actor \in DOMAIN Transitions : \* it is a legal transition + /\ actor = "agent" => t.node # node \* and not one our worker does + /\ << State(t), State(t2) >> \in Transitions[actor] + \* When tasks reach the `assigned' state, they must have a node + /\ IF State(t2) = assigned /\ t.node = unassigned THEN t2.node \in Node + ELSE t2.node = t.node + /\ UpdateTasks(t :> t2) + +(* The reaper removes a task. *) +RemoveTask == + /\ UNCHANGED << containers, nEvents, nodes >> + /\ \E t \in tasks : + /\ << State(t), null >> \in Transitions.reaper + /\ tasks' = tasks \ {t} + +(* Actions of our worker's environment (i.e. SwarmKit and other workers). *) +OtherComponent == + \/ CreateTask + \/ UpdateTask + \/ RemoveTask + +------------------------------------------------------------------------------- +\* A complete system + +vars == << tasks, nEvents, nodes, containers >> + +Init == + /\ tasks = {} + /\ containers = << >> + /\ nodes = [ n \in Node |-> WS!nodeUp ] + /\ InitEvents + +Next == + \/ OtherComponent + \/ Agent + +(* The specification for our worker node. *) +Impl == Init /\ [][Next]_vars /\ WF_vars(AgentProgress) + +------------------------------------------------------------------------------- + +TypeOK == + /\ TasksTypeOK + \* The node's container map maps IDs to states + /\ DOMAIN containers \in SUBSET ModelTaskId + /\ containers \in [ DOMAIN containers -> ContainerState ] + +wsVars == << tasks, nEvents, nodes >> + +(* We want to check that a worker implementing `Impl' is also implementing + `WorkerSpec'. i.e. we need to check that Impl => WSSpec. *) +WSSpec == + /\ [][WS!Agent \/ OtherComponent]_wsVars + /\ WF_wsVars(WS!AgentProgress) + +============================================================================= diff --git a/design/tla/WorkerSpec.tla b/design/tla/WorkerSpec.tla new file mode 100644 index 0000000000..e56b8009b3 --- /dev/null +++ b/design/tla/WorkerSpec.tla @@ -0,0 +1,133 @@ +----------------------------- MODULE WorkerSpec ----------------------------- + +EXTENDS Types, Tasks, EventCounter + +VARIABLE nodes \* Maps nodes to SwarmKit's view of their NodeState + +(* The possible states of a node, as recorded by SwarmKit. *) +nodeUp == "up" +nodeDown == "down" +NodeState == { nodeUp, nodeDown } + +WorkerTypeOK == + \* Nodes are up or down + /\ nodes \in [ Node -> NodeState ] + +----------------------------------------------------------------------------- + +\* Actions performed by worker nodes (actually, by the dispatcher on their behalf) + +(* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *) +IsUp(n) == nodes[n] = nodeUp + +(* Try to advance containers towards `desired_state' if we're not there yet. *) +ProgressTask == + /\ UNCHANGED << nodes, nEvents >> + /\ \E t \in tasks, + s2 \in TaskState : \* The state we want to move to + LET t2 == [t EXCEPT !.status.state = s2] + IN + /\ s2 \preceq t.desired_state \* Can't be after the desired state + /\ << State(t), State(t2) >> \in { \* Possible ``progress'' (desirable) transitions + << assigned, accepted >>, + << accepted, preparing >>, + << preparing, ready >>, + << ready, starting >>, + << starting, running >> + } + /\ IsUp(t.node) \* Node must be connected to SwarmKit + /\ UpdateTasks(t :> t2) + +(* A running container finishes because we stopped it. *) +ShutdownComplete == + /\ UNCHANGED << nodes, nEvents >> + /\ \E t \in tasks : + /\ t.desired_state \in {shutdown, remove} \* We are trying to stop it + /\ State(t) = running \* It is currently running + /\ IsUp(t.node) + /\ UpdateTasks(t :> [t EXCEPT !.status.state = shutdown]) \* It becomes shutdown + +(* A node can reject a task once it's responsible for it (it has reached `assigned') + until it reaches the `running' state. + Note that an ``accepted'' task can still be rejected. *) +RejectTask == + /\ UNCHANGED << nodes >> + /\ CountEvent + /\ \E t \in tasks : + /\ State(t) \in { assigned, accepted, preparing, ready, starting } + /\ IsUp(t.node) + /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected]) + +(* We notify the managers that some running containers have finished. + There might be several updates at once (e.g. if we're reconnecting). *) +ContainerExit == + /\ UNCHANGED << nodes >> + /\ CountEvent + /\ \E n \in Node : + /\ IsUp(n) + /\ \E ts \in SUBSET { t \in tasks : t.node = n /\ State(t) = running } : + \* Each container could have ended in either state: + \E s2 \in [ ts -> { failed, complete } ] : + UpdateTasks( [ t \in ts |-> + [t EXCEPT !.status.state = + \* Report `failed' as `shutdown' if we wanted to shut down + IF s2[t] = failed /\ t.desired_state = shutdown THEN shutdown + ELSE s2[t]] + ] ) + +(* Tasks assigned to a node and for which the node is responsible. *) +TasksOwnedByNode(n) == { t \in tasks : + /\ t.node = n + /\ assigned \preceq State(t) + /\ State(t) \prec remove +} + +(* The dispatcher notices that the worker is down (the connection is lost). *) +WorkerDown == + /\ UNCHANGED << tasks >> + /\ CountEvent + /\ \E n \in Node : + /\ IsUp(n) + /\ nodes' = [nodes EXCEPT ![n] = nodeDown] + +(* When the node reconnects to the cluster, it gets an assignment set from the dispatcher + which does not include any tasks that have been marked orphaned and then deleted. + Any time an agent gets an assignment set that does not include some task it has running, + it shuts down those tasks. *) +WorkerUp == + /\ UNCHANGED << tasks, nEvents >> + /\ \E n \in Node : + /\ ~IsUp(n) + /\ nodes' = [nodes EXCEPT ![n] = nodeUp] + +(* If SwarmKit sees a node as down for a long time (48 hours or so) then + it marks all the node's tasks as orphaned. + + ``Moving a task to the Orphaned state is not desirable, + because it's the one case where we break the otherwise invariant + that the agent sets all states past ASSIGNED.'' +*) +OrphanTasks == + /\ UNCHANGED << nodes, nEvents >> + /\ \E n \in Node : + LET affected == { t \in TasksOwnedByNode(n) : Runnable(t) } + IN + /\ ~IsUp(n) \* Node `n' is still detected as down + /\ UpdateTasks([ t \in affected |-> + [t EXCEPT !.status.state = orphaned] ]) + +(* Actions we require to happen eventually when possible. *) +AgentProgress == + \/ ProgressTask + \/ ShutdownComplete + \/ OrphanTasks + \/ WorkerUp + +(* All actions of the agent/worker. *) +Agent == + \/ AgentProgress + \/ RejectTask + \/ ContainerExit + \/ WorkerDown + +============================================================================= diff --git a/design/tla/models/SwarmKit.cfg b/design/tla/models/SwarmKit.cfg new file mode 100644 index 0000000000..5c92a0153d --- /dev/null +++ b/design/tla/models/SwarmKit.cfg @@ -0,0 +1,15 @@ +SPECIFICATION Spec + +CONSTANT TaskId = {t1, t2} +CONSTANT ServiceId = {s1} +CONSTANT Node = {n1} +CONSTANT maxTerminated = 1 +CONSTANT maxReplicas = 1 +CONSTANT unassigned = unassigned +CONSTANT global = global + +INVARIANT TypeOK +INVARIANT Inv + +PROPERTY TransitionsOK +PROPERTY EventuallyAsDesired diff --git a/design/tla/models/WorkerImpl.cfg b/design/tla/models/WorkerImpl.cfg new file mode 100644 index 0000000000..6f0abf0433 --- /dev/null +++ b/design/tla/models/WorkerImpl.cfg @@ -0,0 +1,14 @@ +SPECIFICATION Impl + +CONSTANT TaskId = {t1} +CONSTANT ServiceId = {s1} +CONSTANT Node = {n1} +CONSTANT node = n1 +CONSTANT maxTerminated = 1 +CONSTANT maxReplicas = 1 +CONSTANT unassigned = unassigned +CONSTANT global = global + +INVARIANT TypeOK + +PROPERTY WSSpec