diff --git a/design/orchestrators.md b/design/orchestrators.md
index cc698e0764..5929309491 100644
--- a/design/orchestrators.md
+++ b/design/orchestrators.md
@@ -128,10 +128,10 @@ means rolling updates will go node-by-node instead of slot-by-slot.
 The restart supervisor manages the process of shutting down a task, and
 possibly starting a replacement task. Its entry point is a `Restart` method
 which is called inside a store write transaction in one of the orchestrators.
-It atomically changes the state of the old task to `Shutdown`, and, if it's
-appropriate to start a replacement task based on the service's restart policy,
-creates a new task in the same slot (replicated service) or on the same node
-(global service).
+It atomically changes the desired state of the old task to `Shutdown`, and, if
+it's appropriate to start a replacement task based on the service's restart
+policy, creates a new task in the same slot (replicated service) or on the same
+node (global service).
 
 If the service is set up with a restart delay, the restart supervisor handles
 this delay too. It initially creates the new task with the desired state
diff --git a/design/task_model.md b/design/task_model.md
index 5f76d3cefd..0453277682 100644
--- a/design/task_model.md
+++ b/design/task_model.md
@@ -127,8 +127,8 @@ that the requested node has the necessary resources, in the case of global
 services' tasks). It changes their state to `ASSIGNED`.
 
 From this point, control over the state passes to the agent. A task will
-progress through the `ACCEPTED`, `PREPARING`, and `STARTING` states on the
-way to `RUNNING`. If a task exits without an error code, it moves to the
+progress through the `ACCEPTED`, `PREPARING`, `READY', and `STARTING` states on
+the way to `RUNNING`. If a task exits without an error code, it moves to the
 `COMPLETE` state. If it fails, it moves to the `FAILED` state instead.
 
 A task may alternatively end up in the `SHUTDOWN` state if its shutdown was
diff --git a/design/tla/.gitignore b/design/tla/.gitignore
new file mode 100644
index 0000000000..d12e6fbdab
--- /dev/null
+++ b/design/tla/.gitignore
@@ -0,0 +1,5 @@
+*.toolbox
+*.pdf
+*.tlaps
+states
+metadir
diff --git a/design/tla/EventCounter.tla b/design/tla/EventCounter.tla
new file mode 100644
index 0000000000..59daf1731d
--- /dev/null
+++ b/design/tla/EventCounter.tla
@@ -0,0 +1,25 @@
+---------------------------- MODULE EventCounter ----------------------------
+
+EXTENDS Integers
+
+\* The number of ``events'' that have occurred (always 0 if we're not keeping track).
+VARIABLE nEvents
+
+\* The maximum number of events to allow, or ``-1'' for unlimited.
+maxEvents == -1
+
+InitEvents ==
+  nEvents = 0       \* Start with the counter at zero
+
+(* If we're counting events, increment the event counter.
+   We don't increment the counter when we don't have a maximum because that
+   would make the model infinite.
+   Actions tagged with CountEvent cannot happen once nEvents = maxEvents. *)
+CountEvent ==
+  IF maxEvents = -1 THEN
+    UNCHANGED nEvents
+  ELSE
+    /\ nEvents < maxEvents
+    /\ nEvents' = nEvents + 1
+
+=============================================================================
\ No newline at end of file
diff --git a/design/tla/Makefile b/design/tla/Makefile
new file mode 100644
index 0000000000..88e0caeb06
--- /dev/null
+++ b/design/tla/Makefile
@@ -0,0 +1,24 @@
+WORKERS := 4
+
+TLA := docker run --rm -it --workdir /mnt -v ${PWD}:/mnt talex5/tla
+
+.PHONY: all check pdfs tlaps
+
+all: check pdfs tlaps
+
+# Run the TLC model checker
+check:
+	${TLA} tlc -workers ${WORKERS} SwarmKit.tla -config models/SwarmKit.cfg
+	${TLA} tlc -workers ${WORKERS} WorkerImpl.tla -config models/WorkerImpl.cfg
+
+# Run the TLAPS proof checker
+tlaps:
+	${TLA} tlapm -I /usr/local/lib/tlaps SwarmKit.tla
+	${TLA} tlapm -I /usr/local/lib/tlaps WorkerImpl.tla
+
+# Generate a PDF file from a .tla file
+%.pdf: %.tla
+	[ -d metadir ] || mkdir metadir
+	${TLA} java tla2tex.TLA -shade -latexCommand pdflatex -latexOutputExt pdf -metadir metadir $<
+
+pdfs: SwarmKit.pdf Types.pdf Tasks.pdf WorkerSpec.pdf EventCounter.pdf
diff --git a/design/tla/README.md b/design/tla/README.md
new file mode 100644
index 0000000000..48d7bfc7a3
--- /dev/null
+++ b/design/tla/README.md
@@ -0,0 +1,14 @@
+This directory contains documentation for SwarmKit using [TLA+][] notation.
+
+Run `make pdfs` to render these documents as PDF files.
+The best one to start with is `SwarmKit.pdf`, which introduces the TLA+ notation
+and describes the overall components of SwarmKit.
+
+The specifications can also be executed by the TLC model checker to help find
+mistakes. Use `make check` to run the checks.
+
+If you want to edit these specifications, you will probably want to use the [TLA+ Toolbox][],
+which provides a GUI.
+
+[TLA+]: https://en.wikipedia.org/wiki/TLA%2B
+[TLA+ Toolbox]: http://lamport.azurewebsites.net/tla/toolbox.html
diff --git a/design/tla/SwarmKit.tla b/design/tla/SwarmKit.tla
new file mode 100644
index 0000000000..1cf23f10c9
--- /dev/null
+++ b/design/tla/SwarmKit.tla
@@ -0,0 +1,633 @@
+This is a TLA+ model of SwarmKit. Even if you don't know TLA+, you should be able to
+get the general idea. This section gives a very brief overview of the syntax.
+
+Declare `x' to be something that changes as the system runs:
+
+  VARIABLE x
+
+Define `Init' to be a state predicate (== means ``is defined to be''):
+
+  Init ==
+    x = 0
+
+`Init' is true for states in which `x = 0'. This can be used to define
+the possible initial states of the system. For example, the state
+[ x |-> 0, y |-> 2, ... ] satisfies this.
+
+Define `Next' to be an action:
+
+  Next ==
+    /\ x' \in Nat
+    /\ x' > x
+
+An action takes a pair of states, representing an atomic step of the system.
+Unprimed expressions (e.g. `x') refer to the old state, and primed ones to
+the new state. This example says that a step is a `Next' step iff the new
+value of `x' is a natural number and greater than the previous value.
+For example, [ x |-> 3, ... ] -> [x |-> 10, ... ] is a `Next' step.
+
+/\ is logical ``and''. This example uses TLA's ``bulleted-list'' syntax, which makes
+writing these easier. It is indentation-sensitive. TLA also has \/ lists (``or'').
+
+See `.http://lamport.azurewebsites.net/tla/summary.pdf.' for a more complete summary
+of the syntax.
+
+This specification can be read as documentation, but it can also be executed by the TLC
+model checker. See the model checking section below for details about that.
+
+The rest of the document is organised as follows:
+
+1. Parameters to the model
+2. Types and definitions
+3. How to run the model checker
+4. Actions performed by the user
+5. Actions performed by the components of SwarmKit
+6. The complete system
+7. Properties of the system
+
+-------------------------------- MODULE SwarmKit --------------------------------
+
+(* Import some libraries we use.
+   Common SwarmKit types are defined in Types.tla. You should probably read that before continuing. *)
+EXTENDS Integers, TLC, FiniteSets,  \* From the TLA+ standard library
+        Types,                      \* SwarmKit types
+        Tasks,                      \* The `tasks' variable
+        WorkerSpec,                 \* High-level spec for worker nodes
+        EventCounter                \* Event limiting, for modelling purposes
+
+(* The maximum number of terminated tasks to keep for each slot. *)
+CONSTANT maxTerminated
+ASSUME maxTerminated \in Nat
+
+(* In the model, we share taskIDs (see ModelTaskId), which means that
+   we can cover most behaviours with only enough task IDs
+   for one running task and maxTerminated finished ones. *)
+ASSUME Cardinality(TaskId) >= 1 + maxTerminated
+
+-------------------------------------------------------------------------------
+\* Services
+
+VARIABLE services       \* A map of currently-allocated services, indexed by ServiceId
+
+(* A replicated service is one that specifies some number of replicas it wants. *)
+IsReplicated(sid) ==
+  services[sid].replicas \in Nat
+
+(* A global service is one that wants one task running on each node. *)
+IsGlobal(sid) ==
+  services[sid].replicas = global
+
+(* TasksOf(sid) is the set of tasks for service `sid'. *)
+TasksOf(sid) ==
+  { t \in tasks : t.service = sid }
+
+(* All tasks of service `sid' in `vslot'. *)
+TasksOfVSlot(sid, vslot) ==
+  { t \in TasksOf(sid) : VSlot(t) = vslot }
+
+(* All vslots of service `sid'. *)
+VSlotsOf(sid) ==
+  { VSlot(t) : t \in TasksOf(sid) }
+
+-------------------------------------------------------------------------------
+\* Types
+
+(* The expected type of each variable. TLA+ is an untyped language, but the model checker
+   can check that TypeOK is true for every reachable state. *)
+TypeOK ==
+  \* `services' is a mapping from service IDs to ServiceSpecs:
+  /\ DOMAIN services \subseteq ServiceId
+  /\ services \in [ DOMAIN services -> ServiceSpec ]
+  /\ TasksTypeOK    \* Defined in Types.tla
+  /\ WorkerTypeOK   \* Defined in WorkerSpec.tla
+
+-------------------------------------------------------------------------------
+(*
+`^ \textbf{Model checking} ^'
+
+   You can test this specification using the TLC model checker.
+   This section describes how to do that. If you don't want to run TLC,
+   you can skip this section.
+
+   To use TLC, load this specification file in the TLA+ toolbox (``Open Spec'')
+   and create a new model using the menu.
+
+   You will be prompted to enter values for the various CONSTANTS.
+   A suitable set of initial values is:
+
+      `.
+      Node          <- [ model value ] {n1}
+      ServiceId     <- [ model value ] {s1}
+      TaskId        <- [ model value ] {t1, t2}
+      maxReplicas   <- 1
+      maxTerminated <- 1
+      .'
+
+   For the [ model value ] ones, select `Set of model values'.
+
+   This says that we have one node, `n1', at most one service, and at most
+   two tasks per vslot. TLC can explore all possible behaviours of this system
+   in a couple of seconds on my laptop.
+
+   You should also specify some things to check (under ``What to check?''):
+
+   - Add `TypeOK' and `Inv' under ``Invariants''
+   - Add `TransitionsOK' and `EventuallyAsDesired' under ``Properties''
+
+   Running the model should report ``No errors''.
+
+   If the model fails, TLC will show you an example sequence of actions that lead to
+   the failure and you can inspect the state at each step. You can try this out by
+   commenting out any important-looking condition in the model (e.g. the requirement
+   in UpdateService that you can't change the mode of an existing service).
+
+   Although the above model is very small, it should detect most errors that you might
+   accidentally introduce when modifying the specification. Increasing the number of nodes,
+   services, replicas or terminated tasks will check more behaviours of the system,
+   but will be MUCH slower.
+
+   The rest of this section describes techniques to make model checking faster by reducing
+   the number of states that must be considered in various ways. Feel free to skip it.
+
+`^ \textbf{Symmetry sets} ^'
+
+   You should configure any model sets (e.g. `TaskId') as `symmetry sets'.
+   For example, if you have a model with two nodes {n1, n2} then this tells TLC that
+   two states which are the same except that n1 and n2 are swapped are equivalent
+   and it only needs to continue exploring from one of them.
+   TLC will warn that checking temporal properties may not work correctly,
+   but it's much faster and I haven't had any problems with it.
+
+`^ \textbf{Limiting the maximum number of setbacks to consider} ^'
+
+   Another way to speed things up is to reduce the number of failures that TLC must consider.
+   By default, it checks every possible combination of failures at every point, which
+   is very expensive.
+   In the `Advanced Options' panel of the model, add a ``Definition Override'' of e.g.
+   `maxEvents = 2'. Actions that represent unnecessary extra work (such as the user
+   changing the configuration or a worker node going down) are tagged with `CountEvent'.
+   Any run of the system cannot have more than `maxEvents' such events.
+
+   See `EventCounter.tla' for details.
+
+`^ \textbf{Preventing certain failures} ^'
+
+   If you're not interested in some actions then you can block them. For example,
+   adding these two constraints in the ``Action Constraint'' box of the
+   ``Advanced Options'' tab tells TLC not to consider workers going down or
+   workers rejecting tasks as possible actions:
+
+   /\ ~WorkerDown
+   /\ ~RejectTask
+*)
+
+(*
+`^ \textbf{Combining task states} ^'
+
+   A finished task can be either in the `complete' or `failed' state, depending on
+   its exit status. If we have 4 finished tasks, that's 16 different states. For
+   modelling, we might not care about exit codes and we can treat this as a single
+   state with another definition override:
+
+   `.failed <- complete.'
+
+   In a similar way, we can combine { assigned, accepted, preparing, ready } into a single
+   state:
+
+   `.accepted <- assigned
+     preparing <- assigned
+     ready <- assigned.'
+*)
+
+---------------------------- MODULE User  --------------------------------------------
+\* Actions performed by users
+
+(* Create a new service with any ServiceSpec.
+
+   This says that a single atomic step of the system from an old state
+   to a new one is a CreateService step iff `tasks', `nodes' and `nEvents' don't change
+   and the new value of `services' is the same as before except that some
+   service ID that wasn't used in the old state is now mapped to some
+   ServiceSpec.
+
+   Note: A \ B means { x \in A : x \notin B } --
+         i.e. the set A with all elements in B removed.
+   *)
+CreateService ==
+  /\ UNCHANGED << tasks, nodes, nEvents >>
+  /\ \E sid \in ServiceId \ DOMAIN services,     \* `sid' is an unused ServiceId
+       spec \in ServiceSpec :                    \* `spec' is any ServiceSpec
+          /\ spec.remove = FALSE                 \* (not flagged for removal)
+          /\ services' = services @@ sid :> spec \* Add `sid |-> spec' to `services'
+
+(* Update an existing service's spec. *)
+UpdateService ==
+  /\ UNCHANGED << tasks, nodes >>
+  /\ CountEvent \* Flag as an event for model-checking purposes
+  /\ \E sid     \in DOMAIN services,   \* `sid' is an existing ServiceId
+        newSpec \in ServiceSpec :      \* `newSpec' is any `ServiceSpec'
+       /\ services[sid].remove = FALSE \* We weren't trying to remove sid
+       /\ newSpec.remove = FALSE       \* and we still aren't.
+       \* You can't change a service's mode:
+       /\ (services[sid].replicas = global) <=> (newSpec.replicas = global)
+       /\ services' = [ services EXCEPT ![sid] = newSpec ]
+
+(* The user removes a service.
+
+   Note: Currently, SwarmKit deletes the service from its records immediately.
+   However, this isn't right because we need to wait for service-level resources
+   such as Virtual IPs to be freed.
+   Here we model the proposed fix, in which we just flag the service for removal. *)
+RemoveService ==
+  /\ UNCHANGED << nodes >>
+  /\ CountEvent
+  /\ \E sid \in DOMAIN services : \* sid is some existing service
+       \* Flag service for removal:
+       /\ services' = [services EXCEPT ![sid].remove = TRUE]
+       \* Flag every task of the service for removal:
+       /\ UpdateTasks([ t \in TasksOf(sid) |->
+                          [t EXCEPT !.desired_state = remove] ])
+
+(* A user action is one of these. *)
+User ==
+  \/ CreateService
+  \/ UpdateService
+  \/ RemoveService
+
+=============================================================================
+
+---------------------------- MODULE Orchestrator ----------------------------
+
+\* Actions performed the orchestrator
+
+\* Note: This is by far the most complicated component in the model.
+\* You might want to read this section last...
+
+(* The set of tasks for service `sid' that should be considered as active.
+   This is any task that is running or on its way to running. *)
+RunnableTasks(sid) ==
+  { t \in TasksOf(sid) : Runnable(t) }
+
+(* Candidates for shutting down when we have too many. We don't want to count tasks that are shutting down
+   towards the total count when deciding whether we need to kill anything. *)
+RunnableWantedTasks(sid) ==
+  { t \in RunnableTasks(sid) : t.desired_state \preceq running  }
+
+(* The set of possible new vslots for `sid'. *)
+UnusedVSlot(sid) ==
+  IF IsReplicated(sid) THEN Slot \ VSlotsOf(sid)
+                       ELSE Node \ VSlotsOf(sid)
+
+(* The set of possible IDs for a new task in a vslot.
+
+   The complexity here is just a side-effect of the modelling (where we need to
+   share and reuse task IDs for performance).
+   In the real system, choosing an unused ID is easy. *)
+UnusedId(sid, vslot) ==
+  LET swarmTaskIds == { t.id : t \in TasksOfVSlot(sid, vslot) }
+  IN  TaskId \ swarmTaskIds
+
+(* Create a new task/slot if the number of runnable tasks is less than the number requested. *)
+CreateSlot ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E sid \in DOMAIN services :          \* `sid' is an existing service
+     /\ ~services[sid].remove              \* that we're not trying to remove
+     (* For replicated tasks, only create as many slots as we need.
+        For global tasks, we want all possible vslots (nodes). *)
+     /\ IsReplicated(sid) =>
+          services[sid].replicas > Cardinality(VSlotsOf(sid))  \* Desired > actual
+     /\ \E slot \in UnusedVSlot(sid) :
+        \E id   \in UnusedId(sid, slot) :
+           tasks' = tasks \union { NewTask(sid, slot, id, running) }
+
+(* Add a task if a slot exists, contains no runnable tasks, and we weren't trying to remove it.
+   Note: if we are trying to remove it, the slot will eventually disappear and CreateSlot will
+   then make a new one if we later need it again.
+
+   Currently in SwarmKit, slots do not actually exist as objects in the store.
+   Instead, we just infer that a slot exists because there exists a task with that slot ID.
+   This has the odd effect that if `maxTerminated = 0' then we may create new slots rather than reusing
+   existing ones, depending on exactly when the reaper runs.
+   *)
+ReplaceTask ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E sid  \in DOMAIN services :
+     \E slot \in VSlotsOf(sid) :
+     /\ \A task \in TasksOfVSlot(sid, slot) :    \* If all tasks in `slot' are
+           ~Runnable(task)                       \* dead (not runnable) and
+     /\ \E task \in TasksOfVSlot(sid, slot) :    \* there is some task that
+           task.desired_state # remove           \* we're not trying to remove,
+     /\ \E id \in UnusedId(sid, slot) :          \* then create a replacement task:
+        tasks' = tasks \union { NewTask(sid, slot, id, running) }
+
+(* If we have more replicas than the spec asks for, remove one of them. *)
+RequestRemoval ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E sid \in DOMAIN services :
+       LET current == RunnableWantedTasks(sid)
+       IN \* Note: `current' excludes tasks we're already trying to kill
+       /\ IsReplicated(sid)
+       /\ services[sid].replicas < Cardinality(current)   \* We have too many replicas
+       /\ \E slot \in { t.slot : t \in current } :        \* Choose an allocated slot
+            \* Mark all tasks for that slot for removal:
+            UpdateTasks( [ t \in TasksOfVSlot(sid, slot) |->
+                            [t EXCEPT !.desired_state = remove] ] )
+
+(* Mark a terminated task for removal if we already have `maxTerminated' terminated tasks for this slot. *)
+CleanupTerminated ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E sid  \in DOMAIN services :
+     \E slot \in VSlotsOf(sid) :
+     LET termTasksInSlot == { t \in TasksOfVSlot(sid, slot) :
+                              State(t) \in { complete, shutdown, failed, rejected } }
+     IN
+     /\ Cardinality(termTasksInSlot) > maxTerminated    \* Too many tasks for slot
+     /\ \E t \in termTasksInSlot :                      \* Pick a victim to remove
+        UpdateTasks(t :> [t EXCEPT !.desired_state = remove])
+
+(* We don't model the updater explicitly, but we allow any task to be restarted (perhaps with
+   a different image) at any time, which should cover the behaviours of the restart supervisor.
+
+   TODO: SwarmKit also allows ``start-first'' mode updates where we first get the new task to
+   `running' and then mark the old task for shutdown. Add this to the model. *)
+RestartTask ==
+  /\ UNCHANGED << services, nodes >>
+  /\ CountEvent
+  /\ \E oldT  \in tasks :
+     \E newId \in UnusedId(oldT.service, VSlot(oldT)) :
+        /\ Runnable(oldT)                           \* Victim must be runnable
+        /\ oldT.desired_state \prec shutdown        \* and we're not trying to kill it
+        \* Create the new task in the `ready' state (see ReleaseReady below):
+        /\ LET replacement == NewTask(oldT.service, VSlot(oldT), newId, ready)
+           IN  tasks' =
+                (tasks \ {oldT}) \union {
+                  [oldT EXCEPT !.desired_state = shutdown],
+                  replacement
+                }
+
+(* A task is set to wait at `ready' and the previous task for that slot has now finished.
+   Allow it to proceed to `running'. *)
+ReleaseReady ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E t \in tasks :
+       /\ t.desired_state = ready         \* (and not e.g. `remove')
+       /\ State(t) = ready
+       /\ \A other \in TasksOfVSlot(t.service, VSlot(t)) \ {t} :
+             ~Runnable(other)             \* All other tasks have finished
+       /\ UpdateTasks(t :> [t EXCEPT !.desired_state = running])
+
+(* The user asked to remove a service, and now all its tasks have been cleaned up. *)
+CleanupService ==
+  /\ UNCHANGED << tasks, nodes, nEvents >>
+  /\ \E sid \in DOMAIN services :
+     /\ services[sid].remove = TRUE
+     /\ TasksOf(sid) = {}
+     /\ services' = [ i \in DOMAIN services \ {sid} |-> services[i] ]
+
+(* Tasks that the orchestrator must always do eventually if it can: *)
+OrchestratorProgress ==
+  \/ CreateSlot
+  \/ ReplaceTask
+  \/ RequestRemoval
+  \/ CleanupTerminated
+  \/ ReleaseReady
+  \/ CleanupService
+
+(* All actions that the orchestrator can perform *)
+Orchestrator ==
+  \/ OrchestratorProgress
+  \/ RestartTask
+
+=============================================================================
+
+---------------------------- MODULE Allocator -------------------------------
+\*  Actions performed the allocator
+
+(* Pick a `new' task and move it to `pending'.
+
+   The spec says the allocator will ``allocate resources such as network attachments
+   which are necessary for the tasks to run''. However, we don't model any resources here. *)
+AllocateTask ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E t \in tasks :
+     /\ State(t) = new
+     /\ UpdateTasks(t :> [t EXCEPT !.status.state = pending])
+
+AllocatorProgress ==
+  \/ AllocateTask
+
+Allocator ==
+  \/ AllocatorProgress
+
+=============================================================================
+
+---------------------------- MODULE Scheduler -------------------------------
+
+\*  Actions performed by the scheduler
+
+(* The scheduler assigns a node to a `pending' task and moves it to `assigned'
+   once sufficient resources are available (we don't model resources here). *)
+Scheduler ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E t \in tasks :
+     /\ State(t) = pending
+     /\ LET candidateNodes == IF t.node = unassigned
+                                THEN Node  \* (all nodes)
+                                ELSE { t.node }
+        IN
+        \E node \in candidateNodes :
+           UpdateTasks(t :> [t EXCEPT !.status.state = assigned,
+                                      !.node = node ])
+
+=============================================================================
+
+---------------------------- MODULE Reaper ----------------------------------
+
+\*  Actions performed by the reaper
+
+(* Forget about tasks in remove or orphan states.
+
+   Orphaned tasks belong to nodes that we are assuming are lost forever (or have crashed
+   and will come up with nothing running, which is an equally fine outcome). *)
+Reaper ==
+  /\ UNCHANGED << services, nodes, nEvents >>
+  /\ \E t \in tasks :
+      /\ \/ /\ t.desired_state = remove
+            /\ (State(t) \prec assigned \/ ~Runnable(t)) \* Not owned by agent
+         \/ State(t) = orphaned
+      /\ tasks' = tasks \ {t}
+
+=============================================================================
+
+\*  The complete system
+
+\* Import definitions from the various modules
+INSTANCE User
+INSTANCE Orchestrator
+INSTANCE Allocator
+INSTANCE Scheduler
+INSTANCE Reaper
+
+\* All the variables
+vars == << tasks, services, nodes, nEvents >>
+
+\* Initially there are no tasks and no services, and all nodes are up.
+Init ==
+  /\ tasks = {}
+  /\ services = << >>
+  /\ nodes = [ n \in Node |-> nodeUp ]
+  /\ InitEvents
+
+(* WorkerSpec doesn't mention `services'. To combine it with this spec, we need to say
+   that every action of the agent leaves `services' unchanged. *)
+AgentReal ==
+  Agent /\ UNCHANGED services
+
+(* Unfortunately, `AgentReal' causes TLC to report all problems of the agent
+   as simply `AgentReal' steps, which isn't very helpful. We can get better
+   diagnostics by expanding it, like this: *)
+AgentTLC ==
+  \/ (ProgressTask     /\ UNCHANGED services)
+  \/ (ShutdownComplete /\ UNCHANGED services)
+  \/ (OrphanTasks      /\ UNCHANGED services)
+  \/ (WorkerUp         /\ UNCHANGED services)
+  \/ (RejectTask       /\ UNCHANGED services)
+  \/ (ContainerExit    /\ UNCHANGED services)
+  \/ (WorkerDown       /\ UNCHANGED services)
+
+(* To avoid the risk of `AgentTLC' getting out of sync,
+   TLAPS can check that the definitions are equivalent. *)
+THEOREM AgentTLC = AgentReal
+BY DEF AgentTLC, AgentReal, Agent, AgentProgress
+
+(* A next step is one in which any of these sub-components takes a step: *)
+Next ==
+  \/ User
+  \/ Orchestrator
+  \/ Allocator
+  \/ Scheduler
+  \/ AgentTLC
+  \/ Reaper
+  \* For model checking: don't report deadlock if we're limiting events
+  \/ (nEvents = maxEvents /\ UNCHANGED vars)
+
+(* This is a ``temporal formula''. It takes a sequence of states representing the
+   changing state of the world and evaluates to TRUE if that sequences of states is
+   a possible behaviour of SwarmKit. *)
+Spec ==
+  \* The first state in the behaviour must satisfy Init:
+  /\ Init
+  \* All consecutive pairs of states must satisfy Next or leave `vars' unchanged:
+  /\ [][Next]_vars
+  (* Some actions are required to happen eventually. For example, a behaviour in
+     which SwarmKit stops doing anything forever, even though it could advance some task
+     from the `new' state, isn't a valid behaviour of the system.
+     This property is called ``weak fairness''. *)
+  /\ WF_vars(OrchestratorProgress)
+  /\ WF_vars(AllocatorProgress)
+  /\ WF_vars(Scheduler)
+  /\ WF_vars(AgentProgress /\ UNCHANGED services)
+  /\ WF_vars(Reaper)
+  /\ WF_vars(WorkerUp /\ UNCHANGED services)
+     (* We don't require fairness of:
+        - User (we don't control them),
+        - RestartTask (services aren't required to be updated),
+        - RejectTask (tasks aren't required to be rejected),
+        - ContainerExit (we don't specify image behaviour) or
+        - WorkerDown (workers aren't required to fail). *)
+
+-------------------------------------------------------------------------------
+\* Properties to verify
+
+(* These are properties that should follow automatically if the system behaves as
+   described by `Spec' in the previous section. *)
+
+\* A state invariant (things that should be true in every state).
+Inv ==
+  \A t \in tasks :
+    (* Every task has a service:
+
+       TODO: The spec says: ``In some cases, there are tasks that exist independent of any service.
+             These do not have a value set in service_id.''. Add an example of one. *)
+    /\ t.service \in DOMAIN services
+    \* Tasks have nodes once they reach `assigned', except maybe if rejected:
+    /\ assigned \preceq State(t) => t.node \in Node \/ State(t) = rejected
+    \* `remove' is only used as a desired state, not an actual one:
+    /\ State(t) # remove
+    \* Task IDs are unique
+    /\ \A t2 \in tasks : Id(t) = Id(t2) => t = t2
+
+(* The state of task `i' in `S', or `null' if it doesn't exist *)
+Get(S, i) ==
+  LET cand == { x \in S : Id(x) = i }
+  IN  IF cand = {} THEN null
+                   ELSE State(CHOOSE x \in cand : TRUE)
+
+(* An action in which all transitions were valid. *)
+StepTransitionsOK ==
+  LET permitted == { << x, x >> : x \in TaskState } \union  \* No change is always OK
+    CASE Orchestrator -> Transitions.orchestrator
+      [] Allocator    -> Transitions.allocator
+      [] Scheduler    -> Transitions.scheduler
+      [] Agent        -> Transitions.agent
+      [] Reaper       -> Transitions.reaper
+      [] OTHER        -> {}
+    oldIds == IdSet(tasks)
+    newIds == IdSet(tasks')
+  IN
+  \A id \in newIds \union oldIds :
+     << Get(tasks, id), Get(tasks', id) >> \in permitted
+
+(* Some of the expressions below are ``temporal formulas''. Unlike state expressions and actions,
+   these look at a complete behaviour (sequence of states). Summary of notation:
+
+   [] means ``always''. e.g. []x=3 means that `x = 3' in all states.
+
+   <> means ``eventually''. e.g. <>x=3 means that `x = 3' in some state.
+
+   `x=3' on its own means that `x=3' in the initial state.
+*)
+
+\* A temporal formula that checks every step satisfies StepTransitionsOK (or `vars' is unchanged)
+TransitionsOK ==
+  [][StepTransitionsOK]_vars
+
+(* Every service has the right number of running tasks (the system is in the desired state). *)
+InDesiredState ==
+  \A sid \in DOMAIN services :
+    \* We're not trying to remove the service:
+    /\ ~services[sid].remove
+    \* The service has the correct set of running replicas:
+    /\ LET runningTasks  == { t \in TasksOf(sid) : State(t) = running }
+           nRunning      == Cardinality(runningTasks)
+       IN
+       CASE IsReplicated(sid) ->
+              /\ nRunning = services[sid].replicas
+         [] IsGlobal(sid) ->
+              \* We have as many tasks as nodes:
+              /\ nRunning = Cardinality(Node)
+              \* We have a task for every node:
+              /\ { t.node : t \in runningTasks } = Node
+    \* The service does not have too many terminated tasks
+    /\ \A slot \in VSlotsOf(sid) :
+       LET terminated == { t \in TasksOfVSlot(sid, slot) : ~Runnable(t) }
+       IN  Cardinality(terminated) <= maxTerminated
+
+(* The main property we want to check.
+
+   []<> means ``always eventually'' (``infinitely-often'')
+
+   <>[] means ``eventually always'' (always true after some point)
+
+   This temporal formula says that if we only experience a finite number of
+   problems then the system will eventually settle on InDesiredState.
+*)
+EventuallyAsDesired ==
+  \/ []<> <<User>>_vars               \* Either the user keeps changing the configuration,
+  \/ []<> <<RestartTask>>_vars        \* or restarting/updating tasks,
+  \/ []<> <<WorkerDown>>_vars         \* or workers keep failing,
+  \/ []<> <<RejectTask>>_vars         \* or workers keep rejecting tasks,
+  \/ []<> <<ContainerExit>>_vars      \* or the containers keep exiting,
+  \/ <>[] InDesiredState              \* or we eventually get to the desired state and stay there.
+
+=============================================================================
diff --git a/design/tla/Tasks.tla b/design/tla/Tasks.tla
new file mode 100644
index 0000000000..f0b1569659
--- /dev/null
+++ b/design/tla/Tasks.tla
@@ -0,0 +1,112 @@
+---------------------------- MODULE Tasks ----------------------------------
+
+EXTENDS TLC, Types
+
+VARIABLE tasks          \* The set of currently-allocated tasks
+
+(* The expected type of each variable. TLA+ is an untyped language, but the model checker
+   can check that TasksTypeOK is true for every reachable state. *)
+TasksTypeOK ==
+  \* `tasks' is a subset of the set of all possible tasks
+  /\ tasks \in SUBSET Task
+
+(* Update `tasks' by performing each update in `f', which is a function
+   mapping old tasks to new ones. *)
+UpdateTasks(f) ==
+  /\ Assert(\A t \in DOMAIN f : t \in tasks, "An old task does not exist!")
+  /\ Assert(\A t \in DOMAIN f :
+                LET t2 == f[t]
+                IN                        \* The updated version of `t' must have
+                /\ t.id      = t2.id      \* the same task ID,
+                /\ t.service = t2.service \* the same service ID,
+                /\ VSlot(t)  = VSlot(t2), \* and the same vslot.
+            "An update changes a task's identity!")
+  \* Remove all the old tasks and add the new ones:
+  /\ tasks' = (tasks \ DOMAIN f) \union Range(f)
+
+(* A `new' task belonging to service `sid' with the given slot, id, and desired state. *)
+NewTask(sid, vslot, id, desired_state) ==
+  [
+    id            |-> id,
+    service       |-> sid,
+    status        |-> [ state |-> new ],
+    desired_state |-> desired_state,
+    node          |-> IF vslot \in Node THEN vslot ELSE unassigned,
+    slot          |-> IF vslot \in Slot THEN vslot ELSE global
+  ]
+
+
+\* A special ``state'' used when a task doesn't exist.
+null == "null"
+
+(* All the possible transitions, grouped by the component that performs them. *)
+Transitions == [
+  orchestrator |-> {
+    << null, new >>
+  },
+
+  allocator |-> {
+    << new, pending >>
+  },
+
+  scheduler |-> {
+    << pending, assigned >>
+  },
+
+  agent |-> {
+    << assigned, accepted >>,
+    << accepted, preparing >>,
+    << preparing, ready >>,
+    << ready, starting >>,
+    << starting, running >>,
+
+    << assigned, rejected >>,
+    << accepted, rejected >>,
+    << preparing, rejected >>,
+    << ready, rejected >>,
+    << starting, rejected >>,
+
+    << running, complete >>,
+    << running, failed >>,
+
+    << running, shutdown >>,
+
+    << assigned, orphaned >>,
+    << accepted, orphaned >>,
+    << preparing, orphaned >>,
+    << ready, orphaned >>,
+    << starting, orphaned >>,
+    << running, orphaned >>
+  },
+
+  reaper |-> {
+    << new, null >>,
+    << pending, null >>,
+    << rejected, null >>,
+    << complete, null >>,
+    << failed, null >>,
+    << shutdown, null >>,
+    << orphaned, null >>
+  }
+]
+
+(* Check that `Transitions' itself is OK. *)
+TransitionTableOK ==
+  \* No transition moves to a lower-ranked state:
+  /\ \A actor \in DOMAIN Transitions :
+     \A trans \in Transitions[actor] :
+        \/ trans[1] = null
+        \/ trans[2] = null
+        \/ trans[1] \preceq trans[2]
+  (* Every source state has exactly one component which handles transitions out of that state.
+     Except for the case of the reaper removing `new' and `pending' tasks that are flagged
+     for removal. *)
+  /\ \A a1, a2 \in DOMAIN Transitions :
+     LET exceptions == { << new, null >>, << pending, null >> }
+          Source(a) == { s[1] : s \in Transitions[a] \ exceptions}
+     IN  a1 # a2 =>
+           Source(a1) \intersect Source(a2) = {}
+
+ASSUME TransitionTableOK  \* Note: ASSUME means ``check'' to TLC
+
+=============================================================================
\ No newline at end of file
diff --git a/design/tla/Types.tla b/design/tla/Types.tla
new file mode 100644
index 0000000000..555ad4c68f
--- /dev/null
+++ b/design/tla/Types.tla
@@ -0,0 +1,124 @@
+------------------------------- MODULE Types -------------------------------
+
+EXTENDS Naturals, FiniteSets
+
+(* A generic operator to get the range of a function (the set of values in a map): *)
+Range(S) == { S[i] : i \in DOMAIN S }
+
+(* The set of worker nodes.
+
+   Note: a CONSTANT is an input to the model. The model should work with any set of nodes you provide.
+
+   TODO: should cope with this changing at runtime, and with draining nodes. *)
+CONSTANT Node
+
+(* A special value indicating that a task is not yet assigned to a node.
+
+   Note: this TLA+ CHOOSE idiom just says to pick some value that isn't a Node (e.g. `null'). *)
+unassigned == CHOOSE n : n \notin Node
+
+(* The type (set) of service IDs (e.g. `Int' or `String').
+   When model checking, this will be some small set (e.g. {"s1", "s2"}). *)
+CONSTANT ServiceId
+
+(* The type of task IDs. *)
+CONSTANT TaskId
+
+(* The maximum possible value for `replicas' in ServiceSpec. *)
+CONSTANT maxReplicas
+ASSUME maxReplicas \in Nat
+Slot == 1..maxReplicas  \* Possible slot numbers
+
+(* A special value (e.g. `-1') indicating that we want one replica running on each node: *)
+global == CHOOSE x : x \notin Nat
+
+(* The type of a description of a service (a struct/record).
+   This is provided by, and only changed by, the user. *)
+ServiceSpec == [
+  (* The replicas field is either a count giving the desired number of replicas,
+     or the special value `global'. *)
+  replicas : 0..maxReplicas \union {global},
+  remove   : BOOLEAN    \* The user wants to remove this service
+]
+
+(* The possible states for a task: *)
+new == "new"
+pending == "pending"
+assigned == "assigned"
+accepted == "accepted"
+preparing == "preparing"
+ready == "ready"
+starting == "starting"
+running == "running"
+complete == "complete"
+shutdown == "shutdown"
+failed == "failed"
+rejected == "rejected"
+remove == "remove"      \* Only used as a ``desired state'', not an actual state
+orphaned == "orphaned"
+
+(* Every state has a rank. It is only possible for a task to change
+   state to a state with a higher rank (later in this sequence). *)
+order == << new, pending, assigned, accepted,
+             preparing, ready, starting,
+             running,
+             complete, shutdown, failed, rejected,
+             remove, orphaned >>
+
+(* Maps a state to its position in `order' (e.g. StateRank(new) = 1): *)
+StateRank(s) == CHOOSE i \in DOMAIN order : order[i] = s
+
+(* Convenient notation for comparing states: *)
+s1 \prec s2   == StateRank(s1) < StateRank(s2)
+s1 \preceq s2 == StateRank(s1) <= StateRank(s2)
+
+(* The set of possible states ({new, pending, ...}): *)
+TaskState == Range(order) \ {remove}
+
+(* Possibly this doesn't need to be a record, but we might want to add extra fields later. *)
+TaskStatus == [
+  state : TaskState
+]
+
+(* The state that SwarmKit wants to the task to be in. *)
+DesiredState == { ready, running, shutdown, remove }
+
+(* This has every field mentioned in `task_model.md' except for `spec', which
+   it doesn't seem to use for anything.
+
+   `desired_state' can be any state, although currently we only ever set it to one of
+    {ready, running, shutdown, remove}. *)
+Task == [
+  id : TaskId,                      \* To uniquely identify this task
+  service : ServiceId,              \* The service that owns the task
+  status : TaskStatus,              \* The current state
+  desired_state : DesiredState,     \* The state requested by the orchestrator
+  node : Node \union {unassigned},  \* The node on which the task should be run
+  slot : Slot \union {global}       \* A way of tracking related tasks
+]
+
+(* The current state of task `t'. *)
+State(t) == t.status.state
+
+(* A task is runnable if it is running or could become running in the future. *)
+Runnable(t) == State(t) \preceq running
+
+(* A task's ``virtual slot'' is its actual slot for replicated services,
+   but its node for global ones. *)
+VSlot(t) ==
+  IF t.slot = global THEN t.node ELSE t.slot
+
+(* In the real SwarmKit, a task's ID is just its taskId field.
+   However, this requires lots of IDs, which is expensive for model checking.
+   So instead, we will identify tasks by their << serviceId, vSlot, taskId >>
+   triple, and only require taskId to be unique within its vslot. *)
+ModelTaskId == ServiceId \X (Slot \union Node) \X TaskId
+
+(* A unique identifier for a task, which never changes. *)
+Id(t) ==
+  << t.service, VSlot(t), t.id >>   \* A ModelTaskId
+
+(* The ModelTaskIds of a set of tasks. *)
+IdSet(S) == { Id(t) : t \in S }
+
+=============================================================================
diff --git a/design/tla/WorkerImpl.tla b/design/tla/WorkerImpl.tla
new file mode 100644
index 0000000000..03bf39bf32
--- /dev/null
+++ b/design/tla/WorkerImpl.tla
@@ -0,0 +1,321 @@
+---------------------------- MODULE WorkerImpl ----------------------------------
+
+EXTENDS TLC, Types, Tasks, EventCounter
+
+(*
+`WorkerSpec' provides a high-level specification of worker nodes that only refers to
+the state of the tasks recorded in SwarmKit's store. This specification (WorkerImpl)
+refines WorkerSpec by also modelling the state of the containers running on a node.
+It should be easier to see that this lower-level specification corresponds to what
+actually happens on worker nodes.
+
+The reason for having this in a separate specification is that including the container
+state greatly increases the number of states to be considered and so slows down model
+checking. Instead of checking
+
+  SwarmKit /\ WorkerImpl => EventuallyAsDesired
+
+(which is very slow), we check two separate expressions:
+
+  SwarmKit /\ WorkerSpec => EventuallyAsDesired
+  WorkerImpl => WorkerSpec
+
+TLAPS can check that separating the specification in this way makes sense: *)
+THEOREM ASSUME TEMPORAL SwarmKit, TEMPORAL WorkerSpec,
+               TEMPORAL WorkerImpl, TEMPORAL EventuallyAsDesired,
+               TEMPORAL Env,  \* A simplified version of SwarmKit
+               SwarmKit /\ WorkerSpec => EventuallyAsDesired,
+               Env /\ WorkerImpl => WorkerSpec,
+               SwarmKit => Env
+        PROVE  SwarmKit /\ WorkerImpl => EventuallyAsDesired
+OBVIOUS
+
+\* This worker's node ID
+CONSTANT node
+ASSUME node \in Node
+
+VARIABLES nodes         \* Defined in WorkerSpec.tla
+VARIABLE containers     \* The actual container state on the node, indexed by ModelTaskId
+
+(* The high-level specification of worker nodes.
+   This module should be a refinement of `WS'. *)
+WS == INSTANCE WorkerSpec
+
+terminating == "terminating"        \* A container which we're trying to stop
+
+(* The state of an actual container on a worker node. *)
+ContainerState == { running, terminating, complete, failed }
+
+(* A running container finishes running on its own (or crashes). *)
+ContainerExit ==
+  /\ UNCHANGED << nodes, tasks >>
+  /\ CountEvent
+  /\ \E id \in DOMAIN containers,
+        s2 \in {failed, complete} :      \* Either a successful or failed exit status
+        /\ containers[id] = running
+        /\ containers' = [containers EXCEPT ![id] = s2]
+
+(* A running container finishes because we stopped it. *)
+ShutdownComplete ==
+  /\ UNCHANGED << nodes, tasks, nEvents >>
+  /\ \E id \in DOMAIN containers :
+        /\ containers[id] = terminating
+        /\ containers' = [containers EXCEPT ![id] = failed]
+
+(* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *)
+IsUp(n) == WS!IsUp(n)
+
+(* The new value that `containers' should take after getting an update from the
+   managers. If the managers asked us to run a container and then stop mentioning
+   that task, we shut the container down and (once stopped) remove it. *)
+DesiredContainers ==
+  LET WantShutdown(id) ==
+        \* The managers stop mentioning the task, or ask for it to be stopped.
+        \/ id \notin IdSet(tasks)
+        \/ running \prec (CHOOSE t \in tasks : Id(t) = id).desired_state
+      (* Remove containers that no longer have tasks, once they've stopped. *)
+      rm == { id \in DOMAIN containers :
+                  /\ containers[id] \in { complete, failed }
+                  /\ id \notin IdSet(tasks) }
+  IN [ id \in DOMAIN containers \ rm |->
+           IF containers[id] = running /\ WantShutdown(id) THEN terminating
+           ELSE containers[id]
+     ]
+
+(* The updates that SwarmKit should apply to its store to bring it up-to-date
+   with the real state of the containers. *)
+RequiredTaskUpdates ==
+  LET \* Tasks the manager is expecting news about:
+      oldTasks == { t \in tasks : t.node = node /\ State(t) = running }
+      \* The state to report for task `t':
+      ReportFor(t) ==
+        IF Id(t) \notin DOMAIN containers THEN \* We were asked to forget about this container.
+          shutdown \* SwarmKit doesn't care which terminal state we finish in.
+        ELSE IF /\ containers[Id(t)] = failed       \* It's terminated and
+                /\ t.desired_state = shutdown THEN  \* we wanted to shut it down,
+          shutdown \* Report a successful shutdown
+        ELSE IF containers[Id(t)] = terminating THEN
+          running  \* SwarmKit doesn't record progress of the shutdown
+        ELSE
+          containers[Id(t)]  \* Report the actual state
+  IN [ t \in oldTasks |-> [ t EXCEPT !.status.state = ReportFor(t) ]]
+
+(* Our node synchronises its state with a manager. *)
+DoSync ==
+   /\ containers' = DesiredContainers
+   /\ UpdateTasks(RequiredTaskUpdates)
+
+(* Try to advance containers towards `desired_state' if we're not there yet.
+
+   XXX: do we need a connection to the manager to do this, or can we make progress
+   while disconnected and just report the final state?
+*)
+ProgressTask ==
+  /\ UNCHANGED << nodes, nEvents >>
+  /\ \E t  \in tasks,
+        s2 \in TaskState :   \* The state we want to move to
+        LET t2 == [t EXCEPT !.status.state = s2]
+        IN
+        /\ s2 \preceq t.desired_state       \* Can't be after the desired state
+        /\ << State(t), State(t2) >> \in {  \* Possible ``progress'' (desirable) transitions
+             << assigned, accepted >>,
+             << accepted, preparing >>,
+             << preparing, ready >>,
+             << ready, starting >>,
+             << starting, running >>
+           }
+        /\ IsUp(t.node)                     \* Node must be connected to SwarmKit
+        /\ IF s2 = running THEN
+              \* The container started running
+              containers' = Id(t) :> running @@ containers
+           ELSE
+              UNCHANGED containers
+        /\ UpdateTasks(t :> t2)
+
+(* The agent on the node synchronises with a manager. *)
+SyncWithManager ==
+  /\ UNCHANGED << nodes, nEvents >>
+  /\ IsUp(node)
+  /\ DoSync
+
+(* We can reject a task once we're responsible for it (it has reached `assigned')
+   until it reaches the `running' state.
+   Note that an ``accepted'' task can still be rejected. *)
+RejectTask ==
+  /\ UNCHANGED << nodes, containers >>
+  /\ CountEvent
+  /\ \E t \in tasks :
+       /\ State(t) \in { assigned, accepted, preparing, ready, starting }
+       /\ t.node = node
+       /\ IsUp(node)
+       /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected])
+
+(* The dispatcher notices that the worker is down (the connection is lost). *)
+WorkerDown ==
+  /\ UNCHANGED << tasks, containers >>
+  /\ CountEvent
+  /\ \E n \in Node :
+       /\ IsUp(n)
+       /\ nodes' = [nodes EXCEPT ![n] = WS!nodeDown]
+
+(* When the node reconnects to the cluster, it gets an assignment set from the dispatcher
+   which does not include any tasks that have been marked orphaned and then deleted.
+   Any time an agent gets an assignment set that does not include some task it has running,
+   it shuts down those tasks.
+
+   We model this separately with the `SyncWithManager' action. *)
+WorkerUp ==
+  /\ UNCHANGED << nEvents, containers, tasks >>
+  /\ \E n \in Node :
+       /\ ~IsUp(n)
+       /\ nodes' = [nodes EXCEPT ![n] = WS!nodeUp]
+
+(* Tasks assigned to a node and for which the node is responsible. *)
+TasksOwnedByNode(n) == { t \in tasks :
+  /\ t.node = n
+  /\ assigned \preceq State(t)
+  /\ State(t) \prec remove
+}
+
+(* If SwarmKit sees a node as down for a long time (48 hours or so) then
+   it marks all the node's tasks as orphaned.
+   Note that this sets the actual state, not the desired state.
+
+   ``Moving a task to the Orphaned state is not desirable,
+   because it's the one case where we break the otherwise invariant
+   that the agent sets all states past ASSIGNED.''
+*)
+OrphanTasks ==
+  /\ UNCHANGED << nodes, containers, nEvents >>
+  /\ LET affected == { t \in TasksOwnedByNode(node) : Runnable(t) }
+     IN
+     /\ ~IsUp(node)    \* Our connection to the agent is down
+     /\ UpdateTasks([ t \in affected |->
+                         [t EXCEPT !.status.state = orphaned] ])
+
+(* The worker reboots. All containers are terminated. *)
+WorkerReboot ==
+  /\ UNCHANGED << nodes, tasks >>
+  /\ CountEvent
+  /\ containers' = [ id \in DOMAIN containers |->
+                       LET state == containers[id]
+                       IN  CASE state \in {running, terminating} -> failed
+                             [] state \in {complete, failed}     -> state
+                   ]
+
+(* Actions we require to happen eventually when possible. *)
+AgentProgress ==
+  \/ ProgressTask
+  \/ OrphanTasks
+  \/ WorkerUp
+  \/ ShutdownComplete
+  \/ SyncWithManager
+
+(* All actions of the agent/worker. *)
+Agent ==
+  \/ AgentProgress
+  \/ RejectTask
+  \/ WorkerDown
+  \/ ContainerExit
+  \/ WorkerReboot
+
+-------------------------------------------------------------------------------
+\* A simplified model of the rest of the system
+
+(* A new task is created. *)
+CreateTask ==
+  /\ UNCHANGED << containers, nEvents, nodes >>
+  /\ \E t \in Task :    \* `t' is the new task
+      \* Don't reuse IDs (only really an issue for model checking)
+      /\ Id(t) \notin IdSet(tasks)
+      /\ Id(t) \notin DOMAIN containers
+      /\ State(t) = new
+      /\ t.desired_state \in { ready, running }
+      /\ \/ /\ t.node = unassigned  \* A task for a replicated service
+            /\ t.slot \in Slot
+         \/ /\ t.node \in Node      \* A task for a global service
+            /\ t.slot = global
+      /\ ~\E t2 \in tasks : \* All tasks of a service have the same mode
+            /\ t.service = t2.service
+            /\ (t.slot = global) # (t2.slot = global)
+      /\ tasks' = tasks \union {t}
+
+(* States before `assigned' aren't shared with worker nodes, so modelling them
+   isn't very useful. You can use this in a model to override `CreateTask' to
+   speed things up a bit. It creates tasks directly in the `assigned' state. *)
+CreateTaskQuick ==
+  /\ UNCHANGED << containers, nEvents, nodes >>
+  /\ \E t \in Task :
+      /\ Id(t) \notin IdSet(tasks)
+      /\ Id(t) \notin DOMAIN containers
+      /\ State(t) = assigned
+      /\ t.desired_state \in { ready, running }
+      /\ t.node \in Node
+      /\ t.slot \in Slot \union {global}
+      /\ ~\E t2 \in tasks : \* All tasks of a service have the same mode
+            /\ t.service = t2.service
+            /\ (t.slot = global) # (t2.slot = global)
+      /\ tasks' = tasks \union {t}
+
+(* The state or desired_state of a task is updated. *)
+UpdateTask ==
+  /\ UNCHANGED << containers, nEvents, nodes >>
+  /\ \E t \in tasks, t2 \in Task :  \* `t' becomes `t2'
+        /\ Id(t) = Id(t2)           \* The ID can't change
+        /\ State(t) # State(t2) =>  \* If the state changes then
+             \E actor \in DOMAIN Transitions :  \* it is a legal transition
+                 /\ actor = "agent"  =>  t.node # node    \* and not one our worker does
+                 /\ << State(t), State(t2) >> \in Transitions[actor]
+        \* When tasks reach the `assigned' state, they must have a node
+        /\ IF State(t2) = assigned /\ t.node = unassigned THEN t2.node \in Node
+                                                          ELSE t2.node = t.node
+        /\ UpdateTasks(t :> t2)
+
+(* The reaper removes a task. *)
+RemoveTask ==
+  /\ UNCHANGED << containers, nEvents, nodes >>
+  /\ \E t \in tasks :
+      /\ << State(t), null >> \in Transitions.reaper
+      /\ tasks' = tasks \ {t}
+
+(* Actions of our worker's environment (i.e. SwarmKit and other workers). *)
+OtherComponent ==
+  \/ CreateTask
+  \/ UpdateTask
+  \/ RemoveTask
+
+-------------------------------------------------------------------------------
+\* A complete system
+
+vars == << tasks, nEvents, nodes, containers >>
+
+Init ==
+  /\ tasks = {}
+  /\ containers = << >>
+  /\ nodes = [ n \in Node |-> WS!nodeUp ]
+  /\ InitEvents
+
+Next ==
+  \/ OtherComponent
+  \/ Agent
+
+(* The specification for our worker node. *)
+Impl == Init /\ [][Next]_vars /\ WF_vars(AgentProgress)
+
+-------------------------------------------------------------------------------
+
+TypeOK ==
+  /\ TasksTypeOK
+  \* The node's container map maps IDs to states
+  /\ DOMAIN containers \in SUBSET ModelTaskId
+  /\ containers \in [ DOMAIN containers -> ContainerState ]
+
+wsVars == << tasks, nEvents, nodes >>
+
+(* We want to check that a worker implementing `Impl' is also implementing
+   `WorkerSpec'. i.e. we need to check that Impl => WSSpec. *)
+WSSpec ==
+  /\ [][WS!Agent \/ OtherComponent]_wsVars
+  /\ WF_wsVars(WS!AgentProgress)
+
+=============================================================================
diff --git a/design/tla/WorkerSpec.tla b/design/tla/WorkerSpec.tla
new file mode 100644
index 0000000000..e56b8009b3
--- /dev/null
+++ b/design/tla/WorkerSpec.tla
@@ -0,0 +1,133 @@
+----------------------------- MODULE WorkerSpec -----------------------------
+
+EXTENDS Types, Tasks, EventCounter
+
+VARIABLE nodes          \* Maps nodes to SwarmKit's view of their NodeState
+
+(* The possible states of a node, as recorded by SwarmKit. *)
+nodeUp   == "up"
+nodeDown == "down"
+NodeState == { nodeUp, nodeDown }
+
+WorkerTypeOK ==
+  \* Nodes are up or down
+  /\ nodes \in [ Node -> NodeState ]
+
+-----------------------------------------------------------------------------
+
+\*  Actions performed by worker nodes (actually, by the dispatcher on their behalf)
+
+(* SwarmKit thinks the node is up. i.e. the agent is connected to a manager. *)
+IsUp(n) == nodes[n] = nodeUp
+
+(* Try to advance containers towards `desired_state' if we're not there yet. *)
+ProgressTask ==
+  /\ UNCHANGED << nodes, nEvents >>
+  /\ \E t  \in tasks,
+        s2 \in TaskState :   \* The state we want to move to
+        LET t2 == [t EXCEPT !.status.state = s2]
+        IN
+        /\ s2 \preceq t.desired_state       \* Can't be after the desired state
+        /\ << State(t), State(t2) >> \in {  \* Possible ``progress'' (desirable) transitions
+             << assigned, accepted >>,
+             << accepted, preparing >>,
+             << preparing, ready >>,
+             << ready, starting >>,
+             << starting, running >>
+           }
+        /\ IsUp(t.node)                     \* Node must be connected to SwarmKit
+        /\ UpdateTasks(t :> t2)
+
+(* A running container finishes because we stopped it. *)
+ShutdownComplete ==
+  /\ UNCHANGED << nodes, nEvents >>
+  /\ \E t \in tasks :
+     /\ t.desired_state \in {shutdown, remove}                  \* We are trying to stop it
+     /\ State(t) = running                                      \* It is currently running
+     /\ IsUp(t.node)
+     /\ UpdateTasks(t :> [t EXCEPT !.status.state = shutdown])  \* It becomes shutdown
+
+(* A node can reject a task once it's responsible for it (it has reached `assigned')
+   until it reaches the `running' state.
+   Note that an ``accepted'' task can still be rejected. *)
+RejectTask ==
+  /\ UNCHANGED << nodes >>
+  /\ CountEvent
+  /\ \E t \in tasks :
+       /\ State(t) \in { assigned, accepted, preparing, ready, starting }
+       /\ IsUp(t.node)
+       /\ UpdateTasks(t :> [t EXCEPT !.status.state = rejected])
+
+(* We notify the managers that some running containers have finished.
+   There might be several updates at once (e.g. if we're reconnecting). *)
+ContainerExit ==
+  /\ UNCHANGED << nodes >>
+  /\ CountEvent
+  /\ \E n \in Node :
+        /\ IsUp(n)
+        /\ \E ts \in SUBSET { t \in tasks : t.node = n /\ State(t) = running } :
+           \* Each container could have ended in either state:
+           \E s2 \in [ ts -> { failed, complete } ] :
+             UpdateTasks( [ t \in ts |->
+                             [t EXCEPT !.status.state =
+                               \* Report `failed' as `shutdown' if we wanted to shut down
+                               IF s2[t] = failed /\ t.desired_state = shutdown THEN shutdown
+                               ELSE s2[t]]
+                        ] )
+
+(* Tasks assigned to a node and for which the node is responsible. *)
+TasksOwnedByNode(n) == { t \in tasks :
+  /\ t.node = n
+  /\ assigned \preceq State(t)
+  /\ State(t) \prec remove
+}
+
+(* The dispatcher notices that the worker is down (the connection is lost). *)
+WorkerDown ==
+  /\ UNCHANGED << tasks >>
+  /\ CountEvent
+  /\ \E n \in Node :
+       /\ IsUp(n)
+       /\ nodes' = [nodes EXCEPT ![n] = nodeDown]
+
+(* When the node reconnects to the cluster, it gets an assignment set from the dispatcher
+   which does not include any tasks that have been marked orphaned and then deleted.
+   Any time an agent gets an assignment set that does not include some task it has running,
+   it shuts down those tasks. *)
+WorkerUp ==
+  /\ UNCHANGED << tasks, nEvents >>
+  /\ \E n \in Node :
+       /\ ~IsUp(n)
+       /\ nodes' = [nodes EXCEPT ![n] = nodeUp]
+
+(* If SwarmKit sees a node as down for a long time (48 hours or so) then
+   it marks all the node's tasks as orphaned.
+
+   ``Moving a task to the Orphaned state is not desirable,
+   because it's the one case where we break the otherwise invariant
+   that the agent sets all states past ASSIGNED.''
+*)
+OrphanTasks ==
+  /\ UNCHANGED << nodes, nEvents >>
+  /\ \E n \in Node :
+       LET affected == { t \in TasksOwnedByNode(n) : Runnable(t) }
+       IN
+       /\ ~IsUp(n)    \* Node `n' is still detected as down
+       /\ UpdateTasks([ t \in affected |->
+                         [t EXCEPT !.status.state = orphaned] ])
+
+(* Actions we require to happen eventually when possible. *)
+AgentProgress ==
+  \/ ProgressTask
+  \/ ShutdownComplete
+  \/ OrphanTasks
+  \/ WorkerUp
+
+(* All actions of the agent/worker. *)
+Agent ==
+  \/ AgentProgress
+  \/ RejectTask
+  \/ ContainerExit
+  \/ WorkerDown
+
+=============================================================================
diff --git a/design/tla/models/SwarmKit.cfg b/design/tla/models/SwarmKit.cfg
new file mode 100644
index 0000000000..5c92a0153d
--- /dev/null
+++ b/design/tla/models/SwarmKit.cfg
@@ -0,0 +1,15 @@
+SPECIFICATION Spec
+
+CONSTANT TaskId = {t1, t2}
+CONSTANT ServiceId = {s1}
+CONSTANT Node = {n1}
+CONSTANT maxTerminated = 1
+CONSTANT maxReplicas = 1
+CONSTANT unassigned = unassigned
+CONSTANT global = global
+
+INVARIANT TypeOK
+INVARIANT Inv
+
+PROPERTY TransitionsOK
+PROPERTY EventuallyAsDesired
diff --git a/design/tla/models/WorkerImpl.cfg b/design/tla/models/WorkerImpl.cfg
new file mode 100644
index 0000000000..6f0abf0433
--- /dev/null
+++ b/design/tla/models/WorkerImpl.cfg
@@ -0,0 +1,14 @@
+SPECIFICATION Impl
+
+CONSTANT TaskId = {t1}
+CONSTANT ServiceId = {s1}
+CONSTANT Node = {n1}
+CONSTANT node = n1
+CONSTANT maxTerminated = 1
+CONSTANT maxReplicas = 1
+CONSTANT unassigned = unassigned
+CONSTANT global = global
+
+INVARIANT TypeOK
+
+PROPERTY WSSpec