From 04adf9ed179e1f1064a18043ad694b7c99c98f98 Mon Sep 17 00:00:00 2001 From: abhi Date: Thu, 4 Jan 2018 13:10:58 -0800 Subject: [PATCH] [Backport]#2468 Tracking down libnetwork/1790#issuecomment-308222053. The report is that if on a single node several services are started, and if this node is then rebooted, all the services appear to come back but some of them are no longer reachable. On probing, the cause turned out to be an invalid assignment of IP addresses to services when they were restored. Specifically, the same IP address was assigned to one service's VIP and also a different service's endpoint. The result was that packets got delivered to the wrong container and caused symptoms like services or ports unreachable. This is very likely to also be the cause of moby/#35675 and other duplicate-IP or overlapping IP issues. The reason for this problem seems to be that the code path followed when services are restored, at no point contacts or informs IPAM about the IP addresses used as the restored service's VIP. So IPAM thinks that those IP addresses are still available and hands them out to endpoints and new services, causing the observed chaos. To work out the right fix, I compared the code path when a service is created from the CLI to the code path when a service is restored on reboot. To me this fix is the bit that should have always been on the restore path but was omitted. With this fix IPAM gets correctly informed and it's state is consistent with what I see on the network. I have tested this fix on a single node running several services and when there multiple nodes with multiple managers running many services (specifically 2 nodes and 2 managers). In both cases, without the fix a reboot would cause IP address overlaps on the ingress network. With the fix there are no overlaps. While the fix seems to work, I'm not sure if it is at exactly the right point in this function, or indeed if it is the right or complete fix. Please take a look and let me know. Signed-off-by: abhi --- manager/allocator/allocator_test.go | 130 ++++++++++++++++++ .../networkallocator/networkallocator.go | 5 + 2 files changed, 135 insertions(+) diff --git a/manager/allocator/allocator_test.go b/manager/allocator/allocator_test.go index adb24c598b..d0bf13d9fb 100644 --- a/manager/allocator/allocator_test.go +++ b/manager/allocator/allocator_test.go @@ -666,6 +666,136 @@ func TestNoDuplicateIPs(t *testing.T) { } } +func TestAllocatorRestoreForDuplicateIPs(t *testing.T) { + s := store.NewMemoryStore(nil) + assert.NotNil(t, s) + defer s.Close() + // Create 3 services with 1 task each + numsvcstsks := 3 + assert.NoError(t, s.Update(func(tx store.Tx) error { + // populate ingress network + in := &api.Network{ + ID: "ingress-nw-id", + Spec: api.NetworkSpec{ + Annotations: api.Annotations{ + Name: "default-ingress", + }, + Ingress: true, + }, + } + assert.NoError(t, store.CreateNetwork(tx, in)) + + for i := 0; i != numsvcstsks; i++ { + svc := &api.Service{ + ID: "testServiceID" + strconv.Itoa(i), + Spec: api.ServiceSpec{ + Annotations: api.Annotations{ + Name: "service" + strconv.Itoa(i), + }, + Endpoint: &api.EndpointSpec{ + Mode: api.ResolutionModeVirtualIP, + + Ports: []*api.PortConfig{ + { + Name: "", + Protocol: api.ProtocolTCP, + TargetPort: 8000, + PublishedPort: uint32(8001 + i), + }, + }, + }, + }, + Endpoint: &api.Endpoint{ + Ports: []*api.PortConfig{ + { + Name: "", + Protocol: api.ProtocolTCP, + TargetPort: 8000, + PublishedPort: uint32(8001 + i), + }, + }, + VirtualIPs: []*api.Endpoint_VirtualIP{ + { + NetworkID: "ingress-nw-id", + Addr: "10.0.0." + strconv.Itoa(2+i) + "/24", + }, + }, + }, + } + assert.NoError(t, store.CreateService(tx, svc)) + } + return nil + })) + + for i := 0; i != numsvcstsks; i++ { + assert.NoError(t, s.Update(func(tx store.Tx) error { + tsk := &api.Task{ + ID: "testTaskID" + strconv.Itoa(i), + Status: api.TaskStatus{ + State: api.TaskStateNew, + }, + ServiceID: "testServiceID" + strconv.Itoa(i), + DesiredState: api.TaskStateRunning, + } + assert.NoError(t, store.CreateTask(tx, tsk)) + return nil + })) + } + + assignedVIPs := make(map[string]bool) + assignedIPs := make(map[string]bool) + hasNoIPOverlapServices := func(fakeT assert.TestingT, service *api.Service) bool { + assert.NotEqual(fakeT, len(service.Endpoint.VirtualIPs), 0) + assert.NotEqual(fakeT, len(service.Endpoint.VirtualIPs[0].Addr), 0) + + assignedVIP := service.Endpoint.VirtualIPs[0].Addr + if assignedVIPs[assignedVIP] { + t.Fatalf("service %s assigned duplicate IP %s", service.ID, assignedVIP) + } + assignedVIPs[assignedVIP] = true + if assignedIPs[assignedVIP] { + t.Fatalf("a task and service %s have the same IP %s", service.ID, assignedVIP) + } + return true + } + + hasNoIPOverlapTasks := func(fakeT assert.TestingT, s *store.MemoryStore, task *api.Task) bool { + assert.NotEqual(fakeT, len(task.Networks), 0) + assert.NotEqual(fakeT, len(task.Networks[0].Addresses), 0) + + assignedIP := task.Networks[0].Addresses[0] + if assignedIPs[assignedIP] { + t.Fatalf("task %s assigned duplicate IP %s", task.ID, assignedIP) + } + assignedIPs[assignedIP] = true + if assignedVIPs[assignedIP] { + t.Fatalf("a service and task %s have the same IP %s", task.ID, assignedIP) + } + return true + } + + a, err := New(s, nil) + assert.NoError(t, err) + assert.NotNil(t, a) + // Start allocator + go func() { + assert.NoError(t, a.Run(context.Background())) + }() + defer a.Stop() + + taskWatch, cancel := state.Watch(s.WatchQueue(), api.EventUpdateTask{}, api.EventDeleteTask{}) + defer cancel() + + serviceWatch, cancel := state.Watch(s.WatchQueue(), api.EventUpdateService{}, api.EventDeleteService{}) + defer cancel() + + // Confirm tasks have no IPs that overlap with the services VIPs on restart + for i := 0; i != numsvcstsks; i++ { + watchTask(t, s, taskWatch, false, hasNoIPOverlapTasks) + watchService(t, serviceWatch, false, hasNoIPOverlapServices) + } +} + func isValidNetwork(t assert.TestingT, n *api.Network) bool { if _, ok := n.Spec.Annotations.Labels["com.docker.swarm.predefined"]; ok { return true diff --git a/manager/allocator/networkallocator/networkallocator.go b/manager/allocator/networkallocator/networkallocator.go index 39b13d0007..40357cbdac 100644 --- a/manager/allocator/networkallocator/networkallocator.go +++ b/manager/allocator/networkallocator/networkallocator.go @@ -422,6 +422,11 @@ func (na *NetworkAllocator) ServiceNeedsAllocation(s *api.Service, flags ...func vipLoop: for _, vip := range s.Endpoint.VirtualIPs { if na.IsVIPOnIngressNetwork(vip) && IsIngressNetworkNeeded(s) { + // This checks the condition when ingress network is needed + // but allocation has not been done. + if _, ok := na.services[s.ID]; !ok { + return true + } continue vipLoop } for _, net := range specNetworks {