sdn: kill containers that fail to update on node restart

dcbw · dcbw · commit fd6dee8ab3a8 · 2017-06-26T12:40:42.000-05:00
With the move to remote runtimes, we can no longer get the pod's network namespace from kubelet (since we cannot insert ourselves into the remote runtime's plugin list and intercept network plugin calls). As kubelet does not call network plugins in any way on startup if a container is already running, we have no way to ensure the container is using the correct NetNamespace (as it may have changed while openshift-node was down) at startup, unless we encode the required information into OVS flows. But if OVS was restarted around the same time OpenShift was, those flows are lost, and we have no information with which to recover the pod's networking on node startup. In this case, kill the infra container underneath kubelet so that it will be restarted and we can set its network up again. NOTE: this is somewhat hacky and will not work with other remote runtimes like CRI-O, but OpenShift 3.6 hardcodes dockershim so this isn't a problem yet. The "correct" solution is to either checkpoint our network configuration at container setup time and recover that ourselves, or to add a GET/STATUS call to CNI and make Kubelet call that operation on startup when recovering running containers. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1453113
diff --git a/pkg/sdn/plugin/node.go b/pkg/sdn/plugin/node.go
@@ -20,8 +20,6 @@ import (
 	"github.com/openshift/origin/pkg/util/netutils"
 	"github.com/openshift/origin/pkg/util/ovs"
 
-	docker "github.com/fsouza/go-dockerclient"
-
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/fields"
 	"k8s.io/apimachinery/pkg/labels"
@@ -32,6 +30,7 @@ import (
 	"k8s.io/kubernetes/pkg/apis/componentconfig"
 	kclientset "k8s.io/kubernetes/pkg/client/clientset_generated/internalclientset"
 	kinternalinformers "k8s.io/kubernetes/pkg/client/informers/informers_generated/internalversion"
+	"k8s.io/kubernetes/pkg/kubelet/dockertools"
 	knetwork "k8s.io/kubernetes/pkg/kubelet/network"
 	kexec "k8s.io/kubernetes/pkg/util/exec"
 )
@@ -195,31 +194,41 @@ func (node *OsdnNode) dockerPreCNICleanup() error {
 		itx.EndTransaction()
 	}
 
-	// Wait until docker has restarted since kubelet will exit it docker isn't running
-	dockerClient, err := docker.NewClientFromEnv()
-	if err != nil {
-		return fmt.Errorf("failed to get docker client: %v", err)
+	// Wait until docker has restarted since kubelet will exit if docker isn't running
+	if _, err := ensureDockerClient(); err != nil {
+		return err
+	}
+
+	log.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces")
+
+	return nil
+}
+
+func ensureDockerClient() (dockertools.DockerInterface, error) {
+	endpoint := os.Getenv("DOCKER_HOST")
+	if endpoint == "" {
+		endpoint = "unix:///var/run/docker.sock"
 	}
-	err = kwait.ExponentialBackoff(
+	dockerClient := dockertools.ConnectToDockerOrDie(endpoint, time.Minute, time.Minute)
+
+	// Wait until docker has restarted since kubelet will exit it docker isn't running
+	err := kwait.ExponentialBackoff(
 		kwait.Backoff{
 			Duration: 100 * time.Millisecond,
 			Factor:   1.2,
 			Steps:    6,
 		},
 		func() (bool, error) {
-			if err := dockerClient.Ping(); err != nil {
+			if _, err := dockerClient.Version(); err != nil {
 				// wait longer
 				return false, nil
 			}
 			return true, nil
 		})
 	if err != nil {
-		return fmt.Errorf("failed to connect to docker after SDN cleanup restart: %v", err)
+		return nil, fmt.Errorf("failed to connect to docker: %v", err)
 	}
-
-	log.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces")
-
-	return nil
+	return dockerClient, nil
 }
 
 func (node *OsdnNode) Start() error {
@@ -271,21 +280,35 @@ func (node *OsdnNode) Start() error {
 	}
 
 	if networkChanged {
-		var pods []kapi.Pod
+		var pods, podsToKill []kapi.Pod
+
 		pods, err = node.GetLocalPods(metav1.NamespaceAll)
 		if err != nil {
 			return err
 		}
 		for _, p := range pods {
-			err = node.UpdatePod(p)
-			if err != nil {
-				log.Warningf("Could not update pod %q: %s", p.Name, err)
+			// Ignore HostNetwork pods since they don't go through OVS
+			if p.Spec.SecurityContext != nil && p.Spec.SecurityContext.HostNetwork {
 				continue
 			}
-			if vnid, err := node.policy.GetVNID(p.Namespace); err == nil {
+			if err := node.UpdatePod(p); err != nil {
+				log.Warningf("will restart pod '%s/%s' due to update failure on restart: %s", p.Namespace, p.Name, err)
+				podsToKill = append(podsToKill, p)
+			} else if vnid, err := node.policy.GetVNID(p.Namespace); err == nil {
 				node.policy.EnsureVNIDRules(vnid)
 			}
 		}
+
+		// Kill pods we couldn't recover; they will get restarted and then
+		// we'll be able to set them up correctly
+		if len(podsToKill) > 0 {
+			docker, err := ensureDockerClient()
+			if err != nil {
+				log.Warningf("failed to get docker client: %v", err)
+			} else if err := killUpdateFailedPods(docker, podsToKill); err != nil {
+				log.Warningf("failed to restart pods that failed to update at startup: %v", err)
+			}
+		}
 	}
 
 	go kwait.Forever(node.policy.SyncVNIDRules, time.Hour)
diff --git a/pkg/sdn/plugin/update.go b/pkg/sdn/plugin/update.go
@@ -0,0 +1,101 @@
+package plugin
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/golang/glog"
+
+	dockertypes "github.com/docker/engine-api/types"
+
+	kapi "k8s.io/kubernetes/pkg/api"
+	kcontainer "k8s.io/kubernetes/pkg/kubelet/container"
+	"k8s.io/kubernetes/pkg/kubelet/dockertools"
+	"k8s.io/kubernetes/pkg/kubelet/leaky"
+)
+
+func formatPod(pod *kapi.Pod) string {
+	return fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
+}
+
+// Copied from pkg/kubelet/dockershim/naming.go::parseSandboxName()
+func dockerSandboxNameToInfraPodNamePrefix(name string) (string, error) {
+	// Docker adds a "/" prefix to names. so trim it.
+	name = strings.TrimPrefix(name, "/")
+
+	parts := strings.Split(name, "_")
+	// Tolerate the random suffix.
+	// TODO(random-liu): Remove 7 field case when docker 1.11 is deprecated.
+	if len(parts) != 6 && len(parts) != 7 {
+		return "", fmt.Errorf("failed to parse the sandbox name: %q", name)
+	}
+	if parts[0] != "k8s" {
+		return "", fmt.Errorf("container is not managed by kubernetes: %q", name)
+	}
+
+	// Return /k8s_POD_name_namespace_uid
+	return fmt.Sprintf("/k8s_%s_%s_%s_%s", leaky.PodInfraContainerName, parts[2], parts[3], parts[4]), nil
+}
+
+func killInfraContainerForPod(docker dockertools.DockerInterface, containers []dockertypes.Container, cid kcontainer.ContainerID) error {
+	// FIXME: handle CRI-O; but unfortunately CRI-O supports multiple
+	// "runtimes" which depend on the filename of that runtime binary,
+	// so we have no idea what cid.Type will be.
+	if cid.Type != "docker" {
+		return fmt.Errorf("unhandled runtime %q", cid.Type)
+	}
+
+	var err error
+	var infraPrefix string
+	for _, c := range containers {
+		if c.ID == cid.ID {
+			infraPrefix, err = dockerSandboxNameToInfraPodNamePrefix(c.Names[0])
+			if err != nil {
+				return err
+			}
+			break
+		}
+	}
+	if infraPrefix == "" {
+		return fmt.Errorf("failed to generate infra container prefix from %q", cid.ID)
+	}
+	// Find and kill the infra container
+	for _, c := range containers {
+		if strings.HasPrefix(c.Names[0], infraPrefix) {
+			if err := docker.StopContainer(c.ID, 10); err != nil {
+				glog.Warningf("failed to stop infra container %q", c.ID)
+			}
+		}
+	}
+
+	return nil
+}
+
+// This function finds the ContainerID of a failed pod, parses it, and kills
+// any matching Infra container for that pod.
+func killUpdateFailedPods(docker dockertools.DockerInterface, pods []kapi.Pod) error {
+	containers, err := docker.ListContainers(dockertypes.ContainerListOptions{All: true})
+	if err != nil {
+		return fmt.Errorf("failed to list docker containers: %v", err)
+	}
+
+	for _, pod := range pods {
+		// Find the first ready container in the pod and use it to find the infra container
+		var cid kcontainer.ContainerID
+		for i := range pod.Status.ContainerStatuses {
+			if pod.Status.ContainerStatuses[i].State.Running != nil && pod.Status.ContainerStatuses[i].ContainerID != "" {
+				cid = kcontainer.ParseContainerID(pod.Status.ContainerStatuses[i].ContainerID)
+				break
+			}
+		}
+		if cid.IsEmpty() {
+			continue
+		}
+		glog.V(5).Infof("Killing pod %q sandbox on restart", formatPod(&pod))
+		if err := killInfraContainerForPod(docker, containers, cid); err != nil {
+			glog.Warningf("Failed to kill pod %q sandbox: %v", formatPod(&pod), err)
+			continue
+		}
+	}
+	return nil
+}
diff --git a/pkg/sdn/plugin/update_test.go b/pkg/sdn/plugin/update_test.go
@@ -0,0 +1,77 @@
+package plugin
+
+import (
+	"fmt"
+	"testing"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kapi "k8s.io/kubernetes/pkg/api"
+	"k8s.io/kubernetes/pkg/kubelet/dockertools"
+)
+
+func TestPodKillOnFailedUpdate(t *testing.T) {
+	fakeDocker := dockertools.NewFakeDockerClient()
+	id := "509383712c59ee328a78ae99d0f9411aa99f0bdf1ecf304aa83afb58f16f0768"
+	name := "/k8s_nginx1_nginx1_default_379e14d9-562e-11e7-b251-0242ac110003_0"
+	infraId := "0e7ff50ca5399654fe3b93a21dae1d264560bc018d5f0b13e79601c1a7948d6e"
+	randomId := "71167588cc97636d2f269081579fb9668b4e42acdfdd1e1cea220f6de86a8b50"
+	fakeDocker.SetFakeRunningContainers([]*dockertools.FakeContainer{
+		{
+			ID:   id,
+			Name: name,
+		},
+		{
+			// Infra container for the above container
+			ID:   infraId,
+			Name: "/k8s_POD_nginx1_default_379e14d9-562e-11e7-b251-0242ac110003_1",
+		},
+		{
+			// Random container unrelated to first two
+			ID:   randomId,
+			Name: "/k8s_POD_blah_default_fef9db05-f5c2-4361-9244-2eb505bc61e7_1",
+		},
+	})
+
+	pods := []kapi.Pod{
+		{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "testpod1",
+				Namespace: "namespace1",
+			},
+			Status: kapi.PodStatus{
+				ContainerStatuses: []kapi.ContainerStatus{
+					{
+						Name:        "container1",
+						ContainerID: fmt.Sprintf("docker://%s", id),
+						State: kapi.ContainerState{
+							Running: &kapi.ContainerStateRunning{},
+						},
+					},
+				},
+			},
+		},
+	}
+
+	err := killUpdateFailedPods(fakeDocker, pods)
+	if err != nil {
+		t.Fatalf("Unexpected error killing update failed pods: %v", err)
+	}
+
+	// Infra container should be stopped
+	result, err := fakeDocker.InspectContainer(infraId)
+	if err != nil {
+		t.Fatalf("Unexpected error inspecting container: %v", err)
+	}
+	if result.State.Running != false {
+		t.Fatalf("Infra container was not stopped")
+	}
+
+	// Unrelated container should still be running
+	result, err = fakeDocker.InspectContainer(randomId)
+	if err != nil {
+		t.Fatalf("Unexpected error inspecting container: %v", err)
+	}
+	if result.State.Running != true {
+		t.Fatalf("Unrelated container was stopped")
+	}
+}