Skip to content

Retry pending deployments longer before failing them #13550

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions pkg/deploy/controller/deployment/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,21 @@ import (
"github.com/openshift/origin/pkg/util"
)

// maxRetryCount is the maximum number of times the controller will retry errors.
// The first requeue is after 5ms and subsequent requeues grow exponentially.
// This effectively can extend up to 5^10ms which caps to 1000s:
//
// 5ms, 25ms, 125ms, 625ms, 3s, 16s, 78s, 390s, 1000s, 1000s
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bparees @csrwng you guys need to be aware for what you are signing up with 60 retries ^

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx @Kargakis What determines the rate of backoff for each requeue? Is it configurable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not configurable currently. See here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess though you can construct your own ratelimiter

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thx

//
// The most common errors are:
//
// * failure to delete the deployer pods
// * failure to update the replication controller
// * pod may be missing from the cache once the deployment transitions to Pending.
//
// In most cases, we shouldn't need to retry up to maxRetryCount...
const maxRetryCount = 10

// fatalError is an error which can't be retried.
type fatalError string

Expand Down Expand Up @@ -71,10 +86,10 @@ type DeploymentController struct {
recorder record.EventRecorder
}

// Handle processes deployment and either creates a deployer pod or responds
// handle processes a deployment and either creates a deployer pod or responds
// to a terminal deployment status. Since this controller started using caches,
// the provided rc MUST be deep-copied beforehand (see work() in factory.go).
func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) error {
func (c *DeploymentController) handle(deployment *kapi.ReplicationController, willBeDropped bool) error {
// Copy all the annotations from the deployment.
updatedAnnotations := make(map[string]string)
for key, value := range deployment.Annotations {
Expand Down Expand Up @@ -127,6 +142,7 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
nextStatus = deployapi.DeploymentStatusPending
glog.V(4).Infof("Created deployer pod %s for deployment %s", deploymentPod.Name, deployutil.LabelForDeployment(deployment))

// Most likely dead code since we never get an error different from 404 back from the cache.
case deployerErr != nil:
// If the pod already exists, it's possible that a previous CreatePod
// succeeded but the deployment state update failed and now we're re-
Expand Down Expand Up @@ -163,6 +179,11 @@ func (c *DeploymentController) Handle(deployment *kapi.ReplicationController) er
// If the deployment is cancelled here then we deleted the deployer in a previous
// resync of the deployment.
if !deployutil.IsDeploymentCancelled(deployment) {
// Retry more before setting the deployment to Failed if it's Pending - the pod might not have
// appeared in the cache yet.
if !willBeDropped && currentStatus == deployapi.DeploymentStatusPending {
return deployerErr
}
updatedAnnotations[deployapi.DeploymentStatusReasonAnnotation] = deployapi.DeploymentFailedDeployerPodNoLongerExists
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "Failed", fmt.Sprintf("Deployer pod %q has gone missing", deployerPodName))
deployerErr = fmt.Errorf("Failing deployment %q because its deployer pod %q disappeared", deployutil.LabelForDeployment(deployment), deployerPodName)
Expand Down Expand Up @@ -423,13 +444,15 @@ func (c *DeploymentController) handleErr(err error, key interface{}, deployment
return
}

if c.queue.NumRequeues(key) < 2 {
if c.queue.NumRequeues(key) < maxRetryCount {
c.queue.AddRateLimited(key)
return
}

msg := fmt.Sprintf("About to stop retrying %q: %v", deployment.Name, err)
if _, isActionableErr := err.(actionableError); isActionableErr {
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", fmt.Sprintf("About to stop retrying %s: %v", deployment.Name, err))
c.emitDeploymentEvent(deployment, kapi.EventTypeWarning, "FailedRetry", msg)
}
glog.V(2).Infof(msg)
c.queue.Forget(key)
}
78 changes: 54 additions & 24 deletions pkg/deploy/controller/deployment/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ func TestHandle_createPodOk(t *testing.T) {

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -211,7 +211,7 @@ func TestHandle_createPodFail(t *testing.T) {

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

err := controller.Handle(deployment)
err := controller.handle(deployment, false)
if err == nil {
t.Fatalf("expected an error")
}
Expand Down Expand Up @@ -278,7 +278,7 @@ func TestHandle_deployerPodAlreadyExists(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down Expand Up @@ -317,7 +317,7 @@ func TestHandle_unrelatedPodAlreadyExists(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -358,7 +358,7 @@ func TestHandle_unrelatedPodAlreadyExistsTestScaled(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, false, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -416,7 +416,7 @@ func TestHandle_noop(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down Expand Up @@ -451,7 +451,7 @@ func TestHandle_failedTest(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodFailed)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -492,7 +492,7 @@ func TestHandle_cleanupPodOk(t *testing.T) {
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
hookPods = append(hookPods, deployment.Name)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -537,7 +537,7 @@ func TestHandle_cleanupPodOkTest(t *testing.T) {
controller := okDeploymentController(client, deployment, hookPods, true, kapi.PodSucceeded)
hookPods = append(hookPods, deployment.Name)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -581,7 +581,7 @@ func TestHandle_cleanupPodNoop(t *testing.T) {
pod.Labels[deployapi.DeployerPodForDeploymentLabel] = "unrelated"
controller.podStore.Indexer.Update(pod)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}
}
Expand Down Expand Up @@ -609,7 +609,7 @@ func TestHandle_cleanupPodFail(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodSucceeded)

err := controller.Handle(deployment)
err := controller.handle(deployment, false)
if err == nil {
t.Fatal("expected an actionable error")
}
Expand Down Expand Up @@ -640,7 +640,7 @@ func TestHandle_cancelNew(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -676,7 +676,7 @@ func TestHandle_cleanupNewWithDeployers(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, kapi.PodRunning)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Fatalf("unexpected error: %v", err)
}

Expand Down Expand Up @@ -755,7 +755,7 @@ func TestHandle_cleanupPostNew(t *testing.T) {

controller := okDeploymentController(client, deployment, hookPods, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand All @@ -767,15 +767,27 @@ func TestHandle_cleanupPostNew(t *testing.T) {
}

// TestHandle_deployerPodDisappeared ensures that a pending/running deployment
// is failed when its deployer pod vanishes.
// is failed when its deployer pod vanishes. Ensure that pending deployments
// wont fail instantly on a missing deployer pod because it may take some time
// for it to appear in the pod cache.
func TestHandle_deployerPodDisappeared(t *testing.T) {
tests := []struct {
name string
phase deployapi.DeploymentStatus
name string
phase deployapi.DeploymentStatus
willBeDropped bool
shouldRetry bool
}{
{
name: "pending",
phase: deployapi.DeploymentStatusPending,
name: "pending - retry",
phase: deployapi.DeploymentStatusPending,
willBeDropped: false,
shouldRetry: true,
},
{
name: "pending - fail",
phase: deployapi.DeploymentStatusPending,
willBeDropped: true,
shouldRetry: false,
},
{
name: "running",
Expand All @@ -801,21 +813,39 @@ func TestHandle_deployerPodDisappeared(t *testing.T) {
continue
}
deployment.Annotations[deployapi.DeploymentStatusAnnotation] = string(test.phase)
updatedDeployment = deployment

controller := okDeploymentController(client, nil, nil, true, kapi.PodUnknown)

if err := controller.Handle(deployment); err != nil {
err = controller.handle(deployment, test.willBeDropped)
if !test.shouldRetry && err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
if test.shouldRetry && err == nil {
t.Errorf("%s: expected an error so that the deployment can be retried, got none", test.name)
continue
}

if !updateCalled {
if !test.shouldRetry && !updateCalled {
t.Errorf("%s: expected update", test.name)
continue
}

if e, a := deployapi.DeploymentStatusFailed, deployutil.DeploymentStatusFor(updatedDeployment); e != a {
t.Errorf("%s: expected deployment status %q, got %q", test.name, e, a)
if test.shouldRetry && updateCalled {
t.Errorf("%s: unexpected update", test.name)
continue
}

gotStatus := deployutil.DeploymentStatusFor(updatedDeployment)
if !test.shouldRetry && deployapi.DeploymentStatusFailed != gotStatus {
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusFailed, gotStatus)
continue
}

if test.shouldRetry && deployapi.DeploymentStatusPending != gotStatus {
t.Errorf("%s: expected deployment status %q, got %q", test.name, deployapi.DeploymentStatusPending, gotStatus)
continue
}
}
}
Expand Down Expand Up @@ -921,7 +951,7 @@ func TestHandle_transitionFromDeployer(t *testing.T) {

controller := okDeploymentController(client, deployment, nil, true, test.podPhase)

if err := controller.Handle(deployment); err != nil {
if err := controller.handle(deployment, false); err != nil {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
Expand Down
6 changes: 5 additions & 1 deletion pkg/deploy/controller/deployment/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,11 @@ func (c *DeploymentController) work() bool {
return false
}

err = c.Handle(rc)
// Resist missing deployer pods from the cache in case of a pending deployment.
// Give some room for a possible rc update failure in case we decided to mark it
// failed.
willBeDropped := c.queue.NumRequeues(key) >= maxRetryCount-2
err = c.handle(rc, willBeDropped)
c.handleErr(err, key, rc)

return false
Expand Down