Skip to content

Commit e5a2c82

Browse files
authored
[ws-manager-mk2] do cleanup of failed workspace with unknown status (#20829)
1 parent 2bce70b commit e5a2c82

File tree

2 files changed

+38
-2
lines changed

2 files changed

+38
-2
lines changed

components/ws-manager-mk2/controllers/status.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -439,10 +439,12 @@ func (r *WorkspaceReconciler) extractFailure(ctx context.Context, ws *workspacev
439439
if !ws.IsHeadless() {
440440
return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that", cs.Name), nil
441441
}
442-
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode {
442+
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode == containerUnknownExitCode {
443+
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), nil
444+
} else if !isPodBeingDeleted(pod) {
443445
// if a container is terminated and it wasn't because of either:
444446
// - regular shutdown
445-
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue and is handled in extractStatusFromPod)
447+
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue
446448
// - another known error
447449
// then we report it as UNKNOWN
448450
phase := workspacev1.WorkspacePhaseUnknown

components/ws-manager-mk2/controllers/workspace_controller_test.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,40 @@ var _ = Describe("WorkspaceController", func() {
218218
})
219219
})
220220

221+
It("should handle workspace failure with unknown exit code", func() {
222+
ws := newWorkspace(uuid.NewString(), "default")
223+
m := collectMetricCounts(wsMetrics, ws)
224+
pod := createWorkspaceExpectPod(ws)
225+
226+
markReady(ws)
227+
228+
// Update Pod with failed exit status.
229+
updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) {
230+
pod.Status.ContainerStatuses = append(pod.Status.ContainerStatuses, corev1.ContainerStatus{
231+
LastTerminationState: corev1.ContainerState{
232+
Terminated: &corev1.ContainerStateTerminated{
233+
ExitCode: containerUnknownExitCode,
234+
},
235+
},
236+
})
237+
})
238+
239+
// Controller should detect container exit and add Failed condition.
240+
expectConditionEventually(ws, string(workspacev1.WorkspaceConditionFailed), metav1.ConditionTrue, "")
241+
242+
expectFinalizerAndMarkBackupCompleted(ws, pod)
243+
244+
expectWorkspaceCleanup(ws, pod)
245+
246+
expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{
247+
restores: 1,
248+
startFailures: 0,
249+
failures: 1,
250+
stops: map[StopReason]int{StopReasonFailed: 1},
251+
backups: 1,
252+
})
253+
})
254+
221255
It("should clean up timed out workspaces", func() {
222256
ws := newWorkspace(uuid.NewString(), "default")
223257
m := collectMetricCounts(wsMetrics, ws)

0 commit comments

Comments
 (0)