Skip to content

Commit 16abcd7

Browse files
authored
[FG:InPlacePodVerticalScaling] surface pod resize actuation errors in pod resize conditions (kubernetes#130902)
* surface pod resize actuation errors in pod resize conditions * update kubelet unit tests * fix all other kubelet unit tests * fix linter error * address feedback * fix test failure
1 parent d8607b9 commit 16abcd7

File tree

8 files changed

+216
-42
lines changed

8 files changed

+216
-42
lines changed

pkg/kubelet/container/sync_result.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ var (
4545
ErrConfigPodSandbox = errors.New("ConfigPodSandboxError")
4646
// ErrKillPodSandbox returned when runtime failed to stop pod's sandbox.
4747
ErrKillPodSandbox = errors.New("KillPodSandboxError")
48+
// ErrResizePodInPlace returned when runtime failed to resize a pod.
49+
ErrResizePodInPlace = errors.New("ResizePodInPlaceError")
4850
)
4951

5052
// SyncAction indicates different kind of actions in SyncPod() and KillPod(). Now there are only actions
@@ -68,6 +70,8 @@ const (
6870
ConfigPodSandbox SyncAction = "ConfigPodSandbox"
6971
// KillPodSandbox action
7072
KillPodSandbox SyncAction = "KillPodSandbox"
73+
// ResizePodInPlace action is included whenever any containers in the pod are resized without restart
74+
ResizePodInPlace SyncAction = "ResizePodInPlace"
7175
)
7276

7377
// SyncResult is the result of sync action.

pkg/kubelet/container/testing/fake_runtime.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ type FakeRuntime struct {
6161
VersionInfo string
6262
APIVersionInfo string
6363
RuntimeType string
64+
SyncResults *kubecontainer.PodSyncResult
6465
Err error
6566
InspectErr error
6667
StatusErr error
@@ -238,6 +239,9 @@ func (f *FakeRuntime) SyncPod(_ context.Context, pod *v1.Pod, _ *kubecontainer.P
238239
for _, c := range pod.Spec.Containers {
239240
f.StartedContainers = append(f.StartedContainers, c.Name)
240241
}
242+
if f.SyncResults != nil {
243+
return *f.SyncResults
244+
}
241245
// TODO(random-liu): Add SyncResult for starting and killing containers
242246
if f.Err != nil {
243247
result.Fail(f.Err)

pkg/kubelet/kubelet.go

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2066,20 +2066,19 @@ func (kl *Kubelet) SyncPod(ctx context.Context, updateType kubetypes.SyncPodType
20662066
sctx := context.WithoutCancel(ctx)
20672067
result := kl.containerRuntime.SyncPod(sctx, pod, podStatus, pullSecrets, kl.crashLoopBackOff)
20682068
kl.reasonCache.Update(pod.UID, result)
2069-
if err := result.Error(); err != nil {
2070-
// Do not return error if the only failures were pods in backoff
2071-
for _, r := range result.SyncResults {
2072-
if r.Error != kubecontainer.ErrCrashLoopBackOff && r.Error != images.ErrImagePullBackOff {
2073-
// Do not record an event here, as we keep all event logging for sync pod failures
2074-
// local to container runtime, so we get better errors.
2075-
return false, err
2069+
2070+
for _, r := range result.SyncResults {
2071+
if r.Action == kubecontainer.ResizePodInPlace {
2072+
if r.Error == nil {
2073+
// The pod was resized successfully, clear any pod resize errors in the PodResizeInProgress condition.
2074+
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "", true)
2075+
} else {
2076+
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, v1.PodReasonError, r.Message, false)
20762077
}
20772078
}
2078-
2079-
return false, nil
20802079
}
20812080

2082-
return false, nil
2081+
return false, result.Error()
20832082
}
20842083

20852084
// SyncTerminatingPod is expected to terminate all running containers in a pod. Once this method
@@ -2932,7 +2931,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
29322931
}
29332932
if kl.isPodResizeInProgress(allocatedPod, podStatus) {
29342933
// If a resize is in progress, make sure the cache has the correct state in case the Kubelet restarted.
2935-
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "")
2934+
kl.statusManager.SetPodResizeInProgressCondition(pod.UID, "", "", false)
29362935
} else {
29372936
// (Allocated == Actual) => clear the resize in-progress status.
29382937
kl.statusManager.ClearPodResizeInProgressCondition(pod.UID)
@@ -2961,6 +2960,11 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
29612960
return nil, err
29622961
}
29632962
kl.statusManager.ClearPodResizePendingCondition(pod.UID)
2963+
2964+
// Clear any errors that may have been surfaced from a previous resize. The condition will be
2965+
// added back as needed in the defer block, but this prevents old errors from being preserved.
2966+
kl.statusManager.ClearPodResizeInProgressCondition(pod.UID)
2967+
29642968
for i, container := range pod.Spec.Containers {
29652969
if !apiequality.Semantic.DeepEqual(container.Resources, podFromAllocation.Spec.Containers[i].Resources) {
29662970
key := kuberuntime.GetStableKey(pod, &container)

pkg/kubelet/kubelet_test.go

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4138,5 +4138,112 @@ func TestCrashLoopBackOffConfiguration(t *testing.T) {
41384138
assert.Equalf(t, tc.expectedInitial, resultInitial, "wrong base calculated, want: %v, got %v", tc.expectedInitial, resultInitial)
41394139
})
41404140
}
4141+
}
4142+
4143+
func TestSyncPodWithErrorsDuringInPlacePodResize(t *testing.T) {
4144+
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
4145+
defer testKubelet.Cleanup()
4146+
kubelet := testKubelet.kubelet
4147+
4148+
pod := podWithUIDNameNsSpec("12345678", "foo", "new", v1.PodSpec{
4149+
Containers: []v1.Container{
4150+
{Name: "bar"},
4151+
},
4152+
})
41414153

4154+
testCases := []struct {
4155+
name string
4156+
syncResults *kubecontainer.PodSyncResult
4157+
expectedErr string
4158+
expectedResizeConditions []*v1.PodCondition
4159+
}{
4160+
{
4161+
name: "pod resize error returned from the runtime",
4162+
syncResults: &kubecontainer.PodSyncResult{
4163+
SyncResults: []*kubecontainer.SyncResult{{
4164+
Action: kubecontainer.ResizePodInPlace,
4165+
Target: pod.UID,
4166+
Error: kubecontainer.ErrResizePodInPlace,
4167+
Message: "could not resize pod",
4168+
}},
4169+
},
4170+
expectedErr: "failed to \"ResizePodInPlace\" for \"12345678\" with ResizePodInPlaceError: \"could not resize pod\"",
4171+
expectedResizeConditions: []*v1.PodCondition{{
4172+
Type: v1.PodResizeInProgress,
4173+
Status: v1.ConditionTrue,
4174+
Reason: v1.PodReasonError,
4175+
Message: "could not resize pod",
4176+
}},
4177+
},
4178+
{
4179+
name: "pod resize error cleared upon successful run",
4180+
syncResults: &kubecontainer.PodSyncResult{
4181+
SyncResults: []*kubecontainer.SyncResult{{
4182+
Action: kubecontainer.ResizePodInPlace,
4183+
Target: pod.UID,
4184+
}},
4185+
},
4186+
expectedResizeConditions: []*v1.PodCondition{{
4187+
Type: v1.PodResizeInProgress,
4188+
Status: v1.ConditionTrue,
4189+
}},
4190+
},
4191+
{
4192+
name: "sync results have a non-resize error",
4193+
syncResults: &kubecontainer.PodSyncResult{
4194+
SyncResults: []*kubecontainer.SyncResult{{
4195+
Action: kubecontainer.CreatePodSandbox,
4196+
Target: pod.UID,
4197+
Error: kubecontainer.ErrCreatePodSandbox,
4198+
Message: "could not create pod sandbox",
4199+
}},
4200+
},
4201+
expectedErr: "failed to \"CreatePodSandbox\" for \"12345678\" with CreatePodSandboxError: \"could not create pod sandbox\"",
4202+
expectedResizeConditions: nil,
4203+
},
4204+
{
4205+
name: "sync results have a non-resize error and a successful pod resize action",
4206+
syncResults: &kubecontainer.PodSyncResult{
4207+
SyncResults: []*kubecontainer.SyncResult{
4208+
{
4209+
Action: kubecontainer.CreatePodSandbox,
4210+
Target: pod.UID,
4211+
Error: kubecontainer.ErrCreatePodSandbox,
4212+
Message: "could not create pod sandbox",
4213+
},
4214+
{
4215+
Action: kubecontainer.ResizePodInPlace,
4216+
Target: pod.UID,
4217+
},
4218+
},
4219+
},
4220+
expectedErr: "failed to \"CreatePodSandbox\" for \"12345678\" with CreatePodSandboxError: \"could not create pod sandbox\"",
4221+
expectedResizeConditions: []*v1.PodCondition{{
4222+
Type: v1.PodResizeInProgress,
4223+
Status: v1.ConditionTrue,
4224+
}},
4225+
},
4226+
}
4227+
4228+
for _, tc := range testCases {
4229+
t.Run(tc.name, func(t *testing.T) {
4230+
testKubelet.fakeRuntime.SyncResults = tc.syncResults
4231+
kubelet.podManager.SetPods([]*v1.Pod{pod})
4232+
isTerminal, err := kubelet.SyncPod(context.Background(), kubetypes.SyncPodUpdate, pod, nil, &kubecontainer.PodStatus{})
4233+
require.False(t, isTerminal)
4234+
if tc.expectedErr == "" {
4235+
require.NoError(t, err)
4236+
} else {
4237+
require.Error(t, err)
4238+
require.Equal(t, tc.expectedErr, err.Error())
4239+
}
4240+
gotResizeConditions := kubelet.statusManager.GetPodResizeConditions(pod.UID)
4241+
for _, c := range gotResizeConditions {
4242+
// ignore last probe and transition times for comparison
4243+
c.LastProbeTime = metav1.Time{}
4244+
c.LastTransitionTime = metav1.Time{}
4245+
}
4246+
require.Equal(t, tc.expectedResizeConditions, gotResizeConditions)
4247+
})
4248+
}
41424249
}

pkg/kubelet/kuberuntime/kuberuntime_manager.go

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -693,7 +693,8 @@ func (m *kubeGenericRuntimeManager) computePodResizeAction(pod *v1.Pod, containe
693693
return true
694694
}
695695

696-
func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerChanges podActions, result *kubecontainer.PodSyncResult) {
696+
func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerChanges podActions) *kubecontainer.SyncResult {
697+
resizeResult := kubecontainer.NewSyncResult(kubecontainer.ResizePodInPlace, format.Pod(pod))
697698
pcm := m.containerManager.NewPodContainerManager()
698699
//TODO(vinaykul,InPlacePodVerticalScaling): Figure out best way to get enforceMemoryQoS value (parameter #4 below) in platform-agnostic way
699700
enforceCPULimits := m.cpuCFSQuota
@@ -704,20 +705,20 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
704705
podResources := cm.ResourceConfigForPod(pod, enforceCPULimits, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false)
705706
if podResources == nil {
706707
klog.ErrorS(nil, "Unable to get resource configuration", "pod", klog.KObj(pod))
707-
result.Fail(fmt.Errorf("unable to get resource configuration processing resize for pod %s", pod.Name))
708-
return
708+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get resource configuration processing resize for pod %s", pod.Name))
709+
return resizeResult
709710
}
710711
currentPodMemoryConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceMemory)
711712
if err != nil {
712713
klog.ErrorS(nil, "Unable to get pod cgroup memory config", "pod", klog.KObj(pod))
713-
result.Fail(fmt.Errorf("unable to get pod cgroup memory config for pod %s", pod.Name))
714-
return
714+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get pod cgroup memory config for pod %s", pod.Name))
715+
return resizeResult
715716
}
716717
currentPodCPUConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceCPU)
717718
if err != nil {
718719
klog.ErrorS(nil, "Unable to get pod cgroup cpu config", "pod", klog.KObj(pod))
719-
result.Fail(fmt.Errorf("unable to get pod cgroup cpu config for pod %s", pod.Name))
720-
return
720+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("unable to get pod cgroup cpu config for pod %s", pod.Name))
721+
return resizeResult
721722
}
722723

723724
currentPodResources := podResources
@@ -800,30 +801,30 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
800801
currentPodMemoryUsage, err := pcm.GetPodCgroupMemoryUsage(pod)
801802
if err != nil {
802803
klog.ErrorS(err, "GetPodCgroupMemoryUsage failed", "pod", pod.Name)
803-
result.Fail(err)
804-
return
804+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, err.Error())
805+
return resizeResult
805806
}
806807
if currentPodMemoryUsage >= uint64(*podResources.Memory) {
807808
klog.ErrorS(nil, "Aborting attempt to set pod memory limit less than current memory usage", "pod", pod.Name)
808-
result.Fail(fmt.Errorf("aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name))
809-
return
809+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name))
810+
return resizeResult
810811
}
811812
} else {
812813
// Default pod memory limit to the current memory limit if unset to prevent it from updating.
813814
// TODO(#128675): This does not support removing limits.
814815
podResources.Memory = currentPodMemoryConfig.Memory
815816
}
816817
if errResize := resizeContainers(v1.ResourceMemory, int64(*currentPodMemoryConfig.Memory), *podResources.Memory, 0, 0); errResize != nil {
817-
result.Fail(errResize)
818-
return
818+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, errResize.Error())
819+
return resizeResult
819820
}
820821
}
821822
if len(podContainerChanges.ContainersToUpdate[v1.ResourceCPU]) > 0 || podContainerChanges.UpdatePodResources {
822823
if podResources.CPUShares == nil {
823824
// This shouldn't happen: ResourceConfigForPod always returns a non-nil value for CPUShares.
824825
klog.ErrorS(nil, "podResources.CPUShares is nil", "pod", pod.Name)
825-
result.Fail(fmt.Errorf("podResources.CPUShares is nil for pod %s", pod.Name))
826-
return
826+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, fmt.Sprintf("podResources.CPUShares is nil for pod %s", pod.Name))
827+
return resizeResult
827828
}
828829

829830
// Default pod CPUQuota to the current CPUQuota if no limit is set to prevent the pod limit
@@ -834,10 +835,11 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podContainerC
834835
}
835836
if errResize := resizeContainers(v1.ResourceCPU, *currentPodCPUConfig.CPUQuota, *podResources.CPUQuota,
836837
int64(*currentPodCPUConfig.CPUShares), int64(*podResources.CPUShares)); errResize != nil {
837-
result.Fail(errResize)
838-
return
838+
resizeResult.Fail(kubecontainer.ErrResizePodInPlace, errResize.Error())
839+
return resizeResult
839840
}
840841
}
842+
return resizeResult
841843
}
842844

843845
func (m *kubeGenericRuntimeManager) updatePodContainerResources(pod *v1.Pod, resourceName v1.ResourceName, containersToUpdate []containerToUpdateInfo) error {
@@ -1401,7 +1403,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
14011403
// Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources
14021404
if resizable, _ := IsInPlacePodVerticalScalingAllowed(pod); resizable {
14031405
if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources {
1404-
m.doPodResizeAction(pod, podContainerChanges, &result)
1406+
result.SyncResults = append(result.SyncResults, m.doPodResizeAction(pod, podContainerChanges))
14051407
}
14061408
}
14071409

pkg/kubelet/kuberuntime/kuberuntime_manager_test.go

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,14 @@ var (
6767
)
6868

6969
func createTestRuntimeManager() (*apitest.FakeRuntimeService, *apitest.FakeImageService, *kubeGenericRuntimeManager, error) {
70+
return createTestRuntimeManagerWithErrors(nil)
71+
}
72+
73+
func createTestRuntimeManagerWithErrors(errors map[string][]error) (*apitest.FakeRuntimeService, *apitest.FakeImageService, *kubeGenericRuntimeManager, error) {
7074
fakeRuntimeService := apitest.NewFakeRuntimeService()
75+
if errors != nil {
76+
fakeRuntimeService.Errors = errors
77+
}
7178
fakeImageService := apitest.NewFakeImageService()
7279
// Only an empty machineInfo is needed here, because in unit test all containers are besteffort,
7380
// data in machineInfo is not used. If burstable containers are used in unit test in the future,
@@ -3186,17 +3193,16 @@ func TestDoPodResizeAction(t *testing.T) {
31863193
}
31873194

31883195
featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.InPlacePodVerticalScaling, true)
3189-
_, _, m, err := createTestRuntimeManager()
3190-
require.NoError(t, err)
3191-
m.cpuCFSQuota = true // Enforce CPU Limits
31923196

31933197
for _, tc := range []struct {
31943198
testName string
31953199
currentResources containerResources
31963200
desiredResources containerResources
31973201
updatedResources []v1.ResourceName
31983202
otherContainersHaveLimits bool
3203+
runtimeErrors map[string][]error
31993204
expectedError string
3205+
expectedErrorMessage string
32003206
expectPodCgroupUpdates int
32013207
}{
32023208
{
@@ -3213,6 +3219,23 @@ func TestDoPodResizeAction(t *testing.T) {
32133219
updatedResources: []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory},
32143220
expectPodCgroupUpdates: 3, // cpu req, cpu lim, mem lim
32153221
},
3222+
{
3223+
testName: "Increase cpu and memory requests and limits, with computed pod limits and set a runtime error",
3224+
currentResources: containerResources{
3225+
cpuRequest: 100, cpuLimit: 100,
3226+
memoryRequest: 100, memoryLimit: 100,
3227+
},
3228+
desiredResources: containerResources{
3229+
cpuRequest: 200, cpuLimit: 200,
3230+
memoryRequest: 200, memoryLimit: 200,
3231+
},
3232+
otherContainersHaveLimits: true,
3233+
updatedResources: []v1.ResourceName{v1.ResourceCPU, v1.ResourceMemory},
3234+
expectPodCgroupUpdates: 1,
3235+
runtimeErrors: map[string][]error{"UpdateContainerResources": {fmt.Errorf("error updating container resources")}},
3236+
expectedError: "ResizePodInPlaceError",
3237+
expectedErrorMessage: "error updating container resources",
3238+
},
32163239
{
32173240
testName: "Increase cpu and memory requests and limits, without computed pod limits",
32183241
currentResources: containerResources{
@@ -3296,6 +3319,10 @@ func TestDoPodResizeAction(t *testing.T) {
32963319
},
32973320
} {
32983321
t.Run(tc.testName, func(t *testing.T) {
3322+
_, _, m, err := createTestRuntimeManagerWithErrors(tc.runtimeErrors)
3323+
require.NoError(t, err)
3324+
m.cpuCFSQuota = true // Enforce CPU Limits
3325+
32993326
mockCM := cmtesting.NewMockContainerManager(t)
33003327
mockCM.EXPECT().PodHasExclusiveCPUs(mock.Anything).Return(false).Maybe()
33013328
mockCM.EXPECT().ContainerHasExclusiveCPUs(mock.Anything, mock.Anything).Return(false).Maybe()
@@ -3358,17 +3385,17 @@ func TestDoPodResizeAction(t *testing.T) {
33583385
containersToUpdate[r] = []containerToUpdateInfo{updateInfo}
33593386
}
33603387

3361-
syncResult := &kubecontainer.PodSyncResult{}
33623388
actions := podActions{
33633389
ContainersToUpdate: containersToUpdate,
33643390
}
3365-
m.doPodResizeAction(pod, actions, syncResult)
3391+
resizeResult := m.doPodResizeAction(pod, actions)
33663392

33673393
if tc.expectedError != "" {
3368-
require.Error(t, syncResult.Error())
3369-
require.EqualError(t, syncResult.Error(), tc.expectedError)
3394+
require.Error(t, resizeResult.Error)
3395+
require.EqualError(t, resizeResult.Error, tc.expectedError)
3396+
require.Equal(t, tc.expectedErrorMessage, resizeResult.Message)
33703397
} else {
3371-
require.NoError(t, syncResult.Error())
3398+
require.NoError(t, resizeResult.Error)
33723399
}
33733400

33743401
mock.AssertExpectationsForObjects(t, mockPCM)

0 commit comments

Comments
 (0)