Skip to content

[release-4.17] OCPBUGS-56976: Boot Image Controller should not degrade when golden configmap is slow to update #5101

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ import (
kubeErrs "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/jsonmergepatch"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
coreinformersv1 "k8s.io/client-go/informers/core/v1"
clientset "k8s.io/client-go/kubernetes"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
Expand Down Expand Up @@ -444,44 +443,31 @@ func (ctrl *Controller) syncMAPIMachineSet(machineSet *machinev1beta1.MachineSet
return fmt.Errorf("failed to fetch infra object during machineset sync: %w", err)
}

// Wait until the MCO hash version stored in the configmap matches the current MCO
// version. This is done by the operator when a master node successfully updates to a new image. This is
// Fetch the bootimage configmap & ensure it has been stamped by the operator. This is done by
// the operator when a master node successfully updates to a new image. This is
// to prevent machinesets from being updated before the operator itself has updated.
// Could return an error(and cause degrade) immediately here, but seems excessive. Waiting with a timeout
// is a bit more graceful.
var configMap *corev1.ConfigMap
var pollError error
klog.Infof("Waiting until coreos-bootimages config map has been stamped by the current version hash (%s) of the operator", operatorversion.Hash)
if err = wait.PollUntilContextTimeout(context.TODO(), 1*time.Minute, 15*time.Minute, true, func(_ context.Context) (bool, error) {
// Fetch the bootimage configmap
configMap, err = ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName)
if configMap == nil || err != nil {
pollError = fmt.Errorf("failed to fetch coreos-bootimages config map during machineset sync: %w", err)
return false, nil
}
versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey]
if !versionHashFound {
pollError = fmt.Errorf("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
return false, nil
}
if versionHashFromCM != operatorversion.Hash {
pollError = fmt.Errorf("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete")
return false, nil
}
releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.MCOReleaseImageVersionKey]
if !releaseVersionFound {
pollError = fmt.Errorf("failed to find mco release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
return false, nil
}
if releaseVersionFromCM != operatorversion.ReleaseVersion {
pollError = fmt.Errorf("mismatch between MCO release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete")
return false, nil
}
return true, nil

}); err != nil {
klog.Errorf("Timed out waiting for coreos-bootimages config map: %v", pollError)
return fmt.Errorf("timed out waiting for coreos-bootimages config map: %v", pollError)
// If it hasn't been updated, exit and wait for a resync.
configMap, err := ctrl.mcoCmLister.ConfigMaps(ctrlcommon.MCONamespace).Get(ctrlcommon.BootImagesConfigMapName)
if err != nil {
return fmt.Errorf("failed to fetch coreos-bootimages config map during machineset sync: %w", err)
}
versionHashFromCM, versionHashFound := configMap.Data[ctrlcommon.MCOVersionHashKey]
if !versionHashFound {
klog.Infof("failed to find mco version hash in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
return nil
}
if versionHashFromCM != operatorversion.Hash {
klog.Infof("mismatch between MCO hash version stored in configmap and current MCO version; sync will exit to wait for the MCO upgrade to complete")
return nil
}
releaseVersionFromCM, releaseVersionFound := configMap.Data[ctrlcommon.MCOReleaseImageVersionKey]
if !releaseVersionFound {
klog.Infof("failed to find OCP release version in %s configmap, sync will exit to wait for the MCO upgrade to complete", ctrlcommon.BootImagesConfigMapName)
return nil
}
if releaseVersionFromCM != operatorversion.ReleaseVersion {
klog.Infof("mismatch between OCP release version stored in configmap and current MCO release version; sync will exit to wait for the MCO upgrade to complete")
return nil
}

// TODO: Also check against the release version stored in the configmap under releaseVersion. This is currently broken as the version
Expand Down