Skip to content

Commit b03b0e9

Browse files
committed
Fix Node scaling issue
This PR removes nodestatus when a node is being removed from the cluster, and the status was succeed. It also addresses the fail nodestatus issue when adding a new node to the cluster.
1 parent 6683394 commit b03b0e9

26 files changed

+5701
-6
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ require (
99
github.com/mitchellh/go-homedir v1.1.0
1010
github.com/onsi/ginkgo v1.16.5
1111
github.com/onsi/gomega v1.27.6
12+
github.com/openshift/api v0.0.0-20230120195050-6ba31fa438f2
1213
github.com/openshift/library-go v0.0.0-20230228181805-0899dfdba7d2
1314
github.com/openshift/machine-config-operator v0.0.1-0.20200913004441-7eba765c69c9
1415
github.com/pborman/uuid v1.2.1
@@ -70,7 +71,6 @@ require (
7071
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
7172
github.com/nbutton23/zxcvbn-go v0.0.0-20210217022336-fa2cb2858354 // indirect
7273
github.com/nxadm/tail v1.4.8 // indirect
73-
github.com/openshift/api v0.0.0-20230120195050-6ba31fa438f2 // indirect
7474
github.com/pmezard/go-difflib v1.0.0 // indirect
7575
github.com/prometheus/common v0.41.0 // indirect
7676
github.com/prometheus/procfs v0.9.0 // indirect

pkg/controller/fileintegrity/config_defaults.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ CONTENT_EX = sha512+ftype+p+u+g+n+acl+selinux+xattrs
4545
!/hostroot/etc/docker/certs.d
4646
!/hostroot/etc/selinux/targeted
4747
!/hostroot/etc/openvswitch/conf.db
48+
!/hostroot/etc/kubernetes/cni/net.d
4849
!/hostroot/etc/kubernetes/cni/net.d/*
4950
!/hostroot/etc/machine-config-daemon/currentconfig$
5051
!/hostroot/etc/pki/ca-trust/extracted/java/cacerts$

pkg/controller/status/status_controller.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ package status
22

33
import (
44
"context"
5+
"time"
6+
57
"github.com/openshift/file-integrity-operator/pkg/apis/fileintegrity/v1alpha1"
68
"github.com/openshift/file-integrity-operator/pkg/controller/metrics"
7-
"time"
89

910
"github.com/go-logr/logr"
1011

@@ -171,12 +172,30 @@ func (r *StatusReconciler) mapActiveStatus(integrity *v1alpha1.FileIntegrity) (v
171172
return v1alpha1.PhaseError, err
172173
}
173174

175+
nodeList := corev1.NodeList{}
176+
if err := r.client.List(context.TODO(), &nodeList, &client.ListOptions{}); err != nil {
177+
return v1alpha1.PhaseError, err
178+
}
179+
nodeNameList := make(map[string]bool)
180+
for _, node := range nodeList.Items {
181+
nodeNameList[node.Name] = true
182+
}
183+
174184
for _, nodeStatus := range nodeStatusList.Items {
185+
// Check if the node is still there, and remove the node status if it's not.
186+
// This is to handle the case where the node is deleted, but the node status is not.
187+
if _, ok := nodeNameList[nodeStatus.NodeName]; !ok {
188+
// If the node is not there, and the node status is success, we can just delete it.
189+
if nodeStatus.LastResult.Condition == v1alpha1.NodeConditionSucceeded {
190+
if err := r.client.Delete(context.TODO(), &nodeStatus); err != nil {
191+
return v1alpha1.PhaseError, err
192+
}
193+
}
194+
}
175195
if nodeStatus.LastResult.Condition == v1alpha1.NodeConditionErrored {
176196
return v1alpha1.PhaseError, nil
177197
}
178198
}
179-
180199
return v1alpha1.PhaseActive, nil
181200
}
182201

tests/e2e/e2e_test.go

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99

1010
"github.com/openshift/file-integrity-operator/pkg/apis/fileintegrity/v1alpha1"
1111
fileintegrity2 "github.com/openshift/file-integrity-operator/pkg/controller/fileintegrity"
12-
1312
framework "github.com/openshift/file-integrity-operator/tests/framework"
1413

1514
"k8s.io/apimachinery/pkg/types"
@@ -918,3 +917,43 @@ func TestFileIntegrityAcceptsExpectedChange(t *testing.T) {
918917
t.Log("Asserting that the FileIntegrity check is in a SUCCESS state after expected changes")
919918
assertNodesConditionIsSuccess(t, f, testName, namespace, 5*time.Second, 10*time.Minute)
920919
}
920+
921+
// This checks test for adding new node and remove a existing node to the cluster and making sure
922+
// the all the nodestatuses are in a success state, and the old nodestatus is removed for the removed node.
923+
func TestFileIntegrityNodeScaling(t *testing.T) {
924+
f, testctx, namespace := setupTest(t)
925+
testName := testIntegrityNamePrefix + "-nodescale"
926+
setupFileIntegrity(t, f, testctx, testName, namespace)
927+
defer testctx.Cleanup()
928+
defer func() {
929+
if err := cleanNodes(f, namespace); err != nil {
930+
t.Fatal(err)
931+
}
932+
if err := resetBundleTestMetrics(f, namespace); err != nil {
933+
t.Fatal(err)
934+
}
935+
}()
936+
defer logContainerOutput(t, f, namespace, testName)
937+
// wait to go active.
938+
err := waitForScanStatus(t, f, namespace, testName, v1alpha1.PhaseActive)
939+
if err != nil {
940+
t.Errorf("Timeout waiting for scan status")
941+
}
942+
943+
t.Log("Asserting that the FileIntegrity check is in a SUCCESS state after deploying it")
944+
assertNodesConditionIsSuccess(t, f, testName, namespace, 2*time.Second, 5*time.Minute)
945+
946+
t.Log("Adding a new worker node to the cluster through the machineset")
947+
scaledUpMachineSetName, newNodeName := scaleUpWorkerMachineSet(t, f, 2*time.Second, 10*time.Minute)
948+
if newNodeName == "" || scaledUpMachineSetName == "" {
949+
t.Fatal("Failed to scale up worker machineset")
950+
}
951+
assertSingleNodeConditionIsSuccess(t, f, testName, namespace, newNodeName, 2*time.Second, 5*time.Minute)
952+
953+
t.Log("Scale down the worker machineset")
954+
removedNodeName := scaleDownWorkerMachineSet(t, f, scaledUpMachineSetName, 2*time.Second, 10*time.Minute)
955+
if removedNodeName == "" {
956+
t.Fatal("Failed to scale down worker machineset")
957+
}
958+
assertNodeStatusForRemovedNode(t, f, testName, namespace, removedNodeName, 2*time.Second, 5*time.Minute)
959+
}

tests/e2e/helpers.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package e2e
33
import (
44
"bufio"
55
"bytes"
6+
"context"
67
goctx "context"
78
"encoding/json"
89
"fmt"
@@ -16,6 +17,7 @@ import (
1617
"testing"
1718
"time"
1819

20+
machinev1 "github.com/openshift/api/machine/v1beta1"
1921
"github.com/openshift/file-integrity-operator/pkg/apis/fileintegrity/v1alpha1"
2022
"github.com/openshift/file-integrity-operator/pkg/controller/metrics"
2123
"github.com/pborman/uuid"
@@ -66,6 +68,7 @@ const (
6668
metricsTestCRBName = "fio-metrics-client"
6769
metricsTestSAName = "default"
6870
metricsTestTokenName = "metrics-token"
71+
machineSetNamespace = "openshift-machine-api"
6972
compressionFileCmd = "for i in `seq 1 10000`; do mktemp \"/hostroot/etc/addedbytest$i.XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\"; done || true"
7073
)
7174

@@ -115,6 +118,7 @@ CONTENT_EX = sha512+ftype+p+u+g+n+acl+selinux+xattrs
115118
!/hostroot/etc/docker/certs.d
116119
!/hostroot/etc/selinux/targeted
117120
!/hostroot/etc/openvswitch/conf.db
121+
!/hostroot/etc/kubernetes/cni/net.d
118122
!/hostroot/etc/kubernetes/cni/net.d/*
119123
!/hostroot/etc/machine-config-daemon/currentconfig$
120124
!/hostroot/etc/pki/ca-trust/extracted/java/cacerts$
@@ -1005,6 +1009,164 @@ func assertNodeOKStatusEvents(t *testing.T, f *framework.Framework, namespace st
10051009
}
10061010
}
10071011

1012+
func scaleUpWorkerMachineSet(t *testing.T, f *framework.Framework, interval, timeout time.Duration) (string, string) {
1013+
// Add a new worker node to the cluster through the machineset
1014+
// Get the machineset
1015+
machineSets := &machinev1.MachineSetList{}
1016+
err := f.Client.List(context.TODO(), machineSets, &client.ListOptions{
1017+
Namespace: machineSetNamespace})
1018+
if err != nil {
1019+
t.Error(err)
1020+
}
1021+
if len(machineSets.Items) == 0 {
1022+
t.Error("No machinesets found")
1023+
}
1024+
machineSetName := ""
1025+
for _, ms := range machineSets.Items {
1026+
if ms.Spec.Replicas != nil && *ms.Spec.Replicas > 0 {
1027+
t.Logf("Found machineset %s with %d replicas", ms.Name, *ms.Spec.Replicas)
1028+
machineSetName = ms.Name
1029+
break
1030+
}
1031+
}
1032+
1033+
// Add one more replica to one of the machinesets
1034+
machineSet := &machinev1.MachineSet{}
1035+
err = f.Client.Get(context.TODO(), types.NamespacedName{Name: machineSetName, Namespace: machineSetNamespace}, machineSet)
1036+
if err != nil {
1037+
t.Error(err)
1038+
}
1039+
t.Logf("Scaling up machineset %s", machineSetName)
1040+
1041+
replicas := *machineSet.Spec.Replicas + 1
1042+
machineSet.Spec.Replicas = &replicas
1043+
err = f.Client.Update(context.TODO(), machineSet)
1044+
if err != nil {
1045+
t.Error(err)
1046+
}
1047+
t.Logf("Waiting for scaling up machineset %s", machineSetName)
1048+
provisionningMachineName := ""
1049+
err = wait.Poll(interval, timeout, func() (bool, error) {
1050+
err = f.Client.Get(context.TODO(), types.NamespacedName{Name: machineSetName, Namespace: machineSetNamespace}, machineSet)
1051+
if err != nil {
1052+
t.Error(err)
1053+
}
1054+
// get name of the new machine
1055+
if provisionningMachineName == "" {
1056+
machines := &machinev1.MachineList{}
1057+
err = f.Client.List(context.TODO(), machines, &client.ListOptions{
1058+
Namespace: machineSetNamespace})
1059+
if err != nil {
1060+
t.Error(err)
1061+
}
1062+
for _, machine := range machines.Items {
1063+
if *machine.Status.Phase == "Provisioning" {
1064+
provisionningMachineName = machine.Name
1065+
break
1066+
}
1067+
}
1068+
}
1069+
if machineSet.Status.Replicas == machineSet.Status.ReadyReplicas {
1070+
t.Logf("Machineset %s scaled up", machineSetName)
1071+
return true, nil
1072+
}
1073+
t.Logf("Waiting for machineset %s to scale up, current ready replicas: %d of %d", machineSetName, machineSet.Status.ReadyReplicas, machineSet.Status.Replicas)
1074+
return false, nil
1075+
})
1076+
if err != nil {
1077+
t.Error(err)
1078+
}
1079+
// get the new node name
1080+
newNodeName := ""
1081+
machine := &machinev1.Machine{}
1082+
err = f.Client.Get(context.TODO(), types.NamespacedName{Name: provisionningMachineName, Namespace: machineSetNamespace}, machine)
1083+
if err != nil {
1084+
t.Error(err)
1085+
}
1086+
newNodeName = machine.Status.NodeRef.Name
1087+
t.Logf("New node name is %s", newNodeName)
1088+
1089+
return machineSetName, newNodeName
1090+
}
1091+
1092+
func scaleDownWorkerMachineSet(t *testing.T, f *framework.Framework, machineSetName string, interval, timeout time.Duration) string {
1093+
// Remove the worker node from the cluster through the machineset
1094+
// Get the machineset
1095+
machineSet := &machinev1.MachineSet{}
1096+
err := f.Client.Get(context.TODO(), types.NamespacedName{Name: machineSetName, Namespace: machineSetNamespace}, machineSet)
1097+
if err != nil {
1098+
t.Error(err)
1099+
}
1100+
1101+
// Remove one replica from the machineset
1102+
t.Logf("Scaling down machineset %s", machineSetName)
1103+
replicas := *machineSet.Spec.Replicas - 1
1104+
machineSet.Spec.Replicas = &replicas
1105+
err = f.Client.Update(context.TODO(), machineSet)
1106+
if err != nil {
1107+
t.Error(err)
1108+
}
1109+
deletedNodeName := ""
1110+
t.Logf("Waiting for scaling down machineset %s", machineSetName)
1111+
err = wait.Poll(interval, timeout, func() (bool, error) {
1112+
err = f.Client.Get(context.TODO(), types.NamespacedName{Name: machineSetName, Namespace: machineSetNamespace}, machineSet)
1113+
if err != nil {
1114+
t.Error(err)
1115+
}
1116+
if machineSet.Status.Replicas == machineSet.Status.ReadyReplicas {
1117+
t.Logf("Machineset %s scaled down", machineSetName)
1118+
return true, nil
1119+
}
1120+
t.Logf("Waiting for machineset %s to scale down, current ready replicas: %d of %d", machineSet.Name, machineSet.Status.ReadyReplicas, machineSet.Status.Replicas)
1121+
return false, nil
1122+
})
1123+
if err != nil {
1124+
t.Error(err)
1125+
}
1126+
if deletedNodeName == "" {
1127+
// Get the node that was deleted
1128+
machineList := &machinev1.MachineList{}
1129+
err = f.Client.List(context.TODO(), machineList, &client.ListOptions{
1130+
Namespace: machineSetNamespace})
1131+
if err != nil {
1132+
t.Error(err)
1133+
}
1134+
if len(machineList.Items) == 0 {
1135+
t.Error("No machines found")
1136+
}
1137+
for _, machine := range machineList.Items {
1138+
if machine.DeletionTimestamp != nil {
1139+
deletedNodeName = machine.Status.NodeRef.Name
1140+
t.Logf("Found deleted node %s", deletedNodeName)
1141+
return deletedNodeName
1142+
}
1143+
}
1144+
}
1145+
return deletedNodeName
1146+
}
1147+
1148+
func assertNodeStatusForRemovedNode(t *testing.T, f *framework.Framework, integrityName, namespace, deletedNodeName string, interval, timeout time.Duration) {
1149+
timeoutErr := wait.PollImmediate(interval, timeout, func() (bool, error) {
1150+
nodestatus := &v1alpha1.FileIntegrityNodeStatus{}
1151+
err := f.Client.Get(goctx.TODO(), types.NamespacedName{Name: integrityName + "-" + deletedNodeName, Namespace: namespace}, nodestatus)
1152+
if err != nil {
1153+
if kerr.IsNotFound(err) {
1154+
t.Logf("Node status for node %s not found, as expected", deletedNodeName)
1155+
return true, nil
1156+
} else {
1157+
t.Errorf("error getting node status for node %s: %v", deletedNodeName, err)
1158+
return true, err
1159+
}
1160+
} else {
1161+
t.Logf("Node status for node %s found, waiting for it to be deleted", deletedNodeName)
1162+
return false, nil
1163+
}
1164+
})
1165+
if timeoutErr != nil {
1166+
t.Errorf("timed out waiting for node status for node %s to be deleted", deletedNodeName)
1167+
}
1168+
}
1169+
10081170
func assertNodesConditionIsSuccess(t *testing.T, f *framework.Framework, integrityName, namespace string, interval, timeout time.Duration) {
10091171
var lastErr error
10101172
type nodeStatus struct {

tests/framework/framework.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@ import (
1515
"time"
1616

1717
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
18-
_ "k8s.io/client-go/plugin/pkg/client/auth"
19-
18+
machinev1 "github.com/openshift/api/machine/v1beta1"
2019
log "github.com/sirupsen/logrus"
2120
extscheme "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/scheme"
2221
"k8s.io/apimachinery/pkg/runtime"
2322
"k8s.io/apimachinery/pkg/util/wait"
2423
cached "k8s.io/client-go/discovery/cached"
2524
"k8s.io/client-go/kubernetes"
2625
cgoscheme "k8s.io/client-go/kubernetes/scheme"
26+
_ "k8s.io/client-go/plugin/pkg/client/auth"
2727
"k8s.io/client-go/rest"
2828
"k8s.io/client-go/restmapper"
2929
"k8s.io/client-go/tools/clientcmd"
@@ -139,6 +139,9 @@ func newFramework(opts *frameworkOpts) (*Framework, error) {
139139
if err := extscheme.AddToScheme(scheme); err != nil {
140140
return nil, fmt.Errorf("failed to add api extensions scheme to runtime scheme: %w", err)
141141
}
142+
if err := machinev1.AddToScheme(scheme); err != nil {
143+
return nil, fmt.Errorf("failed to add machine api scheme to runtime scheme: %w", err)
144+
}
142145

143146
cachedDiscoveryClient := cached.NewMemCacheClient(kubeclient.Discovery())
144147
restMapper := restmapper.NewDeferredDiscoveryRESTMapper(cachedDiscoveryClient)

0 commit comments

Comments
 (0)