Skip to content

Commit bc35368

Browse files
committed
Initial tests for Two Nodes OCP with Fencing (TNF) cluster
Add initial topology tests * Ensure correct number of ControlPlanes, Workers, Arbiters * Ensure correct number of static etcd pod containers * Ensure correct number of podman etcd containers Add initial behavior tests * Ensure the cluster can handle a graceful node shutdown Closes: OCPEDGE-1481, OCPEDGE-1482
1 parent 6e15b2c commit bc35368

File tree

4 files changed

+329
-0
lines changed

4 files changed

+329
-0
lines changed

test/extended/include.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ import (
5757
_ "github.com/openshift/origin/test/extended/storage"
5858
_ "github.com/openshift/origin/test/extended/tbr_health"
5959
_ "github.com/openshift/origin/test/extended/templates"
60+
_ "github.com/openshift/origin/test/extended/tnf"
6061
_ "github.com/openshift/origin/test/extended/user"
6162
_ "github.com/openshift/origin/test/extended/windows"
6263
)

test/extended/tnf/common.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package tnf
2+
3+
import (
4+
"context"
5+
6+
o "github.com/onsi/gomega"
7+
v1 "github.com/openshift/api/config/v1"
8+
exutil "github.com/openshift/origin/test/extended/util"
9+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
10+
)
11+
12+
func getInfraStatus(oc *exutil.CLI) v1.InfrastructureStatus {
13+
infra, err := oc.AdminConfigClient().ConfigV1().Infrastructures().Get(context.Background(),
14+
"cluster", metav1.GetOptions{})
15+
o.Expect(err).NotTo(o.HaveOccurred())
16+
return infra.Status
17+
}
18+
19+
func runOnNodeNS(oc *exutil.CLI, nodeName, namespace, command string) (string, string, error) {
20+
return oc.AsAdmin().Run("debug").Args("-n", namespace, "node/"+nodeName, "--", "chroot", "/host", "/bin/bash", "-c", command).Outputs()
21+
}

test/extended/tnf/recovery.go

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
package tnf
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"math/rand"
7+
"slices"
8+
"time"
9+
10+
g "github.com/onsi/ginkgo/v2"
11+
o "github.com/onsi/gomega"
12+
v1 "github.com/openshift/api/config/v1"
13+
"github.com/openshift/origin/test/extended/etcd/helpers"
14+
exutil "github.com/openshift/origin/test/extended/util"
15+
"go.etcd.io/etcd/api/v3/etcdserverpb"
16+
corev1 "k8s.io/api/core/v1"
17+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
18+
)
19+
20+
var _ = g.Describe("[sig-node][apigroup:config.openshift.io] Two Nodes OCP with fencing recovery", func() {
21+
defer g.GinkgoRecover()
22+
23+
var (
24+
oc = exutil.NewCLIWithoutNamespace("").AsAdmin()
25+
etcdClientFactory *helpers.EtcdClientFactoryImpl
26+
nodeA, nodeB corev1.Node
27+
)
28+
29+
g.BeforeEach(func() {
30+
infraStatus := getInfraStatus(oc)
31+
if infraStatus.ControlPlaneTopology != v1.DualReplicaTopologyMode {
32+
g.Skip("Cluster is not in DualReplicaTopologyMode skipping test")
33+
}
34+
35+
nodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
36+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve nodes without error")
37+
o.Expect(len(nodes.Items)).To(o.BeNumerically("==", 2), "Expected to find 2 Nodes only")
38+
39+
// Select the first index randomly
40+
randomIndex := rand.Intn(len(nodes.Items))
41+
nodeA = nodes.Items[randomIndex]
42+
// Select the remaining index
43+
nodeB = nodes.Items[(randomIndex+1)%len(nodes.Items)]
44+
g.GinkgoT().Printf("Randomly selected %s to be gracefully shut down and %s to take the lead\n", nodeB.Name, nodeA.Name)
45+
46+
kubeClient := oc.KubeClient()
47+
etcdClientFactory = helpers.NewEtcdClientFactory(kubeClient)
48+
49+
o.Eventually(func() error {
50+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeA.Name)
51+
}, time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node A healthy without error")
52+
53+
o.Eventually(func() error {
54+
return helpers.EnsureHealthyMember(g.GinkgoT(), etcdClientFactory, nodeB.Name)
55+
}, time.Minute, 15*time.Second).ShouldNot(o.HaveOccurred(), "expect to ensure Node B healthy without error")
56+
})
57+
58+
g.It("Should support a graceful node shutdown", func() {
59+
msg := fmt.Sprintf("Shutting down %s gracefully in 1 minute", nodeB.Name)
60+
g.By(msg)
61+
// NOTE: Using `shutdown` alone would cause the node to be permanently removed from the cluster.
62+
// To prevent this, we use the `--reboot` flag, which ensures a graceful shutdown and allows the
63+
// node to rejoin the cluster upon restart. A one-minute delay is added to give the debug node
64+
// sufficient time to cleanly exit before the shutdown process completes.
65+
_, _, err := runOnNodeNS(oc, nodeB.Name, "openshift-etcd", "shutdown --reboot +1")
66+
o.Expect(err).To(o.BeNil(), "Expected to gracefully shutdown the node without errors")
67+
time.Sleep(time.Minute)
68+
69+
msg = fmt.Sprintf("Ensuring %s leaves the member list", nodeB.Name)
70+
g.By(msg)
71+
o.Eventually(func() error {
72+
return helpers.EnsureMemberRemoved(g.GinkgoT(), etcdClientFactory, nodeB.Name)
73+
}, 5*time.Minute, 30*time.Second).ShouldNot(o.HaveOccurred())
74+
75+
msg = fmt.Sprintf("Ensuring that %s is a healthy voting member and adds %s back as learner", nodeA.Name, nodeB.Name)
76+
g.By(msg)
77+
o.Eventually(func() error {
78+
members, err := getMembers(etcdClientFactory)
79+
if err != nil {
80+
return err
81+
}
82+
if len(members) != 2 {
83+
return fmt.Errorf("Not enough members")
84+
}
85+
86+
if started, learner, err := getMemberState(&nodeA, members); err != nil {
87+
return err
88+
} else if !started || learner {
89+
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
90+
}
91+
92+
// Ensure GNS node is unstarted and a learner member (i.e. !learner)
93+
if started, learner, err := getMemberState(&nodeB, members); err != nil {
94+
return err
95+
} else if started || !learner {
96+
return fmt.Errorf("Expected node: %s to be a unstarted and learning member. Membership: %+v", nodeB.Name, members)
97+
}
98+
99+
g.GinkgoT().Logf("membership: %+v", members)
100+
return nil
101+
}, 2*time.Minute, 15*time.Second).Should(o.BeNil())
102+
103+
msg = fmt.Sprintf("Ensuring %s rejoins as learner", nodeB.Name)
104+
g.By(msg)
105+
o.Eventually(func() error {
106+
members, err := getMembers(etcdClientFactory)
107+
if err != nil {
108+
return err
109+
}
110+
if len(members) != 2 {
111+
return fmt.Errorf("Not enough members")
112+
}
113+
114+
if started, learner, err := getMemberState(&nodeA, members); err != nil {
115+
return err
116+
} else if !started || learner {
117+
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
118+
}
119+
120+
if started, learner, err := getMemberState(&nodeB, members); err != nil {
121+
return err
122+
} else if !started || !learner {
123+
return fmt.Errorf("Expected node: %s to be a started and learner member. Membership: %+v", nodeB.Name, members)
124+
}
125+
126+
g.GinkgoT().Logf("membership: %+v", members)
127+
return nil
128+
}, 10*time.Minute, 15*time.Second).Should(o.BeNil())
129+
130+
msg = fmt.Sprintf("Ensuring %s node is promoted back as voting member", nodeB.Name)
131+
g.By(msg)
132+
o.Eventually(func() error {
133+
members, err := getMembers(etcdClientFactory)
134+
if err != nil {
135+
return err
136+
}
137+
if len(members) != 2 {
138+
return fmt.Errorf("Not enough members")
139+
}
140+
141+
if started, learner, err := getMemberState(&nodeA, members); err != nil {
142+
return err
143+
} else if !started || learner {
144+
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeA.Name, members)
145+
}
146+
147+
if started, learner, err := getMemberState(&nodeB, members); err != nil {
148+
return err
149+
} else if !started || learner {
150+
return fmt.Errorf("Expected node: %s to be a started and voting member. Membership: %+v", nodeB.Name, members)
151+
}
152+
153+
g.GinkgoT().Logf("membership: %+v", members)
154+
return nil
155+
}, 10*time.Minute, 15*time.Second).Should(o.Succeed())
156+
})
157+
})
158+
159+
func getMembers(etcdClientFactory helpers.EtcdClientCreator) ([]*etcdserverpb.Member, error) {
160+
etcdClient, closeFn, err := etcdClientFactory.NewEtcdClient()
161+
defer closeFn()
162+
if err != nil {
163+
return nil, err
164+
}
165+
ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second)
166+
defer cancel()
167+
m, err := etcdClient.MemberList(ctx)
168+
if err != nil {
169+
return nil, fmt.Errorf("failed to get the member list, err: %w", err)
170+
}
171+
return m.Members, nil
172+
}
173+
174+
func getMemberState(node *corev1.Node, members []*etcdserverpb.Member) (started, learner bool, err error) {
175+
// Etcd members that have been added to the member list but haven't
176+
// joined yet will have an empty Name field. We can match them via Peer URL.
177+
peerURL := fmt.Sprintf("https://%s:2380", node.Status.Addresses[0].Address)
178+
var found bool
179+
for _, m := range members {
180+
if m.Name == node.Name {
181+
found = true
182+
started = true
183+
learner = m.IsLearner
184+
break
185+
}
186+
if slices.Contains(m.PeerURLs, peerURL) {
187+
found = true
188+
learner = m.IsLearner
189+
break
190+
}
191+
}
192+
if !found {
193+
return false, false, fmt.Errorf("could not find node %v", node.Name)
194+
}
195+
return started, learner, nil
196+
}

test/extended/tnf/topology.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package tnf
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
g "github.com/onsi/ginkgo/v2"
8+
o "github.com/onsi/gomega"
9+
v1 "github.com/openshift/api/config/v1"
10+
exutil "github.com/openshift/origin/test/extended/util"
11+
corev1 "k8s.io/api/core/v1"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
13+
)
14+
15+
const useHelpers = true
16+
17+
const (
18+
labelNodeRoleMaster = "node-role.kubernetes.io/master"
19+
labelNodeRoleControlPlane = "node-role.kubernetes.io/control-plane"
20+
labelNodeRoleWorker = "node-role.kubernetes.io/worker"
21+
labelNodeRoleArbiter = "node-role.kubernetes.io/arbiter"
22+
)
23+
24+
const ensurePodmanEtcdContainerIsRunning = "podman inspect --format '{{.State.Running}}' etcd | grep true"
25+
26+
var _ = g.Describe("[sig-node][apigroup:config.openshift.io] Two Nodes OCP with fencing topology", func() {
27+
defer g.GinkgoRecover()
28+
var (
29+
oc = exutil.NewCLIWithoutNamespace("")
30+
nodes *corev1.NodeList
31+
)
32+
33+
g.BeforeEach(func() {
34+
infraStatus := getInfraStatus(oc)
35+
if infraStatus.ControlPlaneTopology != v1.DualReplicaTopologyMode {
36+
g.Skip("Cluster is not in DualReplicaTopologyMode skipping test")
37+
}
38+
var err error
39+
nodes, err = oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{})
40+
o.Expect(err).To(o.BeNil(), "Expected to retrieve all nodes without error")
41+
})
42+
43+
g.It("Should validate the number of control-planes, workers and arbiters as configured", func() {
44+
const (
45+
expectedControlPlanes = 2
46+
expectedWorkers = 2 // CPs will also have the Workers label
47+
expectedArbiters = 0
48+
)
49+
50+
controlPlaneNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
51+
LabelSelector: labelNodeRoleControlPlane,
52+
})
53+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve control-plane nodes without error")
54+
o.Expect(len(controlPlaneNodes.Items)).To(o.Equal(expectedControlPlanes), fmt.Sprintf("Expected %d Control-plane Nodes, found %d", expectedControlPlanes, len(controlPlaneNodes.Items)))
55+
56+
workerNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
57+
LabelSelector: labelNodeRoleWorker,
58+
})
59+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve worker nodes without error")
60+
o.Expect(len(workerNodes.Items)).To(o.Equal(expectedWorkers), fmt.Sprintf("Expected %d Worker Nodes, found %d", expectedWorkers, len(workerNodes.Items)))
61+
62+
arbiterNodes, err := oc.AdminKubeClient().CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
63+
LabelSelector: labelNodeRoleArbiter,
64+
})
65+
o.Expect(err).ShouldNot(o.HaveOccurred(), "Expected to retrieve arbiter nodes without error")
66+
o.Expect(len(arbiterNodes.Items)).To(o.Equal(expectedArbiters), fmt.Sprintf("Expected %d Arbiter Nodes, found %d", expectedArbiters, len(arbiterNodes.Items)))
67+
})
68+
69+
g.It("Should validate the number of etcd pods and containers as configured", func() {
70+
const (
71+
expectedEtcdPod = 2
72+
expectedEtcdCtlContainers = 2
73+
expectedEtcdContainers = 0
74+
)
75+
76+
nodeNameA := nodes.Items[0].Name
77+
nodeNameB := nodes.Items[1].Name
78+
79+
g.By("Ensuring 0 etcd pod containers and 2 etcdctl pod containers are running in the cluster ")
80+
pods, err := oc.AdminKubeClient().CoreV1().Pods("openshift-etcd").List(context.Background(), metav1.ListOptions{})
81+
o.Expect(err).To(o.BeNil(), "Expected to retrieve etcd pods in openshift-etcd namespace without error")
82+
83+
etcdPodCount := 0
84+
etcdContainerCount := 0
85+
etcdctlContainerCount := 0
86+
for _, pod := range pods.Items {
87+
if pod.Name == "etcd-"+nodeNameA || pod.Name == "etcd-"+nodeNameB {
88+
etcdPodCount += 1
89+
for _, container := range pod.Spec.Containers {
90+
if container.Name == "etcd" {
91+
etcdContainerCount += 1
92+
}
93+
if container.Name == "etcdctl" {
94+
etcdctlContainerCount += 1
95+
}
96+
}
97+
}
98+
}
99+
o.Expect(etcdPodCount).To(o.Equal(expectedEtcdPod))
100+
o.Expect(etcdctlContainerCount).To(o.Equal(expectedEtcdCtlContainers))
101+
o.Expect(etcdContainerCount).To(o.Equal(expectedEtcdContainers))
102+
})
103+
104+
g.It("Should verify the number of podman-etcd containers as configured", func() {
105+
g.By("Ensuring one podman etcd container is running on each node")
106+
for _, node := range nodes.Items {
107+
_, _, err := runOnNodeNS(oc, node.Name, "openshift-etcd", ensurePodmanEtcdContainerIsRunning)
108+
o.Expect(err).To(o.BeNil(), fmt.Sprintf("expected a podman etcd container running on Node %s", node.Name))
109+
}
110+
})
111+
})

0 commit comments

Comments
 (0)