Skip to content

Commit 43ed3ba

Browse files
committed
add gpu checker
1 parent 440fac2 commit 43ed3ba

File tree

4 files changed

+299
-31
lines changed

4 files changed

+299
-31
lines changed

.github/workflows/e2e_tests.yaml

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,12 +70,99 @@ jobs:
7070
- name: Install NVidia GPU operator for KinD
7171
uses: ./common/github-actions/nvidia-gpu-operator
7272

73+
- name: Verify GPU availability in KinD
74+
run: |
75+
echo "Checking for available GPUs in the KinD cluster..."
76+
77+
# Wait for GPU operator pods to be ready (with timeout)
78+
echo "Waiting for GPU operator pods to be ready..."
79+
TIMEOUT=300 # 5 minutes timeout
80+
END=$((SECONDS + TIMEOUT))
81+
82+
while [ $SECONDS -lt $END ]; do
83+
# Get total number of pods in the namespace
84+
TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
85+
86+
# Count pods that are either running and ready or completed successfully
87+
# Exclude pods that are still initializing
88+
READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
89+
90+
if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
91+
echo "All GPU operator pods are ready or completed successfully!"
92+
break
93+
fi
94+
95+
echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
96+
echo "Pod status:"
97+
kubectl get pods -n gpu-operator
98+
sleep 10
99+
done
100+
101+
if [ $SECONDS -ge $END ]; then
102+
echo "::error::Timeout waiting for GPU operator pods to be ready"
103+
echo "GPU operator pod status:"
104+
kubectl get pods -n gpu-operator -o wide
105+
echo "GPU operator pod logs:"
106+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
107+
echo "GPU operator pod events:"
108+
kubectl get events -n gpu-operator
109+
exit 1
110+
fi
111+
112+
echo "Node details:"
113+
kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
114+
115+
# Check if GPU operator has labeled nodes
116+
GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
117+
if [ "$GPU_LABELS" -eq 0 ]; then
118+
echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
119+
echo "Full node descriptions for debugging:"
120+
kubectl describe nodes
121+
exit 1
122+
fi
123+
124+
# Check if GPUs are actually allocatable
125+
GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
126+
if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
127+
echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
128+
echo "Checking GPU operator pods:"
129+
kubectl get pods -n gpu-operator -o wide
130+
echo "GPU operator pod logs:"
131+
kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
132+
echo "GPU operator pod events:"
133+
kubectl get events -n gpu-operator
134+
echo "GPU operator pod descriptions:"
135+
kubectl describe pods -n gpu-operator
136+
exit 1
137+
fi
138+
139+
echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
140+
73141
- name: Deploy CodeFlare stack
74142
id: deploy
75143
run: |
76144
cd codeflare-operator
77145
echo Setting up CodeFlare stack
78146
make setup-e2e
147+
148+
# Create ConfigMap to disable mTLS
149+
echo "Creating ConfigMap to disable mTLS..."
150+
cat <<EOF | kubectl apply -f -
151+
apiVersion: v1
152+
kind: ConfigMap
153+
metadata:
154+
name: codeflare-operator-config
155+
namespace: ray-system
156+
data:
157+
config.yaml: |
158+
kuberay:
159+
mTLSEnabled: false
160+
rayDashboardOAuthEnabled: false
161+
ingressDomain: "kind"
162+
appwrapper:
163+
enabled: true
164+
EOF
165+
79166
echo Deploying CodeFlare operator
80167
make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
81168
kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager

codeflare-kuberay.code-workspace

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
{
2+
"folders": [
3+
{
4+
"path": "/Users/bkeane/Code/github.com/codeflare-sdk"
5+
},
6+
{
7+
"path": "/Users/bkeane/Code/github.com/kuberay"
8+
},
9+
{
10+
"path": "/Users/bkeane/Code/github.com/codeflare-operator"
11+
}
12+
]
13+
}

tests/e2e/local_interactive_sdk_kind_test.py

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ def run_local_interactives(
5454
):
5555
cluster_name = "test-ray-cluster-li"
5656
logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
57+
58+
logger.info("Cleaning up existing Ray connections...")
59+
ray.shutdown()
60+
logger.info("Ray connection cleanup completed")
5761

5862
logger.info("Creating cluster configuration...")
5963
cluster = Cluster(
@@ -82,8 +86,59 @@ def run_local_interactives(
8286

8387
logger.info("Waiting for cluster to be ready...")
8488
cluster.wait_ready()
89+
cluster.status()
8590
logger.info("Cluster is ready")
8691

92+
# Wait for pods to be fully ready
93+
logger.info("Waiting for pods to be fully ready...")
94+
TIMEOUT = 300 # 5 minutes timeout
95+
END = time.time() + TIMEOUT
96+
97+
while time.time() < END:
98+
head_pod = f"{cluster_name}-head"
99+
worker_pod = f"{cluster_name}-small-group-{cluster_name}-worker"
100+
101+
# Get pod status
102+
head_status = kubectl_get_pod_status(self.namespace, head_pod)
103+
worker_status = kubectl_get_pod_status(self.namespace, worker_pod)
104+
105+
logger.info(f"Head pod status: {head_status}")
106+
logger.info(f"Worker pod status: {worker_status}")
107+
108+
if "Running" in head_status and "Running" in worker_status:
109+
# Check if containers are ready
110+
head_ready = kubectl_get_pod_ready(self.namespace, head_pod)
111+
worker_ready = kubectl_get_pod_ready(self.namespace, worker_pod)
112+
113+
if head_ready and worker_ready:
114+
logger.info("All pods and containers are ready!")
115+
break
116+
else:
117+
logger.info("Pods are running but containers are not ready yet...")
118+
if not head_ready:
119+
head_container_status = kubectl_get_pod_container_status(self.namespace, head_pod)
120+
logger.info(f"Head pod container status: {head_container_status}")
121+
if not worker_ready:
122+
worker_container_status = kubectl_get_pod_container_status(self.namespace, worker_pod)
123+
logger.info(f"Worker pod container status: {worker_container_status}")
124+
elif "Error" in head_status or "Error" in worker_status:
125+
logger.error("Error getting pod status, retrying...")
126+
time.sleep(10)
127+
continue
128+
else:
129+
logger.info(f"Waiting for pods to be running... Current status - Head: {head_status}, Worker: {worker_status}")
130+
131+
time.sleep(10)
132+
133+
if time.time() >= END:
134+
logger.error("Timeout waiting for pods to be ready")
135+
# Get final pod status for debugging
136+
head_pod = f"{cluster_name}-head"
137+
worker_pod = f"{cluster_name}-small-group-{cluster_name}-worker"
138+
logger.error(f"Final head pod status: {kubectl_get_pod_container_status(self.namespace, head_pod)}")
139+
logger.error(f"Final worker pod status: {kubectl_get_pod_container_status(self.namespace, worker_pod)}")
140+
raise TimeoutError("Pods did not become ready within the timeout period")
141+
87142
logger.info("Generating TLS certificates...")
88143
generate_cert.generate_tls_cert(cluster_name, self.namespace)
89144
logger.info("TLS certificates generated")
@@ -107,13 +162,9 @@ def run_local_interactives(
107162
cluster_uri = cluster.cluster_uri()
108163
logger.info(f"Cluster URI: {cluster_uri}")
109164

110-
logger.info("Shutting down any existing Ray connections...")
111-
ray.shutdown()
112-
logger.info("Ray shutdown completed")
113-
114165
logger.info("Initializing Ray connection...")
115166
try:
116-
ray.init(address=client_url, logging_level="DEBUG")
167+
ray.init(address=client_url, logging_level="INFO")
117168
logger.info("Ray initialization successful")
118169
except Exception as e:
119170
logger.error(f"Ray initialization failed: {str(e)}")

tests/e2e/support.py

Lines changed: 143 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from codeflare_sdk.common.kubernetes_cluster.kube_api_helpers import (
1010
_kube_api_error_handling,
1111
)
12-
12+
import time
1313

1414
def get_ray_cluster(cluster_name, namespace):
1515
api = client.CustomObjectsApi()
@@ -299,31 +299,38 @@ def create_kueue_resources(
299299

300300

301301
def delete_kueue_resources(self):
302-
# Delete if given cluster-queue exists
303-
for cq in self.cluster_queues:
304-
try:
305-
self.custom_api.delete_cluster_custom_object(
306-
group="kueue.x-k8s.io",
307-
plural="clusterqueues",
308-
version="v1beta1",
309-
name=cq,
310-
)
311-
print(f"\n'{cq}' cluster-queue deleted")
312-
except Exception as e:
313-
print(f"\nError deleting cluster-queue '{cq}' : {e}")
314-
315-
# Delete if given resource-flavor exists
316-
for flavor in self.resource_flavors:
317-
try:
318-
self.custom_api.delete_cluster_custom_object(
319-
group="kueue.x-k8s.io",
320-
plural="resourceflavors",
321-
version="v1beta1",
322-
name=flavor,
323-
)
324-
print(f"'{flavor}' resource-flavor deleted")
325-
except Exception as e:
326-
print(f"\nError deleting resource-flavor '{flavor}': {e}")
302+
try:
303+
# Delete if given cluster-queue exists
304+
for cq in getattr(self, "cluster_queues", []):
305+
try:
306+
self.custom_api.delete_cluster_custom_object(
307+
group="kueue.x-k8s.io",
308+
plural="clusterqueues",
309+
version="v1beta1",
310+
name=cq,
311+
)
312+
print(f"\n'{cq}' cluster-queue deleted")
313+
except Exception as e:
314+
print(f"\nError deleting cluster-queue '{cq}' : {e}")
315+
316+
# Delete if given resource-flavor exists
317+
for flavor in getattr(self, "resource_flavors", []):
318+
try:
319+
self.custom_api.delete_cluster_custom_object(
320+
group="kueue.x-k8s.io",
321+
plural="resourceflavors",
322+
version="v1beta1",
323+
name=flavor,
324+
)
325+
print(f"'{flavor}' resource-flavor deleted")
326+
except Exception as e:
327+
print(f"\nError deleting resource-flavor '{flavor}': {e}")
328+
329+
# Wait for resources to be cleaned up
330+
time.sleep(5)
331+
except Exception as e:
332+
print(f"Error during Kueue resource cleanup: {e}")
333+
raise
327334

328335

329336
def get_pod_node(self, namespace, name):
@@ -407,3 +414,113 @@ def assert_get_cluster_and_jobsubmit(
407414
assert job_list[0].submission_id == submission_id
408415

409416
cluster.down()
417+
418+
419+
def kubectl_get_pod_status(namespace, pod_name):
420+
"""Get the status of a pod."""
421+
try:
422+
# First check if the pod exists
423+
result = subprocess.run(
424+
["kubectl", "get", "pod", pod_name, "-n", namespace],
425+
capture_output=True,
426+
text=True,
427+
check=False
428+
)
429+
if result.returncode != 0:
430+
print(f"Pod {pod_name} not found in namespace {namespace}")
431+
return "NotFound"
432+
433+
# Get the pod phase
434+
result = subprocess.run(
435+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.phase}'"],
436+
capture_output=True,
437+
text=True,
438+
check=True
439+
)
440+
status = result.stdout.strip("'")
441+
442+
# Get pod conditions for more detailed status
443+
conditions = subprocess.run(
444+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.conditions}'"],
445+
capture_output=True,
446+
text=True,
447+
check=True
448+
)
449+
print(f"Pod {pod_name} conditions: {conditions.stdout}")
450+
451+
return status
452+
except subprocess.CalledProcessError as e:
453+
print(f"Error getting pod status for {pod_name}: {e.stderr}")
454+
return "Error"
455+
456+
457+
def kubectl_get_pod_ready(namespace, pod_name):
458+
"""Check if all containers in a pod are ready."""
459+
try:
460+
# Get container statuses
461+
result = subprocess.run(
462+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses}'"],
463+
capture_output=True,
464+
text=True,
465+
check=True
466+
)
467+
print(f"Container statuses for {pod_name}: {result.stdout}")
468+
469+
# Get ready status
470+
result = subprocess.run(
471+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].ready}'"],
472+
capture_output=True,
473+
text=True,
474+
check=True
475+
)
476+
statuses = result.stdout.strip("'").split()
477+
ready = all(status == "true" for status in statuses)
478+
479+
if not ready:
480+
# Get container names and their ready status
481+
names_result = subprocess.run(
482+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].name}'"],
483+
capture_output=True,
484+
text=True,
485+
check=True
486+
)
487+
container_names = names_result.stdout.strip("'").split()
488+
for name, status in zip(container_names, statuses):
489+
print(f"Container {name} ready status: {status}")
490+
491+
return ready
492+
except subprocess.CalledProcessError as e:
493+
print(f"Error checking pod readiness for {pod_name}: {e.stderr}")
494+
return False
495+
496+
497+
def kubectl_get_pod_container_status(namespace, pod_name):
498+
"""Get detailed container status for a pod."""
499+
try:
500+
# Get container names
501+
names_result = subprocess.run(
502+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].name}'"],
503+
capture_output=True,
504+
text=True,
505+
check=True
506+
)
507+
container_names = names_result.stdout.strip("'").split()
508+
509+
# Get container states
510+
states_result = subprocess.run(
511+
["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].state}'"],
512+
capture_output=True,
513+
text=True,
514+
check=True
515+
)
516+
states = states_result.stdout.strip("'").split()
517+
518+
# Combine names and states
519+
status = {}
520+
for name, state in zip(container_names, states):
521+
status[name] = state
522+
523+
return status
524+
except subprocess.CalledProcessError as e:
525+
print(f"Error getting container status for {pod_name}: {e.stderr}")
526+
return "Error"

0 commit comments

Comments
 (0)