add gpu checker

kryanbeane · kryanbeane · commit 43ed3ba335a6 · 2025-05-20T15:16:13.000+01:00
diff --git a/.github/workflows/e2e_tests.yaml b/.github/workflows/e2e_tests.yaml
@@ -70,12 +70,99 @@ jobs:
       - name: Install NVidia GPU operator for KinD
         uses: ./common/github-actions/nvidia-gpu-operator
 
+      - name: Verify GPU availability in KinD
+        run: |
+          echo "Checking for available GPUs in the KinD cluster..."
+          
+          # Wait for GPU operator pods to be ready (with timeout)
+          echo "Waiting for GPU operator pods to be ready..."
+          TIMEOUT=300  # 5 minutes timeout
+          END=$((SECONDS + TIMEOUT))
+          
+          while [ $SECONDS -lt $END ]; do
+            # Get total number of pods in the namespace
+            TOTAL_PODS=$(kubectl get pods -n gpu-operator --no-headers | wc -l)
+            
+            # Count pods that are either running and ready or completed successfully
+            # Exclude pods that are still initializing
+            READY_PODS=$(kubectl get pods -n gpu-operator --no-headers | grep -E 'Running|Completed' | grep -v 'PodInitializing' | wc -l)
+            
+            if [ "$READY_PODS" -eq "$TOTAL_PODS" ] && [ "$TOTAL_PODS" -gt 0 ]; then
+              echo "All GPU operator pods are ready or completed successfully!"
+              break
+            fi
+            
+            echo "Waiting for GPU operator pods to be ready... ($READY_PODS/$TOTAL_PODS)"
+            echo "Pod status:"
+            kubectl get pods -n gpu-operator
+            sleep 10
+          done
+          
+          if [ $SECONDS -ge $END ]; then
+            echo "::error::Timeout waiting for GPU operator pods to be ready"
+            echo "GPU operator pod status:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Node details:"
+          kubectl describe nodes | grep -E 'nvidia.com/gpu|Allocatable:|Capacity:|Name:'
+          
+          # Check if GPU operator has labeled nodes
+          GPU_LABELS=$(kubectl describe nodes | grep -c "nvidia.com/gpu")
+          if [ "$GPU_LABELS" -eq 0 ]; then
+            echo "::error::No NVIDIA GPU labels found on nodes. GPU operator may not be running correctly."
+            echo "Full node descriptions for debugging:"
+            kubectl describe nodes
+            exit 1
+          fi
+          
+          # Check if GPUs are actually allocatable
+          GPU_ALLOCATABLE=$(kubectl get nodes -o jsonpath='{.items[*].status.allocatable.nvidia\.com/gpu}' | tr ' ' '\n' | grep -v '^$' | wc -l)
+          if [ "$GPU_ALLOCATABLE" -eq 0 ]; then
+            echo "::error::GPU operator is running but no GPUs are allocatable. Check GPU operator logs."
+            echo "Checking GPU operator pods:"
+            kubectl get pods -n gpu-operator -o wide
+            echo "GPU operator pod logs:"
+            kubectl logs -n gpu-operator -l app.kubernetes.io/name=gpu-operator
+            echo "GPU operator pod events:"
+            kubectl get events -n gpu-operator
+            echo "GPU operator pod descriptions:"
+            kubectl describe pods -n gpu-operator
+            exit 1
+          fi
+          
+          echo "Successfully found $GPU_ALLOCATABLE allocatable GPU(s) in the cluster."
+
       - name: Deploy CodeFlare stack
         id: deploy
         run: |
           cd codeflare-operator
           echo Setting up CodeFlare stack
           make setup-e2e
+          
+          # Create ConfigMap to disable mTLS
+          echo "Creating ConfigMap to disable mTLS..."
+          cat <<EOF | kubectl apply -f -
+          apiVersion: v1
+          kind: ConfigMap
+          metadata:
+            name: codeflare-operator-config
+            namespace: ray-system
+          data:
+            config.yaml: |
+              kuberay:
+                mTLSEnabled: false
+                rayDashboardOAuthEnabled: false  
+                ingressDomain: "kind"
+              appwrapper:
+                enabled: true
+          EOF
+          
           echo Deploying CodeFlare operator
           make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
           kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
diff --git a/codeflare-kuberay.code-workspace b/codeflare-kuberay.code-workspace
@@ -0,0 +1,13 @@
+{
+	"folders": [
+		{
+			"path": "/Users/bkeane/Code/github.com/codeflare-sdk"
+		},
+		{
+			"path": "/Users/bkeane/Code/github.com/kuberay" 
+		},
+		{
+			"path": "/Users/bkeane/Code/github.com/codeflare-operator"
+		}
+	]
+}
diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py
@@ -54,6 +54,10 @@ def run_local_interactives(
     ):
         cluster_name = "test-ray-cluster-li"
         logger.info(f"Starting run_local_interactives with {number_of_gpus} GPUs")
+        
+        logger.info("Cleaning up existing Ray connections...")
+        ray.shutdown()
+        logger.info("Ray connection cleanup completed")
 
         logger.info("Creating cluster configuration...")
         cluster = Cluster(
@@ -82,8 +86,59 @@ def run_local_interactives(
 
         logger.info("Waiting for cluster to be ready...")
         cluster.wait_ready()
+        cluster.status()
         logger.info("Cluster is ready")
 
+        # Wait for pods to be fully ready
+        logger.info("Waiting for pods to be fully ready...")
+        TIMEOUT = 300  # 5 minutes timeout
+        END = time.time() + TIMEOUT
+        
+        while time.time() < END:
+            head_pod = f"{cluster_name}-head"
+            worker_pod = f"{cluster_name}-small-group-{cluster_name}-worker"
+            
+            # Get pod status
+            head_status = kubectl_get_pod_status(self.namespace, head_pod)
+            worker_status = kubectl_get_pod_status(self.namespace, worker_pod)
+            
+            logger.info(f"Head pod status: {head_status}")
+            logger.info(f"Worker pod status: {worker_status}")
+            
+            if "Running" in head_status and "Running" in worker_status:
+                # Check if containers are ready
+                head_ready = kubectl_get_pod_ready(self.namespace, head_pod)
+                worker_ready = kubectl_get_pod_ready(self.namespace, worker_pod)
+                
+                if head_ready and worker_ready:
+                    logger.info("All pods and containers are ready!")
+                    break
+                else:
+                    logger.info("Pods are running but containers are not ready yet...")
+                    if not head_ready:
+                        head_container_status = kubectl_get_pod_container_status(self.namespace, head_pod)
+                        logger.info(f"Head pod container status: {head_container_status}")
+                    if not worker_ready:
+                        worker_container_status = kubectl_get_pod_container_status(self.namespace, worker_pod)
+                        logger.info(f"Worker pod container status: {worker_container_status}")
+            elif "Error" in head_status or "Error" in worker_status:
+                logger.error("Error getting pod status, retrying...")
+                time.sleep(10)
+                continue
+            else:
+                logger.info(f"Waiting for pods to be running... Current status - Head: {head_status}, Worker: {worker_status}")
+            
+            time.sleep(10)
+        
+        if time.time() >= END:
+            logger.error("Timeout waiting for pods to be ready")
+            # Get final pod status for debugging
+            head_pod = f"{cluster_name}-head"
+            worker_pod = f"{cluster_name}-small-group-{cluster_name}-worker"
+            logger.error(f"Final head pod status: {kubectl_get_pod_container_status(self.namespace, head_pod)}")
+            logger.error(f"Final worker pod status: {kubectl_get_pod_container_status(self.namespace, worker_pod)}")
+            raise TimeoutError("Pods did not become ready within the timeout period")
+
         logger.info("Generating TLS certificates...")
         generate_cert.generate_tls_cert(cluster_name, self.namespace)
         logger.info("TLS certificates generated")
@@ -107,13 +162,9 @@ def run_local_interactives(
         cluster_uri = cluster.cluster_uri()
         logger.info(f"Cluster URI: {cluster_uri}")
 
-        logger.info("Shutting down any existing Ray connections...")
-        ray.shutdown()
-        logger.info("Ray shutdown completed")
-
         logger.info("Initializing Ray connection...")
         try:
-            ray.init(address=client_url, logging_level="DEBUG")
+            ray.init(address=client_url, logging_level="INFO")
             logger.info("Ray initialization successful")
         except Exception as e:
             logger.error(f"Ray initialization failed: {str(e)}")
diff --git a/tests/e2e/support.py b/tests/e2e/support.py
@@ -9,7 +9,7 @@
 from codeflare_sdk.common.kubernetes_cluster.kube_api_helpers import (
     _kube_api_error_handling,
 )
-
+import time
 
 def get_ray_cluster(cluster_name, namespace):
     api = client.CustomObjectsApi()
@@ -299,31 +299,38 @@ def create_kueue_resources(
 
 
 def delete_kueue_resources(self):
-    # Delete if given cluster-queue exists
-    for cq in self.cluster_queues:
-        try:
-            self.custom_api.delete_cluster_custom_object(
-                group="kueue.x-k8s.io",
-                plural="clusterqueues",
-                version="v1beta1",
-                name=cq,
-            )
-            print(f"\n'{cq}' cluster-queue deleted")
-        except Exception as e:
-            print(f"\nError deleting cluster-queue '{cq}' : {e}")
-
-    # Delete if given resource-flavor exists
-    for flavor in self.resource_flavors:
-        try:
-            self.custom_api.delete_cluster_custom_object(
-                group="kueue.x-k8s.io",
-                plural="resourceflavors",
-                version="v1beta1",
-                name=flavor,
-            )
-            print(f"'{flavor}' resource-flavor deleted")
-        except Exception as e:
-            print(f"\nError deleting resource-flavor '{flavor}': {e}")
+    try:
+        # Delete if given cluster-queue exists
+        for cq in getattr(self, "cluster_queues", []):
+            try:
+                self.custom_api.delete_cluster_custom_object(
+                    group="kueue.x-k8s.io",
+                    plural="clusterqueues",
+                    version="v1beta1",
+                    name=cq,
+                )
+                print(f"\n'{cq}' cluster-queue deleted")
+            except Exception as e:
+                print(f"\nError deleting cluster-queue '{cq}' : {e}")
+
+        # Delete if given resource-flavor exists
+        for flavor in getattr(self, "resource_flavors", []):
+            try:
+                self.custom_api.delete_cluster_custom_object(
+                    group="kueue.x-k8s.io",
+                    plural="resourceflavors",
+                    version="v1beta1",
+                    name=flavor,
+                )
+                print(f"'{flavor}' resource-flavor deleted")
+            except Exception as e:
+                print(f"\nError deleting resource-flavor '{flavor}': {e}")
+
+        # Wait for resources to be cleaned up
+        time.sleep(5)
+    except Exception as e:
+        print(f"Error during Kueue resource cleanup: {e}")
+        raise
 
 
 def get_pod_node(self, namespace, name):
@@ -407,3 +414,113 @@ def assert_get_cluster_and_jobsubmit(
     assert job_list[0].submission_id == submission_id
 
     cluster.down()
+
+
+def kubectl_get_pod_status(namespace, pod_name):
+    """Get the status of a pod."""
+    try:
+        # First check if the pod exists
+        result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace],
+            capture_output=True,
+            text=True,
+            check=False
+        )
+        if result.returncode != 0:
+            print(f"Pod {pod_name} not found in namespace {namespace}")
+            return "NotFound"
+            
+        # Get the pod phase
+        result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.phase}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        status = result.stdout.strip("'")
+        
+        # Get pod conditions for more detailed status
+        conditions = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.conditions}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        print(f"Pod {pod_name} conditions: {conditions.stdout}")
+        
+        return status
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting pod status for {pod_name}: {e.stderr}")
+        return "Error"
+
+
+def kubectl_get_pod_ready(namespace, pod_name):
+    """Check if all containers in a pod are ready."""
+    try:
+        # Get container statuses
+        result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        print(f"Container statuses for {pod_name}: {result.stdout}")
+        
+        # Get ready status
+        result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].ready}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        statuses = result.stdout.strip("'").split()
+        ready = all(status == "true" for status in statuses)
+        
+        if not ready:
+            # Get container names and their ready status
+            names_result = subprocess.run(
+                ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].name}'"],
+                capture_output=True,
+                text=True,
+                check=True
+            )
+            container_names = names_result.stdout.strip("'").split()
+            for name, status in zip(container_names, statuses):
+                print(f"Container {name} ready status: {status}")
+                
+        return ready
+    except subprocess.CalledProcessError as e:
+        print(f"Error checking pod readiness for {pod_name}: {e.stderr}")
+        return False
+
+
+def kubectl_get_pod_container_status(namespace, pod_name):
+    """Get detailed container status for a pod."""
+    try:
+        # Get container names
+        names_result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].name}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        container_names = names_result.stdout.strip("'").split()
+        
+        # Get container states
+        states_result = subprocess.run(
+            ["kubectl", "get", "pod", pod_name, "-n", namespace, "-o", "jsonpath='{.status.containerStatuses[*].state}'"],
+            capture_output=True,
+            text=True,
+            check=True
+        )
+        states = states_result.stdout.strip("'").split()
+        
+        # Combine names and states
+        status = {}
+        for name, state in zip(container_names, states):
+            status[name] = state
+            
+        return status
+    except subprocess.CalledProcessError as e:
+        print(f"Error getting container status for {pod_name}: {e.stderr}")
+        return "Error"