Add Caching in Kubernetes orchestrator entrypoint (#3703)

schustmi · web-flow · commit e174db4c825e · 2025-05-27T11:36:13.000+02:00
* Cache step in kubernetes orchestrator pod

* Add caching in kubernetes orchestrator entrypoint

* Linting

* Add option to disable orchestrator pod caching
diff --git a/src/zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py b/src/zenml/integrations/kubernetes/flavors/kubernetes_orchestrator_flavor.py
@@ -67,6 +67,8 @@ class KubernetesOrchestratorSettings(BaseSettings):
         ttl_seconds_after_finished: The amount of seconds to keep finished jobs
             before deleting them. This only applies to jobs created when
             scheduling a pipeline.
+        prevent_orchestrator_pod_caching: If `True`, the orchestrator pod will
+            not try to compute cached steps before starting the step pods.
     """
 
     synchronous: bool = True
@@ -85,6 +87,7 @@ class KubernetesOrchestratorSettings(BaseSettings):
     successful_jobs_history_limit: Optional[NonNegativeInt] = None
     failed_jobs_history_limit: Optional[NonNegativeInt] = None
     ttl_seconds_after_finished: Optional[NonNegativeInt] = None
+    prevent_orchestrator_pod_caching: bool = False
 
 
 class KubernetesOrchestratorConfig(
diff --git a/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py b/src/zenml/integrations/kubernetes/orchestrators/kubernetes_orchestrator_entrypoint.py
@@ -74,23 +74,37 @@ def main() -> None:
     orchestrator_pod_name = socket.gethostname()
 
     client = Client()
+    active_stack = client.active_stack
+    orchestrator = active_stack.orchestrator
+    assert isinstance(orchestrator, KubernetesOrchestrator)
 
-    deployment_config = client.get_deployment(args.deployment_id)
+    deployment = client.get_deployment(args.deployment_id)
+    pipeline_settings = cast(
+        KubernetesOrchestratorSettings,
+        orchestrator.get_settings(deployment),
+    )
 
-    pipeline_dag = {
-        step_name: step.spec.upstream_steps
-        for step_name, step in deployment_config.step_configurations.items()
-    }
     step_command = StepEntrypointConfiguration.get_entrypoint_command()
 
-    active_stack = client.active_stack
+    if args.run_id and not pipeline_settings.prevent_orchestrator_pod_caching:
+        from zenml.orchestrators import cache_utils
+
+        run_required = (
+            cache_utils.create_cached_step_runs_and_prune_deployment(
+                deployment=deployment,
+                pipeline_run=client.get_pipeline_run(args.run_id),
+                stack=active_stack,
+            )
+        )
+
+        if not run_required:
+            return
+
     mount_local_stores = active_stack.orchestrator.config.is_local
 
     # Get a Kubernetes client from the active Kubernetes orchestrator, but
     # override the `incluster` setting to `True` since we are running inside
     # the Kubernetes cluster.
-    orchestrator = active_stack.orchestrator
-    assert isinstance(orchestrator, KubernetesOrchestrator)
     kube_client = orchestrator.get_kube_client(incluster=True)
     core_api = k8s_client.CoreV1Api(kube_client)
 
@@ -121,7 +135,7 @@ def run_step_on_kubernetes(step_name: str) -> None:
         Raises:
             Exception: If the pod fails to start.
         """
-        step_config = deployment_config.step_configurations[step_name].config
+        step_config = deployment.step_configurations[step_name].config
         settings = step_config.settings.get("orchestrator.kubernetes", None)
         settings = KubernetesOrchestratorSettings.model_validate(
             settings.model_dump() if settings else {}
@@ -147,10 +161,10 @@ def run_step_on_kubernetes(step_name: str) -> None:
         )
 
         image = KubernetesOrchestrator.get_image(
-            deployment=deployment_config, step_name=step_name
+            deployment=deployment, step_name=step_name
         )
         step_args = StepEntrypointConfiguration.get_entrypoint_arguments(
-            step_name=step_name, deployment_id=deployment_config.id
+            step_name=step_name, deployment_id=deployment.id
         )
 
         # We set some default minimum memory resource requests for the step pod
@@ -165,9 +179,7 @@ def run_step_on_kubernetes(step_name: str) -> None:
 
         if orchestrator.config.pass_zenml_token_as_secret:
             env.pop("ZENML_STORE_API_TOKEN", None)
-            secret_name = orchestrator.get_token_secret_name(
-                deployment_config.id
-            )
+            secret_name = orchestrator.get_token_secret_name(deployment.id)
             pod_settings.env.append(
                 {
                     "name": "ZENML_STORE_API_TOKEN",
@@ -184,7 +196,7 @@ def run_step_on_kubernetes(step_name: str) -> None:
         pod_manifest = build_pod_manifest(
             pod_name=pod_name,
             run_name=args.run_name,
-            pipeline_name=deployment_config.pipeline_configuration.name,
+            pipeline_name=deployment.pipeline_configuration.name,
             image_name=image,
             command=step_command,
             args=step_args,
@@ -251,8 +263,8 @@ def finalize_run(node_states: Dict[str, NodeStatus]) -> None:
 
             pipeline_runs = client.list_pipeline_runs(
                 hydrate=True,
-                project=deployment_config.project_id,
-                deployment_id=deployment_config.id,
+                project=deployment.project_id,
+                deployment_id=deployment.id,
                 **list_args,
             )
             if not len(pipeline_runs):
@@ -298,27 +310,26 @@ def finalize_run(node_states: Dict[str, NodeStatus]) -> None:
     parallel_node_startup_waiting_period = (
         orchestrator.config.parallel_step_startup_waiting_period or 0.0
     )
-    settings = cast(
-        KubernetesOrchestratorSettings,
-        orchestrator.get_settings(deployment_config),
-    )
+
+    pipeline_dag = {
+        step_name: step.spec.upstream_steps
+        for step_name, step in deployment.step_configurations.items()
+    }
     try:
         ThreadedDagRunner(
             dag=pipeline_dag,
             run_fn=run_step_on_kubernetes,
             finalize_fn=finalize_run,
             parallel_node_startup_waiting_period=parallel_node_startup_waiting_period,
-            max_parallelism=settings.max_parallelism,
+            max_parallelism=pipeline_settings.max_parallelism,
         ).run()
         logger.info("Orchestration pod completed.")
     finally:
         if (
             orchestrator.config.pass_zenml_token_as_secret
-            and deployment_config.schedule is None
+            and deployment.schedule is None
         ):
-            secret_name = orchestrator.get_token_secret_name(
-                deployment_config.id
-            )
+            secret_name = orchestrator.get_token_secret_name(deployment.id)
             try:
                 kube_utils.delete_secret(
                     core_api=core_api,
diff --git a/src/zenml/orchestrators/base_orchestrator.py b/src/zenml/orchestrators/base_orchestrator.py
@@ -220,29 +220,18 @@ def run(
             and not deployment.schedule
             and not prevent_client_side_caching
         ):
-            from zenml.orchestrators import step_run_utils
+            from zenml.orchestrators import cache_utils
 
-            cached_invocations = step_run_utils.create_cached_step_runs(
-                deployment=deployment,
-                pipeline_run=placeholder_run,
-                stack=stack,
+            run_required = (
+                cache_utils.create_cached_step_runs_and_prune_deployment(
+                    deployment=deployment,
+                    pipeline_run=placeholder_run,
+                    stack=stack,
+                )
             )
 
-            for invocation_id in cached_invocations:
-                # Remove the cached step invocations from the deployment so
-                # the orchestrator does not try to run them
-                deployment.step_configurations.pop(invocation_id)
-
-            for step in deployment.step_configurations.values():
-                for invocation_id in cached_invocations:
-                    if invocation_id in step.spec.upstream_steps:
-                        step.spec.upstream_steps.remove(invocation_id)
-
-            if len(deployment.step_configurations) == 0:
-                # All steps were cached, we update the pipeline run status and
-                # don't actually use the orchestrator to run the pipeline
+            if not run_required:
                 self._cleanup_run()
-                logger.info("All steps of the pipeline run were cached.")
                 return
         else:
             logger.debug("Skipping client-side caching.")
diff --git a/src/zenml/orchestrators/cache_utils.py b/src/zenml/orchestrators/cache_utils.py
@@ -19,13 +19,20 @@
 from zenml.client import Client
 from zenml.enums import ExecutionStatus, SorterOps
 from zenml.logger import get_logger
+from zenml.orchestrators import step_run_utils
 
 if TYPE_CHECKING:
     from uuid import UUID
 
     from zenml.artifact_stores import BaseArtifactStore
     from zenml.config.step_configurations import Step
-    from zenml.models import StepRunResponse
+    from zenml.models import (
+        PipelineDeploymentResponse,
+        PipelineRunResponse,
+        StepRunResponse,
+    )
+    from zenml.stack import Stack
+
 
 logger = get_logger(__name__)
 
@@ -127,3 +134,43 @@ def get_cached_step_run(cache_key: str) -> Optional["StepRunResponse"]:
     if cache_candidates:
         return cache_candidates[0]
     return None
+
+
+def create_cached_step_runs_and_prune_deployment(
+    deployment: "PipelineDeploymentResponse",
+    pipeline_run: "PipelineRunResponse",
+    stack: "Stack",
+) -> bool:
+    """Create cached step runs and prune the cached steps from the deployment.
+
+    Args:
+        deployment: The deployment of the pipeline run.
+        pipeline_run: The pipeline run for which to create the step runs.
+        stack: The stack on which the pipeline run is happening.
+
+    Returns:
+        Whether an actual pipeline run is still required.
+    """
+    cached_invocations = step_run_utils.create_cached_step_runs(
+        deployment=deployment,
+        pipeline_run=pipeline_run,
+        stack=stack,
+    )
+
+    for invocation_id in cached_invocations:
+        # Remove the cached step invocations from the deployment so
+        # the orchestrator does not try to run them
+        deployment.step_configurations.pop(invocation_id)
+
+    for step in deployment.step_configurations.values():
+        for invocation_id in cached_invocations:
+            if invocation_id in step.spec.upstream_steps:
+                step.spec.upstream_steps.remove(invocation_id)
+
+    if len(deployment.step_configurations) == 0:
+        # All steps were cached, we update the pipeline run status and
+        # don't actually use the orchestrator to run the pipeline
+        logger.info("All steps of the pipeline run were cached.")
+        return False
+
+    return True