From 48d769ebee1aed5a62a928705d9b8fc73e97f77e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 26 Jul 2023 13:49:32 -0700 Subject: [PATCH 01/15] add: create cluster from yaml function --- src/codeflare_sdk/cluster/cluster.py | 55 +++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index ff92bfcf0..3da2e213d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -307,7 +307,8 @@ def torchx_config( def from_k8_cluster_object(rc): machine_types = ( rc["metadata"]["labels"]["orderedinstance"].split("_") - if "orderedinstance" in rc["metadata"]["labels"] + if "labels" in rc["metadata"] + and "orderedinstance" in rc["metadata"]["labels"] else [] ) local_interactive = ( @@ -347,6 +348,58 @@ def from_k8_cluster_object(rc): ) return Cluster(cluster_config) + def from_definition_yaml(yaml_path): + try: + with open(yaml_path) as yaml_file: + rc = yaml.load(yaml_file, Loader=yaml.FullLoader) + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "labels" in rc["metadata"] + and "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + worker_group_specs = rc["spec"]["resources"]["GenericItems"][0][ + "generictemplate" + ]["spec"]["workerGroupSpecs"][0] + local_interactive = ( + "volumeMounts" + in worker_group_specs["template"]["spec"]["containers"][0] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_worker=worker_group_specs["minReplicas"], + max_worker=worker_group_specs["maxReplicas"], + min_cpus=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["cpu"], + max_cpus=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"], + min_memory=int( + worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["memory"][:-1] + ), + max_memory=int( + worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"][:-1] + ), + gpu=worker_group_specs["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=worker_group_specs["template"]["spec"]["containers"][0][ + "image" + ], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + except IOError: + return None + def local_client_url(self): if self.config.local_interactive == True: ingress_domain = _get_ingress_domain() From ec6efabff3bae5b0d04abaca5c9b331d0eee302e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 26 Jul 2023 15:02:57 -0700 Subject: [PATCH 02/15] add: submit and delete functions --- src/codeflare_sdk/cli/commands/delete.py | 17 +++++++++++++ src/codeflare_sdk/cli/commands/submit.py | 31 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 src/codeflare_sdk/cli/commands/delete.py create mode 100644 src/codeflare_sdk/cli/commands/submit.py diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py new file mode 100644 index 000000000..c3d7cde3e --- /dev/null +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -0,0 +1,17 @@ +import click + +from codeflare_sdk.cluster.cluster import get_cluster + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.option("--name", type=str) +@click.option("--namespace", type=str, default="default") +def raycluster(name, namespace): + cluster = get_cluster(name, namespace) + cluster.down() + click.echo(f"Cluster deleted successfully") diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py new file mode 100644 index 000000000..57585b469 --- /dev/null +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -0,0 +1,31 @@ +import click +import yaml +import time + +from codeflare_sdk.cluster.cluster import Cluster +from codeflare_sdk.cli.cli_utils import load_auth +import codeflare_sdk.cluster.auth as sdk_auth + + +@click.group() +def cli(): + pass + + +@cli.command() +@click.argument("cluster_name") +@click.option("--wait", type=bool, default=False) +def raycluster(cluster_name, wait): + load_auth() + cluster = Cluster.from_definition_yaml(cluster_name + ".yaml") + if not cluster: + click.echo( + "Error submitting RayCluster. Make sure the RayCluster is defined before submitting it" + ) + return + if not wait: + cluster.up() + click.echo("Cluster submitted successfully") + return + cluster.up() + cluster.wait_ready() From d6bc888c46de6ee2cecb67f7a9deabbbe5ad473b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 26 Jul 2023 15:18:36 -0700 Subject: [PATCH 03/15] change: cluster_name to name in submit raycluster --- src/codeflare_sdk/cli/commands/submit.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index 57585b469..a012e880a 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -13,11 +13,11 @@ def cli(): @cli.command() -@click.argument("cluster_name") +@click.option("--name", type=str) @click.option("--wait", type=bool, default=False) -def raycluster(cluster_name, wait): +def raycluster(name, wait): load_auth() - cluster = Cluster.from_definition_yaml(cluster_name + ".yaml") + cluster = Cluster.from_definition_yaml(name + ".yaml") if not cluster: click.echo( "Error submitting RayCluster. Make sure the RayCluster is defined before submitting it" From 21ede6101e1bcf3905ba59d35231d8f90b29378e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 26 Jul 2023 15:46:41 -0700 Subject: [PATCH 04/15] add: load_auth in delete function --- src/codeflare_sdk/cli/commands/delete.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index c3d7cde3e..066881088 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -1,6 +1,7 @@ import click from codeflare_sdk.cluster.cluster import get_cluster +from codeflare_sdk.cli.cli_utils import load_auth @click.group() @@ -12,6 +13,7 @@ def cli(): @click.option("--name", type=str) @click.option("--namespace", type=str, default="default") def raycluster(name, namespace): + load_auth() cluster = get_cluster(name, namespace) cluster.down() click.echo(f"Cluster deleted successfully") From 569b7aa136224b5d9f8ba01b44f99464c564425d Mon Sep 17 00:00:00 2001 From: carsonmh Date: Wed, 26 Jul 2023 15:55:04 -0700 Subject: [PATCH 05/15] update: make get_cluster function use new config --- src/codeflare_sdk/cluster/cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 3da2e213d..b0075dfc3 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -459,8 +459,8 @@ def get_current_namespace(): # pragma: no cover def get_cluster(cluster_name: str, namespace: str = "default"): try: - config.load_kube_config() - api_instance = client.CustomObjectsApi() + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) rcs = api_instance.list_namespaced_custom_object( group="ray.io", version="v1alpha1", From 67bfd8ab3c1fe01bad185852e9ab7d9c107dab9a Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 27 Jul 2023 10:23:36 -0700 Subject: [PATCH 06/15] test: unit tests for submit and delete raycluster commands --- tests/cli-test-case.yaml | 6 +++--- tests/unit_test.py | 33 ++++++++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/tests/cli-test-case.yaml b/tests/cli-test-case.yaml index 0788996a7..c312abfaa 100644 --- a/tests/cli-test-case.yaml +++ b/tests/cli-test-case.yaml @@ -4,7 +4,7 @@ metadata: labels: orderedinstance: cpu.small_gpu.large name: cli-test-cluster - namespace: ns + namespace: default spec: priority: 9 resources: @@ -36,7 +36,7 @@ spec: appwrapper.mcad.ibm.com: cli-test-cluster controller-tools.k8s.io: '1.0' name: cli-test-cluster - namespace: ns + namespace: default spec: autoscalerOptions: idleTimeoutSeconds: 60 @@ -184,7 +184,7 @@ spec: labels: odh-ray-cluster-service: cli-test-cluster-head-svc name: ray-dashboard-cli-test-cluster - namespace: ns + namespace: default spec: port: targetPort: dashboard diff --git a/tests/unit_test.py b/tests/unit_test.py index f8d12580a..c414e6c96 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -92,7 +92,7 @@ def test_cluster_definition_cli(): define_cluster_command = """ define raycluster --name=cli-test-cluster - --namespace=ns + --namespace=default --min_worker=1 --max_worker=2 --min_cpus=3 @@ -170,6 +170,37 @@ def test_load_auth(): assert sdk_auth.api_client is not None +def test_cli_cluster_submission(mocker): + mocker.patch.object(client, "ApiClient") + runner = CliRunner() + submit_cluster_command = """ + submit raycluster + --name=cli-test-cluster + """ + result = runner.invoke(cli, submit_cluster_command) + + assert result.exit_code == 0 + assert "Cluster submitted successfully" in result.output + + +def test_cli_cluster_deletion(mocker): + mocker.patch.object(client, "ApiClient") + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) + runner = CliRunner() + delete_cluster_command = """ + delete raycluster + --name=quicktest + """ + result = runner.invoke(cli, delete_cluster_command) + + assert result.exit_code == 0 + assert "Cluster deleted successfully" in result.output + + # For mocking openshift client results fake_res = openshift.Result("fake") From 3ff42d43103a8fba2ad4d433ba0a9dad6b6901d2 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Thu, 27 Jul 2023 13:37:49 -0700 Subject: [PATCH 07/15] change: format slightly on submit/delete commands --- src/codeflare_sdk/cli/commands/delete.py | 4 ++-- src/codeflare_sdk/cli/commands/submit.py | 4 ++-- tests/unit_test.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 066881088..5187fd5fd 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -10,8 +10,8 @@ def cli(): @cli.command() -@click.option("--name", type=str) -@click.option("--namespace", type=str, default="default") +@click.argument("name", type=str) +@click.option("--namespace", type=str, required=True) def raycluster(name, namespace): load_auth() cluster = get_cluster(name, namespace) diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index a012e880a..f1711a8aa 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -13,8 +13,8 @@ def cli(): @cli.command() -@click.option("--name", type=str) -@click.option("--wait", type=bool, default=False) +@click.argument("name", type=str) +@click.option("--wait", is_flag=True) def raycluster(name, wait): load_auth() cluster = Cluster.from_definition_yaml(name + ".yaml") diff --git a/tests/unit_test.py b/tests/unit_test.py index c414e6c96..dcd30983d 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -175,7 +175,7 @@ def test_cli_cluster_submission(mocker): runner = CliRunner() submit_cluster_command = """ submit raycluster - --name=cli-test-cluster + cli-test-cluster """ result = runner.invoke(cli, submit_cluster_command) @@ -193,7 +193,7 @@ def test_cli_cluster_deletion(mocker): runner = CliRunner() delete_cluster_command = """ delete raycluster - --name=quicktest + quicktest """ result = runner.invoke(cli, delete_cluster_command) From 8ce8edfbeb471b251f0918f7913b532728dd4b95 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Fri, 28 Jul 2023 16:11:43 -0700 Subject: [PATCH 08/15] Add: context for current namespace and .codeflare path --- src/codeflare_sdk/cli/codeflare_cli.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index f8a5cbab7..28adbc3c5 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -2,12 +2,23 @@ import sys import os +from codeflare_sdk.cluster.cluster import get_current_namespace +from codeflare_sdk.cli.cli_utils import load_auth + cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) class CodeflareContext: - def __init__(self, codeflare_path): - self.codeflare_path = codeflare_path + def __init__(self): + self.codeflare_path = _initialize_codeflare_folder() + self.namespace = get_current_namespace() + + +def _initialize_codeflare_folder(): + codeflare_folder = os.path.expanduser("~/.codeflare") + if not os.path.exists(codeflare_folder): + os.makedirs(codeflare_folder) + return codeflare_folder class CodeflareCLI(click.MultiCommand): @@ -31,18 +42,11 @@ def get_command(self, ctx, name): return -def initialize_cli(ctx): - # Make .codeflare folder - codeflare_folder = os.path.expanduser("~/.codeflare") - if not os.path.exists(codeflare_folder): - os.makedirs(codeflare_folder) - ctx.obj = CodeflareContext(codeflare_folder) - - @click.command(cls=CodeflareCLI) @click.pass_context def cli(ctx): - initialize_cli(ctx) # Ran on every command + load_auth() + ctx.obj = CodeflareContext() # Ran on every command pass From 7fe8461515c8df00be4dc21e050f5a2ddc87dd4f Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 11:18:10 -0700 Subject: [PATCH 09/15] fix: remove load_auth in functions so it doesn't run twice --- carson.yaml | 173 +++++++++++++++++++++++ src/codeflare_sdk/cli/commands/delete.py | 1 - src/codeflare_sdk/cli/commands/submit.py | 1 - 3 files changed, 173 insertions(+), 2 deletions(-) create mode 100644 carson.yaml diff --git a/carson.yaml b/carson.yaml new file mode 100644 index 000000000..79ff972cb --- /dev/null +++ b/carson.yaml @@ -0,0 +1,173 @@ +apiVersion: mcad.ibm.com/v1beta1 +kind: AppWrapper +metadata: + name: carson + namespace: default +spec: + priority: 9 + resources: + GenericItems: + - custompodresources: + - limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + - limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + replicas: 1 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + generictemplate: + apiVersion: ray.io/v1alpha1 + kind: RayCluster + metadata: + labels: + appwrapper.mcad.ibm.com: carson + controller-tools.k8s.io: '1.0' + name: carson + namespace: default + spec: + autoscalerOptions: + idleTimeoutSeconds: 60 + imagePullPolicy: Always + resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 500m + memory: 512Mi + upscalingMode: Default + enableInTreeAutoscaling: false + headGroupSpec: + rayStartParams: + block: 'true' + dashboard-host: 0.0.0.0 + num-gpus: '0' + serviceType: ClusterIP + template: + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + imagePullPolicy: Always + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: ray-head + ports: + - containerPort: 6379 + name: gcs + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + resources: + limits: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + requests: + cpu: 2 + memory: 8G + nvidia.com/gpu: 0 + imagePullSecrets: [] + rayVersion: 2.1.0 + workerGroupSpecs: + - groupName: small-group-carson + maxReplicas: 1 + minReplicas: 1 + rayStartParams: + block: 'true' + num-gpus: '0' + replicas: 1 + template: + metadata: + annotations: + key: value + labels: + key: value + spec: + containers: + - env: + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: RAY_USE_TLS + value: '0' + - name: RAY_TLS_SERVER_CERT + value: /home/ray/workspace/tls/server.crt + - name: RAY_TLS_SERVER_KEY + value: /home/ray/workspace/tls/server.key + - name: RAY_TLS_CA_CERT + value: /home/ray/workspace/tls/ca.crt + image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - ray stop + name: machine-learning + resources: + limits: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + requests: + cpu: 1 + memory: 2G + nvidia.com/gpu: 0 + imagePullSecrets: [] + initContainers: + - command: + - sh + - -c + - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; + do echo waiting for myservice; sleep 2; done + image: busybox:1.28 + name: init-myservice + replicas: 1 + - generictemplate: + apiVersion: route.openshift.io/v1 + kind: Route + metadata: + labels: + odh-ray-cluster-service: carson-head-svc + name: ray-dashboard-carson + namespace: default + spec: + port: + targetPort: dashboard + to: + kind: Service + name: carson-head-svc + replica: 1 + Items: [] diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 5187fd5fd..dbb7d3b8d 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -13,7 +13,6 @@ def cli(): @click.argument("name", type=str) @click.option("--namespace", type=str, required=True) def raycluster(name, namespace): - load_auth() cluster = get_cluster(name, namespace) cluster.down() click.echo(f"Cluster deleted successfully") diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index f1711a8aa..f95cff095 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -16,7 +16,6 @@ def cli(): @click.argument("name", type=str) @click.option("--wait", is_flag=True) def raycluster(name, wait): - load_auth() cluster = Cluster.from_definition_yaml(name + ".yaml") if not cluster: click.echo( From 1355455e39f7ef07d1a3f87f2d3916d2cf05581b Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 11:20:28 -0700 Subject: [PATCH 10/15] Add: help messages for submit and delete functions --- src/codeflare_sdk/cli/commands/delete.py | 6 ++++++ src/codeflare_sdk/cli/commands/submit.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index dbb7d3b8d..38026990c 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -6,6 +6,9 @@ @click.group() def cli(): + """ + Delete a specified resource from the Kubernetes cluster + """ pass @@ -13,6 +16,9 @@ def cli(): @click.argument("name", type=str) @click.option("--namespace", type=str, required=True) def raycluster(name, namespace): + """ + Delete a specified RayCluster from the Kubernetes cluster + """ cluster = get_cluster(name, namespace) cluster.down() click.echo(f"Cluster deleted successfully") diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index f95cff095..7734154d7 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -9,6 +9,9 @@ @click.group() def cli(): + """ + Submit a defined resource to the Kubernetes cluster + """ pass @@ -16,6 +19,9 @@ def cli(): @click.argument("name", type=str) @click.option("--wait", is_flag=True) def raycluster(name, wait): + """ + Submit a defined RayCluster to the Kubernetes cluster + """ cluster = Cluster.from_definition_yaml(name + ".yaml") if not cluster: click.echo( From c40601c8be571c54bd86e311757c7708c3e214b9 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 11:43:48 -0700 Subject: [PATCH 11/15] cleanup --- carson.yaml | 173 --------------------------------------------- tests/unit_test.py | 1 - 2 files changed, 174 deletions(-) delete mode 100644 carson.yaml diff --git a/carson.yaml b/carson.yaml deleted file mode 100644 index 79ff972cb..000000000 --- a/carson.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: carson - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: carson - controller-tools.k8s.io: '1.0' - name: carson - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-carson - maxReplicas: 1 - minReplicas: 1 - rayStartParams: - block: 'true' - num-gpus: '0' - replicas: 1 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: quay.io/project-codeflare/ray:2.5.0-py38-cu116 - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 0 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: carson-head-svc - name: ray-dashboard-carson - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: carson-head-svc - replica: 1 - Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index dcd30983d..cdb3f66f3 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2353,4 +2353,3 @@ def test_cleanup(): os.remove("tls-cluster-namespace/tls.key") os.rmdir("tls-cluster-namespace") os.remove("cli-test-cluster.yaml") - os.removedirs(os.path.expanduser("~/.codeflare")) From edfe0b94e2a9fbd505309b502f1f5b2282c5130e Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:09:26 -0700 Subject: [PATCH 12/15] remove: remove get_namespace every function call --- src/codeflare_sdk/cli/codeflare_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 28adbc3c5..05b1be169 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -11,7 +11,6 @@ class CodeflareContext: def __init__(self): self.codeflare_path = _initialize_codeflare_folder() - self.namespace = get_current_namespace() def _initialize_codeflare_folder(): From dca9bab970dbc4b8d0c89aa36ed858f9aaff7b69 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:13:35 -0700 Subject: [PATCH 13/15] fix: fix tests --- tests/unit_test.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tests/unit_test.py b/tests/unit_test.py index cdb3f66f3..fd965ebea 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -87,7 +87,8 @@ def test_cli_working(): assert result.exit_code == 0 -def test_cluster_definition_cli(): +def test_cluster_definition_cli(mocker): + mocker.patch.object(client, "ApiClient") runner = CliRunner() define_cluster_command = """ define raycluster @@ -105,7 +106,10 @@ def test_cluster_definition_cli(): --image_pull_secrets='["cli-test-pull-secret"]' """ result = runner.invoke(cli, define_cluster_command) - assert result.output == "Written to: cli-test-cluster.yaml\n" + assert ( + result.output + == "No authentication found, trying default kubeconfig\nWritten to: cli-test-cluster.yaml\n" + ) assert filecmp.cmp( "cli-test-cluster.yaml", f"{parent}/tests/cli-test-case.yaml", shallow=True ) @@ -120,7 +124,10 @@ def test_login_cli(mocker): --token=testtoken """ login_result = runner.invoke(cli, k8s_login_command) - assert login_result.output == "Logged into 'testserver:6443'\n" + assert ( + login_result.output + == "No authentication found, trying default kubeconfig\nLogged into 'testserver:6443'\n" + ) try: auth_file_path = os.path.expanduser("~/.codeflare/auth") with open(auth_file_path, "rb") as file: @@ -170,7 +177,7 @@ def test_load_auth(): assert sdk_auth.api_client is not None -def test_cli_cluster_submission(mocker): +def test_cluster_submission_cli(mocker): mocker.patch.object(client, "ApiClient") runner = CliRunner() submit_cluster_command = """ @@ -183,7 +190,7 @@ def test_cli_cluster_submission(mocker): assert "Cluster submitted successfully" in result.output -def test_cli_cluster_deletion(mocker): +def test_cluster_deletion_cli(mocker): mocker.patch.object(client, "ApiClient") mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( From 30ec230b109e585acc7b56a12d5088966853ea39 Mon Sep 17 00:00:00 2001 From: carsonmh Date: Mon, 31 Jul 2023 12:13:55 -0700 Subject: [PATCH 14/15] change: make namespace default to 'default' and change test slightly --- src/codeflare_sdk/cli/commands/delete.py | 2 +- tests/unit_test.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 38026990c..4cd71dc0a 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -14,7 +14,7 @@ def cli(): @cli.command() @click.argument("name", type=str) -@click.option("--namespace", type=str, required=True) +@click.option("--namespace", type=str, default="default") def raycluster(name, namespace): """ Delete a specified RayCluster from the Kubernetes cluster diff --git a/tests/unit_test.py b/tests/unit_test.py index fd965ebea..783ec928f 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -2360,3 +2360,4 @@ def test_cleanup(): os.remove("tls-cluster-namespace/tls.key") os.rmdir("tls-cluster-namespace") os.remove("cli-test-cluster.yaml") + os.removedirs(os.path.expanduser("~/.codeflare")) From 2943dce24e296e4157a6c5240a8f415c28f5c8ba Mon Sep 17 00:00:00 2001 From: carsonmh Date: Tue, 1 Aug 2023 10:58:09 -0700 Subject: [PATCH 15/15] refactor: remove unused imports --- src/codeflare_sdk/cli/codeflare_cli.py | 2 -- src/codeflare_sdk/cli/commands/delete.py | 1 - src/codeflare_sdk/cli/commands/submit.py | 4 ---- 3 files changed, 7 deletions(-) diff --git a/src/codeflare_sdk/cli/codeflare_cli.py b/src/codeflare_sdk/cli/codeflare_cli.py index 05b1be169..78354695f 100644 --- a/src/codeflare_sdk/cli/codeflare_cli.py +++ b/src/codeflare_sdk/cli/codeflare_cli.py @@ -1,8 +1,6 @@ import click -import sys import os -from codeflare_sdk.cluster.cluster import get_current_namespace from codeflare_sdk.cli.cli_utils import load_auth cmd_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "commands")) diff --git a/src/codeflare_sdk/cli/commands/delete.py b/src/codeflare_sdk/cli/commands/delete.py index 4cd71dc0a..c1ec12451 100644 --- a/src/codeflare_sdk/cli/commands/delete.py +++ b/src/codeflare_sdk/cli/commands/delete.py @@ -1,7 +1,6 @@ import click from codeflare_sdk.cluster.cluster import get_cluster -from codeflare_sdk.cli.cli_utils import load_auth @click.group() diff --git a/src/codeflare_sdk/cli/commands/submit.py b/src/codeflare_sdk/cli/commands/submit.py index 7734154d7..8a476d602 100644 --- a/src/codeflare_sdk/cli/commands/submit.py +++ b/src/codeflare_sdk/cli/commands/submit.py @@ -1,10 +1,6 @@ import click -import yaml -import time from codeflare_sdk.cluster.cluster import Cluster -from codeflare_sdk.cli.cli_utils import load_auth -import codeflare_sdk.cluster.auth as sdk_auth @click.group()