Skip to content

Commit 09181c7

Browse files
Updated ray-finetune-test flow to reuse existing demo files from examples directory
1 parent 0de09b7 commit 09181c7

21 files changed

+121
-1308
lines changed

examples/ray-finetune-llm-deepspeed/create_dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22
import json
33
import os
44

5-
dataset = load_dataset("gsm8k", "main", cache_dir="../../datasets")
5+
cache_dir="../../datasets"
6+
if not os.path.exists(cache_dir):
7+
cache_dir=""
8+
dataset = load_dataset("gsm8k", "main", cache_dir=cache_dir)
69

710
dataset_splits = {"train": dataset["train"], "test": dataset["test"]}
811

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
"# On OpenShift, you can retrieve the token by running `oc whoami -t`,\n",
4343
"# and the server with `oc cluster-info`.\n",
4444
"auth = TokenAuthentication(\n",
45-
" token = \"\",\n",
46-
" server = \"\",\n",
45+
" token = '',\n",
46+
" server = '',\n",
4747
" skip_tls=False\n",
4848
")\n",
4949
"auth.login()"
@@ -69,7 +69,7 @@
6969
" head_memory=128,\n",
7070
" head_gpus=1,\n",
7171
" num_gpus=1,\n",
72-
" image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
72+
" image='quay.io/rhoai/ray:2.23.0-py39-cu121',\n",
7373
"))"
7474
]
7575
},
@@ -126,7 +126,7 @@
126126
"source": [
127127
"# The S3 bucket where to store checkpoint.\n",
128128
"# It can be set manually, otherwise it's retrieved from configured the data connection.\n",
129-
"s3_bucket = \"\"\n",
129+
"s3_bucket = ''\n",
130130
"if not s3_bucket:\n",
131131
" s3_bucket = os.environ.get('AWS_S3_BUCKET')\n",
132132
"assert s3_bucket, \"An S3 bucket must be provided to store checkpoints\""
@@ -153,12 +153,12 @@
153153
" \"--eval-batch-size-per-device=32 \",\n",
154154
" runtime_env={\n",
155155
" \"env_vars\": {\n",
156-
" \"AWS_ACCESS_KEY_ID\": os.environ.get('AWS_ACCESS_KEY_ID'),\n",
157-
" \"AWS_SECRET_ACCESS_KEY\": os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
158-
" \"AWS_DEFAULT_REGION\": os.environ.get('AWS_DEFAULT_REGION')\n",
156+
" 'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),\n",
157+
" 'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
158+
" 'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION')\n",
159159
" },\n",
160-
" \"pip\": \"requirements.txt\",\n",
161-
" \"working_dir\": \"./\",\n",
160+
" 'pip': 'requirements.txt',\n",
161+
" 'working_dir': './',\n",
162162
" \"excludes\": [\"/docs/\", \"*.ipynb\", \"*.md\"]\n",
163163
" },\n",
164164
")\n",

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -592,6 +592,9 @@ def parse_args():
592592

593593
parser.add_argument("--lora", action="store_true", default=False,
594594
help="If passed, will enable parameter efficient fine-tuning with LoRA.")
595+
596+
parser.add_argument("--lora-config", type=str, default="./lora_configs/lora.json",
597+
help="Lora config json to use.")
595598

596599
parser.add_argument("--num-epochs", type=int, default=1,
597600
help="Number of epochs to train for.")
@@ -660,7 +663,7 @@ def main():
660663

661664
# Add LoRA config if needed
662665
if args.lora:
663-
with open("./lora_configs/lora.json", "r") as json_file:
666+
with open(args.lora_config, "r") as json_file:
664667
lora_config = json.load(json_file)
665668
config["lora_config"] = lora_config
666669

tests/odh/mnist_ray_test.go

Lines changed: 4 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func mnistRay(t *testing.T, numGpus int) {
8181

8282
// Test configuration
8383
jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb"
84-
mnist := readMnistPy(test)
84+
mnist := readMnistPy(test, "resources/mnist.py")
8585
if numGpus > 0 {
8686
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
8787
} else {
@@ -91,7 +91,7 @@ func mnistRay(t *testing.T, numGpus int) {
9191
// MNIST Ray Notebook
9292
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"),
9393
"mnist.py": mnist,
94-
"requirements.txt": readRequirementsTxt(test),
94+
"requirements.txt": ReadFile(test, "resources/requirements.txt"),
9595
})
9696

9797
// Define the regular(non-admin) user
@@ -133,27 +133,7 @@ func mnistRay(t *testing.T, numGpus int) {
133133
Should(HaveLen(0))
134134
}
135135

136-
func readRequirementsTxt(test Test) []byte {
137-
// Read the requirements.txt from resources and perform replacements for custom values using go template
138-
props := struct {
139-
PipIndexUrl string
140-
PipTrustedHost string
141-
}{
142-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
143-
}
144-
145-
// Provide trusted host only if defined
146-
if len(GetPipTrustedHost()) > 0 {
147-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
148-
}
149-
150-
template, err := files.ReadFile("resources/requirements.txt")
151-
test.Expect(err).NotTo(HaveOccurred())
152-
153-
return ParseTemplate(test, template, props)
154-
}
155-
156-
func readMnistPy(test Test) []byte {
136+
func readMnistPy(test Test, filePath string) []byte {
157137
// Read the mnist.py from resources and perform replacements for custom values using go template
158138
storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
159139
storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
@@ -184,7 +164,7 @@ func readMnistPy(test Test) []byte {
184164
StorageBucketMnistDir: storage_bucket_mnist_dir,
185165
StorageBucketMnistDirExists: storage_bucket_mnist_dir_exists,
186166
}
187-
template, err := files.ReadFile("resources/mnist.py")
167+
template, err := files.ReadFile(filePath)
188168
test.Expect(err).NotTo(HaveOccurred())
189169

190170
return ParseTemplate(test, template, props)

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
8080

8181
// Test configuration
8282
jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
83-
mnist_hpo := ReadFile(test, "resources/mnist_hpo.py")
83+
mnist_hpo := readMnistPy(test, "resources/mnist_hpo.py")
8484

8585
if numGpus > 0 {
8686
mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"1\""), 1)

tests/odh/notebook.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,23 +45,29 @@ type NotebookProps struct {
4545
NotebookConfigMapFileName string
4646
NotebookPVC string
4747
NumGpus int
48+
PipIndexUrl string
49+
PipTrustedHost string
4850
S3BucketName string
49-
S3BucketNameExists bool
5051
S3AccessKeyId string
51-
S3AccessKeyIdExists bool
5252
S3SecretAccessKey string
53-
S3SecretAccessKeyExists bool
5453
S3DefaultRegion string
5554
}
5655

5756
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5857
// Create PVC for Notebook
5958
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
60-
s3BucketName, _ := GetStorageBucketName()
59+
s3BucketName, s3BucketNameExists := GetStorageBucketName()
6160
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
6261
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
6362
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
6463

64+
if !s3BucketNameExists {
65+
s3BucketName = "''"
66+
s3AccessKeyId = "''"
67+
s3SecretAccessKey = "''"
68+
s3DefaultRegion = "''"
69+
}
70+
6571
// Read the Notebook CR from resources and perform replacements for custom values using go template
6672
notebookProps := NotebookProps{
6773
IngressDomain: GetOpenShiftIngressDomain(test),
@@ -79,6 +85,8 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, j
7985
S3AccessKeyId: s3AccessKeyId,
8086
S3SecretAccessKey: s3SecretAccessKey,
8187
S3DefaultRegion: s3DefaultRegion,
88+
PipIndexUrl: GetPipIndexURL(),
89+
PipTrustedHost: GetPipTrustedHost(),
8290
}
8391
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
8492
test.Expect(err).NotTo(gomega.HaveOccurred())

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -17,39 +17,27 @@ limitations under the License.
1717
package odh
1818

1919
import (
20+
"fmt"
21+
"os"
22+
"strings"
2023
"testing"
2124

2225
. "github.com/onsi/gomega"
2326
. "github.com/project-codeflare/codeflare-common/support"
2427
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
2528
)
2629

27-
func TestRayFinetuneDemo(t *testing.T) {
28-
mnistRayLlmFinetune(t, 1)
30+
func TestRayFinetuneLlmDeepspeedDemo(t *testing.T) {
31+
rayFinetuneLlmDeepspeed(t, 1)
2932
}
3033

31-
func mnistRayLlmFinetune(t *testing.T, numGpus int) {
34+
func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
3235
test := With(t)
3336

3437
// Create a namespace
3538
namespace := test.NewTestNamespace()
36-
37-
// Test configuration
38-
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
39-
40-
// Test configuration
41-
configMap := map[string][]byte{
42-
// MNIST Ray Notebook
43-
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb"),
44-
"ray_finetune_llm_deepspeed.py": ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.py"),
45-
"ray_finetune_requirements.txt": ReadRayFinetuneRequirementsTxt(test),
46-
"create_dataset.py": ReadFile(test, "resources/ray_finetune_demo/create_dataset.py"),
47-
"lora.json": ReadFile(test, "resources/ray_finetune_demo/lora.json"),
48-
"zero_3_llama_2_7b.json": ReadFile(test, "resources/ray_finetune_demo/zero_3_llama_2_7b.json"),
49-
"utils.py": ReadFile(test, "resources/ray_finetune_demo/utils.py"),
50-
}
51-
52-
config := CreateConfigMap(test, namespace.Name, configMap)
39+
var workingDirectory, err = os.Getwd()
40+
test.Expect(err).ToNot(HaveOccurred())
5341

5442
// Define the regular(non-admin) user
5543
userName := GetNotebookUserName(test)
@@ -58,6 +46,53 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
5846
// Create role binding with Namespace specific admin cluster role
5947
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
6048

49+
// list changes required in llm-deepspeed-finetune-demo.ipynb file and update those
50+
requiredChangesInNotebook := map[string]string{
51+
"import os": "import os,time,sys",
52+
"import sys": "!cp /opt/app-root/notebooks/* ./",
53+
"from codeflare_sdk.cluster.auth import TokenAuthentication": "from codeflare_sdk.cluster.auth import TokenAuthentication\\n\",\n\t\"from codeflare_sdk.job import RayJobClient",
54+
"token = ''": fmt.Sprintf("token = '%s'", userToken),
55+
"server = ''": fmt.Sprintf("server = '%s'", GetOpenShiftApiUrl(test)),
56+
"namespace='ray-finetune-llm-deepspeed'": fmt.Sprintf("namespace='%s'", namespace.Name),
57+
"head_cpus=16": "head_cpus=2",
58+
"head_gpus=1": "head_gpus=0",
59+
"num_workers=7": "num_workers=1",
60+
"min_cpus=16": "min_cpus=4",
61+
"max_cpus=16": "max_cpus=4",
62+
"min_memory=128": "min_memory=48",
63+
"max_memory=256": "max_memory=48",
64+
"head_memory=128": "head_memory=48",
65+
"num_gpus=1": fmt.Sprintf("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\n\",\n\t\" write_to_file=True,\\n\",\n\t\" verify_tls=False", numGpus),
66+
"image='quay.io/rhoai/ray:2.23.0-py39-cu121'": fmt.Sprintf("image='%s'", GetRayImage()),
67+
"client = cluster.job_client": "ray_dashboard = cluster.cluster_dashboard_uri()\\n\",\n\t\"header = {\\\"Authorization\\\": \\\"Bearer " + userToken + "\\\"}\\n\",\n\t\"client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\n",
68+
"--num-devices=8": fmt.Sprintf("--num-devices=%d", numGpus),
69+
"--num-epochs=3": fmt.Sprintf("--num-epochs=%d", 1),
70+
"--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json": "--ds-config=./zero_3_llama_2_7b.json \\\"\\n\",\n\t\" \\\"--lora-config=./lora.json \\\"\\n\",\n\t\" \\\"--as-test",
71+
"'pip': 'requirements.txt'": "'pip': '/opt/app-root/src/requirements.txt'",
72+
"'working_dir': './'": "'working_dir': '/opt/app-root/src'",
73+
"client.stop_job(submission_id)": "finished = False\\n\",\n\t\"while not finished:\\n\",\n\t\" time.sleep(1)\\n\",\n\t\" status = client.get_job_status(submission_id)\\n\",\n\t\" finished = (status == \\\"SUCCEEDED\\\")\\n\",\n\t\"if finished:\\n\",\n\t\" print(\\\"Job completed Successfully !\\\")\\n\",\n\t\"else:\\n\",\n\t\" print(\\\"Job failed !\\\")\\n\",\n\t\"time.sleep(10)\\n",
74+
}
75+
76+
updatedNotebookContent := string(ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb"))
77+
for oldValue, newValue := range requiredChangesInNotebook {
78+
updatedNotebookContent = strings.Replace(updatedNotebookContent, oldValue, newValue, -1)
79+
}
80+
updatedNotebook := []byte(updatedNotebookContent)
81+
82+
// Test configuration
83+
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
84+
configMap := map[string][]byte{
85+
jupyterNotebookConfigMapFileName: updatedNotebook,
86+
"ray_finetune_llm_deepspeed.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py"),
87+
"requirements.txt": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/requirements.txt"),
88+
"create_dataset.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/create_dataset.py"),
89+
"lora.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json"),
90+
"zero_3_llama_2_7b.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json"),
91+
"utils.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/utils.py"),
92+
}
93+
94+
config := CreateConfigMap(test, namespace.Name, configMap)
95+
6196
// Create Notebook CR
6297
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
6398

@@ -77,26 +112,6 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
77112
)
78113

79114
// Make sure the RayCluster finishes and is deleted
80-
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutGpuProvisioning).
115+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutMedium).
81116
Should(HaveLen(0))
82117
}
83-
84-
func ReadRayFinetuneRequirementsTxt(test Test) []byte {
85-
// Read the requirements.txt from resources and perform replacements for custom values using go template
86-
props := struct {
87-
PipIndexUrl string
88-
PipTrustedHost string
89-
}{
90-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
91-
}
92-
93-
// Provide trusted host only if defined
94-
if len(GetPipTrustedHost()) > 0 {
95-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
96-
}
97-
98-
template, err := files.ReadFile("resources/ray_finetune_demo/ray_finetune_requirements.txt")
99-
test.Expect(err).NotTo(HaveOccurred())
100-
101-
return ParseTemplate(test, template, props)
102-
}

tests/odh/resources/custom-nb-small.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,20 @@ spec:
5050
value: {{.NotebookImage}}
5151
- name: JUPYTER_NOTEBOOK_PORT
5252
value: "8888"
53+
- name: AWS_ACCESS_KEY_ID
54+
value: {{.S3AccessKeyId}}
55+
- name: AWS_SECRET_ACCESS_KEY
56+
value: {{.S3SecretAccessKey}}
57+
- name: AWS_DEFAULT_REGION
58+
value: {{.S3DefaultRegion}}
59+
- name: AWS_S3_BUCKET
60+
value: {{.S3BucketName}}
61+
- name: PIP_INDEX_URL
62+
value: {{.PipIndexUrl}}
63+
- name: PIP_TRUSTED_HOST
64+
value: {{.PipTrustedHost}}
5365
image: {{.NotebookImage}}
54-
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} -p s3_bucket_name {{.S3BucketName}} -p s3_access_key_id {{.S3AccessKeyId}} -p s3_secret_access_key {{.S3SecretAccessKey}} -p s3_default_region {{.S3DefaultRegion}} --log-output && sleep infinity"]
66+
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
5567
# args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ]
5668
imagePullPolicy: Always
5769
# livenessProbe:
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
torchvision==0.18.0
1+
torchvision==0.18.0

tests/odh/resources/mnist_hpo.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,4 +142,4 @@ def train_mnist(config):
142142

143143
print("Best hyperparameters config is:", results.get_best_result().config)
144144

145-
assert not results.errors
145+
assert not results.errors

tests/odh/resources/mnist_hpo_raytune.ipynb

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
11
{
22
"cells": [
3-
{
4-
"cell_type": "code",
5-
"execution_count": null,
6-
"id": "5c8a5392",
7-
"metadata": {},
8-
"outputs": [],
9-
"source": [
10-
"%pip install codeflare-sdk -U"
11-
]
12-
},
133
{
144
"cell_type": "code",
155
"execution_count": null,
@@ -52,7 +42,6 @@
5242
"#parameters\n",
5343
"namespace = \"default\"\n",
5444
"ray_image = \"has to be specified\"\n",
55-
"local_queue = \"has to be specified\"\n",
5645
"openshift_api_url = \"has to be specified\"\n",
5746
"kubernetes_user_bearer_token = \"has to be specified\"\n",
5847
"num_gpus = \"has to be specified\""

0 commit comments

Comments
 (0)