Skip to content

Commit e38f772

Browse files
Automate LLM Finetune Deepspeed demo
1 parent 53ffaa6 commit e38f772

16 files changed

+1363
-20
lines changed

tests/kfto/core/kfto_kueue_sft_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,10 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
7979
}
8080
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
8181
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
82-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
82+
annotations := map[string]string{
83+
"kueue.x-k8s.io/default-queue": "true",
84+
}
85+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, annotations)
8386

8487
// Create training PyTorch job
8588
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
@@ -143,7 +146,10 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
143146
}
144147
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
145148
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
146-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
149+
annotations := map[string]string{
150+
"kueue.x-k8s.io/default-queue": "true",
151+
}
152+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, annotations)
147153

148154
// Create first training PyTorch job
149155
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,10 @@ func TestSetupPytorchjob(t *testing.T) {
9898
}
9999
clusterQueue, err = test.Client().Kueue().KueueV1beta1().ClusterQueues().Create(test.Ctx(), clusterQueue, metav1.CreateOptions{})
100100
test.Expect(err).NotTo(HaveOccurred())
101-
102-
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name)
101+
annotations := map[string]string{
102+
"kueue.x-k8s.io/default-queue": "true",
103+
}
104+
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name, annotations)
103105

104106
// Create training PyTorch job
105107
tuningJob := createPyTorchJob(test, namespaceName, localQueue.Name, *config)

tests/odh/mnist_ray_test.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,10 @@ func mnistRay(t *testing.T, numGpus int) {
7777
}
7878
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7979
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
80-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
80+
annotations := map[string]string{
81+
"kueue.x-k8s.io/default-queue": "true",
82+
}
83+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, annotations)
8184

8285
// Test configuration
8386
jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb"
@@ -102,7 +105,7 @@ func mnistRay(t *testing.T, numGpus int) {
102105
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
103106

104107
// Create Notebook CR
105-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
108+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106109

107110
// Gracefully cleanup Notebook
108111
defer func() {
@@ -111,7 +114,7 @@ func mnistRay(t *testing.T, numGpus int) {
111114
}()
112115

113116
// Make sure the RayCluster is created and running
114-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
117+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
115118
Should(
116119
And(
117120
HaveLen(1),
@@ -129,7 +132,7 @@ func mnistRay(t *testing.T, numGpus int) {
129132
)
130133

131134
// Make sure the RayCluster finishes and is deleted
132-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
135+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
133136
Should(HaveLen(0))
134137
}
135138

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,10 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
7676
}
7777
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7878
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
79-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
79+
annotations := map[string]string{
80+
"kueue.x-k8s.io/default-queue": "true",
81+
}
82+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, annotations)
8083

8184
// Test configuration
8285
jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
@@ -103,7 +106,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
103106
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
104107

105108
// Create Notebook CR
106-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
109+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107110

108111
// Gracefully cleanup Notebook
109112
defer func() {

tests/odh/notebook.go

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,26 @@ type NotebookProps struct {
4141
OpenDataHubNamespace string
4242
RayImage string
4343
NotebookImage string
44-
LocalQueue string
4544
NotebookConfigMapName string
4645
NotebookConfigMapFileName string
4746
NotebookPVC string
4847
NumGpus int
48+
S3BucketName string
49+
S3BucketNameExists bool
50+
S3AccessKeyId string
51+
S3AccessKeyIdExists bool
52+
S3SecretAccessKey string
53+
S3SecretAccessKeyExists bool
54+
S3DefaultRegion string
4955
}
5056

51-
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, localQueue, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
57+
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5258
// Create PVC for Notebook
5359
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
60+
s3BucketName, _ := GetStorageBucketName()
61+
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
62+
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
63+
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
5464

5565
// Read the Notebook CR from resources and perform replacements for custom values using go template
5666
notebookProps := NotebookProps{
@@ -61,11 +71,14 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, l
6171
OpenDataHubNamespace: GetOpenDataHubNamespace(test),
6272
RayImage: GetRayImage(),
6373
NotebookImage: GetNotebookImage(test),
64-
LocalQueue: localQueue,
6574
NotebookConfigMapName: jupyterNotebookConfigMapName,
6675
NotebookConfigMapFileName: jupyterNotebookConfigMapFileName,
6776
NotebookPVC: notebookPVC.Name,
6877
NumGpus: numGpus,
78+
S3BucketName: s3BucketName,
79+
S3AccessKeyId: s3AccessKeyId,
80+
S3SecretAccessKey: s3SecretAccessKey,
81+
S3DefaultRegion: s3DefaultRegion,
6982
}
7083
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
7184
test.Expect(err).NotTo(gomega.HaveOccurred())
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
/*
2+
Copyright 2023.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package odh
18+
19+
import (
20+
"testing"
21+
22+
. "github.com/onsi/gomega"
23+
. "github.com/project-codeflare/codeflare-common/support"
24+
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
25+
)
26+
27+
func TestRayFinetuneDemo(t *testing.T) {
28+
mnistRayLlmFinetune(t, 1)
29+
}
30+
31+
func mnistRayLlmFinetune(t *testing.T, numGpus int) {
32+
test := With(t)
33+
34+
// Create a namespace
35+
namespace := test.NewTestNamespace()
36+
37+
// Test configuration
38+
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
39+
40+
// Test configuration
41+
configMap := map[string][]byte{
42+
// MNIST Ray Notebook
43+
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb"),
44+
"ray_finetune_llm_deepspeed.py": ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.py"),
45+
"ray_finetune_requirements.txt": ReadRayFinetuneRequirementsTxt(test),
46+
"create_dataset.py": ReadFile(test, "resources/ray_finetune_demo/create_dataset.py"),
47+
"lora.json": ReadFile(test, "resources/ray_finetune_demo/lora.json"),
48+
"zero_3_llama_2_7b.json": ReadFile(test, "resources/ray_finetune_demo/zero_3_llama_2_7b.json"),
49+
"utils.py": ReadFile(test, "resources/ray_finetune_demo/utils.py"),
50+
}
51+
52+
config := CreateConfigMap(test, namespace.Name, configMap)
53+
54+
// Define the regular(non-admin) user
55+
userName := GetNotebookUserName(test)
56+
userToken := GetNotebookUserToken(test)
57+
58+
// Create role binding with Namespace specific admin cluster role
59+
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
60+
61+
// Create Notebook CR
62+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
63+
64+
// Gracefully cleanup Notebook
65+
defer func() {
66+
deleteNotebook(test, namespace)
67+
test.Eventually(listNotebooks(test, namespace), TestTimeoutGpuProvisioning).Should(HaveLen(0))
68+
}()
69+
70+
// Make sure the RayCluster is created and running
71+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutGpuProvisioning).
72+
Should(
73+
And(
74+
HaveLen(1),
75+
ContainElement(WithTransform(RayClusterState, Equal(rayv1.Ready))),
76+
),
77+
)
78+
79+
// Make sure the RayCluster finishes and is deleted
80+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutGpuProvisioning).
81+
Should(HaveLen(0))
82+
}
83+
84+
func ReadRayFinetuneRequirementsTxt(test Test) []byte {
85+
// Read the requirements.txt from resources and perform replacements for custom values using go template
86+
props := struct {
87+
PipIndexUrl string
88+
PipTrustedHost string
89+
}{
90+
PipIndexUrl: "--index " + string(GetPipIndexURL()),
91+
}
92+
93+
// Provide trusted host only if defined
94+
if len(GetPipTrustedHost()) > 0 {
95+
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
96+
}
97+
98+
template, err := files.ReadFile("resources/ray_finetune_demo/ray_finetune_requirements.txt")
99+
test.Expect(err).NotTo(HaveOccurred())
100+
101+
return ParseTemplate(test, template, props)
102+
}

tests/odh/resources/custom-nb-small.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ spec:
5151
- name: JUPYTER_NOTEBOOK_PORT
5252
value: "8888"
5353
image: {{.NotebookImage}}
54-
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p local_queue {{.LocalQueue}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
54+
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} -p s3_bucket_name {{.S3BucketName}} -p s3_access_key_id {{.S3AccessKeyId}} -p s3_secret_access_key {{.S3SecretAccessKey}} -p s3_default_region {{.S3DefaultRegion}} --log-output && sleep infinity"]
5555
# args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ]
5656
imagePullPolicy: Always
5757
# livenessProbe:

tests/odh/resources/mnist_hpo_raytune.ipynb

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "5c8a5392",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"%pip install codeflare-sdk -U"
11+
]
12+
},
313
{
414
"cell_type": "code",
515
"execution_count": null,
@@ -87,7 +97,6 @@
8797
" max_memory=4,\n",
8898
" num_gpus=int(num_gpus),\n",
8999
" image=ray_image,\n",
90-
" local_queue=local_queue,\n",
91100
" write_to_file=True,\n",
92101
" verify_tls=False\n",
93102
" )\n",

tests/odh/resources/mnist_ray_mini.ipynb

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
{
22
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "df737457",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"%pip install codeflare-sdk -U"
11+
]
12+
},
313
{
414
"cell_type": "code",
515
"execution_count": null,
@@ -41,11 +51,10 @@
4151
"source": [
4252
"#parameters\n",
4353
"namespace = \"default\"\n",
44-
"ray_image = \"has to be specified\"\n",
45-
"local_queue = \"has to be specified\"\n",
4654
"openshift_api_url = \"has to be specified\"\n",
4755
"kubernetes_user_bearer_token = \"has to be specified\"\n",
48-
"num_gpus = \"has to be specified\""
56+
"num_gpus = \"has to be specified\"\n",
57+
"print(\"*\"*8, namespace,openshift_api_url,kubernetes_user_bearer_token, num_gpus)"
4958
]
5059
},
5160
{
@@ -86,8 +95,6 @@
8695
" min_memory=1,\n",
8796
" max_memory=4,\n",
8897
" num_gpus=int(num_gpus),\n",
89-
" image=ray_image,\n",
90-
" local_queue=local_queue,\n",
9198
" write_to_file=True,\n",
9299
" verify_tls=False\n",
93100
" )\n",
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from datasets import load_dataset
2+
import json
3+
import os
4+
5+
dataset = load_dataset("gsm8k", "main")
6+
7+
dataset_splits = {"train": dataset["train"], "test": dataset["test"]}
8+
9+
10+
def main():
11+
if not os.path.exists("data"):
12+
os.mkdir("data")
13+
14+
with open("data/tokens.json", "w") as f:
15+
tokens = {}
16+
tokens["tokens"] = ["<START_Q>", "<END_Q>", "<START_A>", "<END_A>"]
17+
f.write(json.dumps(tokens))
18+
19+
for key, ds in dataset_splits.items():
20+
with open(f"data/{key}.jsonl", "w") as f:
21+
for item in ds:
22+
newitem = {}
23+
newitem["input"] = (
24+
f"<START_Q>{item['question']}<END_Q>"
25+
f"<START_A>{item['answer']}<END_A>"
26+
)
27+
f.write(json.dumps(newitem) + "\n")
28+
29+
30+
if __name__ == "__main__":
31+
main()
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"r": 8,
3+
"lora_alpha": 16,
4+
"lora_dropout": 0.05,
5+
"target_modules": ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
6+
"task_type": "CAUSAL_LM",
7+
"modules_to_save": [],
8+
"bias": "none",
9+
"fan_in_fan_out": false,
10+
"init_lora_weights": true
11+
}

0 commit comments

Comments
 (0)