@@ -82,10 +82,10 @@ type TrainingRuntimeList struct {
82
82
type TrainingRuntimeSpec struct {
83
83
84
84
// Configuration for the model training with ML-specific parameters.
85
- MLSpec * MLSpec `json:"mlSpec ,omitempty"`
85
+ MLPolicy * MLPolicy `json:"mlPolicy ,omitempty"`
86
86
87
87
// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
88
- PodGroupSpec * PodGroupSpec `json:"podGroupSpec ,omitempty"`
88
+ PodGroupPolicy * PodGroupPolicy `json:"podGroupPolicy ,omitempty"`
89
89
90
90
// JobSet template which will be used by TrainJob.
91
91
Template JobSetTemplateSpec `json:"template"`
@@ -101,51 +101,57 @@ type JobSetTemplateSpec struct {
101
101
Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"`
102
102
}
103
103
104
- // PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
105
- type PodGroupSpec struct {
106
- // Plugin for the gang-scheduling.
107
- Plugin GangSchedulerPlugin `json:"plugin"`
104
+ // PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
105
+ type PodGroupPolicy struct {
108
106
109
- // Time threshold to schedule PodGroup for gang-scheduling.
110
- ScheduleTimeoutSeconds * string `json:"scheduleTimeoutSeconds,omitempty "`
107
+ // Configuration for gang-scheduling using various plugins .
108
+ PodGroupPolicySource `json:",inline "`
111
109
}
112
110
113
- // GangSchedulerPlugin represents one of the supported gang-scheduling plugins.
114
- type GangSchedulerPlugin string
115
-
116
- const (
117
- // Volcano plugin for gang-scheduling.
118
- GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
111
+ // PodGroupPolicySource represents supported plugins for gang-scheduling.
112
+ // Only one of its members may be specified.
113
+ type PodGroupPolicySource struct {
119
114
120
115
// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
121
- GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
122
- )
116
+ Coscheduling * CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"`
117
+
118
+ // TODO (andreyvelich): Add support for Volcano gang-scheduler.
119
+ }
120
+
121
+ // CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin.
122
+ type CoschedulingPodGroupPolicySource struct {
123
+
124
+ // Time threshold to schedule PodGroup for gang-scheduling.
125
+ // If the scheduling timeout is equal to 0, the default value is used.
126
+ // Defaults to 60 seconds.
127
+ ScheduleTimeoutSeconds * int32 `json:"scheduleTimeoutSeconds,omitempty"`
128
+ }
123
129
124
- // MLSpec represents configuration for the model trining with ML-specific parameters.
125
- type MLSpec struct {
130
+ // MLPolicy represents configuration for the model trining with ML-specific parameters.
131
+ type MLPolicy struct {
126
132
127
133
// Number of training nodes.
128
134
// Defaults to 1.
129
135
NumNodes * int32 `json:"numNodes,omitempty"`
130
136
131
137
// Configuration for the runtime-specific parameters, such as Torch or MPI.
132
- // One of the following spec sources can be set .
133
- MLSpecSource `json:",inline"`
138
+ // Only one of its members may be specified .
139
+ MLPolicySource `json:",inline"`
134
140
}
135
141
136
142
// MLPolicySource represents the runtime-specific configuration for various technologies.
137
143
// One of the following specs can be set.
138
- type MLSpecSource struct {
144
+ type MLPolicySource struct {
139
145
140
146
// Configuration for the PyTorch runtime.
141
- Torch * TorchMLSpecSource `json:"torch,omitempty"`
147
+ Torch * TorchMLPolicySource `json:"torch,omitempty"`
142
148
143
149
// Configuration for the MPI Runtime.
144
- MPI * MPIMLSpecSource `json:"mpi,omitempty"`
150
+ MPI * MPIMLPolicySource `json:"mpi,omitempty"`
145
151
}
146
152
147
- // TorchMLSpecSource represents a PyTorch runtime configuration.
148
- type TorchMLSpecSource struct {
153
+ // TorchMLPolicySource represents a PyTorch runtime configuration.
154
+ type TorchMLPolicySource struct {
149
155
// Number of processes per node.
150
156
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
151
157
// Supported values: `auto`, `cpu`, `gpu`, or int value.
@@ -179,8 +185,8 @@ type TorchElasticPolicy struct {
179
185
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
180
186
}
181
187
182
- // MPIMLSpecSource represents a MPI runtime configuration.
183
- type MPIMLSpecSource struct {
188
+ // MPIMLPolicySource represents a MPI runtime configuration.
189
+ type MPIMLPolicySource struct {
184
190
// Number of processes per node.
185
191
// This value is equal to the number of slots for each node in the hostfile.
186
192
NumProcPerNode * int32 `json:"numProcPerNode,omitempty"`
0 commit comments