Skip to content

Commit 06e7653

Browse files
committed
Rename PodGroupPolicy and MLPolicy APIs
Signed-off-by: Andrey Velichkevich <[email protected]>
1 parent c28a166 commit 06e7653

File tree

2 files changed

+102
-60
lines changed

2 files changed

+102
-60
lines changed

pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go

Lines changed: 33 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -82,10 +82,10 @@ type TrainingRuntimeList struct {
8282
type TrainingRuntimeSpec struct {
8383

8484
// Configuration for the model training with ML-specific parameters.
85-
MLSpec *MLSpec `json:"mlSpec,omitempty"`
85+
MLPolicy *MLPolicy `json:"mlPolicy,omitempty"`
8686

8787
// Configuration for the PodGroup to enable gang-scheduling via supported plugins.
88-
PodGroupSpec *PodGroupSpec `json:"podGroupSpec,omitempty"`
88+
PodGroupPolicy *PodGroupPolicy `json:"podGroupPolicy,omitempty"`
8989

9090
// JobSet template which will be used by TrainJob.
9191
Template JobSetTemplateSpec `json:"template"`
@@ -101,51 +101,57 @@ type JobSetTemplateSpec struct {
101101
Spec jobsetv1alpha2.JobSetSpec `json:"spec,omitempty"`
102102
}
103103

104-
// PodGroupSpec represents a PodGroup configuration to enable gang-scheduling.
105-
type PodGroupSpec struct {
106-
// Plugin for the gang-scheduling.
107-
Plugin GangSchedulerPlugin `json:"plugin"`
104+
// PodGroupPolicy represents a PodGroup configuration for gang-scheduling.
105+
type PodGroupPolicy struct {
108106

109-
// Time threshold to schedule PodGroup for gang-scheduling.
110-
ScheduleTimeoutSeconds *string `json:"scheduleTimeoutSeconds,omitempty"`
107+
// Configuration for gang-scheduling using various plugins.
108+
PodGroupPolicySource `json:",inline"`
111109
}
112110

113-
// GangSchedulerPlugin represents one of the supported gang-scheduling plugins.
114-
type GangSchedulerPlugin string
115-
116-
const (
117-
// Volcano plugin for gang-scheduling.
118-
GangSchedulerPluginVolcano GangSchedulerPlugin = "volcano"
111+
// PodGroupPolicySource represents supported plugins for gang-scheduling.
112+
// Only one of its members may be specified.
113+
type PodGroupPolicySource struct {
119114

120115
// Coscheduling plugin from the Kubernetes scheduler-plugins for gang-scheduling.
121-
GangSchedulerPluginCoscheduling GangSchedulerPlugin = "coscheduling"
122-
)
116+
Coscheduling *CoschedulingPodGroupPolicySource `json:"coscheduling,omitempty"`
117+
118+
// TODO (andreyvelich): Add support for Volcano gang-scheduler.
119+
}
120+
121+
// CoschedulingPodGroupPolicySource represents configuration for coscheduling plugin.
122+
type CoschedulingPodGroupPolicySource struct {
123+
124+
// Time threshold to schedule PodGroup for gang-scheduling.
125+
// If the scheduling timeout is equal to 0, the default value is used.
126+
// Defaults to 60 seconds.
127+
ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
128+
}
123129

124-
// MLSpec represents configuration for the model trining with ML-specific parameters.
125-
type MLSpec struct {
130+
// MLPolicy represents configuration for the model trining with ML-specific parameters.
131+
type MLPolicy struct {
126132

127133
// Number of training nodes.
128134
// Defaults to 1.
129135
NumNodes *int32 `json:"numNodes,omitempty"`
130136

131137
// Configuration for the runtime-specific parameters, such as Torch or MPI.
132-
// One of the following spec sources can be set.
133-
MLSpecSource `json:",inline"`
138+
// Only one of its members may be specified.
139+
MLPolicySource `json:",inline"`
134140
}
135141

136142
// MLPolicySource represents the runtime-specific configuration for various technologies.
137143
// One of the following specs can be set.
138-
type MLSpecSource struct {
144+
type MLPolicySource struct {
139145

140146
// Configuration for the PyTorch runtime.
141-
Torch *TorchMLSpecSource `json:"torch,omitempty"`
147+
Torch *TorchMLPolicySource `json:"torch,omitempty"`
142148

143149
// Configuration for the MPI Runtime.
144-
MPI *MPIMLSpecSource `json:"mpi,omitempty"`
150+
MPI *MPIMLPolicySource `json:"mpi,omitempty"`
145151
}
146152

147-
// TorchMLSpecSource represents a PyTorch runtime configuration.
148-
type TorchMLSpecSource struct {
153+
// TorchMLPolicySource represents a PyTorch runtime configuration.
154+
type TorchMLPolicySource struct {
149155
// Number of processes per node.
150156
// This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
151157
// Supported values: `auto`, `cpu`, `gpu`, or int value.
@@ -179,8 +185,8 @@ type TorchElasticPolicy struct {
179185
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
180186
}
181187

182-
// MPIMLSpecSource represents a MPI runtime configuration.
183-
type MPIMLSpecSource struct {
188+
// MPIMLPolicySource represents a MPI runtime configuration.
189+
type MPIMLPolicySource struct {
184190
// Number of processes per node.
185191
// This value is equal to the number of slots for each node in the hostfile.
186192
NumProcPerNode *int32 `json:"numProcPerNode,omitempty"`

pkg/apis/kubeflow.org/v2alpha1/zz_generated.deepcopy.go

Lines changed: 69 additions & 33 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)