Skip to content

Commit 97c6f5c

Browse files
committed
schedulers/kubernetes_scheduler: add support for resource instance-type node selectors
1 parent 434c013 commit 97c6f5c

File tree

7 files changed

+132
-23
lines changed

7 files changed

+132
-23
lines changed

scripts/kube_dist_trainer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ def register_gpu_resource() -> None:
3232
cpu=2,
3333
gpu=1,
3434
memMB=8 * GiB,
35+
capabilities={
36+
"node.kubernetes.io/instance-type": "p3.2xlarge",
37+
},
3538
)
3639
print(f"Registering resource: {res}")
3740
named_resources["GPU_X1"] = res

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@
8585

8686
logger: logging.Logger = logging.getLogger(__name__)
8787

88+
RESERVED_MILLICPU = 100
89+
RESERVED_MEMMB = 1024
90+
8891
RETRY_POLICIES: Mapping[str, Iterable[Mapping[str, str]]] = {
8992
RetryPolicy.REPLICA: [],
9093
RetryPolicy.APPLICATION: [
@@ -152,6 +155,8 @@
152155

153156
ANNOTATION_ISTIO_SIDECAR = "sidecar.istio.io/inject"
154157

158+
LABEL_INSTANCE_TYPE = "node.kubernetes.io/instance-type"
159+
155160

156161
def sanitize_for_serialization(obj: object) -> object:
157162
from kubernetes import client
@@ -176,21 +181,35 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
176181
V1EmptyDirVolumeSource,
177182
)
178183

184+
# limits puts an upper cap on the resources a pod may consume.
185+
# requests is how much the scheduler allocates. We assume that the jobs will
186+
# be allocation the whole machine so requests is slightly lower than the
187+
# requested resources to account for the Kubernetes node reserved resources.
188+
limits = {}
179189
requests = {}
180190

181191
resource = role.resource
182-
if resource.cpu >= 0:
183-
requests["cpu"] = f"{int(resource.cpu * 1000)}m"
184-
if resource.memMB >= 0:
185-
requests["memory"] = f"{int(resource.memMB)}M"
186-
if resource.gpu >= 0:
187-
requests["nvidia.com/gpu"] = str(resource.gpu)
192+
if resource.cpu > 0:
193+
mcpu = int(resource.cpu * 1000)
194+
limits["cpu"] = f"{mcpu}m"
195+
request_mcpu = max(mcpu - RESERVED_MILLICPU, 0)
196+
requests["cpu"] = f"{request_mcpu}m"
197+
if resource.memMB > 0:
198+
limits["memory"] = f"{int(resource.memMB)}M"
199+
request_memMB = max(int(resource.memMB) - RESERVED_MEMMB, 0)
200+
requests["memory"] = f"{request_memMB}M"
201+
if resource.gpu > 0:
202+
requests["nvidia.com/gpu"] = limits["nvidia.com/gpu"] = str(resource.gpu)
188203

189204
resources = V1ResourceRequirements(
190-
limits=requests,
205+
limits=limits,
191206
requests=requests,
192207
)
193208

209+
node_selector: Dict[str, str] = {}
210+
if LABEL_INSTANCE_TYPE in resource.capabilities:
211+
node_selector[LABEL_INSTANCE_TYPE] = resource.capabilities[LABEL_INSTANCE_TYPE]
212+
194213
# To support PyTorch dataloaders we need to set /dev/shm to larger than the
195214
# 64M default so we mount an unlimited sized tmpfs directory on it.
196215
SHM_VOL = "dshm"
@@ -264,6 +283,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
264283
restart_policy="Never",
265284
service_account_name=service_account,
266285
volumes=volumes,
286+
node_selector=node_selector,
267287
),
268288
metadata=V1ObjectMeta(
269289
annotations={
@@ -416,6 +436,29 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
416436
417437
External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
418438
439+
**Resources / Allocation**
440+
441+
To select a specific machine type you can add a capability to your resources
442+
with ``node.kubernetes.io/instance-type`` which will constrain the launched
443+
jobs to nodes of that instance type.
444+
445+
>>> from torchx import specs
446+
>>> specs.Resource(
447+
... cpu=4,
448+
... memMB=16000,
449+
... gpu=2,
450+
... capabilities={
451+
... "node.kubernetes.io/instance-type": "<cloud instance type>",
452+
... },
453+
... )
454+
Resource(...)
455+
456+
Kubernetes may reserve some memory for the host. TorchX assumes you're
457+
scheduling on whole hosts and thus will automatically reduce the resource
458+
request by a small amount to account for the node reserved CPU and memory.
459+
If you run into scheduling issues you may need to reduce the requested CPU
460+
and memory from the host values.
461+
419462
**Compatibility**
420463
421464
.. compatibility::

torchx/schedulers/test/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
actors.json

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
cleanup_str,
2525
create_scheduler,
2626
role_to_pod,
27+
LABEL_INSTANCE_TYPE,
2728
)
2829

2930
SKIP_DOCKER: bool = not has_docker()
@@ -124,13 +125,18 @@ def test_role_to_pod(self) -> None:
124125
app = _test_app()
125126
pod = role_to_pod("name", app.roles[0], service_account="srvacc")
126127

127-
requests = {
128+
limits = {
128129
"cpu": "2000m",
129130
"memory": "3000M",
130131
"nvidia.com/gpu": "4",
131132
}
133+
requests = {
134+
"cpu": "1900m",
135+
"memory": "1976M",
136+
"nvidia.com/gpu": "4",
137+
}
132138
resources = V1ResourceRequirements(
133-
limits=requests,
139+
limits=limits,
134140
requests=requests,
135141
)
136142
container = V1Container(
@@ -179,6 +185,7 @@ def test_role_to_pod(self) -> None:
179185
),
180186
),
181187
],
188+
node_selector={},
182189
),
183190
metadata=V1ObjectMeta(
184191
annotations={
@@ -279,15 +286,16 @@ def test_submit_dryrun(self) -> None:
279286
memory: 3000M
280287
nvidia.com/gpu: '4'
281288
requests:
282-
cpu: 2000m
283-
memory: 3000M
289+
cpu: 1900m
290+
memory: 1976M
284291
nvidia.com/gpu: '4'
285292
volumeMounts:
286293
- mountPath: /dev/shm
287294
name: dshm
288295
- mountPath: /dst
289296
name: mount-0
290297
readOnly: true
298+
nodeSelector: {{}}
291299
restartPolicy: Never
292300
volumes:
293301
- emptyDir:
@@ -348,6 +356,29 @@ def test_volume_mounts(self) -> None:
348356
],
349357
)
350358

359+
def test_instance_type(self) -> None:
360+
scheduler = create_scheduler("test")
361+
role = specs.Role(
362+
name="foo",
363+
image="",
364+
mounts=[],
365+
resource=specs.Resource(
366+
cpu=4,
367+
memMB=4000,
368+
gpu=8,
369+
capabilities={
370+
LABEL_INSTANCE_TYPE: "some_instance",
371+
},
372+
),
373+
)
374+
pod = role_to_pod("foo", role, service_account="")
375+
self.assertEqual(
376+
pod.spec.node_selector,
377+
{
378+
"node.kubernetes.io/instance-type": "some_instance",
379+
},
380+
)
381+
351382
def test_rank0_env(self) -> None:
352383
from kubernetes.client.models import (
353384
V1EnvVar,

torchx/specs/__init__.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from typing import Dict, Optional
1515

16-
import torchx.specs.named_resources_aws as aws_resources
16+
import torchx.specs.named_resources_aws as named_resources_aws
1717
from torchx.util.entrypoints import load_group
1818

1919
from .api import ( # noqa: F401 F403
@@ -58,14 +58,9 @@
5858
def _load_named_resources() -> Dict[str, Resource]:
5959
resource_methods = load_group("torchx.named_resources", default={})
6060
materialized_resources = {}
61-
default = {
62-
"aws_t3.medium": aws_resources.aws_t3_medium(),
63-
"aws_m5.2xlarge": aws_resources.aws_m5_2xlarge(),
64-
"aws_p3.2xlarge": aws_resources.aws_p3_2xlarge(),
65-
"aws_p3.8xlarge": aws_resources.aws_p3_8xlarge(),
66-
}
61+
default = named_resources_aws.NAMED_RESOURCES
6762
for name, resource in default.items():
68-
materialized_resources[name] = resource
63+
materialized_resources[name] = resource()
6964
for resource_name, resource_method in resource_methods.items():
7065
materialized_resources[resource_name] = resource_method()
7166
materialized_resources["NULL"] = NULL_RESOURCE

torchx/specs/named_resources_aws.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929
3030
"""
3131

32+
from typing import Mapping, Callable
33+
3234
from torchx.specs.api import Resource
3335

3436
GiB: int = 1024
@@ -39,7 +41,9 @@ def aws_p3_2xlarge() -> Resource:
3941
cpu=8,
4042
gpu=1,
4143
memMB=61 * GiB,
42-
capabilities={},
44+
capabilities={
45+
"node.kubernetes.io/instance-type": "p3.2xlarge",
46+
},
4347
)
4448

4549

@@ -48,7 +52,9 @@ def aws_p3_8xlarge() -> Resource:
4852
cpu=32,
4953
gpu=4,
5054
memMB=244 * GiB,
51-
capabilities={},
55+
capabilities={
56+
"node.kubernetes.io/instance-type": "p3.8xlarge",
57+
},
5258
)
5359

5460

@@ -57,7 +63,9 @@ def aws_t3_medium() -> Resource:
5763
cpu=2,
5864
gpu=0,
5965
memMB=4 * GiB,
60-
capabilities={},
66+
capabilities={
67+
"node.kubernetes.io/instance-type": "t3.medium",
68+
},
6169
)
6270

6371

@@ -66,5 +74,27 @@ def aws_m5_2xlarge() -> Resource:
6674
cpu=8,
6775
gpu=0,
6876
memMB=32 * GiB,
69-
capabilities={},
77+
capabilities={
78+
"node.kubernetes.io/instance-type": "m5.2xlarge",
79+
},
80+
)
81+
82+
83+
def aws_g4dn_xlarge() -> Resource:
84+
return Resource(
85+
cpu=4,
86+
gpu=1,
87+
memMB=16 * GiB,
88+
capabilities={
89+
"node.kubernetes.io/instance-type": "g4dn.xlarge",
90+
},
7091
)
92+
93+
94+
NAMED_RESOURCES: Mapping[str, Callable[[], Resource]] = {
95+
"aws_t3.medium": aws_t3_medium,
96+
"aws_m5.2xlarge": aws_m5_2xlarge,
97+
"aws_p3.2xlarge": aws_p3_2xlarge,
98+
"aws_p3.8xlarge": aws_p3_8xlarge,
99+
"aws_g4dn.xlarge": aws_g4dn_xlarge,
100+
}

torchx/specs/test/named_resources_test_aws.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
aws_m5_2xlarge,
1414
aws_t3_medium,
1515
GiB,
16+
NAMED_RESOURCES,
1617
)
1718

1819

@@ -40,3 +41,8 @@ def test_aws_t3_medium(self) -> None:
4041
self.assertEqual(2, resource.cpu)
4142
self.assertEqual(0, resource.gpu)
4243
self.assertEqual(4 * GiB, resource.memMB)
44+
45+
def test_capabilities(self) -> None:
46+
for name, func in NAMED_RESOURCES.items():
47+
resource = func()
48+
self.assertIn("node.kubernetes.io/instance-type", resource.capabilities)

0 commit comments

Comments
 (0)