Skip to content

Commit 090fc11

Browse files
committed
schedulers: set /dev/shm size for docker based schedulers to bypass 64M default
1 parent 90b05b0 commit 090fc11

File tree

6 files changed

+66
-9
lines changed

6 files changed

+66
-9
lines changed

torchx/schedulers/aws_batch_scheduler.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,10 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
8585
cpu = 1
8686
reqs.append({"type": "VCPU", "value": str(cpu)})
8787

88-
mem = resource.memMB
89-
if mem <= 0:
90-
mem = 1000
91-
reqs.append({"type": "MEMORY", "value": str(mem)})
88+
memMB = resource.memMB
89+
if memMB <= 0:
90+
memMB = 1000
91+
reqs.append({"type": "MEMORY", "value": str(memMB)})
9292

9393
if resource.gpu > 0:
9494
reqs.append({"type": "GPU", "value": str(resource.gpu)})
@@ -130,6 +130,11 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
130130
"image": role.image,
131131
"environment": [{"name": k, "value": v} for k, v in role.env.items()],
132132
"resourceRequirements": reqs,
133+
"linuxParameters": {
134+
# To support PyTorch dataloaders we need to set /dev/shm to larger
135+
# than the 64M default.
136+
"sharedMemorySize": memMB,
137+
},
133138
"logConfiguration": {
134139
"logDriver": "awslogs",
135140
},

torchx/schedulers/docker_scheduler.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,11 @@ def _submit_dryrun(
269269
}
270270
resource = replica_role.resource
271271
if resource.memMB >= 0:
272-
c.kwargs["mem_limit"] = f"{int(resource.memMB)}m"
272+
# To support PyTorch dataloaders we need to set /dev/shm to
273+
# larger than the 64M default.
274+
c.kwargs["mem_limit"] = c.kwargs[
275+
"shm_size"
276+
] = f"{int(resource.memMB)}m"
273277
if resource.cpu >= 0:
274278
c.kwargs["nano_cpus"] = int(resource.cpu * 1e9)
275279
if resource.gpu > 0:

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
172172
V1Volume,
173173
V1HostPathVolumeSource,
174174
V1PersistentVolumeClaimVolumeSource,
175+
V1EmptyDirVolumeSource,
175176
)
176177

177178
requests = {}
@@ -189,8 +190,21 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
189190
requests=requests,
190191
)
191192

192-
volumes = []
193-
volume_mounts = []
193+
# To support PyTorch dataloaders we need to set /dev/shm to larger than the
194+
# 64M default so we mount an unlimited sized tmpfs directory on it.
195+
SHM_VOL = "dshm"
196+
volumes = [
197+
V1Volume(
198+
name=SHM_VOL,
199+
empty_dir=V1EmptyDirVolumeSource(
200+
medium="Memory",
201+
),
202+
),
203+
]
204+
volume_mounts = [
205+
V1VolumeMount(name=SHM_VOL, mount_path="/dev/shm"),
206+
]
207+
194208
for i, mount in enumerate(role.mounts):
195209
mount_name = f"mount-{i}"
196210
if isinstance(mount, BindMount):

torchx/schedulers/test/aws_batch_scheduler_test.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ def test_submit_dryrun(self) -> None:
112112
{"type": "MEMORY", "value": "3000"},
113113
{"type": "GPU", "value": "4"},
114114
],
115+
"linuxParameters": {
116+
"sharedMemorySize": 3000,
117+
},
115118
"logConfiguration": {"logDriver": "awslogs"},
116119
"mountPoints": [
117120
{
@@ -154,6 +157,9 @@ def test_submit_dryrun(self) -> None:
154157
{"type": "MEMORY", "value": "3000"},
155158
{"type": "GPU", "value": "4"},
156159
],
160+
"linuxParameters": {
161+
"sharedMemorySize": 3000,
162+
},
157163
"logConfiguration": {"logDriver": "awslogs"},
158164
"mountPoints": [
159165
{

torchx/schedulers/test/docker_scheduler_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ def test_submit_dryrun(self) -> None:
100100
"torchx.pytorch.org/version": "0.1.2dev0",
101101
},
102102
"mem_limit": "3000m",
103+
"shm_size": "3000m",
103104
"name": "app_name_42-trainer-0",
104105
"hostname": "app_name_42-trainer-0",
105106
"nano_cpus": int(2e9),

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ def test_role_to_pod(self) -> None:
118118
V1Volume,
119119
V1VolumeMount,
120120
V1HostPathVolumeSource,
121+
V1EmptyDirVolumeSource,
121122
)
122123

123124
app = _test_app()
@@ -148,11 +149,15 @@ def test_role_to_pod(self) -> None:
148149
resources=resources,
149150
ports=[V1ContainerPort(name="foo", container_port=1234)],
150151
volume_mounts=[
152+
V1VolumeMount(
153+
name="dshm",
154+
mount_path="/dev/shm",
155+
),
151156
V1VolumeMount(
152157
name="mount-0",
153158
mount_path="/dst",
154159
read_only=True,
155-
)
160+
),
156161
],
157162
)
158163
want = V1Pod(
@@ -161,6 +166,12 @@ def test_role_to_pod(self) -> None:
161166
restart_policy="Never",
162167
service_account_name="srvacc",
163168
volumes=[
169+
V1Volume(
170+
name="dshm",
171+
empty_dir=V1EmptyDirVolumeSource(
172+
medium="Memory",
173+
),
174+
),
164175
V1Volume(
165176
name="mount-0",
166177
host_path=V1HostPathVolumeSource(
@@ -272,11 +283,16 @@ def test_submit_dryrun(self) -> None:
272283
memory: 3000M
273284
nvidia.com/gpu: '4'
274285
volumeMounts:
286+
- mountPath: /dev/shm
287+
name: dshm
275288
- mountPath: /dst
276289
name: mount-0
277290
readOnly: true
278291
restartPolicy: Never
279292
volumes:
293+
- emptyDir:
294+
medium: Memory
295+
name: dshm
280296
- hostPath:
281297
path: /src
282298
name: mount-0
@@ -289,6 +305,7 @@ def test_volume_mounts(self) -> None:
289305
V1Volume,
290306
V1VolumeMount,
291307
V1PersistentVolumeClaimVolumeSource,
308+
V1EmptyDirVolumeSource,
292309
)
293310

294311
role = specs.Role(
@@ -302,6 +319,12 @@ def test_volume_mounts(self) -> None:
302319
self.assertEqual(
303320
pod.spec.volumes,
304321
[
322+
V1Volume(
323+
name="dshm",
324+
empty_dir=V1EmptyDirVolumeSource(
325+
medium="Memory",
326+
),
327+
),
305328
V1Volume(
306329
name="mount-0",
307330
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
@@ -313,11 +336,15 @@ def test_volume_mounts(self) -> None:
313336
self.assertEqual(
314337
pod.spec.containers[0].volume_mounts,
315338
[
339+
V1VolumeMount(
340+
name="dshm",
341+
mount_path="/dev/shm",
342+
),
316343
V1VolumeMount(
317344
name="mount-0",
318345
mount_path="/dst",
319346
read_only=True,
320-
)
347+
),
321348
],
322349
)
323350

0 commit comments

Comments
 (0)