Skip to content

Commit e6ac408

Browse files
committed
specs,schedulers: add VolumeMount
1 parent 354d26e commit e6ac408

File tree

11 files changed

+243
-36
lines changed

11 files changed

+243
-36
lines changed

docs/source/specs.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,14 @@ Run Status
6767
Mounts
6868
--------
6969

70+
.. autofunction:: parse_mounts
71+
7072
.. autoclass:: BindMount
7173
:members:
7274

73-
.. autofunction:: parse_mounts
75+
.. autoclass:: VolumeMount
76+
:members:
77+
7478

7579
Component Linter
7680
-----------------

torchx/components/dist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def ddp(
172172
max_retries: the number of scheduler retries allowed
173173
rdzv_backend: rendezvous backend (only matters when nnodes > 1)
174174
rdzv_endpoint: rendezvous server endpoint (only matters when nnodes > 1), defaults to rank0 host for schedulers that support it
175-
mounts: the list of mounts to bind mount into the worker environment/container (ex. type=bind,src=/host,dst=/job[,readonly])
175+
mounts: mounts to mount into the worker environment/container (ex. type=<bind/volume>,src=/host,dst=/job[,readonly]). See scheduler documentation for more info.
176176
"""
177177

178178
if (script is None) == (m is None):

torchx/schedulers/aws_batch_scheduler.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
macros,
5959
runopts,
6060
CfgVal,
61+
BindMount,
62+
VolumeMount,
6163
)
6264
from torchx.workspace.docker_workspace import DockerWorkspace
6365

@@ -95,14 +97,26 @@ def _role_to_node_properties(idx: int, role: Role) -> Dict[str, object]:
9597
volumes = []
9698
for i, mount in enumerate(role.mounts):
9799
name = f"mount_{i}"
98-
volumes.append(
99-
{
100-
"name": name,
101-
"host": {
102-
"sourcePath": mount.src_path,
103-
},
104-
}
105-
)
100+
if isinstance(mount, BindMount):
101+
volumes.append(
102+
{
103+
"name": name,
104+
"host": {
105+
"sourcePath": mount.src_path,
106+
},
107+
}
108+
)
109+
elif isinstance(mount, VolumeMount):
110+
volumes.append(
111+
{
112+
"name": name,
113+
"efsVolumeConfiguration": {
114+
"fileSystemId": mount.src,
115+
},
116+
}
117+
)
118+
else:
119+
raise TypeError(f"unknown mount type {mount}")
106120
mount_points.append(
107121
{
108122
"containerPath": mount.dst_path,
@@ -176,6 +190,18 @@ class AWSBatchScheduler(Scheduler, DockerWorkspace):
176190
.. runopts::
177191
class: torchx.schedulers.aws_batch_scheduler.AWSBatchScheduler
178192
193+
**Mounts**
194+
195+
This class supports bind mounting host directories and efs volumes
196+
197+
* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
198+
* efs volume: ``type=volume,src=<efs id>,dst=<container path>[,readonly]``
199+
200+
See :py:func:`torchx.specs.parse_mounts` for more info.
201+
202+
For other filesystems such as FSx you can mount them onto the host and bind
203+
mount them into your job: https://aws.amazon.com/premiumsupport/knowledge-center/batch-fsx-lustre-file-system-mount/
204+
179205
**Compatibility**
180206
181207
.. compatibility::

torchx/schedulers/docker_scheduler.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@
3333
is_terminal,
3434
macros,
3535
runopts,
36+
BindMount,
37+
VolumeMount,
3638
)
3739
from torchx.workspace.docker_workspace import DockerWorkspace
3840

@@ -104,6 +106,19 @@ class DockerScheduler(Scheduler, DockerWorkspace):
104106
in a job fails, only that replica will be restarted.
105107
106108
109+
**Config Options**
110+
111+
.. runopts::
112+
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
113+
114+
**Mounts**
115+
116+
This class supports bind mounting directories and named volumes.
117+
118+
* bind mount: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
119+
* named volume: ``type=volume,src=<name>,dst=<container path>[,readonly]``
120+
121+
See :py:func:`torchx.specs.parse_mounts` for more info.
107122
108123
.. compatibility::
109124
type: scheduler
@@ -192,14 +207,26 @@ def _submit_dryrun(
192207
for role in app.roles:
193208
mounts = []
194209
for mount in role.mounts:
195-
mounts.append(
196-
Mount(
197-
target=mount.dst_path,
198-
source=mount.src_path,
199-
read_only=mount.read_only,
200-
type="bind",
210+
if isinstance(mount, BindMount):
211+
mounts.append(
212+
Mount(
213+
target=mount.dst_path,
214+
source=mount.src_path,
215+
read_only=mount.read_only,
216+
type="bind",
217+
)
201218
)
202-
)
219+
elif isinstance(mount, VolumeMount):
220+
mounts.append(
221+
Mount(
222+
target=mount.dst_path,
223+
source=mount.src,
224+
read_only=mount.read_only,
225+
type="volume",
226+
)
227+
)
228+
else:
229+
raise TypeError(f"unknown mount type {mount}")
203230

204231
for replica_id in range(role.num_replicas):
205232
values = macros.Values(

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@
6363
SchedulerBackend,
6464
macros,
6565
runopts,
66+
BindMount,
67+
VolumeMount,
6668
)
6769
from torchx.workspace.docker_workspace import DockerWorkspace
6870

@@ -169,6 +171,7 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
169171
V1VolumeMount,
170172
V1Volume,
171173
V1HostPathVolumeSource,
174+
V1PersistentVolumeClaimVolumeSource,
172175
)
173176

174177
requests = {}
@@ -190,14 +193,26 @@ def role_to_pod(name: str, role: Role, service_account: Optional[str]) -> "V1Pod
190193
volume_mounts = []
191194
for i, mount in enumerate(role.mounts):
192195
mount_name = f"mount-{i}"
193-
volumes.append(
194-
V1Volume(
195-
name=mount_name,
196-
host_path=V1HostPathVolumeSource(
197-
path=mount.src_path,
198-
),
196+
if isinstance(mount, BindMount):
197+
volumes.append(
198+
V1Volume(
199+
name=mount_name,
200+
host_path=V1HostPathVolumeSource(
201+
path=mount.src_path,
202+
),
203+
)
199204
)
200-
)
205+
elif isinstance(mount, VolumeMount):
206+
volumes.append(
207+
V1Volume(
208+
name=mount_name,
209+
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
210+
claim_name=mount.src,
211+
),
212+
)
213+
)
214+
else:
215+
raise TypeError(f"unknown mount type {mount}")
201216
volume_mounts.append(
202217
V1VolumeMount(
203218
name=mount_name,
@@ -374,6 +389,18 @@ class KubernetesScheduler(Scheduler, DockerWorkspace):
374389
.. runopts::
375390
class: torchx.schedulers.kubernetes_scheduler.KubernetesScheduler
376391
392+
**Mounts**
393+
394+
Mounting external filesystems/volumes is via the HostPath and
395+
PersistentVolumeClaim support.
396+
397+
* hostPath volumes: ``type=bind,src=<host path>,dst=<container path>[,readonly]``
398+
* PersistentVolumeClaim: ``type=volume,src=<claim>,dst=<container path>[,readonly]``
399+
400+
See :py:func:`torchx.specs.parse_mounts` for more info.
401+
402+
External docs: https://kubernetes.io/docs/concepts/storage/persistent-volumes/
403+
377404
**Compatibility**
378405
379406
.. compatibility::

torchx/schedulers/test/aws_batch_scheduler_test.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from torchx.schedulers.aws_batch_scheduler import (
1515
create_scheduler,
1616
AWSBatchScheduler,
17+
_role_to_node_properties,
1718
)
1819

1920

@@ -184,6 +185,38 @@ def test_submit_dryrun(self) -> None:
184185
},
185186
)
186187

188+
def test_volume_mounts(self) -> None:
189+
role = specs.Role(
190+
name="foo",
191+
image="",
192+
mounts=[
193+
specs.VolumeMount(src="efsid", dst_path="/dst", read_only=True),
194+
],
195+
)
196+
props = _role_to_node_properties(0, role)
197+
self.assertEqual(
198+
# pyre-fixme[16]: `object` has no attribute `__getitem__`.
199+
props["container"]["volumes"],
200+
[
201+
{
202+
"name": "mount_0",
203+
"efsVolumeConfiguration": {
204+
"fileSystemId": "efsid",
205+
},
206+
}
207+
],
208+
)
209+
self.assertEqual(
210+
props["container"]["mountPoints"],
211+
[
212+
{
213+
"containerPath": "/dst",
214+
"readOnly": True,
215+
"sourceVolume": "mount_0",
216+
}
217+
],
218+
)
219+
187220
def _mock_scheduler(self) -> AWSBatchScheduler:
188221
scheduler = AWSBatchScheduler(
189222
"test",

torchx/schedulers/test/docker_scheduler_test.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,14 +114,31 @@ def test_submit_dryrun(self) -> None:
114114
source="/tmp",
115115
read_only=True,
116116
type="bind",
117-
)
117+
),
118118
],
119119
},
120120
)
121121
],
122122
)
123123
self.assertEqual(str(info), str(want))
124124

125+
def test_volume_mounts(self) -> None:
126+
app = _test_app()
127+
app.roles[0].mounts = [
128+
specs.VolumeMount(src="name", dst_path="/tmp", read_only=True),
129+
]
130+
131+
info = self.scheduler._submit_dryrun(app, cfg={})
132+
want = [
133+
Mount(
134+
target="/tmp",
135+
source="name",
136+
read_only=True,
137+
type="volume",
138+
),
139+
]
140+
self.assertEqual(info.request.containers[0].kwargs["mounts"], want)
141+
125142
@patch("os.environ", {"FOO_1": "f1", "BAR_1": "b1", "FOOBAR_1": "fb1"})
126143
def test_copy_env(self) -> None:
127144
app = _test_app()

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,44 @@ def test_submit_dryrun(self) -> None:
283283
""",
284284
)
285285

286+
def test_volume_mounts(self) -> None:
287+
scheduler = create_scheduler("test")
288+
from kubernetes.client.models import (
289+
V1Volume,
290+
V1VolumeMount,
291+
V1PersistentVolumeClaimVolumeSource,
292+
)
293+
294+
role = specs.Role(
295+
name="foo",
296+
image="",
297+
mounts=[
298+
specs.VolumeMount(src="name", dst_path="/dst", read_only=True),
299+
],
300+
)
301+
pod = role_to_pod("foo", role, service_account="")
302+
self.assertEqual(
303+
pod.spec.volumes,
304+
[
305+
V1Volume(
306+
name="mount-0",
307+
persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(
308+
claim_name="name",
309+
),
310+
),
311+
],
312+
)
313+
self.assertEqual(
314+
pod.spec.containers[0].volume_mounts,
315+
[
316+
V1VolumeMount(
317+
name="mount-0",
318+
mount_path="/dst",
319+
read_only=True,
320+
)
321+
],
322+
)
323+
286324
def test_rank0_env(self) -> None:
287325
from kubernetes.client.models import (
288326
V1EnvVar,

torchx/specs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
AppState,
2828
AppStatus,
2929
BindMount,
30+
VolumeMount,
3031
CfgVal,
3132
InvalidRunConfigException,
3233
MalformedAppHandleException,

0 commit comments

Comments
 (0)