Skip to content

Commit f70db97

Browse files
committed
slurm_scheduler: inherit cwd instead of image + skip mem request via cfg
1 parent a53732d commit f70db97

File tree

2 files changed

+41
-11
lines changed

2 files changed

+41
-11
lines changed

torchx/schedulers/slurm_scheduler.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@
5151
"TIMEOUT": AppState.FAILED,
5252
}
5353

54+
SBATCH_OPTIONS = {
55+
"partition",
56+
"time",
57+
}
5458

5559
def _apply_app_id_env(s: str) -> str:
5660
"""
@@ -68,7 +72,6 @@ class SlurmReplicaRequest:
6872
"""
6973

7074
name: str
71-
dir: str
7275
entrypoint: str
7376
args: List[str]
7477
srun_opts: Dict[str, str]
@@ -79,21 +82,25 @@ class SlurmReplicaRequest:
7982
def from_role(
8083
cls, name: str, role: Role, cfg: Mapping[str, CfgVal]
8184
) -> "SlurmReplicaRequest":
82-
sbatch_opts = {k: str(v) for k, v in cfg.items() if v is not None}
85+
sbatch_opts = {}
86+
for k, v in cfg.items():
87+
if v is None:
88+
continue
89+
if k in SBATCH_OPTIONS:
90+
sbatch_opts[k] = str(v)
8391
sbatch_opts.setdefault("ntasks-per-node", "1")
8492
resource = role.resource
8593

8694
if resource != NONE:
8795
if resource.cpu > 0:
8896
sbatch_opts.setdefault("cpus-per-task", str(resource.cpu))
89-
if resource.memMB > 0:
97+
if not cfg.get("nomem") and resource.memMB > 0:
9098
sbatch_opts.setdefault("mem", str(resource.memMB))
9199
if resource.gpu > 0:
92100
sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
93101

94102
return cls(
95103
name=name,
96-
dir=role.image,
97104
entrypoint=role.entrypoint,
98105
args=list(role.args),
99106
sbatch_opts=sbatch_opts,
@@ -110,8 +117,7 @@ def materialize(self) -> Tuple[List[str], List[str]]:
110117
f"--job-name={self.name}",
111118
] + [f"--{key}={value}" for key, value in self.sbatch_opts.items()]
112119
srun_args = (
113-
[f"--chdir={self.dir}"]
114-
+ [f"--{key}={value}" for key, value in self.srun_opts.items()]
120+
[f"--{key}={value}" for key, value in self.srun_opts.items()]
115121
+ [f"--export={key}={value}" for key, value in self.env.items()]
116122
)
117123

@@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler):
172178
173179
Logs are written to the default slurm log file.
174180
175-
Any scheduler options passed to it are added as SBATCH arguments to each
181+
Some of the config options passed to it are added as SBATCH arguments to each
176182
replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info
177183
on the arguments.
178184
185+
Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run
186+
in the current working directory. This matches the behavior of the
187+
``local_cwd`` scheduler.
188+
179189
For more info see:
180190
181191
* https://slurm.schedmd.com/sbatch.html
@@ -219,6 +229,12 @@ def run_opts(self) -> runopts:
219229
default=None,
220230
help="The maximum time the job is allowed to run for.",
221231
)
232+
opts.add(
233+
"nomem",
234+
type_=bool,
235+
default=False,
236+
help="disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198",
237+
)
222238
return opts
223239

224240
def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:

torchx/schedulers/test/slurm_scheduler_test.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,21 @@ def test_replica_request(self) -> None:
5454
)
5555
self.assertEqual(
5656
srun,
57-
["--chdir=/some/path", "--export=FOO=bar", "echo", "'hello slurm'", "test"],
57+
["--export=FOO=bar", "echo", "'hello slurm'", "test"],
58+
)
59+
60+
# test nomem option
61+
sbatch, srun = SlurmReplicaRequest.from_role(
62+
"role-name", role, cfg={"nomem": True}
63+
).materialize()
64+
self.assertEqual(
65+
sbatch,
66+
[
67+
"--job-name=role-name",
68+
"--ntasks-per-node=1",
69+
"--cpus-per-task=2",
70+
"--gpus-per-task=3",
71+
],
5872
)
5973

6074
def test_replica_request_app_id(self) -> None:
@@ -135,9 +149,9 @@ def test_dryrun_multi_role(self) -> None:
135149
# exit on error
136150
set -e
137151
138-
srun --chdir=/some/path echo 0 'hello '"$SLURM_JOB_ID"'' :\\
139-
--chdir=/some/path echo 1 'hello '"$SLURM_JOB_ID"'' :\\
140-
--chdir=/some/path echo
152+
srun echo 0 'hello '"$SLURM_JOB_ID"'' :\\
153+
echo 1 'hello '"$SLURM_JOB_ID"'' :\\
154+
echo
141155
""",
142156
)
143157

0 commit comments

Comments
 (0)