Skip to content

Commit 1fc7974

Browse files
committed
slurm_scheduler: inherit cwd instead of image + skip mem request via cfg
1 parent a53732d commit 1fc7974

File tree

2 files changed

+44
-14
lines changed

2 files changed

+44
-14
lines changed

torchx/schedulers/slurm_scheduler.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@
5151
"TIMEOUT": AppState.FAILED,
5252
}
5353

54+
SBATCH_OPTIONS = {
55+
"partition",
56+
"time",
57+
}
58+
5459

5560
def _apply_app_id_env(s: str) -> str:
5661
"""
@@ -68,7 +73,6 @@ class SlurmReplicaRequest:
6873
"""
6974

7075
name: str
71-
dir: str
7276
entrypoint: str
7377
args: List[str]
7478
srun_opts: Dict[str, str]
@@ -79,21 +83,25 @@ class SlurmReplicaRequest:
7983
def from_role(
8084
cls, name: str, role: Role, cfg: Mapping[str, CfgVal]
8185
) -> "SlurmReplicaRequest":
82-
sbatch_opts = {k: str(v) for k, v in cfg.items() if v is not None}
86+
sbatch_opts = {}
87+
for k, v in cfg.items():
88+
if v is None:
89+
continue
90+
if k in SBATCH_OPTIONS:
91+
sbatch_opts[k] = str(v)
8392
sbatch_opts.setdefault("ntasks-per-node", "1")
8493
resource = role.resource
8594

8695
if resource != NONE:
8796
if resource.cpu > 0:
8897
sbatch_opts.setdefault("cpus-per-task", str(resource.cpu))
89-
if resource.memMB > 0:
98+
if not cfg.get("nomem") and resource.memMB > 0:
9099
sbatch_opts.setdefault("mem", str(resource.memMB))
91100
if resource.gpu > 0:
92101
sbatch_opts.setdefault("gpus-per-task", str(resource.gpu))
93102

94103
return cls(
95104
name=name,
96-
dir=role.image,
97105
entrypoint=role.entrypoint,
98106
args=list(role.args),
99107
sbatch_opts=sbatch_opts,
@@ -109,11 +117,9 @@ def materialize(self) -> Tuple[List[str], List[str]]:
109117
sbatch_args = [
110118
f"--job-name={self.name}",
111119
] + [f"--{key}={value}" for key, value in self.sbatch_opts.items()]
112-
srun_args = (
113-
[f"--chdir={self.dir}"]
114-
+ [f"--{key}={value}" for key, value in self.srun_opts.items()]
115-
+ [f"--export={key}={value}" for key, value in self.env.items()]
116-
)
120+
srun_args = [f"--{key}={value}" for key, value in self.srun_opts.items()] + [
121+
f"--export={key}={value}" for key, value in self.env.items()
122+
]
117123

118124
srun_group = srun_args + [self.entrypoint] + self.args
119125
srun_group = [_apply_app_id_env(arg) for arg in srun_group]
@@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler):
172178
173179
Logs are written to the default slurm log file.
174180
175-
Any scheduler options passed to it are added as SBATCH arguments to each
181+
Some of the config options passed to it are added as SBATCH arguments to each
176182
replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info
177183
on the arguments.
178184
185+
Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run
186+
in the current working directory. This matches the behavior of the
187+
``local_cwd`` scheduler.
188+
179189
For more info see:
180190
181191
* https://slurm.schedmd.com/sbatch.html
@@ -219,6 +229,12 @@ def run_opts(self) -> runopts:
219229
default=None,
220230
help="The maximum time the job is allowed to run for.",
221231
)
232+
opts.add(
233+
"nomem",
234+
type_=bool,
235+
default=False,
236+
help="disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198",
237+
)
222238
return opts
223239

224240
def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str:

torchx/schedulers/test/slurm_scheduler_test.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,21 @@ def test_replica_request(self) -> None:
5454
)
5555
self.assertEqual(
5656
srun,
57-
["--chdir=/some/path", "--export=FOO=bar", "echo", "'hello slurm'", "test"],
57+
["--export=FOO=bar", "echo", "'hello slurm'", "test"],
58+
)
59+
60+
# test nomem option
61+
sbatch, srun = SlurmReplicaRequest.from_role(
62+
"role-name", role, cfg={"nomem": True}
63+
).materialize()
64+
self.assertEqual(
65+
sbatch,
66+
[
67+
"--job-name=role-name",
68+
"--ntasks-per-node=1",
69+
"--cpus-per-task=2",
70+
"--gpus-per-task=3",
71+
],
5872
)
5973

6074
def test_replica_request_app_id(self) -> None:
@@ -135,9 +149,9 @@ def test_dryrun_multi_role(self) -> None:
135149
# exit on error
136150
set -e
137151
138-
srun --chdir=/some/path echo 0 'hello '"$SLURM_JOB_ID"'' :\\
139-
--chdir=/some/path echo 1 'hello '"$SLURM_JOB_ID"'' :\\
140-
--chdir=/some/path echo
152+
srun echo 0 'hello '"$SLURM_JOB_ID"'' :\\
153+
echo 1 'hello '"$SLURM_JOB_ID"'' :\\
154+
echo
141155
""",
142156
)
143157

0 commit comments

Comments
 (0)