diff --git a/torchx/schedulers/slurm_scheduler.py b/torchx/schedulers/slurm_scheduler.py index 8bf58cd37..7c3dbf1f8 100644 --- a/torchx/schedulers/slurm_scheduler.py +++ b/torchx/schedulers/slurm_scheduler.py @@ -51,6 +51,11 @@ "TIMEOUT": AppState.FAILED, } +SBATCH_OPTIONS = { + "partition", + "time", +} + def _apply_app_id_env(s: str) -> str: """ @@ -68,7 +73,6 @@ class SlurmReplicaRequest: """ name: str - dir: str entrypoint: str args: List[str] srun_opts: Dict[str, str] @@ -79,21 +83,25 @@ class SlurmReplicaRequest: def from_role( cls, name: str, role: Role, cfg: Mapping[str, CfgVal] ) -> "SlurmReplicaRequest": - sbatch_opts = {k: str(v) for k, v in cfg.items() if v is not None} + sbatch_opts = {} + for k, v in cfg.items(): + if v is None: + continue + if k in SBATCH_OPTIONS: + sbatch_opts[k] = str(v) sbatch_opts.setdefault("ntasks-per-node", "1") resource = role.resource if resource != NONE: if resource.cpu > 0: sbatch_opts.setdefault("cpus-per-task", str(resource.cpu)) - if resource.memMB > 0: + if not cfg.get("nomem") and resource.memMB > 0: sbatch_opts.setdefault("mem", str(resource.memMB)) if resource.gpu > 0: sbatch_opts.setdefault("gpus-per-task", str(resource.gpu)) return cls( name=name, - dir=role.image, entrypoint=role.entrypoint, args=list(role.args), sbatch_opts=sbatch_opts, @@ -109,11 +117,9 @@ def materialize(self) -> Tuple[List[str], List[str]]: sbatch_args = [ f"--job-name={self.name}", ] + [f"--{key}={value}" for key, value in self.sbatch_opts.items()] - srun_args = ( - [f"--chdir={self.dir}"] - + [f"--{key}={value}" for key, value in self.srun_opts.items()] - + [f"--export={key}={value}" for key, value in self.env.items()] - ) + srun_args = [f"--{key}={value}" for key, value in self.srun_opts.items()] + [ + f"--export={key}={value}" for key, value in self.env.items() + ] srun_group = srun_args + [self.entrypoint] + self.args srun_group = [_apply_app_id_env(arg) for arg in srun_group] @@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler): Logs are written to the default slurm log file. - Any scheduler options passed to it are added as SBATCH arguments to each + Some of the config options passed to it are added as SBATCH arguments to each replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info on the arguments. + Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run + in the current working directory. This matches the behavior of the + ``local_cwd`` scheduler. + For more info see: * https://slurm.schedmd.com/sbatch.html @@ -219,6 +229,12 @@ def run_opts(self) -> runopts: default=None, help="The maximum time the job is allowed to run for.", ) + opts.add( + "nomem", + type_=bool, + default=False, + help="disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198", + ) return opts def schedule(self, dryrun_info: AppDryRunInfo[SlurmBatchRequest]) -> str: diff --git a/torchx/schedulers/test/slurm_scheduler_test.py b/torchx/schedulers/test/slurm_scheduler_test.py index 6dccbeffd..27d285d25 100644 --- a/torchx/schedulers/test/slurm_scheduler_test.py +++ b/torchx/schedulers/test/slurm_scheduler_test.py @@ -54,7 +54,21 @@ def test_replica_request(self) -> None: ) self.assertEqual( srun, - ["--chdir=/some/path", "--export=FOO=bar", "echo", "'hello slurm'", "test"], + ["--export=FOO=bar", "echo", "'hello slurm'", "test"], + ) + + # test nomem option + sbatch, srun = SlurmReplicaRequest.from_role( + "role-name", role, cfg={"nomem": True} + ).materialize() + self.assertEqual( + sbatch, + [ + "--job-name=role-name", + "--ntasks-per-node=1", + "--cpus-per-task=2", + "--gpus-per-task=3", + ], ) def test_replica_request_app_id(self) -> None: @@ -135,9 +149,9 @@ def test_dryrun_multi_role(self) -> None: # exit on error set -e -srun --chdir=/some/path echo 0 'hello '"$SLURM_JOB_ID"'' :\\ - --chdir=/some/path echo 1 'hello '"$SLURM_JOB_ID"'' :\\ - --chdir=/some/path echo +srun echo 0 'hello '"$SLURM_JOB_ID"'' :\\ + echo 1 'hello '"$SLURM_JOB_ID"'' :\\ + echo """, )