51
51
"TIMEOUT" : AppState .FAILED ,
52
52
}
53
53
54
+ SBATCH_OPTIONS = {
55
+ "partition" ,
56
+ "time" ,
57
+ }
54
58
55
59
def _apply_app_id_env (s : str ) -> str :
56
60
"""
@@ -68,7 +72,6 @@ class SlurmReplicaRequest:
68
72
"""
69
73
70
74
name : str
71
- dir : str
72
75
entrypoint : str
73
76
args : List [str ]
74
77
srun_opts : Dict [str , str ]
@@ -79,21 +82,25 @@ class SlurmReplicaRequest:
79
82
def from_role (
80
83
cls , name : str , role : Role , cfg : Mapping [str , CfgVal ]
81
84
) -> "SlurmReplicaRequest" :
82
- sbatch_opts = {k : str (v ) for k , v in cfg .items () if v is not None }
85
+ sbatch_opts = {}
86
+ for k , v in cfg .items ():
87
+ if v is None :
88
+ continue
89
+ if k in SBATCH_OPTIONS :
90
+ sbatch_opts [k ] = str (v )
83
91
sbatch_opts .setdefault ("ntasks-per-node" , "1" )
84
92
resource = role .resource
85
93
86
94
if resource != NONE :
87
95
if resource .cpu > 0 :
88
96
sbatch_opts .setdefault ("cpus-per-task" , str (resource .cpu ))
89
- if resource .memMB > 0 :
97
+ if not cfg . get ( "nomem" ) and resource .memMB > 0 :
90
98
sbatch_opts .setdefault ("mem" , str (resource .memMB ))
91
99
if resource .gpu > 0 :
92
100
sbatch_opts .setdefault ("gpus-per-task" , str (resource .gpu ))
93
101
94
102
return cls (
95
103
name = name ,
96
- dir = role .image ,
97
104
entrypoint = role .entrypoint ,
98
105
args = list (role .args ),
99
106
sbatch_opts = sbatch_opts ,
@@ -110,8 +117,7 @@ def materialize(self) -> Tuple[List[str], List[str]]:
110
117
f"--job-name={ self .name } " ,
111
118
] + [f"--{ key } ={ value } " for key , value in self .sbatch_opts .items ()]
112
119
srun_args = (
113
- [f"--chdir={ self .dir } " ]
114
- + [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()]
120
+ [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()]
115
121
+ [f"--export={ key } ={ value } " for key , value in self .env .items ()]
116
122
)
117
123
@@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler):
172
178
173
179
Logs are written to the default slurm log file.
174
180
175
- Any scheduler options passed to it are added as SBATCH arguments to each
181
+ Some of the config options passed to it are added as SBATCH arguments to each
176
182
replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info
177
183
on the arguments.
178
184
185
+ Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run
186
+ in the current working directory. This matches the behavior of the
187
+ ``local_cwd`` scheduler.
188
+
179
189
For more info see:
180
190
181
191
* https://slurm.schedmd.com/sbatch.html
@@ -219,6 +229,12 @@ def run_opts(self) -> runopts:
219
229
default = None ,
220
230
help = "The maximum time the job is allowed to run for." ,
221
231
)
232
+ opts .add (
233
+ "nomem" ,
234
+ type_ = bool ,
235
+ default = False ,
236
+ help = "disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198" ,
237
+ )
222
238
return opts
223
239
224
240
def schedule (self , dryrun_info : AppDryRunInfo [SlurmBatchRequest ]) -> str :
0 commit comments