51
51
"TIMEOUT" : AppState .FAILED ,
52
52
}
53
53
54
+ SBATCH_OPTIONS = {
55
+ "partition" ,
56
+ "time" ,
57
+ }
58
+
54
59
55
60
def _apply_app_id_env (s : str ) -> str :
56
61
"""
@@ -68,7 +73,6 @@ class SlurmReplicaRequest:
68
73
"""
69
74
70
75
name : str
71
- dir : str
72
76
entrypoint : str
73
77
args : List [str ]
74
78
srun_opts : Dict [str , str ]
@@ -79,21 +83,25 @@ class SlurmReplicaRequest:
79
83
def from_role (
80
84
cls , name : str , role : Role , cfg : Mapping [str , CfgVal ]
81
85
) -> "SlurmReplicaRequest" :
82
- sbatch_opts = {k : str (v ) for k , v in cfg .items () if v is not None }
86
+ sbatch_opts = {}
87
+ for k , v in cfg .items ():
88
+ if v is None :
89
+ continue
90
+ if k in SBATCH_OPTIONS :
91
+ sbatch_opts [k ] = str (v )
83
92
sbatch_opts .setdefault ("ntasks-per-node" , "1" )
84
93
resource = role .resource
85
94
86
95
if resource != NONE :
87
96
if resource .cpu > 0 :
88
97
sbatch_opts .setdefault ("cpus-per-task" , str (resource .cpu ))
89
- if resource .memMB > 0 :
98
+ if not cfg . get ( "nomem" ) and resource .memMB > 0 :
90
99
sbatch_opts .setdefault ("mem" , str (resource .memMB ))
91
100
if resource .gpu > 0 :
92
101
sbatch_opts .setdefault ("gpus-per-task" , str (resource .gpu ))
93
102
94
103
return cls (
95
104
name = name ,
96
- dir = role .image ,
97
105
entrypoint = role .entrypoint ,
98
106
args = list (role .args ),
99
107
sbatch_opts = sbatch_opts ,
@@ -109,11 +117,9 @@ def materialize(self) -> Tuple[List[str], List[str]]:
109
117
sbatch_args = [
110
118
f"--job-name={ self .name } " ,
111
119
] + [f"--{ key } ={ value } " for key , value in self .sbatch_opts .items ()]
112
- srun_args = (
113
- [f"--chdir={ self .dir } " ]
114
- + [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()]
115
- + [f"--export={ key } ={ value } " for key , value in self .env .items ()]
116
- )
120
+ srun_args = [f"--{ key } ={ value } " for key , value in self .srun_opts .items ()] + [
121
+ f"--export={ key } ={ value } " for key , value in self .env .items ()
122
+ ]
117
123
118
124
srun_group = srun_args + [self .entrypoint ] + self .args
119
125
srun_group = [_apply_app_id_env (arg ) for arg in srun_group ]
@@ -172,10 +178,14 @@ class SlurmScheduler(Scheduler):
172
178
173
179
Logs are written to the default slurm log file.
174
180
175
- Any scheduler options passed to it are added as SBATCH arguments to each
181
+ Some of the config options passed to it are added as SBATCH arguments to each
176
182
replica. See https://slurm.schedmd.com/sbatch.html#SECTION_OPTIONS for info
177
183
on the arguments.
178
184
185
+ Slurm jobs inherit the currently active ``conda`` or ``virtualenv`` and run
186
+ in the current working directory. This matches the behavior of the
187
+ ``local_cwd`` scheduler.
188
+
179
189
For more info see:
180
190
181
191
* https://slurm.schedmd.com/sbatch.html
@@ -219,6 +229,12 @@ def run_opts(self) -> runopts:
219
229
default = None ,
220
230
help = "The maximum time the job is allowed to run for." ,
221
231
)
232
+ opts .add (
233
+ "nomem" ,
234
+ type_ = bool ,
235
+ default = False ,
236
+ help = "disables memory request to workaround https://github.com/aws/aws-parallelcluster/issues/2198" ,
237
+ )
222
238
return opts
223
239
224
240
def schedule (self , dryrun_info : AppDryRunInfo [SlurmBatchRequest ]) -> str :
0 commit comments