Skip to content

Commit 20597ec

Browse files
authored
Use exponentially increasing retry delays for pending runs (#2519)
1 parent 9260050 commit 20597ec

File tree

3 files changed

+75
-6
lines changed

3 files changed

+75
-6
lines changed

src/dstack/_internal/server/background/tasks/process_runs.py

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@
4040
from dstack._internal.utils.logging import get_logger
4141

4242
logger = get_logger(__name__)
43-
RETRY_DELAY = datetime.timedelta(seconds=15)
4443

4544

4645
async def process_runs(batch_size: int = 1):
@@ -130,11 +129,7 @@ async def _process_run(session: AsyncSession, run_model: RunModel):
130129
async def _process_pending_run(session: AsyncSession, run_model: RunModel):
131130
"""Jobs are not created yet"""
132131
run = run_model_to_run(run_model)
133-
if (
134-
run.latest_job_submission is not None
135-
and common.get_current_datetime() - run.latest_job_submission.last_processed_at
136-
< RETRY_DELAY
137-
):
132+
if not _pending_run_ready_for_resubmission(run_model, run):
138133
logger.debug("%s: pending run is not yet ready for resubmission", fmt(run_model))
139134
return
140135

@@ -183,6 +178,37 @@ async def _process_pending_run(session: AsyncSession, run_model: RunModel):
183178
logger.info("%s: run status has changed PENDING -> SUBMITTED", fmt(run_model))
184179

185180

181+
def _pending_run_ready_for_resubmission(run_model: RunModel, run: Run) -> bool:
182+
if run.latest_job_submission is None:
183+
# Should not be possible
184+
return True
185+
duration_since_processing = (
186+
common.get_current_datetime() - run.latest_job_submission.last_processed_at
187+
)
188+
if duration_since_processing < _get_retry_delay(run_model.resubmission_attempt):
189+
return False
190+
return True
191+
192+
193+
# We use exponentially increasing retry delays for pending runs.
194+
# This prevents creation of too many job submissions for runs stuck in pending,
195+
# e.g. when users set retry for a long period without capacity.
196+
_PENDING_RETRY_DELAYS = [
197+
datetime.timedelta(seconds=15),
198+
datetime.timedelta(seconds=30),
199+
datetime.timedelta(minutes=1),
200+
datetime.timedelta(minutes=2),
201+
datetime.timedelta(minutes=5),
202+
datetime.timedelta(minutes=10),
203+
]
204+
205+
206+
def _get_retry_delay(resubmission_attempt: int) -> datetime.timedelta:
207+
if resubmission_attempt - 1 < len(_PENDING_RETRY_DELAYS):
208+
return _PENDING_RETRY_DELAYS[resubmission_attempt - 1]
209+
return _PENDING_RETRY_DELAYS[-1]
210+
211+
186212
async def _process_active_run(session: AsyncSession, run_model: RunModel):
187213
"""
188214
Run is submitted, provisioning, or running.
@@ -341,6 +367,11 @@ async def _process_active_run(session: AsyncSession, run_model: RunModel):
341367
)
342368
run_model.status = new_status
343369
run_model.termination_reason = termination_reason
370+
# While a run goes to pending without provisioning, resubmission_attempt increases.
371+
if new_status == RunStatus.PROVISIONING:
372+
run_model.resubmission_attempt = 0
373+
elif new_status == RunStatus.PENDING:
374+
run_model.resubmission_attempt += 1
344375

345376

346377
def _should_retry_job(run: Run, job: Job, job_model: JobModel) -> Optional[datetime.timedelta]:
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
"""Add RunModel.resubmission_attempt
2+
3+
Revision ID: 7ba3b59d7ca6
4+
Revises: 7bc2586e8b9e
5+
Create Date: 2025-04-15 18:00:35.320906
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "7ba3b59d7ca6"
14+
down_revision = "7bc2586e8b9e"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
with op.batch_alter_table("runs", schema=None) as batch_op:
22+
batch_op.add_column(sa.Column("resubmission_attempt", sa.Integer(), nullable=True))
23+
batch_op.execute("UPDATE runs SET resubmission_attempt = 0")
24+
with op.batch_alter_table("runs", schema=None) as batch_op:
25+
batch_op.alter_column("resubmission_attempt", nullable=False)
26+
27+
# ### end Alembic commands ###
28+
29+
30+
def downgrade() -> None:
31+
# ### commands auto generated by Alembic - please adjust! ###
32+
with op.batch_alter_table("runs", schema=None) as batch_op:
33+
batch_op.drop_column("resubmission_attempt")
34+
35+
# ### end Alembic commands ###

src/dstack/_internal/server/models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ class RunModel(BaseModel):
343343
termination_reason: Mapped[Optional[RunTerminationReason]] = mapped_column(
344344
Enum(RunTerminationReason)
345345
)
346+
# resubmission_attempt counts consecutive transitions to pending without provisioning.
347+
# Can be used to choose retry delay depending on the attempt number.
348+
resubmission_attempt: Mapped[int] = mapped_column(Integer, default=0)
346349
run_spec: Mapped[str] = mapped_column(Text)
347350
service_spec: Mapped[Optional[str]] = mapped_column(Text)
348351

0 commit comments

Comments
 (0)