Skip to content

Commit c318d83

Browse files
authored
Allow to configure Nebius InfiniBand fabrics (#2607)
Add an option in Nebius backend settings to limit the list of allowed fabrics for InfiniBand clusters. This can be useful for larger customers that have capacity reservations tied to a specific fabric.
1 parent eee869d commit c318d83

File tree

4 files changed

+75
-33
lines changed

4 files changed

+75
-33
lines changed

src/dstack/_internal/core/backends/nebius/compute.py

Lines changed: 9 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import random
33
import shlex
44
import time
5-
from dataclasses import dataclass
65
from functools import cached_property
76
from typing import List, Optional
87

@@ -21,6 +20,7 @@
2120
)
2221
from dstack._internal.core.backends.base.offers import get_catalog_offers
2322
from dstack._internal.core.backends.nebius import resources
23+
from dstack._internal.core.backends.nebius.fabrics import get_suitable_infiniband_fabrics
2424
from dstack._internal.core.backends.nebius.models import NebiusConfig, NebiusServiceAccountCreds
2525
from dstack._internal.core.errors import (
2626
BackendError,
@@ -81,24 +81,6 @@
8181
]
8282

8383

84-
@dataclass(frozen=True)
85-
class InfinibandFabric:
86-
name: str
87-
platform: str
88-
region: str
89-
90-
91-
# https://docs.nebius.com/compute/clusters/gpu#fabrics
92-
INFINIBAND_FABRICS = [
93-
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
94-
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
95-
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
96-
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
97-
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
98-
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
99-
]
100-
101-
10284
class NebiusCompute(
10385
ComputeWithCreateInstanceSupport,
10486
ComputeWithMultinodeSupport,
@@ -280,7 +262,9 @@ def create_placement_group(
280262
backend_data = NebiusPlacementGroupBackendData(cluster=None)
281263
# Only create a Nebius cluster if the instance supports it.
282264
# For other instances, return dummy PlacementGroupProvisioningData.
283-
if fabrics := _get_suitable_infiniband_fabrics(master_instance_offer):
265+
if fabrics := get_suitable_infiniband_fabrics(
266+
master_instance_offer, allowed_fabrics=self.config.fabrics
267+
):
284268
fabric = random.choice(fabrics)
285269
op = resources.create_cluster(
286270
self._sdk,
@@ -319,7 +303,11 @@ def is_suitable_placement_group(
319303
)
320304
return (
321305
backend_data.cluster is None
322-
or backend_data.cluster.fabric in _get_suitable_infiniband_fabrics(instance_offer)
306+
or backend_data.cluster.fabric
307+
in get_suitable_infiniband_fabrics(
308+
instance_offer,
309+
allowed_fabrics=None, # enforced at cluster creation time, no need to enforce here
310+
)
323311
)
324312

325313

@@ -380,15 +368,3 @@ def _wait_for_instance(sdk: SDK, op: SDKOperation[Operation]) -> None:
380368
def _supported_instances(offer: InstanceOffer) -> bool:
381369
platform, _ = offer.instance.name.split()
382370
return platform in SUPPORTED_PLATFORMS and not offer.instance.resources.spot
383-
384-
385-
def _get_suitable_infiniband_fabrics(offer: InstanceOffer) -> list[str]:
386-
if len(offer.instance.resources.gpus) < 8:
387-
# From the create VM page in the Nebius Console:
388-
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
389-
# > can be added to the cluster
390-
return []
391-
platform, _ = offer.instance.name.split()
392-
return [
393-
f.name for f in INFINIBAND_FABRICS if f.platform == platform and f.region == offer.region
394-
]

src/dstack/_internal/core/backends/nebius/configurator.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
)
1010
from dstack._internal.core.backends.nebius import resources
1111
from dstack._internal.core.backends.nebius.backend import NebiusBackend
12+
from dstack._internal.core.backends.nebius.fabrics import get_all_infiniband_fabrics
1213
from dstack._internal.core.backends.nebius.models import (
1314
AnyNebiusBackendConfig,
1415
NebiusBackendConfig,
@@ -38,6 +39,16 @@ def validate_config(self, config: NebiusBackendConfigWithCreds, default_creds_en
3839
fields=[["creds"]],
3940
details=str(e),
4041
)
42+
valid_fabrics = get_all_infiniband_fabrics()
43+
if invalid_fabrics := set(config.fabrics or []) - valid_fabrics:
44+
raise_invalid_credentials_error(
45+
fields=[["fabrics"]],
46+
details=(
47+
"These InfiniBand fabrics do not exist or are not known to dstack:"
48+
f" {sorted(invalid_fabrics)}. Omit `fabrics` to allow all fabrics or select"
49+
f" some of the valid options: {sorted(valid_fabrics)}"
50+
),
51+
)
4152

4253
def create_backend(
4354
self, project_name: str, config: NebiusBackendConfigWithCreds
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from collections.abc import Container
2+
from dataclasses import dataclass
3+
from typing import Optional
4+
5+
from dstack._internal.core.models.instances import InstanceOffer
6+
7+
8+
@dataclass(frozen=True)
9+
class InfinibandFabric:
10+
name: str
11+
platform: str
12+
region: str
13+
14+
15+
# https://docs.nebius.com/compute/clusters/gpu#fabrics
16+
INFINIBAND_FABRICS = [
17+
InfinibandFabric("fabric-2", "gpu-h100-sxm", "eu-north1"),
18+
InfinibandFabric("fabric-3", "gpu-h100-sxm", "eu-north1"),
19+
InfinibandFabric("fabric-4", "gpu-h100-sxm", "eu-north1"),
20+
InfinibandFabric("fabric-5", "gpu-h200-sxm", "eu-west1"),
21+
InfinibandFabric("fabric-6", "gpu-h100-sxm", "eu-north1"),
22+
InfinibandFabric("fabric-7", "gpu-h200-sxm", "eu-north1"),
23+
]
24+
25+
26+
def get_suitable_infiniband_fabrics(
27+
offer: InstanceOffer, allowed_fabrics: Optional[Container[str]]
28+
) -> list[str]:
29+
if len(offer.instance.resources.gpus) < 8:
30+
# From the create VM page in the Nebius Console:
31+
# > Only virtual machines with at least 8 NVIDIA® Hopper® H100 or H200 GPUs
32+
# > can be added to the cluster
33+
return []
34+
platform, _ = offer.instance.name.split()
35+
return [
36+
f.name
37+
for f in INFINIBAND_FABRICS
38+
if (
39+
f.platform == platform
40+
and f.region == offer.region
41+
and (allowed_fabrics is None or f.name in allowed_fabrics)
42+
)
43+
]
44+
45+
46+
def get_all_infiniband_fabrics() -> set[str]:
47+
return {f.name for f in INFINIBAND_FABRICS}

src/dstack/_internal/core/backends/nebius/models.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ class NebiusBackendConfig(CoreModel):
8787
Optional[list[str]],
8888
Field(description="The list of allowed Nebius regions. Omit to allow all regions"),
8989
] = None
90+
fabrics: Annotated[
91+
Optional[list[str]],
92+
Field(
93+
description=(
94+
"The list of allowed fabrics for InfiniBand clusters. Omit to allow all fabrics"
95+
)
96+
),
97+
] = None
9098

9199

92100
class NebiusBackendConfigWithCreds(NebiusBackendConfig):

0 commit comments

Comments
 (0)