Skip to content

Commit 6fc2b9c

Browse files
committed
feat: Add startup, readiness, and liveness probes to K8s apps
We currently experience temporary 502s from the applications while they are doing a deployment rollout. This adds startup, readiness, and liveness checks to the pods to ensure that they are ready to serve traffic before they get incorporated into their corresponding service. This depends on upstream changes in the applications in PRS: - mitodl/mit-learn#2247 - mitodl/mitxonline#2652
1 parent a0a58a6 commit 6fc2b9c

File tree

1 file changed

+45
-0
lines changed
  • src/ol_infrastructure/components/services

1 file changed

+45
-0
lines changed

src/ol_infrastructure/components/services/k8s.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,22 +131,59 @@ class OLApplicationK8sConfig(BaseModel):
131131
init_migrations: bool = Field(default=True)
132132
init_collectstatic: bool = Field(default=True)
133133
celery_worker_configs: list[OLApplicationK8sCeleryWorkerConfig] = []
134+
probe_configs: dict[str, kubernetes.core.v1.ProbeArgs] = {
135+
# Liveness probe to check if the application is still running
136+
"liveness_probe": kubernetes.core.v1.ProbeArgs(
137+
http_get=kubernetes.core.v1.HTTPGetActionArgs(
138+
path="/health/liveness/",
139+
port=DEFAULT_UWSGI_PORT,
140+
),
141+
initial_delay_seconds=30, # Wait 30 seconds before first probe
142+
period_seconds=30,
143+
failure_threshold=3, # Consider failed after 3 attempts
144+
),
145+
# Readiness probe to check if the application is ready to serve traffic
146+
"readiness_probe": kubernetes.core.v1.ProbeArgs(
147+
http_get=kubernetes.core.v1.HTTPGetActionArgs(
148+
path="/health/readiness/",
149+
port=DEFAULT_UWSGI_PORT,
150+
),
151+
initial_delay_seconds=15, # Wait 15 seconds before first probe
152+
period_seconds=15,
153+
failure_threshold=3, # Consider failed after 3 attempts
154+
),
155+
# Startup probe to ensure the application is fully initialized before other probes start
156+
"startup_probe": kubernetes.core.v1.ProbeArgs(
157+
http_get=kubernetes.core.v1.HTTPGetActionArgs(
158+
path="/health/startup/",
159+
port=DEFAULT_UWSGI_PORT,
160+
),
161+
initial_delay_seconds=10, # Wait 10 seconds before first probe
162+
period_seconds=10, # Probe every 10 seconds
163+
failure_threshold=30, # Allow up to 5 minutes (30 * 10s) for startup
164+
success_threshold=1,
165+
timeout_seconds=5,
166+
),
167+
}
134168

135169
# See https://www.pulumi.com/docs/reference/pkg/python/pulumi/#pulumi.Output.from_input
136170
# for docs. This unwraps the value so Pydantic can store it in the config class.
137171
@field_validator("application_security_group_id")
138172
@classmethod
139173
def validate_sec_group_id(cls, application_security_group_id: Output[str]):
174+
"""Ensure that the security group ID is unwrapped from the Pulumi Output."""
140175
return Output.from_input(application_security_group_id)
141176

142177
@field_validator("application_security_group_name")
143178
@classmethod
144179
def validate_sec_group_name(cls, application_security_group_name: Output[str]):
180+
"""Ensure that the security group name is unwrapped from the Pulumi Output."""
145181
return Output.from_input(application_security_group_name)
146182

147183
@field_validator("application_config")
148184
@classmethod
149185
def validate_application_config(cls, application_config: dict[str, Any]):
186+
"""Ensure that all application config values are strings."""
150187
# Convert all values to strings because that is what k8s expects.
151188
return {key: str(value) for key, value in application_config.items()}
152189

@@ -427,6 +464,7 @@ def __init__( # noqa: C901
427464
env=application_deployment_env_vars,
428465
env_from=application_deployment_envfrom,
429466
volume_mounts=webapp_volume_mounts,
467+
**ol_app_k8s_config.probe_configs,
430468
),
431469
],
432470
),
@@ -684,6 +722,7 @@ def ensure_request_id_plugin(
684722

685723
@model_validator(mode="after")
686724
def check_backend_or_upstream(self) -> "OLApisixRouteConfig":
725+
"""Ensure that either upstream or backend service details are provided, not both."""
687726
upstream: str | None = self.upstream
688727
backend_service_name: str | None = self.backend_service_name
689728
backend_service_port: str | None = self.backend_service_port
@@ -712,6 +751,7 @@ def __init__(
712751
k8s_labels: dict[str, str],
713752
opts: pulumi.ResourceOptions | None = None,
714753
):
754+
"""Initialize the OLApisixRoute component resource."""
715755
super().__init__(
716756
"ol:infrastructure:services:k8s:OLApisixRoute", name, None, opts
717757
)
@@ -805,6 +845,7 @@ def __init__(
805845
oidc_config: OLApisixOIDCConfig,
806846
opts: pulumi.ResourceOptions | None = None,
807847
):
848+
"""Initialize the OLApisixOIDCResources component resource."""
808849
super().__init__(
809850
"ol:infrastructure:services:k8s:OLApisixOIDCResources", name, None, opts
810851
)
@@ -868,12 +909,14 @@ def __init__(
868909
)
869910

870911
def get_base_oidc_config(self, unauth_action: str) -> dict[str, Any]:
912+
"""Return the base OIDC configuration dictionary."""
871913
return {
872914
**self.base_oidc_config,
873915
"unauth_action": unauth_action,
874916
}
875917

876918
def get_full_oidc_plugin_config(self, unauth_action: str) -> dict[str, Any]:
919+
"""Return the full OIDC plugin configuration dictionary for Apisix."""
877920
return {
878921
"name": "openid-connect",
879922
"enable": True,
@@ -906,6 +949,7 @@ def __init__(
906949
plugin_config: OLApisixSharedPluginsConfig,
907950
opts: pulumi.ResourceOptions | None = None,
908951
):
952+
"""Initialize the OLApisixSharedPlugins component resource."""
909953
super().__init__(
910954
"ol:infrastructure:services:k8s:OLApisixSharedPlugin", name, None, opts
911955
)
@@ -990,6 +1034,7 @@ def __init__(
9901034
external_upstream_config: OLApisixExternalUpstreamConfig,
9911035
opts: pulumi.ResourceOptions | None = None,
9921036
):
1037+
"""Initialize the OLApisixExternalUpstream component resource."""
9931038
super().__init__(
9941039
"ol:infrastructure:services:k8s:OLApisixExternalUpstream", name, None, opts
9951040
)

0 commit comments

Comments
 (0)