Refine health checks and add polling animation

This commit is contained in:
2025-11-27 11:43:35 +01:00
parent c841e27c30
commit 25d80f5723
18 changed files with 499 additions and 219 deletions

View File

@@ -0,0 +1 @@
"""Domain operation modules."""

View File

@@ -0,0 +1 @@
"""Health feature package."""

View File

@@ -0,0 +1,74 @@
"""Health endpoints."""
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import PlainTextResponse
from core.config import Settings, get_settings
from operations.health.schemas import HealthStatus
from operations.health.service import get_detailed_health, readiness_check
router = APIRouter(prefix="/health", tags=["health"])
@router.get(
"/live",
summary="Liveness Probe",
response_class=PlainTextResponse,
status_code=200,
)
def liveness() -> str:
"""
**Liveness Probe:** Confirms the application process is running and responsive.
This endpoint is used by automated systems (like Kubernetes) to determine if
the instance should be kept running or restarted. It must be extremely lightweight,
performing no deep checks on external dependencies.
**Success Response:** HTTP 200 OK with "live" body.
**Failure Response:** Endpoint timeout (no response).
"""
return "live"
@router.get(
"/ready",
summary="Readiness Probe",
response_class=PlainTextResponse,
status_code=200,
)
async def readiness() -> str:
"""
**Readiness Probe:** Determines if the application can accept user traffic.
This endpoint is used by load balancers to route traffic. It performs deep checks
on all critical dependencies (e.g., database, message queue).
**Success Response:** HTTP 200 OK with "ready" body.
**Failure Response:** HTTP 503 Service Unavailable if any critical dependency fails.
"""
ok = await readiness_check()
if not ok:
raise HTTPException(status_code=503, detail="not ready")
return "ready"
@router.get(
"",
summary="Detailed Health Status Page",
response_model=HealthStatus,
status_code=200,
)
async def detailed_health(settings: Settings = Depends(get_settings)) -> HealthStatus:
"""
**Detailed Status Page:** Provides granular health information for human operators.
This endpoint runs all readiness checks and returns a structured JSON object.
The top-level HTTP status code reflects the overall application health (200 OK or 503 Service Unavailable).
"""
detailed_health = await get_detailed_health(settings)
if detailed_health.status != "pass":
raise HTTPException(status_code=503, detail="not ready")
return detailed_health

View File

@@ -0,0 +1,23 @@
"""Pydantic schemas for health responses."""
from datetime import datetime
from pydantic import BaseModel
class ComponentCheck(BaseModel):
name: str
# pass | warn | fail
status: str
time: datetime | None = None
output: str | None = None
observedValue: float | int | None = None
observedUnit: str | None = None
class HealthStatus(BaseModel):
# pass | warn | fail
status: str
version: str | None = None
environment: str | None = None
serviceName: str | None = None
description: str | None = None
checks: dict[str, ComponentCheck]

View File

@@ -0,0 +1,89 @@
import asyncio
from datetime import datetime, timezone
from core.config import Settings
from operations.health.schemas import ComponentCheck, HealthStatus
async def check_database_status() -> ComponentCheck:
"""Checking the primary database connection with a timeout."""
try:
# Simulate an async DB call (replace with actual logic)
await asyncio.wait_for(asyncio.sleep(0.042), timeout=0.1)
return ComponentCheck(name = "postgres",
status = "pass",
time = datetime.now(timezone.utc),
observedValue = 42,
observedUnit = "ms")
except asyncio.TimeoutError:
return ComponentCheck(name = "postgres",
status = "fail",
time = datetime.now(timezone.utc),
output = "Check timed out")
except Exception as e:
return ComponentCheck(name = "postgres",
status = "fail",
time = datetime.now(timezone.utc),
output = str(e))
async def check_media_server_status() -> ComponentCheck:
"""Checking the media server connection with a timeout."""
try:
# Simulate a successful async queue call
await asyncio.wait_for(asyncio.sleep(0.02), timeout=0.05)
return ComponentCheck(name = "livekit",
status = "pass",
time = datetime.now(timezone.utc),
observedValue = 20,
observedUnit = "ms")
except asyncio.TimeoutError:
return ComponentCheck(name = "livekit",
status = "fail",
time = datetime.now(timezone.utc),
output = "Check timed out")
except Exception as e:
return ComponentCheck(name = "livekit",
status = "fail",
time = datetime.now(timezone.utc),
output = str(e))
async def readiness_check() -> bool:
"""
*Extend readiness_check() as dependencies are added; it must return True/False.*
"""
# Run all critical checks
db = await check_database_status()
media_server = await check_media_server_status()
# Check if all statuses are 'pass'
if db.status == 'pass' and media_server.status == 'pass':
return True
else:
return False
async def get_detailed_health(settings: Settings) -> HealthStatus:
"""
Build detailed health payload aligned with common status formats.
status: pass | warn | fail
"""
# Run all critical checks
db = await check_database_status()
media_server = await check_media_server_status()
# Check if all statuses are 'pass'
overall = "pass"
if db.status != 'pass' or media_server.status != 'pass':
overall = "fail"
return HealthStatus(status = overall,
version = settings.version,
environment = settings.environment,
serviceName = settings.service_name,
description = settings.title,
checks = {"Database": db,
"Media Server": media_server})