Refine health checks and add polling animation
This commit is contained in:
1
app/backend/operations/health/__init__.py
Normal file
1
app/backend/operations/health/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Health feature package."""
|
||||
74
app/backend/operations/health/router.py
Normal file
74
app/backend/operations/health/router.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""Health endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from fastapi.responses import PlainTextResponse
|
||||
|
||||
from core.config import Settings, get_settings
|
||||
from operations.health.schemas import HealthStatus
|
||||
from operations.health.service import get_detailed_health, readiness_check
|
||||
|
||||
router = APIRouter(prefix="/health", tags=["health"])
|
||||
|
||||
|
||||
@router.get(
|
||||
"/live",
|
||||
summary="Liveness Probe",
|
||||
response_class=PlainTextResponse,
|
||||
status_code=200,
|
||||
)
|
||||
def liveness() -> str:
|
||||
"""
|
||||
**Liveness Probe:** Confirms the application process is running and responsive.
|
||||
|
||||
This endpoint is used by automated systems (like Kubernetes) to determine if
|
||||
the instance should be kept running or restarted. It must be extremely lightweight,
|
||||
performing no deep checks on external dependencies.
|
||||
|
||||
**Success Response:** HTTP 200 OK with "live" body.
|
||||
**Failure Response:** Endpoint timeout (no response).
|
||||
"""
|
||||
return "live"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/ready",
|
||||
summary="Readiness Probe",
|
||||
response_class=PlainTextResponse,
|
||||
status_code=200,
|
||||
)
|
||||
async def readiness() -> str:
|
||||
"""
|
||||
**Readiness Probe:** Determines if the application can accept user traffic.
|
||||
|
||||
This endpoint is used by load balancers to route traffic. It performs deep checks
|
||||
on all critical dependencies (e.g., database, message queue).
|
||||
|
||||
**Success Response:** HTTP 200 OK with "ready" body.
|
||||
**Failure Response:** HTTP 503 Service Unavailable if any critical dependency fails.
|
||||
"""
|
||||
ok = await readiness_check()
|
||||
if not ok:
|
||||
raise HTTPException(status_code=503, detail="not ready")
|
||||
|
||||
return "ready"
|
||||
|
||||
|
||||
@router.get(
|
||||
"",
|
||||
summary="Detailed Health Status Page",
|
||||
response_model=HealthStatus,
|
||||
status_code=200,
|
||||
)
|
||||
async def detailed_health(settings: Settings = Depends(get_settings)) -> HealthStatus:
|
||||
"""
|
||||
**Detailed Status Page:** Provides granular health information for human operators.
|
||||
|
||||
This endpoint runs all readiness checks and returns a structured JSON object.
|
||||
The top-level HTTP status code reflects the overall application health (200 OK or 503 Service Unavailable).
|
||||
"""
|
||||
detailed_health = await get_detailed_health(settings)
|
||||
|
||||
if detailed_health.status != "pass":
|
||||
raise HTTPException(status_code=503, detail="not ready")
|
||||
|
||||
return detailed_health
|
||||
23
app/backend/operations/health/schemas.py
Normal file
23
app/backend/operations/health/schemas.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Pydantic schemas for health responses."""
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
class ComponentCheck(BaseModel):
|
||||
name: str
|
||||
# pass | warn | fail
|
||||
status: str
|
||||
time: datetime | None = None
|
||||
output: str | None = None
|
||||
observedValue: float | int | None = None
|
||||
observedUnit: str | None = None
|
||||
|
||||
class HealthStatus(BaseModel):
|
||||
# pass | warn | fail
|
||||
status: str
|
||||
version: str | None = None
|
||||
environment: str | None = None
|
||||
serviceName: str | None = None
|
||||
description: str | None = None
|
||||
checks: dict[str, ComponentCheck]
|
||||
89
app/backend/operations/health/service.py
Normal file
89
app/backend/operations/health/service.py
Normal file
@@ -0,0 +1,89 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from core.config import Settings
|
||||
from operations.health.schemas import ComponentCheck, HealthStatus
|
||||
|
||||
|
||||
async def check_database_status() -> ComponentCheck:
|
||||
"""Checking the primary database connection with a timeout."""
|
||||
try:
|
||||
# Simulate an async DB call (replace with actual logic)
|
||||
await asyncio.wait_for(asyncio.sleep(0.042), timeout=0.1)
|
||||
|
||||
return ComponentCheck(name = "postgres",
|
||||
status = "pass",
|
||||
time = datetime.now(timezone.utc),
|
||||
observedValue = 42,
|
||||
observedUnit = "ms")
|
||||
except asyncio.TimeoutError:
|
||||
return ComponentCheck(name = "postgres",
|
||||
status = "fail",
|
||||
time = datetime.now(timezone.utc),
|
||||
output = "Check timed out")
|
||||
except Exception as e:
|
||||
return ComponentCheck(name = "postgres",
|
||||
status = "fail",
|
||||
time = datetime.now(timezone.utc),
|
||||
output = str(e))
|
||||
|
||||
async def check_media_server_status() -> ComponentCheck:
|
||||
"""Checking the media server connection with a timeout."""
|
||||
try:
|
||||
# Simulate a successful async queue call
|
||||
await asyncio.wait_for(asyncio.sleep(0.02), timeout=0.05)
|
||||
|
||||
return ComponentCheck(name = "livekit",
|
||||
status = "pass",
|
||||
time = datetime.now(timezone.utc),
|
||||
observedValue = 20,
|
||||
observedUnit = "ms")
|
||||
except asyncio.TimeoutError:
|
||||
return ComponentCheck(name = "livekit",
|
||||
status = "fail",
|
||||
time = datetime.now(timezone.utc),
|
||||
output = "Check timed out")
|
||||
except Exception as e:
|
||||
return ComponentCheck(name = "livekit",
|
||||
status = "fail",
|
||||
time = datetime.now(timezone.utc),
|
||||
output = str(e))
|
||||
|
||||
async def readiness_check() -> bool:
|
||||
"""
|
||||
*Extend readiness_check() as dependencies are added; it must return True/False.*
|
||||
"""
|
||||
|
||||
# Run all critical checks
|
||||
db = await check_database_status()
|
||||
media_server = await check_media_server_status()
|
||||
|
||||
# Check if all statuses are 'pass'
|
||||
if db.status == 'pass' and media_server.status == 'pass':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
async def get_detailed_health(settings: Settings) -> HealthStatus:
|
||||
"""
|
||||
Build detailed health payload aligned with common status formats.
|
||||
|
||||
status: pass | warn | fail
|
||||
"""
|
||||
|
||||
# Run all critical checks
|
||||
db = await check_database_status()
|
||||
media_server = await check_media_server_status()
|
||||
|
||||
# Check if all statuses are 'pass'
|
||||
overall = "pass"
|
||||
if db.status != 'pass' or media_server.status != 'pass':
|
||||
overall = "fail"
|
||||
|
||||
return HealthStatus(status = overall,
|
||||
version = settings.version,
|
||||
environment = settings.environment,
|
||||
serviceName = settings.service_name,
|
||||
description = settings.title,
|
||||
checks = {"Database": db,
|
||||
"Media Server": media_server})
|
||||
Reference in New Issue
Block a user