Fix health check loop silently dying on uncaught exception
Wrap check_all() call inside the loop with try/except so a transient error cannot exit the while-True and freeze all health statuses. Add transition logging (HEALTHY / UNHEALTHY) so docker logs show when a service changes state. Also add refetchIntervalInBackground on the frontend query so the poll continues even when the browser tab is not focused. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,7 @@ no blocking calls on the request path.
|
||||
"""
|
||||
import asyncio
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
@@ -62,6 +62,7 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
|
||||
]
|
||||
|
||||
_health = {svc.id: None for svc in _REGISTRY}
|
||||
logger.info("Service registry initialised with %d services", len(_REGISTRY))
|
||||
|
||||
|
||||
# ── Health check logic ────────────────────────────────────────────────────────
|
||||
@@ -69,12 +70,23 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
|
||||
|
||||
async def _check_service(svc: ServiceDefinition) -> None:
|
||||
url = f"{svc.internal_url}{svc.health_path}"
|
||||
prev = _health.get(svc.id)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||
resp = await client.get(url)
|
||||
_health[svc.id] = resp.status_code == 200
|
||||
except Exception:
|
||||
_health[svc.id] = False
|
||||
healthy = resp.status_code == 200
|
||||
except Exception as exc:
|
||||
logger.debug("Health check failed for %s: %s", svc.id, exc)
|
||||
healthy = False
|
||||
|
||||
_health[svc.id] = healthy
|
||||
|
||||
# Log only on transitions so the logs stay quiet during normal operation
|
||||
if prev != healthy:
|
||||
if healthy:
|
||||
logger.info("Service %s is now HEALTHY", svc.id)
|
||||
else:
|
||||
logger.warning("Service %s is now UNHEALTHY", svc.id)
|
||||
|
||||
|
||||
async def check_all() -> None:
|
||||
@@ -83,9 +95,16 @@ async def check_all() -> None:
|
||||
|
||||
|
||||
async def health_check_loop() -> None:
|
||||
"""Runs forever; polls every POLL_INTERVAL seconds."""
|
||||
"""Runs forever; polls every POLL_INTERVAL seconds.
|
||||
|
||||
Exceptions inside a single polling round are caught so the loop cannot
|
||||
be killed by a transient error.
|
||||
"""
|
||||
while True:
|
||||
await check_all()
|
||||
try:
|
||||
await check_all()
|
||||
except Exception:
|
||||
logger.exception("Unexpected error during health check round; will retry")
|
||||
await asyncio.sleep(POLL_INTERVAL)
|
||||
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ export default function AppsPage() {
|
||||
queryKey: ["services"],
|
||||
queryFn: getServices,
|
||||
refetchInterval: 30_000,
|
||||
refetchIntervalInBackground: true,
|
||||
});
|
||||
|
||||
return (
|
||||
|
||||
Reference in New Issue
Block a user