Fix health check loop silently dying on uncaught exception

Wrap check_all() call inside the loop with try/except so a transient error
cannot exit the while-True and freeze all health statuses. Add transition
logging (HEALTHY / UNHEALTHY) so docker logs show when a service changes
state. Also add refetchIntervalInBackground on the frontend query so the
poll continues even when the browser tab is not focused.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-17 17:36:58 +02:00
parent 3248607790
commit 151773ab51
2 changed files with 26 additions and 6 deletions
+25 -6
View File
@@ -7,7 +7,7 @@ no blocking calls on the request path.
"""
import asyncio
import logging
from dataclasses import dataclass, field
from dataclasses import dataclass
import httpx
@@ -62,6 +62,7 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
]
_health = {svc.id: None for svc in _REGISTRY}
logger.info("Service registry initialised with %d services", len(_REGISTRY))
# ── Health check logic ────────────────────────────────────────────────────────
@@ -69,12 +70,23 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
async def _check_service(svc: ServiceDefinition) -> None:
url = f"{svc.internal_url}{svc.health_path}"
prev = _health.get(svc.id)
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(url)
_health[svc.id] = resp.status_code == 200
except Exception:
_health[svc.id] = False
healthy = resp.status_code == 200
except Exception as exc:
logger.debug("Health check failed for %s: %s", svc.id, exc)
healthy = False
_health[svc.id] = healthy
# Log only on transitions so the logs stay quiet during normal operation
if prev != healthy:
if healthy:
logger.info("Service %s is now HEALTHY", svc.id)
else:
logger.warning("Service %s is now UNHEALTHY", svc.id)
async def check_all() -> None:
@@ -83,9 +95,16 @@ async def check_all() -> None:
async def health_check_loop() -> None:
"""Runs forever; polls every POLL_INTERVAL seconds."""
"""Runs forever; polls every POLL_INTERVAL seconds.
Exceptions inside a single polling round are caught so the loop cannot
be killed by a transient error.
"""
while True:
await check_all()
try:
await check_all()
except Exception:
logger.exception("Unexpected error during health check round; will retry")
await asyncio.sleep(POLL_INTERVAL)
+1
View File
@@ -32,6 +32,7 @@ export default function AppsPage() {
queryKey: ["services"],
queryFn: getServices,
refetchInterval: 30_000,
refetchIntervalInBackground: true,
});
return (