Fix health check loop silently dying on uncaught exception
Wrap check_all() call inside the loop with try/except so a transient error cannot exit the while-True and freeze all health statuses. Add transition logging (HEALTHY / UNHEALTHY) so docker logs show when a service changes state. Also add refetchIntervalInBackground on the frontend query so the poll continues even when the browser tab is not focused. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,7 @@ no blocking calls on the request path.
|
|||||||
"""
|
"""
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
@@ -62,6 +62,7 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
|
|||||||
]
|
]
|
||||||
|
|
||||||
_health = {svc.id: None for svc in _REGISTRY}
|
_health = {svc.id: None for svc in _REGISTRY}
|
||||||
|
logger.info("Service registry initialised with %d services", len(_REGISTRY))
|
||||||
|
|
||||||
|
|
||||||
# ── Health check logic ────────────────────────────────────────────────────────
|
# ── Health check logic ────────────────────────────────────────────────────────
|
||||||
@@ -69,12 +70,23 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
|
|||||||
|
|
||||||
async def _check_service(svc: ServiceDefinition) -> None:
|
async def _check_service(svc: ServiceDefinition) -> None:
|
||||||
url = f"{svc.internal_url}{svc.health_path}"
|
url = f"{svc.internal_url}{svc.health_path}"
|
||||||
|
prev = _health.get(svc.id)
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=5.0) as client:
|
async with httpx.AsyncClient(timeout=5.0) as client:
|
||||||
resp = await client.get(url)
|
resp = await client.get(url)
|
||||||
_health[svc.id] = resp.status_code == 200
|
healthy = resp.status_code == 200
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
_health[svc.id] = False
|
logger.debug("Health check failed for %s: %s", svc.id, exc)
|
||||||
|
healthy = False
|
||||||
|
|
||||||
|
_health[svc.id] = healthy
|
||||||
|
|
||||||
|
# Log only on transitions so the logs stay quiet during normal operation
|
||||||
|
if prev != healthy:
|
||||||
|
if healthy:
|
||||||
|
logger.info("Service %s is now HEALTHY", svc.id)
|
||||||
|
else:
|
||||||
|
logger.warning("Service %s is now UNHEALTHY", svc.id)
|
||||||
|
|
||||||
|
|
||||||
async def check_all() -> None:
|
async def check_all() -> None:
|
||||||
@@ -83,9 +95,16 @@ async def check_all() -> None:
|
|||||||
|
|
||||||
|
|
||||||
async def health_check_loop() -> None:
|
async def health_check_loop() -> None:
|
||||||
"""Runs forever; polls every POLL_INTERVAL seconds."""
|
"""Runs forever; polls every POLL_INTERVAL seconds.
|
||||||
|
|
||||||
|
Exceptions inside a single polling round are caught so the loop cannot
|
||||||
|
be killed by a transient error.
|
||||||
|
"""
|
||||||
while True:
|
while True:
|
||||||
|
try:
|
||||||
await check_all()
|
await check_all()
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Unexpected error during health check round; will retry")
|
||||||
await asyncio.sleep(POLL_INTERVAL)
|
await asyncio.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ export default function AppsPage() {
|
|||||||
queryKey: ["services"],
|
queryKey: ["services"],
|
||||||
queryFn: getServices,
|
queryFn: getServices,
|
||||||
refetchInterval: 30_000,
|
refetchInterval: 30_000,
|
||||||
|
refetchIntervalInBackground: true,
|
||||||
});
|
});
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
|||||||
Reference in New Issue
Block a user