Add priority queue to ai-service and STATUS.md workflow

- Introduce async priority queue service in ai-service; all /chat calls now route through it - Refactor chat router to separate execute_chat (core logic) from the HTTP handler - Add /queue endpoints (status, pause, resume, cancel) for queue management - Update ai-service config to use Pydantic v2 model_config style - Add STATUS.md files for backend, ai-service, doc-service, and frontend - Document STATUS.md workflow in CLAUDE.md - Update doc-service documents router and schemas; frontend DocumentsPage and API client Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 22:58:10 +02:00
parent d2495190a9
commit c4f0c7ad49
18 changed files with 1253 additions and 35 deletions
@@ -1,3 +1,10 @@
+"""
+POST /chat — synchronous chat endpoint.
+
+All requests are submitted to the priority queue at NORMAL priority and the caller
+waits for the result. This keeps the contract identical to the original endpoint
+while ensuring all AI traffic flows through one ordered queue.
+"""
 import asyncio
 import re

@@ -21,8 +28,11 @@ def _strip_fences(text: str) -> str:
    return m.group(1).strip() if m else text.strip()


-@router.post("/chat", response_model=ChatResponse)
-async def chat(request: ChatRequest) -> ChatResponse:
+async def execute_chat(request: ChatRequest) -> ChatResponse:
+    """
+    Core provider call — invoked by the queue worker.
+    Raises HTTPException on provider errors so the queue worker stores the message.
+    """
    config = await load_ai_config()

    provider_name = config.get("provider", "lmstudio")
@@ -36,7 +46,6 @@ async def chat(request: ChatRequest) -> ChatResponse:

    timeout = config.get("timeout_seconds", 60)
    max_retries = config.get("max_retries", 2)
-    last_exc: Exception | None = None

    for attempt in range(max_retries + 1):
        try:
@@ -46,11 +55,8 @@ async def chat(request: ChatRequest) -> ChatResponse:
            )
            break
        except asyncio.TimeoutError as exc:
-            last_exc = exc
-            # Don't retry on timeout — the model is busy; fail fast
            raise HTTPException(status_code=504, detail="AI provider timed out") from exc
        except (AnthropicConnError, OpenAIConnError) as exc:
-            last_exc = exc
            if attempt < max_retries:
                await asyncio.sleep(0.5 * (attempt + 1))
                continue
@@ -68,3 +74,28 @@ async def chat(request: ChatRequest) -> ChatResponse:
        input_tokens=input_tokens,
        output_tokens=output_tokens,
    )
+
+
+@router.post("/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest) -> ChatResponse:
+    """
+    Submit at NORMAL priority and block until the queue processes the job.
+    If the queue is paused or stopped, the call blocks until resumed (or times out).
+    """
+    from app.services.queue import Priority, queue_service  # deferred — avoids circular import
+
+    job = await queue_service.enqueue(request, Priority.NORMAL)
+    config = await load_ai_config()
+    timeout = float(config.get("timeout_seconds", 60)) + 5.0  # +5s buffer over provider timeout
+
+    try:
+        return await asyncio.wait_for(asyncio.shield(job.future), timeout=timeout)
+    except asyncio.TimeoutError:
+        queue_service.cancel_job(job.id)
+        raise HTTPException(status_code=504, detail="Timed out waiting for queue to process job")
+    except asyncio.CancelledError:
+        raise HTTPException(status_code=503, detail="Job was cancelled")
+    except Exception as exc:
+        if isinstance(exc, HTTPException):
+            raise
+        raise HTTPException(status_code=502, detail=str(exc)) from exc