feat(01-05): wire main.py lifespan+health and rewrite documents+topics to async session

- Rewrite main.py lifespan: MinIO client created at startup, docuvault bucket auto-created if missing, stored on app.state.minio; engine.dispose() on shutdown - Extend /health endpoint: probes PostgreSQL (SELECT 1) and MinIO (bucket_exists) returning {"status": "ok"|"degraded", "checks": {"postgres": ..., "minio": ...}} - Rewrite api/documents.py: all routes inject session: AsyncSession = Depends(get_db); save_upload/save_metadata/list_metadata/get_metadata/delete_document all async; upload handler queues extract_and_classify.delay() instead of inline classification; /classify endpoint retains synchronous await classifier.classify_document() for backward-compatible immediate response - Rewrite api/topics.py: all routes inject session dependency; all storage calls are async with session parameter; Pydantic models TopicCreate/TopicUpdate/ SuggestRequest preserved verbatim
2026-05-22 09:47:00 +02:00
parent 32d67de1ca
commit c1931fd566
3 changed files with 120 additions and 42 deletions
@@ -1,6 +1,11 @@
 from datetime import datetime, timezone
-from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query
-from services import storage, extractor, classifier
+
+from fastapi import APIRouter, Depends, File, Form, HTTPException, Query, UploadFile
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from deps.db import get_db
+from services import classifier, extractor, storage
+from tasks.document_tasks import extract_and_classify

 router = APIRouter(prefix="/api/documents", tags=["documents"])

@@ -22,6 +27,7 @@ ALLOWED_MIME_TYPES = {
 async def upload_document(
    file: UploadFile = File(...),
    auto_classify: bool = Form(True),
+    session: AsyncSession = Depends(get_db),
 ):
    content = await file.read()
    if len(content) == 0:
@@ -29,8 +35,10 @@ async def upload_document(

    mime = file.content_type or "application/octet-stream"

-    saved = storage.save_upload(content, file.filename or "upload", mime)
-    text = extractor.extract_text(saved["path"], mime)
+    saved = await storage.save_upload(session, content, file.filename or "upload", mime)
+
+    # Extract text from the in-memory bytes (avoid a second MinIO round-trip at upload time)
+    text = extractor.extract_text_from_bytes(content, mime)

    now = datetime.now(timezone.utc).isoformat()
    meta = {
@@ -40,20 +48,20 @@ async def upload_document(
        "mime_type": mime,
        "size_bytes": len(content),
        "extracted_text": text,
+        # Phase 1 cutover: topics are empty at upload time; the Celery worker
+        # fills them in asynchronously after extract_and_classify completes.
        "topics": [],
        "created_at": now,
        "classified_at": None,
    }
-    storage.save_metadata(meta)
+    await storage.save_metadata(session, meta)

    if auto_classify:
-        try:
-            topics = await classifier.classify_document(saved["id"])
-            meta["topics"] = topics
-            meta["classified_at"] = datetime.now(timezone.utc).isoformat()
-        except Exception as e:
-            # Classification failure is non-fatal; document is still saved
-            meta["classification_error"] = str(e)
+        # Queue the extract+classify task on the Celery documents queue (STORE-08).
+        # The task re-fetches bytes from MinIO, extracts text, and classifies.
+        # The upload response returns topics=[] — polling GET /api/documents/{id}
+        # will show the populated topics once the worker completes.
+        extract_and_classify.delay(str(saved["id"]))

    return meta

@@ -63,38 +71,46 @@ async def list_documents(
    topic: str | None = Query(None),
    page: int = Query(1, ge=1),
    per_page: int = Query(20, ge=1, le=100),
+    session: AsyncSession = Depends(get_db),
 ):
-    docs = storage.list_metadata(topic=topic)
+    docs = await storage.list_metadata(session, topic=topic)
    total = len(docs)
    start = (page - 1) * per_page
    return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}


@router.get("/{doc_id}")
-async def get_document(doc_id: str):
-    meta = storage.get_metadata(doc_id)
+async def get_document(doc_id: str, session: AsyncSession = Depends(get_db)):
+    meta = await storage.get_metadata(session, doc_id)
    if meta is None:
        raise HTTPException(404, "Document not found")
    return meta


@router.delete("/{doc_id}")
-async def delete_document(doc_id: str):
-    ok = storage.delete_document(doc_id)
+async def delete_document(doc_id: str, session: AsyncSession = Depends(get_db)):
+    ok = await storage.delete_document(session, doc_id)
    if not ok:
        raise HTTPException(404, "Document not found")
    return {"success": True}


@router.post("/{doc_id}/classify")
-async def classify_document(doc_id: str, body: dict = {}):
-    meta = storage.get_metadata(doc_id)
+async def classify_document(
+    doc_id: str,
+    body: dict = {},
+    session: AsyncSession = Depends(get_db),
+):
+    meta = await storage.get_metadata(session, doc_id)
    if meta is None:
        raise HTTPException(404, "Document not found")

    topic_names = body.get("topics") if body else None
    try:
-        topics = await classifier.classify_document(doc_id, topic_names)
+        # The /classify endpoint calls classifier synchronously and returns the
+        # topic list immediately — this preserves the historical behavior.
+        # The upload-time path uses Celery .delay() instead (Phase 1 cutover).
+        topics = await classifier.classify_document(session, doc_id, topic_names)
    except Exception as e:
        raise HTTPException(500, f"Classification failed: {e}")