feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier

- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON serialization, and tasks.document_tasks.* routed to documents queue; reads REDIS_URL directly from os.environ (no config import — Pitfall 7) - Add backend/tasks/__init__.py: empty package marker - Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text via extractor, and classify via classifier; classification failure is non-fatal - Update backend/services/classifier.py: classify_document and suggest_topics_for_document now accept session: AsyncSession as first arg; all storage.* calls updated to async session-injection pattern - Add extract_text_from_bytes helper to services/extractor.py for bytes-based extraction (used by Celery worker, which retrieves bytes from MinIO)
2026-05-22 09:45:33 +02:00
parent 5d21c6f588
commit 32d67de1ca
5 changed files with 185 additions and 7 deletions
@@ -0,0 +1,94 @@
+"""
+Celery tasks for document processing in DocuVault.
+
+extract_and_classify — called via .delay(document_id) by the upload handler.
+The task is a plain sync def (Celery workers have no asyncio event loop); it
+bridges into the async service layer via asyncio.run().
+
+Flow:
+  1. Open a fresh AsyncSession (one per task invocation — never share sessions)
+  2. Look up the Document row to get the MinIO object_key
+  3. Retrieve file bytes from MinIO via the storage backend
+  4. Extract text from bytes using services.extractor
+  5. Persist extracted_text back to the Document row
+  6. Call services.classifier.classify_document to assign topics
+  7. Return a result dict (never raises — classification failures are non-fatal)
+"""
+import asyncio
+
+from celery_app import celery_app
+
+
+@celery_app.task(name="tasks.document_tasks.extract_and_classify")
+def extract_and_classify(document_id: str) -> dict:
+    """Synchronous Celery entry-point — delegates to async _run via asyncio.run."""
+    return asyncio.run(_run(document_id))
+
+
+async def _run(document_id: str) -> dict:
+    """Async body of extract_and_classify.
+
+    Opens its own AsyncSession (not shared with the upload request) to avoid
+    cross-thread session contamination.
+    """
+    import uuid as _uuid
+
+    from db.session import AsyncSessionLocal
+    from db.models import Document
+    from services import extractor, classifier
+    from storage import get_storage_backend
+
+    async with AsyncSessionLocal() as session:
+        # ── Step 1: fetch Document row ─────────────────────────────────────────
+        try:
+            doc_uuid = _uuid.UUID(document_id)
+        except ValueError:
+            return {"document_id": document_id, "status": "invalid_id"}
+
+        doc = await session.get(Document, doc_uuid)
+        if doc is None:
+            return {"document_id": document_id, "status": "not_found"}
+
+        if not doc.object_key:
+            return {"document_id": document_id, "status": "missing_object"}
+
+        # ── Step 2: retrieve bytes from MinIO ──────────────────────────────────
+        try:
+            backend = get_storage_backend()
+            file_bytes = await backend.get_object(doc.object_key)
+        except Exception as e:
+            return {
+                "document_id": document_id,
+                "status": "extract_failed",
+                "error": f"MinIO retrieval failed: {e}",
+            }
+
+        # ── Step 3: extract text from bytes ────────────────────────────────────
+        try:
+            text = extractor.extract_text_from_bytes(file_bytes, doc.content_type)
+            doc.extracted_text = text
+            await session.commit()
+        except Exception as e:
+            return {
+                "document_id": document_id,
+                "status": "extract_failed",
+                "error": f"Text extraction failed: {e}",
+            }
+
+        # ── Step 4: classify document (non-fatal) ──────────────────────────────
+        try:
+            topics = await classifier.classify_document(session, document_id)
+            return {
+                "document_id": document_id,
+                "status": "classified",
+                "topics": topics,
+            }
+        except Exception as e:
+            # Non-fatal — preserve existing convention from api/documents.py
+            doc.status = "classification_failed"
+            await session.commit()
+            return {
+                "document_id": document_id,
+                "status": "classification_failed",
+                "error": str(e),
+            }