feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier

- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON serialization, and tasks.document_tasks.* routed to documents queue; reads REDIS_URL directly from os.environ (no config import — Pitfall 7) - Add backend/tasks/__init__.py: empty package marker - Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text via extractor, and classify via classifier; classification failure is non-fatal - Update backend/services/classifier.py: classify_document and suggest_topics_for_document now accept session: AsyncSession as first arg; all storage.* calls updated to async session-injection pattern - Add extract_text_from_bytes helper to services/extractor.py for bytes-based extraction (used by Celery worker, which retrieves bytes from MinIO)
2026-05-22 09:45:33 +02:00
parent 5d21c6f588
commit 32d67de1ca
5 changed files with 185 additions and 7 deletions
@@ -2,11 +2,49 @@
 Text extraction dispatcher.
 Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
 """
+import tempfile
 from pathlib import Path

 MAX_STORED_CHARS = 50_000


+def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str:
+    """Extract text from raw bytes by writing to a temp file and dispatching to extract_text.
+
+    Used by the Celery worker (which retrieves bytes from MinIO) so extraction
+    does not require a filesystem path.
+    """
+    suffix = _mime_to_suffix(mime_type)
+    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
+        tmp.write(file_bytes)
+        tmp_path = tmp.name
+    try:
+        return extract_text(tmp_path, mime_type)
+    finally:
+        import os
+        try:
+            os.unlink(tmp_path)
+        except OSError:
+            pass
+
+
+def _mime_to_suffix(mime_type: str) -> str:
+    """Return a file extension for the given MIME type."""
+    mapping = {
+        "application/pdf": ".pdf",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
+        "application/msword": ".doc",
+        "text/plain": ".txt",
+        "text/markdown": ".md",
+        "image/png": ".png",
+        "image/jpeg": ".jpg",
+        "image/jpg": ".jpg",
+        "image/tiff": ".tiff",
+        "image/webp": ".webp",
+    }
+    return mapping.get(mime_type, ".bin")
+
+
 def extract_text(file_path: str, mime_type: str) -> str:
    path = Path(file_path)
    try: