feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier

- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON
  serialization, and tasks.document_tasks.* routed to documents queue;
  reads REDIS_URL directly from os.environ (no config import — Pitfall 7)
- Add backend/tasks/__init__.py: empty package marker
- Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task
  that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text
  via extractor, and classify via classifier; classification failure is non-fatal
- Update backend/services/classifier.py: classify_document and
  suggest_topics_for_document now accept session: AsyncSession as first arg;
  all storage.* calls updated to async session-injection pattern
- Add extract_text_from_bytes helper to services/extractor.py for bytes-based
  extraction (used by Celery worker, which retrieves bytes from MinIO)
This commit is contained in:
curo1305
2026-05-22 09:45:33 +02:00
parent 5d21c6f588
commit 32d67de1ca
5 changed files with 185 additions and 7 deletions
+94
View File
@@ -0,0 +1,94 @@
"""
Celery tasks for document processing in DocuVault.
extract_and_classify — called via .delay(document_id) by the upload handler.
The task is a plain sync def (Celery workers have no asyncio event loop); it
bridges into the async service layer via asyncio.run().
Flow:
1. Open a fresh AsyncSession (one per task invocation — never share sessions)
2. Look up the Document row to get the MinIO object_key
3. Retrieve file bytes from MinIO via the storage backend
4. Extract text from bytes using services.extractor
5. Persist extracted_text back to the Document row
6. Call services.classifier.classify_document to assign topics
7. Return a result dict (never raises — classification failures are non-fatal)
"""
import asyncio
from celery_app import celery_app
@celery_app.task(name="tasks.document_tasks.extract_and_classify")
def extract_and_classify(document_id: str) -> dict:
"""Synchronous Celery entry-point — delegates to async _run via asyncio.run."""
return asyncio.run(_run(document_id))
async def _run(document_id: str) -> dict:
"""Async body of extract_and_classify.
Opens its own AsyncSession (not shared with the upload request) to avoid
cross-thread session contamination.
"""
import uuid as _uuid
from db.session import AsyncSessionLocal
from db.models import Document
from services import extractor, classifier
from storage import get_storage_backend
async with AsyncSessionLocal() as session:
# ── Step 1: fetch Document row ─────────────────────────────────────────
try:
doc_uuid = _uuid.UUID(document_id)
except ValueError:
return {"document_id": document_id, "status": "invalid_id"}
doc = await session.get(Document, doc_uuid)
if doc is None:
return {"document_id": document_id, "status": "not_found"}
if not doc.object_key:
return {"document_id": document_id, "status": "missing_object"}
# ── Step 2: retrieve bytes from MinIO ──────────────────────────────────
try:
backend = get_storage_backend()
file_bytes = await backend.get_object(doc.object_key)
except Exception as e:
return {
"document_id": document_id,
"status": "extract_failed",
"error": f"MinIO retrieval failed: {e}",
}
# ── Step 3: extract text from bytes ────────────────────────────────────
try:
text = extractor.extract_text_from_bytes(file_bytes, doc.content_type)
doc.extracted_text = text
await session.commit()
except Exception as e:
return {
"document_id": document_id,
"status": "extract_failed",
"error": f"Text extraction failed: {e}",
}
# ── Step 4: classify document (non-fatal) ──────────────────────────────
try:
topics = await classifier.classify_document(session, document_id)
return {
"document_id": document_id,
"status": "classified",
"topics": topics,
}
except Exception as e:
# Non-fatal — preserve existing convention from api/documents.py
doc.status = "classification_failed"
await session.commit()
return {
"document_id": document_id,
"status": "classification_failed",
"error": str(e),
}