kite/backend/tasks/document_tasks.py

"""
Celery tasks for document processing in DocuVault.

extract_and_classify — called via .delay(document_id) by the upload handler.
The task is a plain sync def (Celery workers have no asyncio event loop); it
bridges into the async service layer via asyncio.run().

Flow:
  1. Open a fresh AsyncSession (one per task invocation — never share sessions)
  2. Look up the Document row to get the MinIO object_key
  3. Retrieve file bytes from MinIO via the storage backend
  4. Extract text from bytes using services.extractor
  5. Persist extracted_text back to the Document row
  6. Call services.classifier.classify_document to assign topics
  7. Return a result dict (never raises — classification failures are non-fatal)
"""
import asyncio

from celery_app import celery_app


@celery_app.task(name="tasks.document_tasks.extract_and_classify")
def extract_and_classify(document_id: str) -> dict:
    """Synchronous Celery entry-point — delegates to async _run via asyncio.run."""
    return asyncio.run(_run(document_id))


async def _run(document_id: str) -> dict:
    """Async body of extract_and_classify.

    Opens its own AsyncSession (not shared with the upload request) to avoid
    cross-thread session contamination.
    """
    import uuid as _uuid

    from db.session import AsyncSessionLocal
    from db.models import Document
    from services import extractor, classifier
    from storage import get_storage_backend

    async with AsyncSessionLocal() as session:
        # ── Step 1: fetch Document row ─────────────────────────────────────────
        try:
            doc_uuid = _uuid.UUID(document_id)
        except ValueError:
            return {"document_id": document_id, "status": "invalid_id"}

        doc = await session.get(Document, doc_uuid)
        if doc is None:
            return {"document_id": document_id, "status": "not_found"}

        if not doc.object_key:
            return {"document_id": document_id, "status": "missing_object"}

        # ── Step 2: retrieve bytes from MinIO ──────────────────────────────────
        try:
            backend = get_storage_backend()
            file_bytes = await backend.get_object(doc.object_key)
        except Exception as e:
            return {
                "document_id": document_id,
                "status": "extract_failed",
                "error": f"MinIO retrieval failed: {e}",
            }

        # ── Step 3: extract text from bytes ────────────────────────────────────
        try:
            text = extractor.extract_text_from_bytes(file_bytes, doc.content_type)
            doc.extracted_text = text
            await session.commit()
        except Exception as e:
            return {
                "document_id": document_id,
                "status": "extract_failed",
                "error": f"Text extraction failed: {e}",
            }

        # ── Step 4: classify document (non-fatal) ──────────────────────────────
        try:
            topics = await classifier.classify_document(session, document_id)
            return {
                "document_id": document_id,
                "status": "classified",
                "topics": topics,
            }
        except Exception as e:
            # Non-fatal — preserve existing convention from api/documents.py
            doc.status = "classification_failed"
            await session.commit()
            return {
                "document_id": document_id,
                "status": "classification_failed",
                "error": str(e),
            }