feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier

- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON
  serialization, and tasks.document_tasks.* routed to documents queue;
  reads REDIS_URL directly from os.environ (no config import — Pitfall 7)
- Add backend/tasks/__init__.py: empty package marker
- Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task
  that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text
  via extractor, and classify via classifier; classification failure is non-fatal
- Update backend/services/classifier.py: classify_document and
  suggest_topics_for_document now accept session: AsyncSession as first arg;
  all storage.* calls updated to async session-injection pattern
- Add extract_text_from_bytes helper to services/extractor.py for bytes-based
  extraction (used by Celery worker, which retrieves bytes from MinIO)
This commit is contained in:
curo1305
2026-05-22 09:45:33 +02:00
parent 5d21c6f588
commit 32d67de1ca
5 changed files with 185 additions and 7 deletions
+17 -7
View File
@@ -1,20 +1,30 @@
"""
Classification orchestrator.
Loads settings, selects AI provider, classifies document, auto-creates suggested topics.
Updated in Plan 05: classify_document and suggest_topics_for_document now accept
an AsyncSession as their first argument so they can be called from the Celery task
wrapper and from API route handlers that already hold a session.
"""
from sqlalchemy.ext.asyncio import AsyncSession
from services import storage
from ai import get_provider
MAX_AI_CHARS = 8_000
async def classify_document(doc_id: str, topic_names: list[str] | None = None) -> list[str]:
async def classify_document(
session: AsyncSession,
doc_id: str,
topic_names: list[str] | None = None,
) -> list[str]:
"""
Classify a document by its ID. Returns the list of assigned topic names.
If topic_names is provided, restrict classification to those topics.
Auto-creates any newly suggested topics.
"""
meta = storage.get_metadata(doc_id)
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
@@ -24,7 +34,7 @@ async def classify_document(doc_id: str, topic_names: list[str] | None = None) -
# Use all known topics if not specified
if topic_names is None:
all_topics = storage.load_topics()
all_topics = await storage.load_topics(session)
topic_names = [t["name"] for t in all_topics]
text = meta.get("extracted_text", "")
@@ -37,18 +47,18 @@ async def classify_document(doc_id: str, topic_names: list[str] | None = None) -
existing_names = {t.lower() for t in topic_names}
for name in all_new_names:
if name.strip() and name.lower() not in existing_names:
storage.create_topic(name.strip())
await storage.create_topic(session, name.strip())
# Final list: everything the AI assigned or suggested
final_topics = [t for t in list(set(result.topics + result.suggested_new_topics)) if t.strip()]
storage.update_document_topics(doc_id, final_topics)
await storage.update_document_topics(session, doc_id, final_topics)
return final_topics
async def suggest_topics_for_document(doc_id: str) -> list[str]:
async def suggest_topics_for_document(session: AsyncSession, doc_id: str) -> list[str]:
"""Return AI-suggested topic names without modifying the document."""
meta = storage.get_metadata(doc_id)
meta = await storage.get_metadata(session, doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
+38
View File
@@ -2,11 +2,49 @@
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
import tempfile
from pathlib import Path
MAX_STORED_CHARS = 50_000
def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str:
"""Extract text from raw bytes by writing to a temp file and dispatching to extract_text.
Used by the Celery worker (which retrieves bytes from MinIO) so extraction
does not require a filesystem path.
"""
suffix = _mime_to_suffix(mime_type)
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
return extract_text(tmp_path, mime_type)
finally:
import os
try:
os.unlink(tmp_path)
except OSError:
pass
def _mime_to_suffix(mime_type: str) -> str:
"""Return a file extension for the given MIME type."""
mapping = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/msword": ".doc",
"text/plain": ".txt",
"text/markdown": ".md",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/tiff": ".tiff",
"image/webp": ".webp",
}
return mapping.get(mime_type, ".bin")
def extract_text(file_path: str, mime_type: str) -> str:
path = Path(file_path)
try: