feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier
- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON
serialization, and tasks.document_tasks.* routed to documents queue;
reads REDIS_URL directly from os.environ (no config import — Pitfall 7)
- Add backend/tasks/__init__.py: empty package marker
- Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task
that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text
via extractor, and classify via classifier; classification failure is non-fatal
- Update backend/services/classifier.py: classify_document and
suggest_topics_for_document now accept session: AsyncSession as first arg;
all storage.* calls updated to async session-injection pattern
- Add extract_text_from_bytes helper to services/extractor.py for bytes-based
extraction (used by Celery worker, which retrieves bytes from MinIO)
This commit is contained in:
@@ -2,11 +2,49 @@
|
||||
Text extraction dispatcher.
|
||||
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
|
||||
"""
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
MAX_STORED_CHARS = 50_000
|
||||
|
||||
|
||||
def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str:
|
||||
"""Extract text from raw bytes by writing to a temp file and dispatching to extract_text.
|
||||
|
||||
Used by the Celery worker (which retrieves bytes from MinIO) so extraction
|
||||
does not require a filesystem path.
|
||||
"""
|
||||
suffix = _mime_to_suffix(mime_type)
|
||||
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
||||
tmp.write(file_bytes)
|
||||
tmp_path = tmp.name
|
||||
try:
|
||||
return extract_text(tmp_path, mime_type)
|
||||
finally:
|
||||
import os
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _mime_to_suffix(mime_type: str) -> str:
|
||||
"""Return a file extension for the given MIME type."""
|
||||
mapping = {
|
||||
"application/pdf": ".pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
||||
"application/msword": ".doc",
|
||||
"text/plain": ".txt",
|
||||
"text/markdown": ".md",
|
||||
"image/png": ".png",
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/tiff": ".tiff",
|
||||
"image/webp": ".webp",
|
||||
}
|
||||
return mapping.get(mime_type, ".bin")
|
||||
|
||||
|
||||
def extract_text(file_path: str, mime_type: str) -> str:
|
||||
path = Path(file_path)
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user