Files
curo1305 32d67de1ca feat(01-05): introduce celery_app + tasks/document_tasks + session-aware classifier
- Add backend/celery_app.py: Celery("docuvault") with Redis broker, JSON
  serialization, and tasks.document_tasks.* routed to documents queue;
  reads REDIS_URL directly from os.environ (no config import — Pitfall 7)
- Add backend/tasks/__init__.py: empty package marker
- Add backend/tasks/document_tasks.py: sync extract_and_classify Celery task
  that calls asyncio.run(_run()) to retrieve bytes from MinIO, extract text
  via extractor, and classify via classifier; classification failure is non-fatal
- Update backend/services/classifier.py: classify_document and
  suggest_topics_for_document now accept session: AsyncSession as first arg;
  all storage.* calls updated to async session-injection pattern
- Add extract_text_from_bytes helper to services/extractor.py for bytes-based
  extraction (used by Celery worker, which retrieves bytes from MinIO)
2026-05-22 09:45:33 +02:00

110 lines
3.2 KiB
Python

"""
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
import tempfile
from pathlib import Path
MAX_STORED_CHARS = 50_000
def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str:
"""Extract text from raw bytes by writing to a temp file and dispatching to extract_text.
Used by the Celery worker (which retrieves bytes from MinIO) so extraction
does not require a filesystem path.
"""
suffix = _mime_to_suffix(mime_type)
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(file_bytes)
tmp_path = tmp.name
try:
return extract_text(tmp_path, mime_type)
finally:
import os
try:
os.unlink(tmp_path)
except OSError:
pass
def _mime_to_suffix(mime_type: str) -> str:
"""Return a file extension for the given MIME type."""
mapping = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"application/msword": ".doc",
"text/plain": ".txt",
"text/markdown": ".md",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/tiff": ".tiff",
"image/webp": ".webp",
}
return mapping.get(mime_type, ".bin")
def extract_text(file_path: str, mime_type: str) -> str:
path = Path(file_path)
try:
if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
return _extract_pdf(path)
elif mime_type in (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
) or path.suffix.lower() in (".docx", ".doc"):
return _extract_docx(path)
elif mime_type and mime_type.startswith("image/"):
return _extract_image(path)
else:
return _extract_text_file(path)
except Exception as e:
return f"[Extraction error: {e}]"
def _extract_pdf(path: Path) -> str:
import fitz # PyMuPDF
doc = fitz.open(str(path))
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return _truncate("\n".join(pages))
def _extract_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return _truncate("\n".join(paragraphs))
def _extract_image(path: Path) -> str:
try:
from PIL import Image
import pytesseract
img = Image.open(str(path))
text = pytesseract.image_to_string(img)
return _truncate(text)
except ImportError:
return "[OCR unavailable: pytesseract or Pillow not installed]"
except Exception as e:
return f"[OCR error: {e}]"
def _extract_text_file(path: Path) -> str:
for enc in ("utf-8", "latin-1", "cp1252"):
try:
return _truncate(path.read_text(encoding=enc))
except UnicodeDecodeError:
continue
return "[Could not decode text file]"
def _truncate(text: str) -> str:
text = text.strip()
if len(text) > MAX_STORED_CHARS:
text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
return text