""" Text extraction dispatcher. Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract). """ import tempfile from pathlib import Path MAX_STORED_CHARS = 50_000 def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str: """Extract text from raw bytes by writing to a temp file and dispatching to extract_text. Used by the Celery worker (which retrieves bytes from MinIO) so extraction does not require a filesystem path. """ suffix = _mime_to_suffix(mime_type) with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: tmp.write(file_bytes) tmp_path = tmp.name try: return extract_text(tmp_path, mime_type) finally: import os try: os.unlink(tmp_path) except OSError: pass def _mime_to_suffix(mime_type: str) -> str: """Return a file extension for the given MIME type.""" mapping = { "application/pdf": ".pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", "application/msword": ".doc", "text/plain": ".txt", "text/markdown": ".md", "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/tiff": ".tiff", "image/webp": ".webp", } return mapping.get(mime_type, ".bin") def extract_text(file_path: str, mime_type: str) -> str: path = Path(file_path) try: if mime_type == "application/pdf" or path.suffix.lower() == ".pdf": return _extract_pdf(path) elif mime_type in ( "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword", ) or path.suffix.lower() in (".docx", ".doc"): return _extract_docx(path) elif mime_type and mime_type.startswith("image/"): return _extract_image(path) else: return _extract_text_file(path) except Exception as e: return f"[Extraction error: {e}]" def _extract_pdf(path: Path) -> str: import fitz # PyMuPDF doc = fitz.open(str(path)) pages = [] for page in doc: pages.append(page.get_text()) doc.close() return _truncate("\n".join(pages)) def _extract_docx(path: Path) -> str: from docx import Document doc = Document(str(path)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return _truncate("\n".join(paragraphs)) def _extract_image(path: Path) -> str: try: from PIL import Image import pytesseract img = Image.open(str(path)) text = pytesseract.image_to_string(img) return _truncate(text) except ImportError: return "[OCR unavailable: pytesseract or Pillow not installed]" except Exception as e: return f"[OCR error: {e}]" def _extract_text_file(path: Path) -> str: for enc in ("utf-8", "latin-1", "cp1252"): try: return _truncate(path.read_text(encoding=enc)) except UnicodeDecodeError: continue return "[Could not decode text file]" def _truncate(text: str) -> str: text = text.strip() if len(text) > MAX_STORED_CHARS: text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]" return text