""" Text extraction dispatcher. Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract). """ from pathlib import Path MAX_STORED_CHARS = 50_000 def extract_text(file_path: str, mime_type: str) -> str: path = Path(file_path) try: if mime_type == "application/pdf" or path.suffix.lower() == ".pdf": return _extract_pdf(path) elif mime_type in ( "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword", ) or path.suffix.lower() in (".docx", ".doc"): return _extract_docx(path) elif mime_type and mime_type.startswith("image/"): return _extract_image(path) else: return _extract_text_file(path) except Exception as e: return f"[Extraction error: {e}]" def _extract_pdf(path: Path) -> str: import fitz # PyMuPDF doc = fitz.open(str(path)) pages = [] for page in doc: pages.append(page.get_text()) doc.close() return _truncate("\n".join(pages)) def _extract_docx(path: Path) -> str: from docx import Document doc = Document(str(path)) paragraphs = [p.text for p in doc.paragraphs if p.text.strip()] return _truncate("\n".join(paragraphs)) def _extract_image(path: Path) -> str: try: from PIL import Image import pytesseract img = Image.open(str(path)) text = pytesseract.image_to_string(img) return _truncate(text) except ImportError: return "[OCR unavailable: pytesseract or Pillow not installed]" except Exception as e: return f"[OCR error: {e}]" def _extract_text_file(path: Path) -> str: for enc in ("utf-8", "latin-1", "cp1252"): try: return _truncate(path.read_text(encoding=enc)) except UnicodeDecodeError: continue return "[Could not decode text file]" def _truncate(text: str) -> str: text = text.strip() if len(text) > MAX_STORED_CHARS: text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]" return text