kite/backend/services/extractor.py

"""
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
import tempfile
from pathlib import Path

MAX_STORED_CHARS = 50_000


def extract_text_from_bytes(file_bytes: bytes, mime_type: str) -> str:
    """Extract text from raw bytes by writing to a temp file and dispatching to extract_text.

    Used by the Celery worker (which retrieves bytes from MinIO) so extraction
    does not require a filesystem path.
    """
    suffix = _mime_to_suffix(mime_type)
    with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
        tmp.write(file_bytes)
        tmp_path = tmp.name
    try:
        return extract_text(tmp_path, mime_type)
    finally:
        import os
        try:
            os.unlink(tmp_path)
        except OSError:
            pass


def _mime_to_suffix(mime_type: str) -> str:
    """Return a file extension for the given MIME type."""
    mapping = {
        "application/pdf": ".pdf",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
        "application/msword": ".doc",
        "text/plain": ".txt",
        "text/markdown": ".md",
        "image/png": ".png",
        "image/jpeg": ".jpg",
        "image/jpg": ".jpg",
        "image/tiff": ".tiff",
        "image/webp": ".webp",
    }
    return mapping.get(mime_type, ".bin")


def extract_text(file_path: str, mime_type: str) -> str:
    path = Path(file_path)
    try:
        if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
            return _extract_pdf(path)
        elif mime_type in (
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "application/msword",
        ) or path.suffix.lower() in (".docx", ".doc"):
            return _extract_docx(path)
        elif mime_type and mime_type.startswith("image/"):
            return _extract_image(path)
        else:
            return _extract_text_file(path)
    except Exception as e:
        return f"[Extraction error: {e}]"


def _extract_pdf(path: Path) -> str:
    import fitz  # PyMuPDF
    doc = fitz.open(str(path))
    pages = []
    for page in doc:
        pages.append(page.get_text())
    doc.close()
    return _truncate("\n".join(pages))


def _extract_docx(path: Path) -> str:
    from docx import Document
    doc = Document(str(path))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return _truncate("\n".join(paragraphs))


def _extract_image(path: Path) -> str:
    try:
        from PIL import Image
        import pytesseract
        img = Image.open(str(path))
        text = pytesseract.image_to_string(img)
        return _truncate(text)
    except ImportError:
        return "[OCR unavailable: pytesseract or Pillow not installed]"
    except Exception as e:
        return f"[OCR error: {e}]"


def _extract_text_file(path: Path) -> str:
    for enc in ("utf-8", "latin-1", "cp1252"):
        try:
            return _truncate(path.read_text(encoding=enc))
        except UnicodeDecodeError:
            continue
    return "[Could not decode text file]"


def _truncate(text: str) -> str:
    text = text.strip()
    if len(text) > MAX_STORED_CHARS:
        text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
    return text