kite/backend/services/extractor.py

"""
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
from pathlib import Path

MAX_STORED_CHARS = 50_000


def extract_text(file_path: str, mime_type: str) -> str:
    path = Path(file_path)
    try:
        if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
            return _extract_pdf(path)
        elif mime_type in (
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "application/msword",
        ) or path.suffix.lower() in (".docx", ".doc"):
            return _extract_docx(path)
        elif mime_type and mime_type.startswith("image/"):
            return _extract_image(path)
        else:
            return _extract_text_file(path)
    except Exception as e:
        return f"[Extraction error: {e}]"


def _extract_pdf(path: Path) -> str:
    import fitz  # PyMuPDF
    doc = fitz.open(str(path))
    pages = []
    for page in doc:
        pages.append(page.get_text())
    doc.close()
    return _truncate("\n".join(pages))


def _extract_docx(path: Path) -> str:
    from docx import Document
    doc = Document(str(path))
    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
    return _truncate("\n".join(paragraphs))


def _extract_image(path: Path) -> str:
    try:
        from PIL import Image
        import pytesseract
        img = Image.open(str(path))
        text = pytesseract.image_to_string(img)
        return _truncate(text)
    except ImportError:
        return "[OCR unavailable: pytesseract or Pillow not installed]"
    except Exception as e:
        return f"[OCR error: {e}]"


def _extract_text_file(path: Path) -> str:
    for enc in ("utf-8", "latin-1", "cp1252"):
        try:
            return _truncate(path.read_text(encoding=enc))
        except UnicodeDecodeError:
            continue
    return "[Could not decode text file]"


def _truncate(text: str) -> str:
    text = text.strip()
    if len(text) > MAX_STORED_CHARS:
        text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
    return text