kite/backend/tests/test_extractor.py

import pytest
from pathlib import Path
from services.extractor import extract_text


def test_extract_txt(tmp_path):
    p = tmp_path / "test.txt"
    p.write_text("Hello world this is a test document.", encoding="utf-8")
    text = extract_text(str(p), "text/plain")
    assert "Hello world" in text


def test_extract_pdf(tmp_path):
    import fitz
    doc = fitz.open()
    page = doc.new_page()
    page.insert_text((50, 50), "PDF content about legal contracts.")
    pdf_path = tmp_path / "test.pdf"
    doc.save(str(pdf_path))
    doc.close()

    text = extract_text(str(pdf_path), "application/pdf")
    assert "PDF content" in text


def test_extract_docx(tmp_path):
    from docx import Document
    doc = Document()
    doc.add_paragraph("DOCX paragraph about financial reports.")
    docx_path = tmp_path / "test.docx"
    doc.save(str(docx_path))

    text = extract_text(
        str(docx_path),
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    )
    assert "DOCX paragraph" in text


def test_extract_unknown_falls_back_to_text(tmp_path):
    p = tmp_path / "test.csv"
    p.write_text("col1,col2\nval1,val2", encoding="utf-8")
    text = extract_text(str(p), "text/csv")
    assert "col1" in text


def test_extract_truncation(tmp_path):
    p = tmp_path / "big.txt"
    p.write_text("A" * 60_000, encoding="utf-8")
    text = extract_text(str(p), "text/plain")
    assert len(text) <= 50_100  # 50k + truncation marker
    assert "truncated" in text