7a34807fa0
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
import pytest
|
|
from pathlib import Path
|
|
from services.extractor import extract_text
|
|
|
|
|
|
def test_extract_txt(tmp_path):
|
|
p = tmp_path / "test.txt"
|
|
p.write_text("Hello world this is a test document.", encoding="utf-8")
|
|
text = extract_text(str(p), "text/plain")
|
|
assert "Hello world" in text
|
|
|
|
|
|
def test_extract_pdf(tmp_path):
|
|
import fitz
|
|
doc = fitz.open()
|
|
page = doc.new_page()
|
|
page.insert_text((50, 50), "PDF content about legal contracts.")
|
|
pdf_path = tmp_path / "test.pdf"
|
|
doc.save(str(pdf_path))
|
|
doc.close()
|
|
|
|
text = extract_text(str(pdf_path), "application/pdf")
|
|
assert "PDF content" in text
|
|
|
|
|
|
def test_extract_docx(tmp_path):
|
|
from docx import Document
|
|
doc = Document()
|
|
doc.add_paragraph("DOCX paragraph about financial reports.")
|
|
docx_path = tmp_path / "test.docx"
|
|
doc.save(str(docx_path))
|
|
|
|
text = extract_text(
|
|
str(docx_path),
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
)
|
|
assert "DOCX paragraph" in text
|
|
|
|
|
|
def test_extract_unknown_falls_back_to_text(tmp_path):
|
|
p = tmp_path / "test.csv"
|
|
p.write_text("col1,col2\nval1,val2", encoding="utf-8")
|
|
text = extract_text(str(p), "text/csv")
|
|
assert "col1" in text
|
|
|
|
|
|
def test_extract_truncation(tmp_path):
|
|
p = tmp_path / "big.txt"
|
|
p.write_text("A" * 60_000, encoding="utf-8")
|
|
text = extract_text(str(p), "text/plain")
|
|
assert len(text) <= 50_100 # 50k + truncation marker
|
|
assert "truncated" in text
|