import pytest from pathlib import Path from services.extractor import extract_text def test_extract_txt(tmp_path): p = tmp_path / "test.txt" p.write_text("Hello world this is a test document.", encoding="utf-8") text = extract_text(str(p), "text/plain") assert "Hello world" in text def test_extract_pdf(tmp_path): import fitz doc = fitz.open() page = doc.new_page() page.insert_text((50, 50), "PDF content about legal contracts.") pdf_path = tmp_path / "test.pdf" doc.save(str(pdf_path)) doc.close() text = extract_text(str(pdf_path), "application/pdf") assert "PDF content" in text def test_extract_docx(tmp_path): from docx import Document doc = Document() doc.add_paragraph("DOCX paragraph about financial reports.") docx_path = tmp_path / "test.docx" doc.save(str(docx_path)) text = extract_text( str(docx_path), "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) assert "DOCX paragraph" in text def test_extract_unknown_falls_back_to_text(tmp_path): p = tmp_path / "test.csv" p.write_text("col1,col2\nval1,val2", encoding="utf-8") text = extract_text(str(p), "text/csv") assert "col1" in text def test_extract_truncation(tmp_path): p = tmp_path / "big.txt" p.write_text("A" * 60_000, encoding="utf-8") text = extract_text(str(p), "text/plain") assert len(text) <= 50_100 # 50k + truncation marker assert "truncated" in text