chore: initial commit — existing single-user document scanner codebase
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,52 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from services.extractor import extract_text
|
||||
|
||||
|
||||
def test_extract_txt(tmp_path):
|
||||
p = tmp_path / "test.txt"
|
||||
p.write_text("Hello world this is a test document.", encoding="utf-8")
|
||||
text = extract_text(str(p), "text/plain")
|
||||
assert "Hello world" in text
|
||||
|
||||
|
||||
def test_extract_pdf(tmp_path):
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "PDF content about legal contracts.")
|
||||
pdf_path = tmp_path / "test.pdf"
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
text = extract_text(str(pdf_path), "application/pdf")
|
||||
assert "PDF content" in text
|
||||
|
||||
|
||||
def test_extract_docx(tmp_path):
|
||||
from docx import Document
|
||||
doc = Document()
|
||||
doc.add_paragraph("DOCX paragraph about financial reports.")
|
||||
docx_path = tmp_path / "test.docx"
|
||||
doc.save(str(docx_path))
|
||||
|
||||
text = extract_text(
|
||||
str(docx_path),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
assert "DOCX paragraph" in text
|
||||
|
||||
|
||||
def test_extract_unknown_falls_back_to_text(tmp_path):
|
||||
p = tmp_path / "test.csv"
|
||||
p.write_text("col1,col2\nval1,val2", encoding="utf-8")
|
||||
text = extract_text(str(p), "text/csv")
|
||||
assert "col1" in text
|
||||
|
||||
|
||||
def test_extract_truncation(tmp_path):
|
||||
p = tmp_path / "big.txt"
|
||||
p.write_text("A" * 60_000, encoding="utf-8")
|
||||
text = extract_text(str(p), "text/plain")
|
||||
assert len(text) <= 50_100 # 50k + truncation marker
|
||||
assert "truncated" in text
|
||||
Reference in New Issue
Block a user