chore: initial commit — existing single-user document scanner codebase

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-05-22 08:53:28 +02:00
parent 6fed5ba531
commit 7a34807fa0
71 changed files with 16408 additions and 0 deletions
+52
View File
@@ -0,0 +1,52 @@
import pytest
from pathlib import Path
from services.extractor import extract_text
def test_extract_txt(tmp_path):
p = tmp_path / "test.txt"
p.write_text("Hello world this is a test document.", encoding="utf-8")
text = extract_text(str(p), "text/plain")
assert "Hello world" in text
def test_extract_pdf(tmp_path):
import fitz
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "PDF content about legal contracts.")
pdf_path = tmp_path / "test.pdf"
doc.save(str(pdf_path))
doc.close()
text = extract_text(str(pdf_path), "application/pdf")
assert "PDF content" in text
def test_extract_docx(tmp_path):
from docx import Document
doc = Document()
doc.add_paragraph("DOCX paragraph about financial reports.")
docx_path = tmp_path / "test.docx"
doc.save(str(docx_path))
text = extract_text(
str(docx_path),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
assert "DOCX paragraph" in text
def test_extract_unknown_falls_back_to_text(tmp_path):
p = tmp_path / "test.csv"
p.write_text("col1,col2\nval1,val2", encoding="utf-8")
text = extract_text(str(p), "text/csv")
assert "col1" in text
def test_extract_truncation(tmp_path):
p = tmp_path / "big.txt"
p.write_text("A" * 60_000, encoding="utf-8")
text = extract_text(str(p), "text/plain")
assert len(text) <= 50_100 # 50k + truncation marker
assert "truncated" in text