Add doc-service tests, AI category suggestions, LM Studio default

- pytest suite for doc-service: 20+ tests covering category CRUD,
  document upload/get/delete/patch, ownership isolation, category
  assignment, AI processing (mock), and live PDF tests (auto-skipped
  when tests/pdfs/ is empty)
- Minimal in-memory PDF builder in conftest so tests run without any
  fixture files; real PDFs can be dropped into tests/pdfs/ to activate
  live extraction tests
- AI prompt updated to return suggested_categories (2–5 short names)
- Frontend: SuggestionChip component in DocumentRow shows AI-suggested
  categories after processing; "Assign" links to an existing category,
  "Create & Assign" creates it first, ✕ dismisses locally
- Default AI provider changed to LM Studio at
  http://host.docker.internal:1234/v1 (host.docker.internal resolves
  to the macOS host from inside Docker Desktop)
- tests/pdfs/ directory tracked via .gitkeep; *.pdf excluded by .gitignore

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-14 11:27:57 +02:00
parent b8238e03ea
commit 1cdc532fff
12 changed files with 755 additions and 75 deletions
@@ -0,0 +1,244 @@
"""
Tests for the /documents endpoints.
Synthetic (minimal_pdf / invoice_pdf) tests run always.
Live tests that use real PDFs from tests/pdfs/ are skipped when that
directory is empty — drop any PDF there to activate them.
"""
import io
import json
from pathlib import Path
from unittest.mock import patch
import pytest
pytestmark = pytest.mark.asyncio
# ── Helpers ────────────────────────────────────────────────────────────────────
def _pdf_upload(filename: str, data: bytes):
return {"file": (filename, io.BytesIO(data), "application/pdf")}
# ── List / empty state ─────────────────────────────────────────────────────────
async def test_list_documents_empty(client):
r = await client.get("/documents")
assert r.status_code == 200
assert r.json() == []
# ── Upload ─────────────────────────────────────────────────────────────────────
async def test_upload_returns_202(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
r = await client.post("/documents/upload", files=_pdf_upload("test.pdf", minimal_pdf))
assert r.status_code == 202
data = r.json()
assert data["filename"] == "test.pdf"
assert data["status"] == "pending"
assert "id" in data
async def test_upload_non_pdf_rejected(client):
r = await client.post(
"/documents/upload",
files={"file": ("note.txt", io.BytesIO(b"hello"), "text/plain")},
)
assert r.status_code == 415
async def test_upload_file_too_large(client):
big = b"%PDF-1.4\n" + b"x" * (21 * 1024 * 1024)
with patch("app.routers.documents.process_document"):
r = await client.post("/documents/upload", files=_pdf_upload("big.pdf", big))
assert r.status_code == 413
# ── Get / status ───────────────────────────────────────────────────────────────
async def test_get_document(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("get.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await client.get(f"/documents/{doc_id}")
assert r.status_code == 200
assert r.json()["id"] == doc_id
async def test_get_document_not_found(client):
r = await client.get("/documents/nonexistent-id")
assert r.status_code == 404
async def test_get_document_status(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("status.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await client.get(f"/documents/{doc_id}/status")
assert r.status_code == 200
assert r.json()["status"] == "pending"
async def test_other_user_cannot_see_document(client, other_client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("private.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await other_client.get(f"/documents/{doc_id}")
assert r.status_code == 404
# ── Patch document type ────────────────────────────────────────────────────────
async def test_patch_document_type(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("patch.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await client.patch(f"/documents/{doc_id}/type", json={"document_type": "receipt"})
assert r.status_code == 200
assert r.json()["document_type"] == "receipt"
# ── Delete ─────────────────────────────────────────────────────────────────────
async def test_delete_document(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("del.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await client.delete(f"/documents/{doc_id}")
assert r.status_code == 204
r2 = await client.get(f"/documents/{doc_id}")
assert r2.status_code == 404
async def test_delete_document_not_found(client):
r = await client.delete("/documents/nonexistent-id")
assert r.status_code == 404
async def test_other_user_cannot_delete_document(client, other_client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("owned.pdf", minimal_pdf))
doc_id = up.json()["id"]
r = await other_client.delete(f"/documents/{doc_id}")
assert r.status_code == 404
# Original owner can still get it
r2 = await client.get(f"/documents/{doc_id}")
assert r2.status_code == 200
# ── Category assignment ────────────────────────────────────────────────────────
async def test_assign_and_remove_category(client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("cat.pdf", minimal_pdf))
doc_id = up.json()["id"]
cat = await client.post("/categories", json={"name": "Ops"})
cat_id = cat.json()["id"]
r = await client.post(f"/documents/{doc_id}/categories/{cat_id}")
assert r.status_code == 204
doc = await client.get(f"/documents/{doc_id}")
cat_names = [c["name"] for c in doc.json()["categories"]]
assert "Ops" in cat_names
r2 = await client.delete(f"/documents/{doc_id}/categories/{cat_id}")
assert r2.status_code == 204
doc2 = await client.get(f"/documents/{doc_id}")
assert doc2.json()["categories"] == []
async def test_assign_category_idempotent(client, minimal_pdf):
"""Assigning the same category twice should not error."""
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("idem.pdf", minimal_pdf))
doc_id = up.json()["id"]
cat_id = (await client.post("/categories", json={"name": "Idem"})).json()["id"]
await client.post(f"/documents/{doc_id}/categories/{cat_id}")
r = await client.post(f"/documents/{doc_id}/categories/{cat_id}")
assert r.status_code == 204 # no error on duplicate
async def test_cannot_assign_other_users_category(client, other_client, minimal_pdf):
with patch("app.routers.documents.process_document"):
up = await client.post("/documents/upload", files=_pdf_upload("x.pdf", minimal_pdf))
doc_id = up.json()["id"]
# other_client creates a category
other_cat = await other_client.post("/categories", json={"name": "Foreign"})
other_cat_id = other_cat.json()["id"]
# original user tries to assign it
r = await client.post(f"/documents/{doc_id}/categories/{other_cat_id}")
assert r.status_code == 404
# ── AI processing integration (with mock AI) ──────────────────────────────────
async def test_processing_sets_extracted_data(client, invoice_pdf, mock_ai):
"""Upload + wait for background processing; verify extracted_data is populated."""
r = await client.post("/documents/upload", files=_pdf_upload("invoice.pdf", invoice_pdf))
assert r.status_code == 202
doc_id = r.json()["id"]
# Background tasks run synchronously in test context once response is sent.
# Poll the status endpoint briefly.
import asyncio
for _ in range(20):
status_r = await client.get(f"/documents/{doc_id}/status")
if status_r.json()["status"] in ("done", "failed"):
break
await asyncio.sleep(0.1)
doc_r = await client.get(f"/documents/{doc_id}")
doc = doc_r.json()
assert doc["status"] == "done"
assert doc["document_type"] == "invoice"
assert doc["extracted_data"] is not None
extracted = json.loads(doc["extracted_data"])
assert extracted["vendor_name"] == "ACME Corp"
assert "suggested_categories" in extracted
assert isinstance(extracted["suggested_categories"], list)
assert len(extracted["suggested_categories"]) > 0
# ── Live tests (require real PDFs in tests/pdfs/) ─────────────────────────────
async def test_live_upload_real_pdf(client, real_pdfs, mock_ai):
"""Upload each real PDF from tests/pdfs/ and verify it reaches 'done'."""
import asyncio
for pdf_path in real_pdfs:
data = pdf_path.read_bytes()
r = await client.post(
"/documents/upload",
files=_pdf_upload(pdf_path.name, data),
)
assert r.status_code == 202, f"Upload failed for {pdf_path.name}: {r.text}"
doc_id = r.json()["id"]
for _ in range(30):
status_r = await client.get(f"/documents/{doc_id}/status")
if status_r.json()["status"] in ("done", "failed"):
break
await asyncio.sleep(0.2)
final = await client.get(f"/documents/{doc_id}")
assert final.json()["status"] == "done", (
f"{pdf_path.name} ended with status '{final.json()['status']}': "
f"{final.json().get('error_message')}"
)