chore: initial commit — existing single-user document scanner codebase

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-05-22 08:53:28 +02:00
parent 6fed5ba531
commit 7a34807fa0
71 changed files with 16408 additions and 0 deletions
+17
View File
@@ -0,0 +1,17 @@
FROM python:3.12-slim
WORKDIR /app
# System deps for PyMuPDF + OCR
RUN apt-get update && apt-get install -y \
tesseract-ocr \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
EXPOSE 8000
+36
View File
@@ -0,0 +1,36 @@
from ai.base import AIProvider, ClassificationResult
from ai.anthropic_provider import AnthropicProvider
from ai.openai_provider import OpenAIProvider
from ai.ollama_provider import OllamaProvider
from ai.lmstudio_provider import LMStudioProvider
def get_provider(settings: dict) -> AIProvider:
active = settings.get("active_provider", "lmstudio")
providers = settings.get("providers", {})
cfg = providers.get(active, {})
match active:
case "anthropic":
return AnthropicProvider(
api_key=cfg.get("api_key", ""),
model=cfg.get("model", "claude-sonnet-4-6"),
)
case "openai":
return OpenAIProvider(
api_key=cfg.get("api_key", ""),
model=cfg.get("model", "gpt-4o"),
base_url=cfg.get("base_url") or None,
)
case "ollama":
return OllamaProvider(
base_url=cfg.get("base_url", "http://host.docker.internal:11434"),
model=cfg.get("model", "llama3.2"),
)
case "lmstudio":
return LMStudioProvider(
base_url=cfg.get("base_url", "http://host.docker.internal:1234"),
model=cfg.get("model", "gemma-4-e4b-it"),
)
case _:
raise ValueError(f"Unknown AI provider: {active}")
+103
View File
@@ -0,0 +1,103 @@
import json
import re
import anthropic
from ai.base import AIProvider, ClassificationResult
MAX_AI_CHARS = 8_000
class AnthropicProvider(AIProvider):
def __init__(self, api_key: str, model: str = "claude-sonnet-4-6"):
self._api_key = api_key
self._model = model
def _client(self):
return anthropic.AsyncAnthropic(api_key=self._api_key)
async def classify(
self,
document_text: str,
existing_topics: list[str],
system_prompt: str,
) -> ClassificationResult:
topics_str = ", ".join(existing_topics) if existing_topics else "(none yet)"
user_msg = (
f"Existing topics: [{topics_str}]\n\n"
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
)
client = self._client()
response = await client.messages.create(
model=self._model,
max_tokens=1024,
system=system_prompt,
messages=[{"role": "user", "content": user_msg}],
)
raw = response.content[0].text
return _parse_classification(raw)
async def suggest_topics(
self,
document_text: str,
system_prompt: str,
) -> list[str]:
user_msg = (
"Suggest 3-5 topic names for this document. "
"Return ONLY valid JSON: {\"suggested_topics\": [\"topic1\", \"topic2\"]}\n\n"
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
)
client = self._client()
response = await client.messages.create(
model=self._model,
max_tokens=256,
system=system_prompt,
messages=[{"role": "user", "content": user_msg}],
)
raw = response.content[0].text
return _parse_suggestions(raw)
async def health_check(self) -> bool:
try:
client = self._client()
await client.messages.create(
model=self._model,
max_tokens=5,
messages=[{"role": "user", "content": "ping"}],
)
return True
except Exception:
return False
def _strip_code_fences(text: str) -> str:
text = re.sub(r"```(?:json)?\s*", "", text)
text = re.sub(r"```", "", text)
return text.strip()
def _parse_classification(raw: str) -> ClassificationResult:
raw = _strip_code_fences(raw)
# Try to find JSON object
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
try:
data = json.loads(match.group())
return ClassificationResult(
topics=data.get("assigned_topics", []),
suggested_new_topics=data.get("new_topic_suggestions", []),
reasoning=data.get("reasoning", ""),
)
except json.JSONDecodeError:
pass
return ClassificationResult()
def _parse_suggestions(raw: str) -> list[str]:
raw = _strip_code_fences(raw)
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
try:
data = json.loads(match.group())
return data.get("suggested_topics", [])
except json.JSONDecodeError:
pass
return []
+32
View File
@@ -0,0 +1,32 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
@dataclass
class ClassificationResult:
topics: list[str] = field(default_factory=list)
suggested_new_topics: list[str] = field(default_factory=list)
reasoning: str = ""
class AIProvider(ABC):
@abstractmethod
async def classify(
self,
document_text: str,
existing_topics: list[str],
system_prompt: str,
) -> ClassificationResult:
...
@abstractmethod
async def suggest_topics(
self,
document_text: str,
system_prompt: str,
) -> list[str]:
...
@abstractmethod
async def health_check(self) -> bool:
...
+10
View File
@@ -0,0 +1,10 @@
from ai.openai_provider import OpenAIProvider
class LMStudioProvider(OpenAIProvider):
def __init__(self, base_url: str = "http://host.docker.internal:1234", model: str = "gemma-4-e4b-it"):
super().__init__(
api_key="lm-studio",
model=model,
base_url=base_url.rstrip("/") + "/v1",
)
+10
View File
@@ -0,0 +1,10 @@
from ai.openai_provider import OpenAIProvider
class OllamaProvider(OpenAIProvider):
def __init__(self, base_url: str = "http://host.docker.internal:11434", model: str = "llama3.2"):
super().__init__(
api_key="ollama",
model=model,
base_url=base_url.rstrip("/") + "/v1",
)
+104
View File
@@ -0,0 +1,104 @@
import json
import re
from openai import AsyncOpenAI
from ai.base import AIProvider, ClassificationResult
MAX_AI_CHARS = 8_000
class OpenAIProvider(AIProvider):
def __init__(self, api_key: str, model: str = "gpt-4o", base_url: str | None = None):
self._api_key = api_key
self._model = model
self._base_url = base_url
def _client(self) -> AsyncOpenAI:
return AsyncOpenAI(api_key=self._api_key or "placeholder", base_url=self._base_url)
async def classify(
self,
document_text: str,
existing_topics: list[str],
system_prompt: str,
) -> ClassificationResult:
topics_str = ", ".join(existing_topics) if existing_topics else "(none yet)"
user_msg = (
f"Existing topics: [{topics_str}]\n\n"
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
)
response = await self._client().chat.completions.create(
model=self._model,
max_tokens=1024,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_msg},
],
)
raw = response.choices[0].message.content or ""
return _parse_classification(raw)
async def suggest_topics(
self,
document_text: str,
system_prompt: str,
) -> list[str]:
user_msg = (
"Suggest 3-5 topic names for this document. "
"Return ONLY valid JSON: {\"suggested_topics\": [\"topic1\", \"topic2\"]}\n\n"
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
)
response = await self._client().chat.completions.create(
model=self._model,
max_tokens=256,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_msg},
],
)
raw = response.choices[0].message.content or ""
return _parse_suggestions(raw)
async def health_check(self) -> bool:
try:
await self._client().chat.completions.create(
model=self._model,
max_tokens=5,
messages=[{"role": "user", "content": "ping"}],
)
return True
except Exception:
return False
def _strip_code_fences(text: str) -> str:
text = re.sub(r"```(?:json)?\s*", "", text)
text = re.sub(r"```", "", text)
return text.strip()
def _parse_classification(raw: str) -> ClassificationResult:
raw = _strip_code_fences(raw)
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
try:
data = json.loads(match.group())
return ClassificationResult(
topics=data.get("assigned_topics", []),
suggested_new_topics=data.get("new_topic_suggestions", []),
reasoning=data.get("reasoning", ""),
)
except json.JSONDecodeError:
pass
return ClassificationResult()
def _parse_suggestions(raw: str) -> list[str]:
raw = _strip_code_fences(raw)
match = re.search(r"\{.*\}", raw, re.DOTALL)
if match:
try:
data = json.loads(match.group())
return data.get("suggested_topics", [])
except json.JSONDecodeError:
pass
return []
View File
+101
View File
@@ -0,0 +1,101 @@
from datetime import datetime, timezone
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query
from services import storage, extractor, classifier
router = APIRouter(prefix="/api/documents", tags=["documents"])
ALLOWED_MIME_TYPES = {
"application/pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
"text/plain",
"text/markdown",
"image/png",
"image/jpeg",
"image/jpg",
"image/tiff",
"image/webp",
}
@router.post("/upload")
async def upload_document(
file: UploadFile = File(...),
auto_classify: bool = Form(True),
):
content = await file.read()
if len(content) == 0:
raise HTTPException(400, "Empty file")
mime = file.content_type or "application/octet-stream"
saved = storage.save_upload(content, file.filename or "upload", mime)
text = extractor.extract_text(saved["path"], mime)
now = datetime.now(timezone.utc).isoformat()
meta = {
"id": saved["id"],
"original_name": file.filename or "upload",
"filename": saved["filename"],
"mime_type": mime,
"size_bytes": len(content),
"extracted_text": text,
"topics": [],
"created_at": now,
"classified_at": None,
}
storage.save_metadata(meta)
if auto_classify:
try:
topics = await classifier.classify_document(saved["id"])
meta["topics"] = topics
meta["classified_at"] = datetime.now(timezone.utc).isoformat()
except Exception as e:
# Classification failure is non-fatal; document is still saved
meta["classification_error"] = str(e)
return meta
@router.get("")
async def list_documents(
topic: str | None = Query(None),
page: int = Query(1, ge=1),
per_page: int = Query(20, ge=1, le=100),
):
docs = storage.list_metadata(topic=topic)
total = len(docs)
start = (page - 1) * per_page
return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}
@router.get("/{doc_id}")
async def get_document(doc_id: str):
meta = storage.get_metadata(doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
return meta
@router.delete("/{doc_id}")
async def delete_document(doc_id: str):
ok = storage.delete_document(doc_id)
if not ok:
raise HTTPException(404, "Document not found")
return {"success": True}
@router.post("/{doc_id}/classify")
async def classify_document(doc_id: str, body: dict = {}):
meta = storage.get_metadata(doc_id)
if meta is None:
raise HTTPException(404, "Document not found")
topic_names = body.get("topics") if body else None
try:
topics = await classifier.classify_document(doc_id, topic_names)
except Exception as e:
raise HTTPException(500, f"Classification failed: {e}")
return {"topics": topics}
+84
View File
@@ -0,0 +1,84 @@
import time
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services import storage
from config import DEFAULT_SYSTEM_PROMPT
from ai import get_provider
router = APIRouter(prefix="/api/settings", tags=["settings"])
class SettingsPatch(BaseModel):
system_prompt: str | None = None
active_provider: str | None = None
providers: dict | None = None
class TestProviderRequest(BaseModel):
provider: str
@router.get("")
async def get_settings():
settings = storage.load_settings()
return storage.settings_masked(settings)
@router.patch("")
async def patch_settings(body: SettingsPatch):
settings = storage.load_settings()
if body.system_prompt is not None:
settings["system_prompt"] = body.system_prompt
if body.active_provider is not None:
valid = {"anthropic", "openai", "ollama", "lmstudio"}
if body.active_provider not in valid:
raise HTTPException(400, f"Invalid provider. Must be one of: {valid}")
settings["active_provider"] = body.active_provider
if body.providers is not None:
# Deep merge per-provider config
for prov_name, prov_cfg in body.providers.items():
if prov_name not in settings.get("providers", {}):
settings.setdefault("providers", {})[prov_name] = {}
existing = settings["providers"][prov_name]
for key, val in prov_cfg.items():
# Don't overwrite api_key if it comes in masked (contains ****)
if key == "api_key" and val and "****" in str(val):
continue
existing[key] = val
storage.save_settings(settings)
return storage.settings_masked(settings)
@router.post("/test-provider")
async def test_provider(body: TestProviderRequest):
settings = storage.load_settings()
# Temporarily switch active provider for the test
test_settings = dict(settings)
test_settings["active_provider"] = body.provider
try:
provider = get_provider(test_settings)
except ValueError as e:
raise HTTPException(400, str(e))
start = time.monotonic()
try:
ok = await provider.health_check()
except Exception as e:
return {"ok": False, "message": str(e), "latency_ms": 0}
latency_ms = int((time.monotonic() - start) * 1000)
return {
"ok": ok,
"message": "Connection successful" if ok else "Health check failed",
"latency_ms": latency_ms,
}
@router.get("/default-prompt")
async def get_default_prompt():
return {"system_prompt": DEFAULT_SYSTEM_PROMPT}
+72
View File
@@ -0,0 +1,72 @@
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from services import storage, classifier
router = APIRouter(prefix="/api/topics", tags=["topics"])
class TopicCreate(BaseModel):
name: str
description: str = ""
color: str = "#6366f1"
class TopicUpdate(BaseModel):
name: str | None = None
description: str | None = None
color: str | None = None
class SuggestRequest(BaseModel):
document_id: str
@router.get("")
async def list_topics():
topics = storage.load_topics()
counts = storage.topic_doc_counts()
for t in topics:
t["doc_count"] = counts.get(t["name"], 0)
return {"topics": topics}
@router.post("")
async def create_topic(body: TopicCreate):
topic = storage.create_topic(body.name, body.description, body.color)
topic["doc_count"] = 0
return topic
@router.patch("/{topic_id}")
async def update_topic(topic_id: str, body: TopicUpdate):
topic = storage.update_topic(
topic_id,
name=body.name,
description=body.description,
color=body.color,
)
if topic is None:
raise HTTPException(404, "Topic not found")
counts = storage.topic_doc_counts()
topic["doc_count"] = counts.get(topic["name"], 0)
return topic
@router.delete("/{topic_id}")
async def delete_topic(topic_id: str):
name = storage.delete_topic(topic_id)
if name is None:
raise HTTPException(404, "Topic not found")
return {"success": True, "removed_from_documents": True}
@router.post("/suggest")
async def suggest_topics(body: SuggestRequest):
meta = storage.get_metadata(body.document_id)
if meta is None:
raise HTTPException(404, "Document not found")
try:
suggestions = await classifier.suggest_topics_for_document(body.document_id)
except Exception as e:
raise HTTPException(500, f"Suggestion failed: {e}")
return {"suggested": suggestions}
+51
View File
@@ -0,0 +1,51 @@
import json
import os
from pathlib import Path
DATA_DIR = Path(os.environ.get("DATA_DIR", "/app/data"))
UPLOADS_DIR = DATA_DIR / "uploads"
METADATA_DIR = DATA_DIR / "metadata"
TOPICS_FILE = DATA_DIR / "topics.json"
SETTINGS_FILE = DATA_DIR / "settings.json"
DEFAULT_SYSTEM_PROMPT = """You are a document classification assistant. When given a document's text content and a list of existing topics, you must:
1. Assign the document to one or more relevant topics from the list.
2. If no existing topics fit well, suggest new topic names.
Return ONLY valid JSON in this exact format, with no additional text or explanation:
{"assigned_topics": ["topic1"], "new_topic_suggestions": ["new topic name"]}
If the document fits no topics and you have no suggestions, return: {"assigned_topics": [], "new_topic_suggestions": []}"""
DEFAULT_SETTINGS = {
"system_prompt": DEFAULT_SYSTEM_PROMPT,
"active_provider": "lmstudio",
"providers": {
"anthropic": {
"api_key": "",
"model": "claude-sonnet-4-6"
},
"openai": {
"api_key": "",
"model": "gpt-4o",
"base_url": None
},
"ollama": {
"base_url": "http://host.docker.internal:11434",
"model": "llama3.2"
},
"lmstudio": {
"base_url": "http://host.docker.internal:1234",
"model": "gemma-4-e4b-it"
}
}
}
def ensure_data_dirs():
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
METADATA_DIR.mkdir(parents=True, exist_ok=True)
if not TOPICS_FILE.exists():
TOPICS_FILE.write_text(json.dumps({"topics": []}, indent=2))
if not SETTINGS_FILE.exists():
SETTINGS_FILE.write_text(json.dumps(DEFAULT_SETTINGS, indent=2))
@@ -0,0 +1,14 @@
{
"id": "69eb8545-2e19-4651-903e-6489dbd9f687",
"original_name": "1907-Rechnung.pdf",
"filename": "69eb8545-2e19-4651-903e-6489dbd9f687.pdf",
"mime_type": "application/pdf",
"size_bytes": 38090,
"extracted_text": "mobilcom-debitel GmbH · Geschäftsführung: Ingo Arnold, Antonius Fromme, Rickmann von Platen \nHRB 14826 KI, Amtsgericht Kiel · Vorsitzender des Aufsichtsrats: Stephan Esch · Sitz der Gesellschaft: Büdelsdorf\nBankverbindung: Commerzbank Rendsburg · IBAN DE08214400450844443200 · BIC COBADEFFXXX\nUSt-ID: DE 194 910 634 · Gläubiger-ID: DE43ZZZ00000074855\nHaben Sie Fragen zur Rechnung?\nwww.md.de/faq\nmobilcom-debitel Kundenservice\nHandykurzwahl: 22240\nDer Anruf erfolgt zu einer ortsgebundenen Rufnummer\nTelefon: 040/55 55 41 00 0\nmobilcom-debitel Kundenservice Technik\nTelefon: 0900/10 22 24 0\n€ 2,49/Anruf, nur aus dem dt. Festnetz erreichbar\nwww.md.de\nHerrn\nDominik Ritter\nLeibnizstr. 41\n10629 Berlin\nRechnungsdatum:\nRechnungsnr.:\nKundennummer:\n31.07.2019\nM19046649250\n33040574\nPost: mobilcom-debitel GmbH · 99076 Erfurt\nIhre mobilcom-debitel Rechnung\nRechnungsbetrag netto\n55,4645 €\nUSt.-Betrag (19%)\n10,54 €\nRechnungsbetrag gesamt\n66,00 €\nDie Begleichung der Rechnung erfolgt am 07.08.2019 im Lastschriftverfahren mit der Mandatsreferenz-Nummer\nMC-33040574-00000001 von dem Konto: IBAN DE38100208900615356026.\nKennen Sie schon waipu.tv? Das ist Fernsehen wie noch nie: auf Smartphone, Tablet oder Ihrem TV.\nJetzt kostenlos testen: md.de/tv/waipu-tv.\nMobilfunk-Vertragsabrechnungen\nMobilfunk-Rufnummer: 0170 / 4322717\nVertragsnummer:\n217582256\nTeilnehmer: Dominik Ritter\nTarif:\nreal Allnet mit Smartphone 10\nMobilfunknetz: Telekom Mobilfunk\nDie Leistungen im Überblick\nMenge Details\nZeitraum/Datum\nSumme\nBasisleistungen\n1 Grundgebühr\n01.08.2019 - 31.08.2019\n31,0840 €\n1 freenet Hotspot Flat (DLS24M0TB0G0000):\nUnbegrenztes Datenvolumen im größten WLAN-Netzwerk\n01.08.2019 - 31.08.2019\n0,0000 €\n1 T@ke-away Flat Upgrade (+2 GB) - 6M (anteilig)\n03.07.2019 - 31.07.2019\n11,7839 €\n1 T@ke-away Flat Upgrade (+2 GB) - 6M\n01.08.2019 - 31.08.2019\n12,5966 €\n1 Kaspersky Passwort Manager 1 Monat (DLS1M1TB1G0299)\n(anteilig):\nEin Passwort für mehrere Konten!\n03.07.2019 - 31.07.2019\n2,3505 €\n1 Kaspersky Passwort Manager 1 Monat (DLS1M1TB1G0299)\n(anteilig)\n01.08.2019 - 02.08.2019\n0,1621 €\n1 Gutschrift Kaspersky Passwort Manager\n(DLS1M1TB1G0299) (anteilig)\n03.07.2019 - 31.07.2019\n-2,3505 €\n1 Gutschrift Kaspersky Passwort Manager\n(DLS1M1TB1G0299) (anteilig)\n01.08.2019 - 02.08.2019\n-0,1621 €\n1 Smartphone-Option\n01.08.2019 - 31.08.2019\n8,4034 €\nVerbindungen\n3 Verbindungen ins dt. Festnetz (FN)\n01.07.2019 - 03.07.2019\n0,0000 €\n39 Netzexterne Verbindungen (NX)\n28.06.2019 - 30.07.2019\n0,0000 €\n1 Abgehende Roaming Verbindungen (RA)\n17.07.2019 - 17.07.2019\n0,0000 €\n202 Datenverbindungen (DATA)\n27.06.2019 - 30.07.2019\n0,0000 €\n120 Roaming Datenverbindungen (RD)\n14.07.2019 - 20.07.2019\n0,0000 €\nZwischensumme netto\n63,8679 €\nIhre mobilcom-debitel Vorteile\n1 24 x 10 Euro Grundgebührrabatt\n01.08.2019 - 31.08.2019\n-8,4034 €\nNettobetrag für Rufnummer 0170 / 4322717\n55,4645 €\nSofern Sie die Löschung Ihrer Verbindungsdaten sofort, 90 oder 180 Tage nach Rechnungsstellung gewünscht haben, entfällt\nmit der Löschung unsere Nachweispflicht für diese Daten. Erfolgt innerhalb von 8 Wochen nach Erhalt der Rechnung kein\nschriftlicher Widerspruch, gilt die Rechnung als genehmigt. Begründete Einwendungen können auch gegen einzelne in der\nRechnung dargestellte Forderungen erhoben werden. Verzug tritt spätestens 30 Tage nach Zugang der Rechnung ein. Dies\nschließt einen frühzeitigeren Verzug nicht aus. Hinweise zum Ablauf eines Anbieterwechsels finden Sie auf der Internetseite\nder Bundesnetzagentur.\nRechnungserklärung\nSeite 1 von 2\n\nmobilcom-debitel GmbH · Geschäftsführung: Ingo Arnold, Antonius Fromme, Rickmann von Platen \nHRB 14826 KI, Amtsgericht Kiel · Vorsitzender des Aufsichtsrats: Stephan Esch · Sitz der Gesellschaft: Büdelsdorf\nBankverbindung: Commerzbank Rendsburg · IBAN DE08214400450844443200 · BIC COBADEFFXXX\nUSt-ID: DE 194 910 634 · Gläubiger-ID: DE43ZZZ00000074855\nRechnungsdatum:\nRechnungsnr.:\nKundennummer:\n31.07.2019\nM19046649250\n33040574\nIhre mobilcom-debitel Rechnung\nInformationen gemäß Telekommunikations-Transparenzverordnung\nMobilfunk-Rufnummer: 0170 / 4322717\nZeitraum Datenverbrauch:\n01.06.2019 - 30.06.2019\nVertragsbeginn:\n20.12.2016 Kündigungsfrist:\n3 Monat(e) Summe vereinbartes Datenvolumen:\n8000 MB\nMindestlaufzeit bis:\n19.12.2020 Kündigungseingang bis:\n19.09.2020 Verbrauchtes Datenvolumen:\n8080 MB\nSeite 2 von 2",
"topics": [
"Telecommunications",
"Billing and Invoicing"
],
"created_at": "2026-04-16T11:08:33.558670+00:00",
"classified_at": "2026-04-16T11:08:40.831347+00:00"
}
File diff suppressed because one or more lines are too long
@@ -0,0 +1,13 @@
{
"id": "cf4dd4cf-dcfb-42f1-957d-bcdba640163b",
"original_name": "invoice.txt",
"filename": "cf4dd4cf-dcfb-42f1-957d-bcdba640163b.txt",
"mime_type": "text/plain",
"size_bytes": 108,
"extracted_text": "This is an invoice for professional consulting services rendered in April 2026. Total amount due: 5000 EUR.",
"topics": [
"Invoice"
],
"created_at": "2026-04-16T11:06:08.026326+00:00",
"classified_at": "2026-04-16T11:06:09.636422+00:00"
}
@@ -0,0 +1,11 @@
{
"id": "e71d8a85-09a1-4cd8-b602-65aa9216a724",
"original_name": "test_doc.txt",
"filename": "e71d8a85-09a1-4cd8-b602-65aa9216a724.txt",
"mime_type": "text/plain",
"size_bytes": 57,
"extracted_text": "This document is about accounting and financial reports.",
"topics": [],
"created_at": "2026-04-16T11:05:24.317425+00:00",
"classified_at": null
}
+23
View File
@@ -0,0 +1,23 @@
{
"system_prompt": "You are a document classification assistant. When given a document's text content and a list of existing topics, you must:\n1. Assign the document to one or more relevant topics from the list.\n2. If no existing topics fit well, suggest new topic names.\nReturn ONLY valid JSON in this exact format, with no additional text or explanation:\n{\"assigned_topics\": [\"topic1\"], \"new_topic_suggestions\": [\"new topic name\"]}\nIf the document fits no topics and you have no suggestions, return: {\"assigned_topics\": [], \"new_topic_suggestions\": []}",
"active_provider": "lmstudio",
"providers": {
"anthropic": {
"api_key": "",
"model": "claude-sonnet-4-6"
},
"openai": {
"api_key": "",
"model": "gpt-4o",
"base_url": null
},
"ollama": {
"base_url": "http://host.docker.internal:11434",
"model": "llama3.2"
},
"lmstudio": {
"base_url": "http://host.docker.internal:1234",
"model": "gemma-4-e4b-it"
}
}
}
+22
View File
@@ -0,0 +1,22 @@
{
"topics": [
{
"id": "39ffdadb",
"name": "Test Topic",
"description": "",
"color": "#6366f1"
},
{
"id": "d2e0fbd8",
"name": "Telecommunications",
"description": "",
"color": "#6366f1"
},
{
"id": "d3823fd0",
"name": "Billing and Invoicing",
"description": "",
"color": "#6366f1"
}
]
}
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
This is an invoice for professional consulting services rendered in April 2026. Total amount due: 5000 EUR.
@@ -0,0 +1 @@
This document is about accounting and financial reports.
+33
View File
@@ -0,0 +1,33 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from config import ensure_data_dirs
from api.documents import router as documents_router
from api.topics import router as topics_router
from api.settings import router as settings_router
@asynccontextmanager
async def lifespan(app: FastAPI):
ensure_data_dirs()
yield
app = FastAPI(title="Document Scanner API", version="1.0.0", lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/health")
async def health():
return {"status": "ok"}
app.include_router(documents_router)
app.include_router(topics_router)
app.include_router(settings_router)
+3
View File
@@ -0,0 +1,3 @@
[pytest]
asyncio_mode = auto
testpaths = tests
+15
View File
@@ -0,0 +1,15 @@
fastapi>=0.111
uvicorn[standard]>=0.29
python-multipart
pydantic-settings>=2.2
anthropic>=0.26
openai>=1.30
PyMuPDF>=1.24
python-docx>=1.1
pytesseract>=0.3
Pillow>=10.3
filelock>=3.14
aiofiles>=23.2
httpx>=0.27
pytest>=8.2
pytest-asyncio>=0.23
View File
+59
View File
@@ -0,0 +1,59 @@
"""
Classification orchestrator.
Loads settings, selects AI provider, classifies document, auto-creates suggested topics.
"""
from services import storage
from ai import get_provider
MAX_AI_CHARS = 8_000
async def classify_document(doc_id: str, topic_names: list[str] | None = None) -> list[str]:
"""
Classify a document by its ID. Returns the list of assigned topic names.
If topic_names is provided, restrict classification to those topics.
Auto-creates any newly suggested topics.
"""
meta = storage.get_metadata(doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
settings = storage.load_settings()
system_prompt = settings.get("system_prompt", "")
provider = get_provider(settings)
# Use all known topics if not specified
if topic_names is None:
all_topics = storage.load_topics()
topic_names = [t["name"] for t in all_topics]
text = meta.get("extracted_text", "")
result = await provider.classify(text[:MAX_AI_CHARS], topic_names, system_prompt)
# Collect all topic names to persist (assigned + suggested)
all_new_names = set(result.suggested_new_topics) | set(result.topics)
# Auto-create any topic not already in the registry
existing_names = {t.lower() for t in topic_names}
for name in all_new_names:
if name.strip() and name.lower() not in existing_names:
storage.create_topic(name.strip())
# Final list: everything the AI assigned or suggested
final_topics = [t for t in list(set(result.topics + result.suggested_new_topics)) if t.strip()]
storage.update_document_topics(doc_id, final_topics)
return final_topics
async def suggest_topics_for_document(doc_id: str) -> list[str]:
"""Return AI-suggested topic names without modifying the document."""
meta = storage.get_metadata(doc_id)
if meta is None:
raise ValueError(f"Document {doc_id} not found")
settings = storage.load_settings()
system_prompt = settings.get("system_prompt", "")
provider = get_provider(settings)
text = meta.get("extracted_text", "")
return await provider.suggest_topics(text[:MAX_AI_CHARS], system_prompt)
+71
View File
@@ -0,0 +1,71 @@
"""
Text extraction dispatcher.
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
"""
from pathlib import Path
MAX_STORED_CHARS = 50_000
def extract_text(file_path: str, mime_type: str) -> str:
path = Path(file_path)
try:
if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
return _extract_pdf(path)
elif mime_type in (
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/msword",
) or path.suffix.lower() in (".docx", ".doc"):
return _extract_docx(path)
elif mime_type and mime_type.startswith("image/"):
return _extract_image(path)
else:
return _extract_text_file(path)
except Exception as e:
return f"[Extraction error: {e}]"
def _extract_pdf(path: Path) -> str:
import fitz # PyMuPDF
doc = fitz.open(str(path))
pages = []
for page in doc:
pages.append(page.get_text())
doc.close()
return _truncate("\n".join(pages))
def _extract_docx(path: Path) -> str:
from docx import Document
doc = Document(str(path))
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
return _truncate("\n".join(paragraphs))
def _extract_image(path: Path) -> str:
try:
from PIL import Image
import pytesseract
img = Image.open(str(path))
text = pytesseract.image_to_string(img)
return _truncate(text)
except ImportError:
return "[OCR unavailable: pytesseract or Pillow not installed]"
except Exception as e:
return f"[OCR error: {e}]"
def _extract_text_file(path: Path) -> str:
for enc in ("utf-8", "latin-1", "cp1252"):
try:
return _truncate(path.read_text(encoding=enc))
except UnicodeDecodeError:
continue
return "[Could not decode text file]"
def _truncate(text: str) -> str:
text = text.strip()
if len(text) > MAX_STORED_CHARS:
text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
return text
+187
View File
@@ -0,0 +1,187 @@
import json
import uuid
import shutil
from datetime import datetime, timezone
from pathlib import Path
from filelock import FileLock
from config import UPLOADS_DIR, METADATA_DIR, TOPICS_FILE, SETTINGS_FILE, DEFAULT_SETTINGS
# ── File locks ────────────────────────────────────────────────────────────────
_topics_lock = FileLock(str(TOPICS_FILE) + ".lock")
_settings_lock = FileLock(str(SETTINGS_FILE) + ".lock")
# ── Documents ─────────────────────────────────────────────────────────────────
def save_upload(file_bytes: bytes, original_name: str, mime_type: str) -> dict:
doc_id = str(uuid.uuid4())
suffix = Path(original_name).suffix.lower()
filename = f"{doc_id}{suffix}"
dest = UPLOADS_DIR / filename
dest.write_bytes(file_bytes)
return {"id": doc_id, "filename": filename, "path": str(dest)}
def save_metadata(meta: dict) -> None:
path = METADATA_DIR / f"{meta['id']}.json"
lock = FileLock(str(path) + ".lock")
with lock:
path.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
def get_metadata(doc_id: str) -> dict | None:
path = METADATA_DIR / f"{doc_id}.json"
if not path.exists():
return None
return json.loads(path.read_text())
def list_metadata(topic: str | None = None) -> list[dict]:
docs = []
for p in sorted(METADATA_DIR.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True):
try:
meta = json.loads(p.read_text())
except Exception:
continue
if topic and topic not in meta.get("topics", []):
continue
docs.append(meta)
return docs
def delete_document(doc_id: str) -> bool:
meta_path = METADATA_DIR / f"{doc_id}.json"
if not meta_path.exists():
return False
meta = json.loads(meta_path.read_text())
upload_path = UPLOADS_DIR / meta.get("filename", "")
if upload_path.exists():
upload_path.unlink()
meta_path.unlink()
lock_path = Path(str(meta_path) + ".lock")
if lock_path.exists():
lock_path.unlink()
return True
def update_document_topics(doc_id: str, topics: list[str]) -> dict | None:
meta = get_metadata(doc_id)
if meta is None:
return None
meta["topics"] = topics
meta["classified_at"] = datetime.now(timezone.utc).isoformat()
save_metadata(meta)
return meta
def remove_topic_from_all_documents(topic_name: str) -> int:
"""Remove a topic name from all documents. Returns number of docs updated."""
count = 0
for p in METADATA_DIR.glob("*.json"):
try:
meta = json.loads(p.read_text())
except Exception:
continue
if topic_name in meta.get("topics", []):
meta["topics"] = [t for t in meta["topics"] if t != topic_name]
lock = FileLock(str(p) + ".lock")
with lock:
p.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
count += 1
return count
# ── Topics ────────────────────────────────────────────────────────────────────
def load_topics() -> list[dict]:
with _topics_lock:
data = json.loads(TOPICS_FILE.read_text())
return data.get("topics", [])
def save_topics(topics: list[dict]) -> None:
with _topics_lock:
TOPICS_FILE.write_text(json.dumps({"topics": topics}, indent=2))
def get_topic(topic_id: str) -> dict | None:
return next((t for t in load_topics() if t["id"] == topic_id), None)
def create_topic(name: str, description: str = "", color: str = "#6366f1") -> dict:
topics = load_topics()
# Deduplicate by name (case-insensitive)
if any(t["name"].lower() == name.lower() for t in topics):
return next(t for t in topics if t["name"].lower() == name.lower())
topic = {
"id": str(uuid.uuid4())[:8],
"name": name,
"description": description,
"color": color,
}
topics.append(topic)
save_topics(topics)
return topic
def update_topic(topic_id: str, **kwargs) -> dict | None:
topics = load_topics()
for t in topics:
if t["id"] == topic_id:
t.update({k: v for k, v in kwargs.items() if v is not None})
save_topics(topics)
return t
return None
def delete_topic(topic_id: str) -> str | None:
topics = load_topics()
topic = next((t for t in topics if t["id"] == topic_id), None)
if not topic:
return None
name = topic["name"]
save_topics([t for t in topics if t["id"] != topic_id])
remove_topic_from_all_documents(name)
return name
def topic_doc_counts() -> dict[str, int]:
counts: dict[str, int] = {}
for p in METADATA_DIR.glob("*.json"):
try:
meta = json.loads(p.read_text())
except Exception:
continue
for t in meta.get("topics", []):
counts[t] = counts.get(t, 0) + 1
return counts
# ── Settings ──────────────────────────────────────────────────────────────────
def load_settings() -> dict:
with _settings_lock:
return json.loads(SETTINGS_FILE.read_text())
def save_settings(settings: dict) -> None:
with _settings_lock:
SETTINGS_FILE.write_text(json.dumps(settings, indent=2))
def mask_api_key(key: str) -> str:
if not key or len(key) <= 4:
return "****"
return "****" + key[-4:]
def settings_masked(settings: dict) -> dict:
import copy
s = copy.deepcopy(settings)
for prov in ("anthropic", "openai"):
key = s.get("providers", {}).get(prov, {}).get("api_key", "")
if key:
s["providers"][prov]["api_key"] = mask_api_key(key)
return s
View File
+70
View File
@@ -0,0 +1,70 @@
"""
pytest configuration: isolate each test with a temporary data directory.
"""
import os
import json
import pytest
import tempfile
import shutil
from pathlib import Path
from fastapi.testclient import TestClient
@pytest.fixture(autouse=True)
def isolated_data_dir(monkeypatch, tmp_path):
"""Each test gets its own clean data directory."""
data_dir = tmp_path / "data"
(data_dir / "uploads").mkdir(parents=True)
(data_dir / "metadata").mkdir(parents=True)
(data_dir / "topics.json").write_text(json.dumps({"topics": []}))
from config import DEFAULT_SETTINGS
(data_dir / "settings.json").write_text(json.dumps(DEFAULT_SETTINGS))
monkeypatch.setenv("DATA_DIR", str(data_dir))
# Patch the module-level path constants so the running app sees the temp dir
import config
monkeypatch.setattr(config, "DATA_DIR", data_dir)
monkeypatch.setattr(config, "UPLOADS_DIR", data_dir / "uploads")
monkeypatch.setattr(config, "METADATA_DIR", data_dir / "metadata")
monkeypatch.setattr(config, "TOPICS_FILE", data_dir / "topics.json")
monkeypatch.setattr(config, "SETTINGS_FILE", data_dir / "settings.json")
import services.storage as st
from filelock import FileLock
monkeypatch.setattr(st, "UPLOADS_DIR", data_dir / "uploads")
monkeypatch.setattr(st, "METADATA_DIR", data_dir / "metadata")
monkeypatch.setattr(st, "TOPICS_FILE", data_dir / "topics.json")
monkeypatch.setattr(st, "SETTINGS_FILE", data_dir / "settings.json")
monkeypatch.setattr(st, "_topics_lock", FileLock(str(data_dir / "topics.json") + ".lock"))
monkeypatch.setattr(st, "_settings_lock", FileLock(str(data_dir / "settings.json") + ".lock"))
yield data_dir
@pytest.fixture
def client(isolated_data_dir):
from main import app
with TestClient(app) as c:
yield c
@pytest.fixture
def sample_txt(tmp_path):
p = tmp_path / "sample.txt"
p.write_text("This is a test document about invoices and finance.")
return p
@pytest.fixture
def sample_pdf(tmp_path):
"""Create a minimal valid PDF for testing."""
import fitz
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "Test PDF document about contracts and legal matters.")
pdf_path = tmp_path / "sample.pdf"
doc.save(str(pdf_path))
doc.close()
return pdf_path
+110
View File
@@ -0,0 +1,110 @@
"""
Unit tests for AI provider JSON parsing robustness and classifier orchestration.
Uses a mock provider — no real AI calls made.
"""
import json
import pytest
from ai.openai_provider import _parse_classification, _parse_suggestions, _strip_code_fences
from ai.base import ClassificationResult
def test_parse_clean_json():
raw = '{"assigned_topics": ["finance", "invoices"], "new_topic_suggestions": []}'
result = _parse_classification(raw)
assert result.topics == ["finance", "invoices"]
assert result.suggested_new_topics == []
def test_parse_with_code_fence():
raw = '```json\n{"assigned_topics": ["legal"], "new_topic_suggestions": ["contracts"]}\n```'
result = _parse_classification(raw)
assert result.topics == ["legal"]
assert result.suggested_new_topics == ["contracts"]
def test_parse_with_preamble():
raw = 'Here is the classification:\n{"assigned_topics": ["hr"], "new_topic_suggestions": []}\nDone.'
result = _parse_classification(raw)
assert result.topics == ["hr"]
def test_parse_malformed_returns_empty():
raw = "I cannot classify this document."
result = _parse_classification(raw)
assert result.topics == []
assert result.suggested_new_topics == []
def test_strip_code_fences():
raw = "```json\n{}\n```"
assert _strip_code_fences(raw) == "{}"
def test_parse_suggestions_clean():
raw = '{"suggested_topics": ["Human Resources", "Onboarding"]}'
result = _parse_suggestions(raw)
assert "Human Resources" in result
assert "Onboarding" in result
def test_parse_suggestions_with_fence():
raw = "```\n{\"suggested_topics\": [\"Finance\"]}\n```"
result = _parse_suggestions(raw)
assert result == ["Finance"]
def test_parse_suggestions_malformed():
raw = "No suggestions available."
result = _parse_suggestions(raw)
assert result == []
@pytest.mark.asyncio
async def test_classifier_with_mock_provider(isolated_data_dir):
"""Test classifier orchestration with a mock provider."""
from unittest.mock import AsyncMock, patch
from ai.base import ClassificationResult
import services.storage as st
# Create a document
doc_id = "test-doc-1"
st.save_metadata({
"id": doc_id,
"original_name": "test.txt",
"filename": "test-doc-1.txt",
"mime_type": "text/plain",
"size_bytes": 50,
"extracted_text": "Invoice for services rendered in March 2026.",
"topics": [],
"created_at": "2026-01-01T00:00:00Z",
"classified_at": None,
})
# Create some topics
st.create_topic("Finance")
st.create_topic("Legal")
mock_result = ClassificationResult(
topics=["Finance"],
suggested_new_topics=["Invoices"],
reasoning="Document is about financial invoicing.",
)
with patch("services.classifier.get_provider") as mock_get_provider:
mock_provider = AsyncMock()
mock_provider.classify = AsyncMock(return_value=mock_result)
mock_get_provider.return_value = mock_provider
from services.classifier import classify_document
topics = await classify_document(doc_id)
assert "Finance" in topics
assert "Invoices" in topics
# Verify new topic was auto-created
all_topics = st.load_topics()
assert any(t["name"] == "Invoices" for t in all_topics)
# Verify document was updated
meta = st.get_metadata(doc_id)
assert "Finance" in meta["topics"]
+107
View File
@@ -0,0 +1,107 @@
def test_upload_txt_no_classify(client, sample_txt):
with open(sample_txt, "rb") as f:
resp = client.post(
"/api/documents/upload",
files={"file": ("sample.txt", f, "text/plain")},
data={"auto_classify": "false"},
)
assert resp.status_code == 200
data = resp.json()
assert data["original_name"] == "sample.txt"
assert "extracted_text" in data
assert "invoices" in data["extracted_text"].lower() or len(data["extracted_text"]) > 0
assert data["topics"] == []
assert "id" in data
def test_upload_pdf_no_classify(client, sample_pdf):
with open(sample_pdf, "rb") as f:
resp = client.post(
"/api/documents/upload",
files={"file": ("sample.pdf", f, "application/pdf")},
data={"auto_classify": "false"},
)
assert resp.status_code == 200
data = resp.json()
assert data["mime_type"] == "application/pdf"
assert len(data["extracted_text"]) > 0
def test_list_documents(client, sample_txt):
with open(sample_txt, "rb") as f:
client.post(
"/api/documents/upload",
files={"file": ("a.txt", f, "text/plain")},
data={"auto_classify": "false"},
)
resp = client.get("/api/documents")
assert resp.status_code == 200
data = resp.json()
assert data["total"] == 1
assert len(data["items"]) == 1
def test_list_documents_filter_by_topic(client, sample_txt):
with open(sample_txt, "rb") as f:
upload = client.post(
"/api/documents/upload",
files={"file": ("a.txt", f, "text/plain")},
data={"auto_classify": "false"},
).json()
import services.storage as st
st.update_document_topics(upload["id"], ["finance"])
resp = client.get("/api/documents?topic=finance")
assert resp.json()["total"] == 1
resp2 = client.get("/api/documents?topic=legal")
assert resp2.json()["total"] == 0
def test_get_document(client, sample_txt):
with open(sample_txt, "rb") as f:
upload = client.post(
"/api/documents/upload",
files={"file": ("a.txt", f, "text/plain")},
data={"auto_classify": "false"},
).json()
resp = client.get(f"/api/documents/{upload['id']}")
assert resp.status_code == 200
assert resp.json()["id"] == upload["id"]
def test_get_document_not_found(client):
resp = client.get("/api/documents/nonexistent")
assert resp.status_code == 404
def test_delete_document(client, sample_txt):
with open(sample_txt, "rb") as f:
upload = client.post(
"/api/documents/upload",
files={"file": ("a.txt", f, "text/plain")},
data={"auto_classify": "false"},
).json()
resp = client.delete(f"/api/documents/{upload['id']}")
assert resp.status_code == 200
assert resp.json()["success"] is True
resp2 = client.get(f"/api/documents/{upload['id']}")
assert resp2.status_code == 404
def test_delete_document_not_found(client):
resp = client.delete("/api/documents/nonexistent")
assert resp.status_code == 404
def test_upload_empty_file(client):
resp = client.post(
"/api/documents/upload",
files={"file": ("empty.txt", b"", "text/plain")},
data={"auto_classify": "false"},
)
assert resp.status_code == 400
+52
View File
@@ -0,0 +1,52 @@
import pytest
from pathlib import Path
from services.extractor import extract_text
def test_extract_txt(tmp_path):
p = tmp_path / "test.txt"
p.write_text("Hello world this is a test document.", encoding="utf-8")
text = extract_text(str(p), "text/plain")
assert "Hello world" in text
def test_extract_pdf(tmp_path):
import fitz
doc = fitz.open()
page = doc.new_page()
page.insert_text((50, 50), "PDF content about legal contracts.")
pdf_path = tmp_path / "test.pdf"
doc.save(str(pdf_path))
doc.close()
text = extract_text(str(pdf_path), "application/pdf")
assert "PDF content" in text
def test_extract_docx(tmp_path):
from docx import Document
doc = Document()
doc.add_paragraph("DOCX paragraph about financial reports.")
docx_path = tmp_path / "test.docx"
doc.save(str(docx_path))
text = extract_text(
str(docx_path),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
assert "DOCX paragraph" in text
def test_extract_unknown_falls_back_to_text(tmp_path):
p = tmp_path / "test.csv"
p.write_text("col1,col2\nval1,val2", encoding="utf-8")
text = extract_text(str(p), "text/csv")
assert "col1" in text
def test_extract_truncation(tmp_path):
p = tmp_path / "big.txt"
p.write_text("A" * 60_000, encoding="utf-8")
text = extract_text(str(p), "text/plain")
assert len(text) <= 50_100 # 50k + truncation marker
assert "truncated" in text
+4
View File
@@ -0,0 +1,4 @@
def test_health(client):
resp = client.get("/health")
assert resp.status_code == 200
assert resp.json() == {"status": "ok"}
+46
View File
@@ -0,0 +1,46 @@
"""
Integration test against a live LM Studio instance.
Skipped automatically if LM Studio is not reachable.
"""
import pytest
import httpx
def lmstudio_available() -> bool:
try:
r = httpx.get("http://host.docker.internal:1234/v1/models", timeout=3)
return r.status_code == 200
except Exception:
return False
@pytest.mark.skipif(not lmstudio_available(), reason="LM Studio not reachable at host.docker.internal:1234")
@pytest.mark.asyncio
async def test_lmstudio_health_check():
from ai.lmstudio_provider import LMStudioProvider
provider = LMStudioProvider(
base_url="http://host.docker.internal:1234",
model="gemma-4-e4b-it",
)
ok = await provider.health_check()
assert ok, "LM Studio health check failed"
@pytest.mark.skipif(not lmstudio_available(), reason="LM Studio not reachable at host.docker.internal:1234")
@pytest.mark.asyncio
async def test_lmstudio_classify():
from ai.lmstudio_provider import LMStudioProvider
from config import DEFAULT_SYSTEM_PROMPT
provider = LMStudioProvider(
base_url="http://host.docker.internal:1234",
model="gemma-4-e4b-it",
)
result = await provider.classify(
document_text="This document is an invoice for software development services.",
existing_topics=["Finance", "Legal", "HR"],
system_prompt=DEFAULT_SYSTEM_PROMPT,
)
# Result should have some topics assigned or suggested
assert isinstance(result.topics, list)
assert isinstance(result.suggested_new_topics, list)
+60
View File
@@ -0,0 +1,60 @@
def test_get_settings_defaults(client):
resp = client.get("/api/settings")
assert resp.status_code == 200
data = resp.json()
assert data["active_provider"] == "lmstudio"
assert "system_prompt" in data
assert "providers" in data
# API keys should be masked or empty
for prov in ("anthropic", "openai"):
key = data["providers"][prov].get("api_key", "")
assert "****" not in key or len(key) <= 8 # masked or empty
def test_patch_system_prompt(client):
new_prompt = "Custom system prompt for testing."
resp = client.patch("/api/settings", json={"system_prompt": new_prompt})
assert resp.status_code == 200
resp2 = client.get("/api/settings")
assert resp2.json()["system_prompt"] == new_prompt
def test_patch_active_provider(client):
resp = client.patch("/api/settings", json={"active_provider": "ollama"})
assert resp.status_code == 200
assert resp.json()["active_provider"] == "ollama"
def test_patch_invalid_provider(client):
resp = client.patch("/api/settings", json={"active_provider": "unknown"})
assert resp.status_code == 400
def test_patch_provider_config(client):
resp = client.patch("/api/settings", json={
"providers": {
"ollama": {"model": "mistral", "base_url": "http://host.docker.internal:11434"}
}
})
assert resp.status_code == 200
assert resp.json()["providers"]["ollama"]["model"] == "mistral"
def test_masked_api_key_not_overwritten(client):
"""Patching with a masked key should not overwrite the real stored key."""
# First set a real key
client.patch("/api/settings", json={"providers": {"anthropic": {"api_key": "sk-ant-realkey"}}})
# Then patch with masked key (simulating frontend re-submitting)
client.patch("/api/settings", json={"providers": {"anthropic": {"api_key": "****key"}}})
# The stored key should still be the real one
import services.storage as st
settings = st.load_settings()
assert settings["providers"]["anthropic"]["api_key"] == "sk-ant-realkey"
def test_get_default_prompt(client):
resp = client.get("/api/settings/default-prompt")
assert resp.status_code == 200
assert "system_prompt" in resp.json()
assert len(resp.json()["system_prompt"]) > 0
+72
View File
@@ -0,0 +1,72 @@
def test_list_topics_empty(client):
resp = client.get("/api/topics")
assert resp.status_code == 200
assert resp.json()["topics"] == []
def test_create_topic(client):
resp = client.post("/api/topics", json={"name": "Finance", "description": "Financial docs", "color": "#ff0000"})
assert resp.status_code == 200
data = resp.json()
assert data["name"] == "Finance"
assert data["color"] == "#ff0000"
assert "id" in data
def test_create_topic_deduplication(client):
client.post("/api/topics", json={"name": "Finance"})
resp = client.post("/api/topics", json={"name": "finance"}) # case-insensitive
assert resp.status_code == 200
topics = client.get("/api/topics").json()["topics"]
assert len(topics) == 1
def test_update_topic(client):
create = client.post("/api/topics", json={"name": "Old Name"}).json()
resp = client.patch(f"/api/topics/{create['id']}", json={"name": "New Name"})
assert resp.status_code == 200
assert resp.json()["name"] == "New Name"
def test_update_topic_not_found(client):
resp = client.patch("/api/topics/nonexistent", json={"name": "X"})
assert resp.status_code == 404
def test_delete_topic(client):
create = client.post("/api/topics", json={"name": "ToDelete"}).json()
resp = client.delete(f"/api/topics/{create['id']}")
assert resp.status_code == 200
assert resp.json()["success"] is True
topics = client.get("/api/topics").json()["topics"]
assert not any(t["name"] == "ToDelete" for t in topics)
def test_delete_topic_cascades_to_documents(client, sample_txt):
# Create a topic
topic = client.post("/api/topics", json={"name": "Legal"}).json()
# Upload doc (no auto classify to control topics manually)
with open(sample_txt, "rb") as f:
upload = client.post(
"/api/documents/upload",
files={"file": ("sample.txt", f, "text/plain")},
data={"auto_classify": "false"},
).json()
# Manually set topic on the document via classify endpoint
import services.storage as st
st.update_document_topics(upload["id"], ["Legal"])
# Delete topic
client.delete(f"/api/topics/{topic['id']}")
# Verify document no longer has the topic
doc = client.get(f"/api/documents/{upload['id']}").json()
assert "Legal" not in doc["topics"]
def test_delete_topic_not_found(client):
resp = client.delete("/api/topics/nonexistent")
assert resp.status_code == 404