chore: initial commit — existing single-user document scanner codebase
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# System deps for PyMuPDF + OCR
|
||||
RUN apt-get update && apt-get install -y \
|
||||
tesseract-ocr \
|
||||
libgl1 \
|
||||
libglib2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY . .
|
||||
|
||||
EXPOSE 8000
|
||||
@@ -0,0 +1,36 @@
|
||||
from ai.base import AIProvider, ClassificationResult
|
||||
from ai.anthropic_provider import AnthropicProvider
|
||||
from ai.openai_provider import OpenAIProvider
|
||||
from ai.ollama_provider import OllamaProvider
|
||||
from ai.lmstudio_provider import LMStudioProvider
|
||||
|
||||
|
||||
def get_provider(settings: dict) -> AIProvider:
|
||||
active = settings.get("active_provider", "lmstudio")
|
||||
providers = settings.get("providers", {})
|
||||
cfg = providers.get(active, {})
|
||||
|
||||
match active:
|
||||
case "anthropic":
|
||||
return AnthropicProvider(
|
||||
api_key=cfg.get("api_key", ""),
|
||||
model=cfg.get("model", "claude-sonnet-4-6"),
|
||||
)
|
||||
case "openai":
|
||||
return OpenAIProvider(
|
||||
api_key=cfg.get("api_key", ""),
|
||||
model=cfg.get("model", "gpt-4o"),
|
||||
base_url=cfg.get("base_url") or None,
|
||||
)
|
||||
case "ollama":
|
||||
return OllamaProvider(
|
||||
base_url=cfg.get("base_url", "http://host.docker.internal:11434"),
|
||||
model=cfg.get("model", "llama3.2"),
|
||||
)
|
||||
case "lmstudio":
|
||||
return LMStudioProvider(
|
||||
base_url=cfg.get("base_url", "http://host.docker.internal:1234"),
|
||||
model=cfg.get("model", "gemma-4-e4b-it"),
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unknown AI provider: {active}")
|
||||
@@ -0,0 +1,103 @@
|
||||
import json
|
||||
import re
|
||||
import anthropic
|
||||
from ai.base import AIProvider, ClassificationResult
|
||||
|
||||
MAX_AI_CHARS = 8_000
|
||||
|
||||
|
||||
class AnthropicProvider(AIProvider):
|
||||
def __init__(self, api_key: str, model: str = "claude-sonnet-4-6"):
|
||||
self._api_key = api_key
|
||||
self._model = model
|
||||
|
||||
def _client(self):
|
||||
return anthropic.AsyncAnthropic(api_key=self._api_key)
|
||||
|
||||
async def classify(
|
||||
self,
|
||||
document_text: str,
|
||||
existing_topics: list[str],
|
||||
system_prompt: str,
|
||||
) -> ClassificationResult:
|
||||
topics_str = ", ".join(existing_topics) if existing_topics else "(none yet)"
|
||||
user_msg = (
|
||||
f"Existing topics: [{topics_str}]\n\n"
|
||||
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
|
||||
)
|
||||
client = self._client()
|
||||
response = await client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=1024,
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": user_msg}],
|
||||
)
|
||||
raw = response.content[0].text
|
||||
return _parse_classification(raw)
|
||||
|
||||
async def suggest_topics(
|
||||
self,
|
||||
document_text: str,
|
||||
system_prompt: str,
|
||||
) -> list[str]:
|
||||
user_msg = (
|
||||
"Suggest 3-5 topic names for this document. "
|
||||
"Return ONLY valid JSON: {\"suggested_topics\": [\"topic1\", \"topic2\"]}\n\n"
|
||||
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
|
||||
)
|
||||
client = self._client()
|
||||
response = await client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=256,
|
||||
system=system_prompt,
|
||||
messages=[{"role": "user", "content": user_msg}],
|
||||
)
|
||||
raw = response.content[0].text
|
||||
return _parse_suggestions(raw)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
try:
|
||||
client = self._client()
|
||||
await client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=5,
|
||||
messages=[{"role": "user", "content": "ping"}],
|
||||
)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _strip_code_fences(text: str) -> str:
|
||||
text = re.sub(r"```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"```", "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _parse_classification(raw: str) -> ClassificationResult:
|
||||
raw = _strip_code_fences(raw)
|
||||
# Try to find JSON object
|
||||
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
return ClassificationResult(
|
||||
topics=data.get("assigned_topics", []),
|
||||
suggested_new_topics=data.get("new_topic_suggestions", []),
|
||||
reasoning=data.get("reasoning", ""),
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return ClassificationResult()
|
||||
|
||||
|
||||
def _parse_suggestions(raw: str) -> list[str]:
|
||||
raw = _strip_code_fences(raw)
|
||||
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
return data.get("suggested_topics", [])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return []
|
||||
@@ -0,0 +1,32 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
topics: list[str] = field(default_factory=list)
|
||||
suggested_new_topics: list[str] = field(default_factory=list)
|
||||
reasoning: str = ""
|
||||
|
||||
|
||||
class AIProvider(ABC):
|
||||
@abstractmethod
|
||||
async def classify(
|
||||
self,
|
||||
document_text: str,
|
||||
existing_topics: list[str],
|
||||
system_prompt: str,
|
||||
) -> ClassificationResult:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def suggest_topics(
|
||||
self,
|
||||
document_text: str,
|
||||
system_prompt: str,
|
||||
) -> list[str]:
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
async def health_check(self) -> bool:
|
||||
...
|
||||
@@ -0,0 +1,10 @@
|
||||
from ai.openai_provider import OpenAIProvider
|
||||
|
||||
|
||||
class LMStudioProvider(OpenAIProvider):
|
||||
def __init__(self, base_url: str = "http://host.docker.internal:1234", model: str = "gemma-4-e4b-it"):
|
||||
super().__init__(
|
||||
api_key="lm-studio",
|
||||
model=model,
|
||||
base_url=base_url.rstrip("/") + "/v1",
|
||||
)
|
||||
@@ -0,0 +1,10 @@
|
||||
from ai.openai_provider import OpenAIProvider
|
||||
|
||||
|
||||
class OllamaProvider(OpenAIProvider):
|
||||
def __init__(self, base_url: str = "http://host.docker.internal:11434", model: str = "llama3.2"):
|
||||
super().__init__(
|
||||
api_key="ollama",
|
||||
model=model,
|
||||
base_url=base_url.rstrip("/") + "/v1",
|
||||
)
|
||||
@@ -0,0 +1,104 @@
|
||||
import json
|
||||
import re
|
||||
from openai import AsyncOpenAI
|
||||
from ai.base import AIProvider, ClassificationResult
|
||||
|
||||
MAX_AI_CHARS = 8_000
|
||||
|
||||
|
||||
class OpenAIProvider(AIProvider):
|
||||
def __init__(self, api_key: str, model: str = "gpt-4o", base_url: str | None = None):
|
||||
self._api_key = api_key
|
||||
self._model = model
|
||||
self._base_url = base_url
|
||||
|
||||
def _client(self) -> AsyncOpenAI:
|
||||
return AsyncOpenAI(api_key=self._api_key or "placeholder", base_url=self._base_url)
|
||||
|
||||
async def classify(
|
||||
self,
|
||||
document_text: str,
|
||||
existing_topics: list[str],
|
||||
system_prompt: str,
|
||||
) -> ClassificationResult:
|
||||
topics_str = ", ".join(existing_topics) if existing_topics else "(none yet)"
|
||||
user_msg = (
|
||||
f"Existing topics: [{topics_str}]\n\n"
|
||||
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
|
||||
)
|
||||
response = await self._client().chat.completions.create(
|
||||
model=self._model,
|
||||
max_tokens=1024,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
],
|
||||
)
|
||||
raw = response.choices[0].message.content or ""
|
||||
return _parse_classification(raw)
|
||||
|
||||
async def suggest_topics(
|
||||
self,
|
||||
document_text: str,
|
||||
system_prompt: str,
|
||||
) -> list[str]:
|
||||
user_msg = (
|
||||
"Suggest 3-5 topic names for this document. "
|
||||
"Return ONLY valid JSON: {\"suggested_topics\": [\"topic1\", \"topic2\"]}\n\n"
|
||||
f"Document text:\n{document_text[:MAX_AI_CHARS]}"
|
||||
)
|
||||
response = await self._client().chat.completions.create(
|
||||
model=self._model,
|
||||
max_tokens=256,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_msg},
|
||||
],
|
||||
)
|
||||
raw = response.choices[0].message.content or ""
|
||||
return _parse_suggestions(raw)
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
try:
|
||||
await self._client().chat.completions.create(
|
||||
model=self._model,
|
||||
max_tokens=5,
|
||||
messages=[{"role": "user", "content": "ping"}],
|
||||
)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _strip_code_fences(text: str) -> str:
|
||||
text = re.sub(r"```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"```", "", text)
|
||||
return text.strip()
|
||||
|
||||
|
||||
def _parse_classification(raw: str) -> ClassificationResult:
|
||||
raw = _strip_code_fences(raw)
|
||||
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
return ClassificationResult(
|
||||
topics=data.get("assigned_topics", []),
|
||||
suggested_new_topics=data.get("new_topic_suggestions", []),
|
||||
reasoning=data.get("reasoning", ""),
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return ClassificationResult()
|
||||
|
||||
|
||||
def _parse_suggestions(raw: str) -> list[str]:
|
||||
raw = _strip_code_fences(raw)
|
||||
match = re.search(r"\{.*\}", raw, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
return data.get("suggested_topics", [])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return []
|
||||
@@ -0,0 +1,101 @@
|
||||
from datetime import datetime, timezone
|
||||
from fastapi import APIRouter, UploadFile, File, Form, HTTPException, Query
|
||||
from services import storage, extractor, classifier
|
||||
|
||||
router = APIRouter(prefix="/api/documents", tags=["documents"])
|
||||
|
||||
ALLOWED_MIME_TYPES = {
|
||||
"application/pdf",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/msword",
|
||||
"text/plain",
|
||||
"text/markdown",
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/jpg",
|
||||
"image/tiff",
|
||||
"image/webp",
|
||||
}
|
||||
|
||||
|
||||
@router.post("/upload")
|
||||
async def upload_document(
|
||||
file: UploadFile = File(...),
|
||||
auto_classify: bool = Form(True),
|
||||
):
|
||||
content = await file.read()
|
||||
if len(content) == 0:
|
||||
raise HTTPException(400, "Empty file")
|
||||
|
||||
mime = file.content_type or "application/octet-stream"
|
||||
|
||||
saved = storage.save_upload(content, file.filename or "upload", mime)
|
||||
text = extractor.extract_text(saved["path"], mime)
|
||||
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
meta = {
|
||||
"id": saved["id"],
|
||||
"original_name": file.filename or "upload",
|
||||
"filename": saved["filename"],
|
||||
"mime_type": mime,
|
||||
"size_bytes": len(content),
|
||||
"extracted_text": text,
|
||||
"topics": [],
|
||||
"created_at": now,
|
||||
"classified_at": None,
|
||||
}
|
||||
storage.save_metadata(meta)
|
||||
|
||||
if auto_classify:
|
||||
try:
|
||||
topics = await classifier.classify_document(saved["id"])
|
||||
meta["topics"] = topics
|
||||
meta["classified_at"] = datetime.now(timezone.utc).isoformat()
|
||||
except Exception as e:
|
||||
# Classification failure is non-fatal; document is still saved
|
||||
meta["classification_error"] = str(e)
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_documents(
|
||||
topic: str | None = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
per_page: int = Query(20, ge=1, le=100),
|
||||
):
|
||||
docs = storage.list_metadata(topic=topic)
|
||||
total = len(docs)
|
||||
start = (page - 1) * per_page
|
||||
return {"items": docs[start : start + per_page], "total": total, "page": page, "per_page": per_page}
|
||||
|
||||
|
||||
@router.get("/{doc_id}")
|
||||
async def get_document(doc_id: str):
|
||||
meta = storage.get_metadata(doc_id)
|
||||
if meta is None:
|
||||
raise HTTPException(404, "Document not found")
|
||||
return meta
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_document(doc_id: str):
|
||||
ok = storage.delete_document(doc_id)
|
||||
if not ok:
|
||||
raise HTTPException(404, "Document not found")
|
||||
return {"success": True}
|
||||
|
||||
|
||||
@router.post("/{doc_id}/classify")
|
||||
async def classify_document(doc_id: str, body: dict = {}):
|
||||
meta = storage.get_metadata(doc_id)
|
||||
if meta is None:
|
||||
raise HTTPException(404, "Document not found")
|
||||
|
||||
topic_names = body.get("topics") if body else None
|
||||
try:
|
||||
topics = await classifier.classify_document(doc_id, topic_names)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Classification failed: {e}")
|
||||
|
||||
return {"topics": topics}
|
||||
@@ -0,0 +1,84 @@
|
||||
import time
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from services import storage
|
||||
from config import DEFAULT_SYSTEM_PROMPT
|
||||
from ai import get_provider
|
||||
|
||||
router = APIRouter(prefix="/api/settings", tags=["settings"])
|
||||
|
||||
|
||||
class SettingsPatch(BaseModel):
|
||||
system_prompt: str | None = None
|
||||
active_provider: str | None = None
|
||||
providers: dict | None = None
|
||||
|
||||
|
||||
class TestProviderRequest(BaseModel):
|
||||
provider: str
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def get_settings():
|
||||
settings = storage.load_settings()
|
||||
return storage.settings_masked(settings)
|
||||
|
||||
|
||||
@router.patch("")
|
||||
async def patch_settings(body: SettingsPatch):
|
||||
settings = storage.load_settings()
|
||||
|
||||
if body.system_prompt is not None:
|
||||
settings["system_prompt"] = body.system_prompt
|
||||
|
||||
if body.active_provider is not None:
|
||||
valid = {"anthropic", "openai", "ollama", "lmstudio"}
|
||||
if body.active_provider not in valid:
|
||||
raise HTTPException(400, f"Invalid provider. Must be one of: {valid}")
|
||||
settings["active_provider"] = body.active_provider
|
||||
|
||||
if body.providers is not None:
|
||||
# Deep merge per-provider config
|
||||
for prov_name, prov_cfg in body.providers.items():
|
||||
if prov_name not in settings.get("providers", {}):
|
||||
settings.setdefault("providers", {})[prov_name] = {}
|
||||
existing = settings["providers"][prov_name]
|
||||
for key, val in prov_cfg.items():
|
||||
# Don't overwrite api_key if it comes in masked (contains ****)
|
||||
if key == "api_key" and val and "****" in str(val):
|
||||
continue
|
||||
existing[key] = val
|
||||
|
||||
storage.save_settings(settings)
|
||||
return storage.settings_masked(settings)
|
||||
|
||||
|
||||
@router.post("/test-provider")
|
||||
async def test_provider(body: TestProviderRequest):
|
||||
settings = storage.load_settings()
|
||||
# Temporarily switch active provider for the test
|
||||
test_settings = dict(settings)
|
||||
test_settings["active_provider"] = body.provider
|
||||
|
||||
try:
|
||||
provider = get_provider(test_settings)
|
||||
except ValueError as e:
|
||||
raise HTTPException(400, str(e))
|
||||
|
||||
start = time.monotonic()
|
||||
try:
|
||||
ok = await provider.health_check()
|
||||
except Exception as e:
|
||||
return {"ok": False, "message": str(e), "latency_ms": 0}
|
||||
|
||||
latency_ms = int((time.monotonic() - start) * 1000)
|
||||
return {
|
||||
"ok": ok,
|
||||
"message": "Connection successful" if ok else "Health check failed",
|
||||
"latency_ms": latency_ms,
|
||||
}
|
||||
|
||||
|
||||
@router.get("/default-prompt")
|
||||
async def get_default_prompt():
|
||||
return {"system_prompt": DEFAULT_SYSTEM_PROMPT}
|
||||
@@ -0,0 +1,72 @@
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from services import storage, classifier
|
||||
|
||||
router = APIRouter(prefix="/api/topics", tags=["topics"])
|
||||
|
||||
|
||||
class TopicCreate(BaseModel):
|
||||
name: str
|
||||
description: str = ""
|
||||
color: str = "#6366f1"
|
||||
|
||||
|
||||
class TopicUpdate(BaseModel):
|
||||
name: str | None = None
|
||||
description: str | None = None
|
||||
color: str | None = None
|
||||
|
||||
|
||||
class SuggestRequest(BaseModel):
|
||||
document_id: str
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_topics():
|
||||
topics = storage.load_topics()
|
||||
counts = storage.topic_doc_counts()
|
||||
for t in topics:
|
||||
t["doc_count"] = counts.get(t["name"], 0)
|
||||
return {"topics": topics}
|
||||
|
||||
|
||||
@router.post("")
|
||||
async def create_topic(body: TopicCreate):
|
||||
topic = storage.create_topic(body.name, body.description, body.color)
|
||||
topic["doc_count"] = 0
|
||||
return topic
|
||||
|
||||
|
||||
@router.patch("/{topic_id}")
|
||||
async def update_topic(topic_id: str, body: TopicUpdate):
|
||||
topic = storage.update_topic(
|
||||
topic_id,
|
||||
name=body.name,
|
||||
description=body.description,
|
||||
color=body.color,
|
||||
)
|
||||
if topic is None:
|
||||
raise HTTPException(404, "Topic not found")
|
||||
counts = storage.topic_doc_counts()
|
||||
topic["doc_count"] = counts.get(topic["name"], 0)
|
||||
return topic
|
||||
|
||||
|
||||
@router.delete("/{topic_id}")
|
||||
async def delete_topic(topic_id: str):
|
||||
name = storage.delete_topic(topic_id)
|
||||
if name is None:
|
||||
raise HTTPException(404, "Topic not found")
|
||||
return {"success": True, "removed_from_documents": True}
|
||||
|
||||
|
||||
@router.post("/suggest")
|
||||
async def suggest_topics(body: SuggestRequest):
|
||||
meta = storage.get_metadata(body.document_id)
|
||||
if meta is None:
|
||||
raise HTTPException(404, "Document not found")
|
||||
try:
|
||||
suggestions = await classifier.suggest_topics_for_document(body.document_id)
|
||||
except Exception as e:
|
||||
raise HTTPException(500, f"Suggestion failed: {e}")
|
||||
return {"suggested": suggestions}
|
||||
@@ -0,0 +1,51 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(os.environ.get("DATA_DIR", "/app/data"))
|
||||
UPLOADS_DIR = DATA_DIR / "uploads"
|
||||
METADATA_DIR = DATA_DIR / "metadata"
|
||||
TOPICS_FILE = DATA_DIR / "topics.json"
|
||||
SETTINGS_FILE = DATA_DIR / "settings.json"
|
||||
|
||||
DEFAULT_SYSTEM_PROMPT = """You are a document classification assistant. When given a document's text content and a list of existing topics, you must:
|
||||
1. Assign the document to one or more relevant topics from the list.
|
||||
2. If no existing topics fit well, suggest new topic names.
|
||||
Return ONLY valid JSON in this exact format, with no additional text or explanation:
|
||||
{"assigned_topics": ["topic1"], "new_topic_suggestions": ["new topic name"]}
|
||||
If the document fits no topics and you have no suggestions, return: {"assigned_topics": [], "new_topic_suggestions": []}"""
|
||||
|
||||
DEFAULT_SETTINGS = {
|
||||
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
||||
"active_provider": "lmstudio",
|
||||
"providers": {
|
||||
"anthropic": {
|
||||
"api_key": "",
|
||||
"model": "claude-sonnet-4-6"
|
||||
},
|
||||
"openai": {
|
||||
"api_key": "",
|
||||
"model": "gpt-4o",
|
||||
"base_url": None
|
||||
},
|
||||
"ollama": {
|
||||
"base_url": "http://host.docker.internal:11434",
|
||||
"model": "llama3.2"
|
||||
},
|
||||
"lmstudio": {
|
||||
"base_url": "http://host.docker.internal:1234",
|
||||
"model": "gemma-4-e4b-it"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def ensure_data_dirs():
|
||||
UPLOADS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
METADATA_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not TOPICS_FILE.exists():
|
||||
TOPICS_FILE.write_text(json.dumps({"topics": []}, indent=2))
|
||||
|
||||
if not SETTINGS_FILE.exists():
|
||||
SETTINGS_FILE.write_text(json.dumps(DEFAULT_SETTINGS, indent=2))
|
||||
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"id": "69eb8545-2e19-4651-903e-6489dbd9f687",
|
||||
"original_name": "1907-Rechnung.pdf",
|
||||
"filename": "69eb8545-2e19-4651-903e-6489dbd9f687.pdf",
|
||||
"mime_type": "application/pdf",
|
||||
"size_bytes": 38090,
|
||||
"extracted_text": "mobilcom-debitel GmbH · Geschäftsführung: Ingo Arnold, Antonius Fromme, Rickmann von Platen \nHRB 14826 KI, Amtsgericht Kiel · Vorsitzender des Aufsichtsrats: Stephan Esch · Sitz der Gesellschaft: Büdelsdorf\nBankverbindung: Commerzbank Rendsburg · IBAN DE08214400450844443200 · BIC COBADEFFXXX\nUSt-ID: DE 194 910 634 · Gläubiger-ID: DE43ZZZ00000074855\nHaben Sie Fragen zur Rechnung?\nwww.md.de/faq\nmobilcom-debitel Kundenservice\nHandykurzwahl: 22240\nDer Anruf erfolgt zu einer ortsgebundenen Rufnummer\nTelefon: 040/55 55 41 00 0\nmobilcom-debitel Kundenservice Technik\nTelefon: 0900/10 22 24 0\n€ 2,49/Anruf, nur aus dem dt. Festnetz erreichbar\nwww.md.de\nHerrn\nDominik Ritter\nLeibnizstr. 41\n10629 Berlin\nRechnungsdatum:\nRechnungsnr.:\nKundennummer:\n31.07.2019\nM19046649250\n33040574\nPost: mobilcom-debitel GmbH · 99076 Erfurt\nIhre mobilcom-debitel Rechnung\nRechnungsbetrag netto\n55,4645 €\nUSt.-Betrag (19%)\n10,54 €\nRechnungsbetrag gesamt\n66,00 €\nDie Begleichung der Rechnung erfolgt am 07.08.2019 im Lastschriftverfahren mit der Mandatsreferenz-Nummer\nMC-33040574-00000001 von dem Konto: IBAN DE38100208900615356026.\nKennen Sie schon waipu.tv? Das ist Fernsehen wie noch nie: auf Smartphone, Tablet oder Ihrem TV.\nJetzt kostenlos testen: md.de/tv/waipu-tv.\nMobilfunk-Vertragsabrechnungen\nMobilfunk-Rufnummer: 0170 / 4322717\nVertragsnummer:\n217582256\nTeilnehmer: Dominik Ritter\nTarif:\nreal Allnet mit Smartphone 10\nMobilfunknetz: Telekom Mobilfunk\nDie Leistungen im Überblick\nMenge Details\nZeitraum/Datum\nSumme\nBasisleistungen\n1 Grundgebühr\n01.08.2019 - 31.08.2019\n31,0840 €\n1 freenet Hotspot Flat (DLS24M0TB0G0000):\nUnbegrenztes Datenvolumen im größten WLAN-Netzwerk\n01.08.2019 - 31.08.2019\n0,0000 €\n1 T@ke-away Flat Upgrade (+2 GB) - 6M (anteilig)\n03.07.2019 - 31.07.2019\n11,7839 €\n1 T@ke-away Flat Upgrade (+2 GB) - 6M\n01.08.2019 - 31.08.2019\n12,5966 €\n1 Kaspersky Passwort Manager 1 Monat (DLS1M1TB1G0299)\n(anteilig):\nEin Passwort für mehrere Konten!\n03.07.2019 - 31.07.2019\n2,3505 €\n1 Kaspersky Passwort Manager 1 Monat (DLS1M1TB1G0299)\n(anteilig)\n01.08.2019 - 02.08.2019\n0,1621 €\n1 Gutschrift Kaspersky Passwort Manager\n(DLS1M1TB1G0299) (anteilig)\n03.07.2019 - 31.07.2019\n-2,3505 €\n1 Gutschrift Kaspersky Passwort Manager\n(DLS1M1TB1G0299) (anteilig)\n01.08.2019 - 02.08.2019\n-0,1621 €\n1 Smartphone-Option\n01.08.2019 - 31.08.2019\n8,4034 €\nVerbindungen\n3 Verbindungen ins dt. Festnetz (FN)\n01.07.2019 - 03.07.2019\n0,0000 €\n39 Netzexterne Verbindungen (NX)\n28.06.2019 - 30.07.2019\n0,0000 €\n1 Abgehende Roaming Verbindungen (RA)\n17.07.2019 - 17.07.2019\n0,0000 €\n202 Datenverbindungen (DATA)\n27.06.2019 - 30.07.2019\n0,0000 €\n120 Roaming Datenverbindungen (RD)\n14.07.2019 - 20.07.2019\n0,0000 €\nZwischensumme netto\n63,8679 €\nIhre mobilcom-debitel Vorteile\n1 24 x 10 Euro Grundgebührrabatt\n01.08.2019 - 31.08.2019\n-8,4034 €\nNettobetrag für Rufnummer 0170 / 4322717\n55,4645 €\nSofern Sie die Löschung Ihrer Verbindungsdaten sofort, 90 oder 180 Tage nach Rechnungsstellung gewünscht haben, entfällt\nmit der Löschung unsere Nachweispflicht für diese Daten. Erfolgt innerhalb von 8 Wochen nach Erhalt der Rechnung kein\nschriftlicher Widerspruch, gilt die Rechnung als genehmigt. Begründete Einwendungen können auch gegen einzelne in der\nRechnung dargestellte Forderungen erhoben werden. Verzug tritt spätestens 30 Tage nach Zugang der Rechnung ein. Dies\nschließt einen frühzeitigeren Verzug nicht aus. Hinweise zum Ablauf eines Anbieterwechsels finden Sie auf der Internetseite\nder Bundesnetzagentur.\nRechnungserklärung\nSeite 1 von 2\n\nmobilcom-debitel GmbH · Geschäftsführung: Ingo Arnold, Antonius Fromme, Rickmann von Platen \nHRB 14826 KI, Amtsgericht Kiel · Vorsitzender des Aufsichtsrats: Stephan Esch · Sitz der Gesellschaft: Büdelsdorf\nBankverbindung: Commerzbank Rendsburg · IBAN DE08214400450844443200 · BIC COBADEFFXXX\nUSt-ID: DE 194 910 634 · Gläubiger-ID: DE43ZZZ00000074855\nRechnungsdatum:\nRechnungsnr.:\nKundennummer:\n31.07.2019\nM19046649250\n33040574\nIhre mobilcom-debitel Rechnung\nInformationen gemäß Telekommunikations-Transparenzverordnung\nMobilfunk-Rufnummer: 0170 / 4322717\nZeitraum Datenverbrauch:\n01.06.2019 - 30.06.2019\nVertragsbeginn:\n20.12.2016 Kündigungsfrist:\n3 Monat(e) Summe vereinbartes Datenvolumen:\n8000 MB\nMindestlaufzeit bis:\n19.12.2020 Kündigungseingang bis:\n19.09.2020 Verbrauchtes Datenvolumen:\n8080 MB\nSeite 2 von 2",
|
||||
"topics": [
|
||||
"Telecommunications",
|
||||
"Billing and Invoicing"
|
||||
],
|
||||
"created_at": "2026-04-16T11:08:33.558670+00:00",
|
||||
"classified_at": "2026-04-16T11:08:40.831347+00:00"
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,13 @@
|
||||
{
|
||||
"id": "cf4dd4cf-dcfb-42f1-957d-bcdba640163b",
|
||||
"original_name": "invoice.txt",
|
||||
"filename": "cf4dd4cf-dcfb-42f1-957d-bcdba640163b.txt",
|
||||
"mime_type": "text/plain",
|
||||
"size_bytes": 108,
|
||||
"extracted_text": "This is an invoice for professional consulting services rendered in April 2026. Total amount due: 5000 EUR.",
|
||||
"topics": [
|
||||
"Invoice"
|
||||
],
|
||||
"created_at": "2026-04-16T11:06:08.026326+00:00",
|
||||
"classified_at": "2026-04-16T11:06:09.636422+00:00"
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"id": "e71d8a85-09a1-4cd8-b602-65aa9216a724",
|
||||
"original_name": "test_doc.txt",
|
||||
"filename": "e71d8a85-09a1-4cd8-b602-65aa9216a724.txt",
|
||||
"mime_type": "text/plain",
|
||||
"size_bytes": 57,
|
||||
"extracted_text": "This document is about accounting and financial reports.",
|
||||
"topics": [],
|
||||
"created_at": "2026-04-16T11:05:24.317425+00:00",
|
||||
"classified_at": null
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"system_prompt": "You are a document classification assistant. When given a document's text content and a list of existing topics, you must:\n1. Assign the document to one or more relevant topics from the list.\n2. If no existing topics fit well, suggest new topic names.\nReturn ONLY valid JSON in this exact format, with no additional text or explanation:\n{\"assigned_topics\": [\"topic1\"], \"new_topic_suggestions\": [\"new topic name\"]}\nIf the document fits no topics and you have no suggestions, return: {\"assigned_topics\": [], \"new_topic_suggestions\": []}",
|
||||
"active_provider": "lmstudio",
|
||||
"providers": {
|
||||
"anthropic": {
|
||||
"api_key": "",
|
||||
"model": "claude-sonnet-4-6"
|
||||
},
|
||||
"openai": {
|
||||
"api_key": "",
|
||||
"model": "gpt-4o",
|
||||
"base_url": null
|
||||
},
|
||||
"ollama": {
|
||||
"base_url": "http://host.docker.internal:11434",
|
||||
"model": "llama3.2"
|
||||
},
|
||||
"lmstudio": {
|
||||
"base_url": "http://host.docker.internal:1234",
|
||||
"model": "gemma-4-e4b-it"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,22 @@
|
||||
{
|
||||
"topics": [
|
||||
{
|
||||
"id": "39ffdadb",
|
||||
"name": "Test Topic",
|
||||
"description": "",
|
||||
"color": "#6366f1"
|
||||
},
|
||||
{
|
||||
"id": "d2e0fbd8",
|
||||
"name": "Telecommunications",
|
||||
"description": "",
|
||||
"color": "#6366f1"
|
||||
},
|
||||
{
|
||||
"id": "d3823fd0",
|
||||
"name": "Billing and Invoicing",
|
||||
"description": "",
|
||||
"color": "#6366f1"
|
||||
}
|
||||
]
|
||||
}
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@@ -0,0 +1 @@
|
||||
This is an invoice for professional consulting services rendered in April 2026. Total amount due: 5000 EUR.
|
||||
@@ -0,0 +1 @@
|
||||
This document is about accounting and financial reports.
|
||||
@@ -0,0 +1,33 @@
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from config import ensure_data_dirs
|
||||
from api.documents import router as documents_router
|
||||
from api.topics import router as topics_router
|
||||
from api.settings import router as settings_router
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
ensure_data_dirs()
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="Document Scanner API", version="1.0.0", lifespan=lifespan)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
app.include_router(documents_router)
|
||||
app.include_router(topics_router)
|
||||
app.include_router(settings_router)
|
||||
@@ -0,0 +1,3 @@
|
||||
[pytest]
|
||||
asyncio_mode = auto
|
||||
testpaths = tests
|
||||
@@ -0,0 +1,15 @@
|
||||
fastapi>=0.111
|
||||
uvicorn[standard]>=0.29
|
||||
python-multipart
|
||||
pydantic-settings>=2.2
|
||||
anthropic>=0.26
|
||||
openai>=1.30
|
||||
PyMuPDF>=1.24
|
||||
python-docx>=1.1
|
||||
pytesseract>=0.3
|
||||
Pillow>=10.3
|
||||
filelock>=3.14
|
||||
aiofiles>=23.2
|
||||
httpx>=0.27
|
||||
pytest>=8.2
|
||||
pytest-asyncio>=0.23
|
||||
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Classification orchestrator.
|
||||
Loads settings, selects AI provider, classifies document, auto-creates suggested topics.
|
||||
"""
|
||||
from services import storage
|
||||
from ai import get_provider
|
||||
|
||||
MAX_AI_CHARS = 8_000
|
||||
|
||||
|
||||
async def classify_document(doc_id: str, topic_names: list[str] | None = None) -> list[str]:
|
||||
"""
|
||||
Classify a document by its ID. Returns the list of assigned topic names.
|
||||
If topic_names is provided, restrict classification to those topics.
|
||||
Auto-creates any newly suggested topics.
|
||||
"""
|
||||
meta = storage.get_metadata(doc_id)
|
||||
if meta is None:
|
||||
raise ValueError(f"Document {doc_id} not found")
|
||||
|
||||
settings = storage.load_settings()
|
||||
system_prompt = settings.get("system_prompt", "")
|
||||
provider = get_provider(settings)
|
||||
|
||||
# Use all known topics if not specified
|
||||
if topic_names is None:
|
||||
all_topics = storage.load_topics()
|
||||
topic_names = [t["name"] for t in all_topics]
|
||||
|
||||
text = meta.get("extracted_text", "")
|
||||
result = await provider.classify(text[:MAX_AI_CHARS], topic_names, system_prompt)
|
||||
|
||||
# Collect all topic names to persist (assigned + suggested)
|
||||
all_new_names = set(result.suggested_new_topics) | set(result.topics)
|
||||
|
||||
# Auto-create any topic not already in the registry
|
||||
existing_names = {t.lower() for t in topic_names}
|
||||
for name in all_new_names:
|
||||
if name.strip() and name.lower() not in existing_names:
|
||||
storage.create_topic(name.strip())
|
||||
|
||||
# Final list: everything the AI assigned or suggested
|
||||
final_topics = [t for t in list(set(result.topics + result.suggested_new_topics)) if t.strip()]
|
||||
|
||||
storage.update_document_topics(doc_id, final_topics)
|
||||
return final_topics
|
||||
|
||||
|
||||
async def suggest_topics_for_document(doc_id: str) -> list[str]:
|
||||
"""Return AI-suggested topic names without modifying the document."""
|
||||
meta = storage.get_metadata(doc_id)
|
||||
if meta is None:
|
||||
raise ValueError(f"Document {doc_id} not found")
|
||||
|
||||
settings = storage.load_settings()
|
||||
system_prompt = settings.get("system_prompt", "")
|
||||
provider = get_provider(settings)
|
||||
text = meta.get("extracted_text", "")
|
||||
return await provider.suggest_topics(text[:MAX_AI_CHARS], system_prompt)
|
||||
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
Text extraction dispatcher.
|
||||
Supports: PDF (PyMuPDF), DOCX (python-docx), plain text, images (pytesseract).
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
MAX_STORED_CHARS = 50_000
|
||||
|
||||
|
||||
def extract_text(file_path: str, mime_type: str) -> str:
|
||||
path = Path(file_path)
|
||||
try:
|
||||
if mime_type == "application/pdf" or path.suffix.lower() == ".pdf":
|
||||
return _extract_pdf(path)
|
||||
elif mime_type in (
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/msword",
|
||||
) or path.suffix.lower() in (".docx", ".doc"):
|
||||
return _extract_docx(path)
|
||||
elif mime_type and mime_type.startswith("image/"):
|
||||
return _extract_image(path)
|
||||
else:
|
||||
return _extract_text_file(path)
|
||||
except Exception as e:
|
||||
return f"[Extraction error: {e}]"
|
||||
|
||||
|
||||
def _extract_pdf(path: Path) -> str:
|
||||
import fitz # PyMuPDF
|
||||
doc = fitz.open(str(path))
|
||||
pages = []
|
||||
for page in doc:
|
||||
pages.append(page.get_text())
|
||||
doc.close()
|
||||
return _truncate("\n".join(pages))
|
||||
|
||||
|
||||
def _extract_docx(path: Path) -> str:
|
||||
from docx import Document
|
||||
doc = Document(str(path))
|
||||
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
||||
return _truncate("\n".join(paragraphs))
|
||||
|
||||
|
||||
def _extract_image(path: Path) -> str:
|
||||
try:
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
img = Image.open(str(path))
|
||||
text = pytesseract.image_to_string(img)
|
||||
return _truncate(text)
|
||||
except ImportError:
|
||||
return "[OCR unavailable: pytesseract or Pillow not installed]"
|
||||
except Exception as e:
|
||||
return f"[OCR error: {e}]"
|
||||
|
||||
|
||||
def _extract_text_file(path: Path) -> str:
|
||||
for enc in ("utf-8", "latin-1", "cp1252"):
|
||||
try:
|
||||
return _truncate(path.read_text(encoding=enc))
|
||||
except UnicodeDecodeError:
|
||||
continue
|
||||
return "[Could not decode text file]"
|
||||
|
||||
|
||||
def _truncate(text: str) -> str:
|
||||
text = text.strip()
|
||||
if len(text) > MAX_STORED_CHARS:
|
||||
text = text[:MAX_STORED_CHARS] + "\n[... truncated ...]"
|
||||
return text
|
||||
@@ -0,0 +1,187 @@
|
||||
import json
|
||||
import uuid
|
||||
import shutil
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from filelock import FileLock
|
||||
from config import UPLOADS_DIR, METADATA_DIR, TOPICS_FILE, SETTINGS_FILE, DEFAULT_SETTINGS
|
||||
|
||||
|
||||
# ── File locks ────────────────────────────────────────────────────────────────
|
||||
|
||||
_topics_lock = FileLock(str(TOPICS_FILE) + ".lock")
|
||||
_settings_lock = FileLock(str(SETTINGS_FILE) + ".lock")
|
||||
|
||||
|
||||
# ── Documents ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def save_upload(file_bytes: bytes, original_name: str, mime_type: str) -> dict:
|
||||
doc_id = str(uuid.uuid4())
|
||||
suffix = Path(original_name).suffix.lower()
|
||||
filename = f"{doc_id}{suffix}"
|
||||
dest = UPLOADS_DIR / filename
|
||||
dest.write_bytes(file_bytes)
|
||||
return {"id": doc_id, "filename": filename, "path": str(dest)}
|
||||
|
||||
|
||||
def save_metadata(meta: dict) -> None:
|
||||
path = METADATA_DIR / f"{meta['id']}.json"
|
||||
lock = FileLock(str(path) + ".lock")
|
||||
with lock:
|
||||
path.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
def get_metadata(doc_id: str) -> dict | None:
|
||||
path = METADATA_DIR / f"{doc_id}.json"
|
||||
if not path.exists():
|
||||
return None
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
def list_metadata(topic: str | None = None) -> list[dict]:
|
||||
docs = []
|
||||
for p in sorted(METADATA_DIR.glob("*.json"), key=lambda x: x.stat().st_mtime, reverse=True):
|
||||
try:
|
||||
meta = json.loads(p.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
if topic and topic not in meta.get("topics", []):
|
||||
continue
|
||||
docs.append(meta)
|
||||
return docs
|
||||
|
||||
|
||||
def delete_document(doc_id: str) -> bool:
|
||||
meta_path = METADATA_DIR / f"{doc_id}.json"
|
||||
if not meta_path.exists():
|
||||
return False
|
||||
meta = json.loads(meta_path.read_text())
|
||||
upload_path = UPLOADS_DIR / meta.get("filename", "")
|
||||
if upload_path.exists():
|
||||
upload_path.unlink()
|
||||
meta_path.unlink()
|
||||
lock_path = Path(str(meta_path) + ".lock")
|
||||
if lock_path.exists():
|
||||
lock_path.unlink()
|
||||
return True
|
||||
|
||||
|
||||
def update_document_topics(doc_id: str, topics: list[str]) -> dict | None:
|
||||
meta = get_metadata(doc_id)
|
||||
if meta is None:
|
||||
return None
|
||||
meta["topics"] = topics
|
||||
meta["classified_at"] = datetime.now(timezone.utc).isoformat()
|
||||
save_metadata(meta)
|
||||
return meta
|
||||
|
||||
|
||||
def remove_topic_from_all_documents(topic_name: str) -> int:
|
||||
"""Remove a topic name from all documents. Returns number of docs updated."""
|
||||
count = 0
|
||||
for p in METADATA_DIR.glob("*.json"):
|
||||
try:
|
||||
meta = json.loads(p.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
if topic_name in meta.get("topics", []):
|
||||
meta["topics"] = [t for t in meta["topics"] if t != topic_name]
|
||||
lock = FileLock(str(p) + ".lock")
|
||||
with lock:
|
||||
p.write_text(json.dumps(meta, indent=2, ensure_ascii=False))
|
||||
count += 1
|
||||
return count
|
||||
|
||||
|
||||
# ── Topics ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def load_topics() -> list[dict]:
|
||||
with _topics_lock:
|
||||
data = json.loads(TOPICS_FILE.read_text())
|
||||
return data.get("topics", [])
|
||||
|
||||
|
||||
def save_topics(topics: list[dict]) -> None:
|
||||
with _topics_lock:
|
||||
TOPICS_FILE.write_text(json.dumps({"topics": topics}, indent=2))
|
||||
|
||||
|
||||
def get_topic(topic_id: str) -> dict | None:
|
||||
return next((t for t in load_topics() if t["id"] == topic_id), None)
|
||||
|
||||
|
||||
def create_topic(name: str, description: str = "", color: str = "#6366f1") -> dict:
|
||||
topics = load_topics()
|
||||
# Deduplicate by name (case-insensitive)
|
||||
if any(t["name"].lower() == name.lower() for t in topics):
|
||||
return next(t for t in topics if t["name"].lower() == name.lower())
|
||||
topic = {
|
||||
"id": str(uuid.uuid4())[:8],
|
||||
"name": name,
|
||||
"description": description,
|
||||
"color": color,
|
||||
}
|
||||
topics.append(topic)
|
||||
save_topics(topics)
|
||||
return topic
|
||||
|
||||
|
||||
def update_topic(topic_id: str, **kwargs) -> dict | None:
|
||||
topics = load_topics()
|
||||
for t in topics:
|
||||
if t["id"] == topic_id:
|
||||
t.update({k: v for k, v in kwargs.items() if v is not None})
|
||||
save_topics(topics)
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def delete_topic(topic_id: str) -> str | None:
|
||||
topics = load_topics()
|
||||
topic = next((t for t in topics if t["id"] == topic_id), None)
|
||||
if not topic:
|
||||
return None
|
||||
name = topic["name"]
|
||||
save_topics([t for t in topics if t["id"] != topic_id])
|
||||
remove_topic_from_all_documents(name)
|
||||
return name
|
||||
|
||||
|
||||
def topic_doc_counts() -> dict[str, int]:
|
||||
counts: dict[str, int] = {}
|
||||
for p in METADATA_DIR.glob("*.json"):
|
||||
try:
|
||||
meta = json.loads(p.read_text())
|
||||
except Exception:
|
||||
continue
|
||||
for t in meta.get("topics", []):
|
||||
counts[t] = counts.get(t, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
# ── Settings ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def load_settings() -> dict:
|
||||
with _settings_lock:
|
||||
return json.loads(SETTINGS_FILE.read_text())
|
||||
|
||||
|
||||
def save_settings(settings: dict) -> None:
|
||||
with _settings_lock:
|
||||
SETTINGS_FILE.write_text(json.dumps(settings, indent=2))
|
||||
|
||||
|
||||
def mask_api_key(key: str) -> str:
|
||||
if not key or len(key) <= 4:
|
||||
return "****"
|
||||
return "****" + key[-4:]
|
||||
|
||||
|
||||
def settings_masked(settings: dict) -> dict:
|
||||
import copy
|
||||
s = copy.deepcopy(settings)
|
||||
for prov in ("anthropic", "openai"):
|
||||
key = s.get("providers", {}).get(prov, {}).get("api_key", "")
|
||||
if key:
|
||||
s["providers"][prov]["api_key"] = mask_api_key(key)
|
||||
return s
|
||||
@@ -0,0 +1,70 @@
|
||||
"""
|
||||
pytest configuration: isolate each test with a temporary data directory.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import pytest
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def isolated_data_dir(monkeypatch, tmp_path):
|
||||
"""Each test gets its own clean data directory."""
|
||||
data_dir = tmp_path / "data"
|
||||
(data_dir / "uploads").mkdir(parents=True)
|
||||
(data_dir / "metadata").mkdir(parents=True)
|
||||
(data_dir / "topics.json").write_text(json.dumps({"topics": []}))
|
||||
|
||||
from config import DEFAULT_SETTINGS
|
||||
(data_dir / "settings.json").write_text(json.dumps(DEFAULT_SETTINGS))
|
||||
|
||||
monkeypatch.setenv("DATA_DIR", str(data_dir))
|
||||
|
||||
# Patch the module-level path constants so the running app sees the temp dir
|
||||
import config
|
||||
monkeypatch.setattr(config, "DATA_DIR", data_dir)
|
||||
monkeypatch.setattr(config, "UPLOADS_DIR", data_dir / "uploads")
|
||||
monkeypatch.setattr(config, "METADATA_DIR", data_dir / "metadata")
|
||||
monkeypatch.setattr(config, "TOPICS_FILE", data_dir / "topics.json")
|
||||
monkeypatch.setattr(config, "SETTINGS_FILE", data_dir / "settings.json")
|
||||
|
||||
import services.storage as st
|
||||
from filelock import FileLock
|
||||
monkeypatch.setattr(st, "UPLOADS_DIR", data_dir / "uploads")
|
||||
monkeypatch.setattr(st, "METADATA_DIR", data_dir / "metadata")
|
||||
monkeypatch.setattr(st, "TOPICS_FILE", data_dir / "topics.json")
|
||||
monkeypatch.setattr(st, "SETTINGS_FILE", data_dir / "settings.json")
|
||||
monkeypatch.setattr(st, "_topics_lock", FileLock(str(data_dir / "topics.json") + ".lock"))
|
||||
monkeypatch.setattr(st, "_settings_lock", FileLock(str(data_dir / "settings.json") + ".lock"))
|
||||
|
||||
yield data_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client(isolated_data_dir):
|
||||
from main import app
|
||||
with TestClient(app) as c:
|
||||
yield c
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_txt(tmp_path):
|
||||
p = tmp_path / "sample.txt"
|
||||
p.write_text("This is a test document about invoices and finance.")
|
||||
return p
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf(tmp_path):
|
||||
"""Create a minimal valid PDF for testing."""
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "Test PDF document about contracts and legal matters.")
|
||||
pdf_path = tmp_path / "sample.pdf"
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
return pdf_path
|
||||
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
Unit tests for AI provider JSON parsing robustness and classifier orchestration.
|
||||
Uses a mock provider — no real AI calls made.
|
||||
"""
|
||||
import json
|
||||
import pytest
|
||||
from ai.openai_provider import _parse_classification, _parse_suggestions, _strip_code_fences
|
||||
from ai.base import ClassificationResult
|
||||
|
||||
|
||||
def test_parse_clean_json():
|
||||
raw = '{"assigned_topics": ["finance", "invoices"], "new_topic_suggestions": []}'
|
||||
result = _parse_classification(raw)
|
||||
assert result.topics == ["finance", "invoices"]
|
||||
assert result.suggested_new_topics == []
|
||||
|
||||
|
||||
def test_parse_with_code_fence():
|
||||
raw = '```json\n{"assigned_topics": ["legal"], "new_topic_suggestions": ["contracts"]}\n```'
|
||||
result = _parse_classification(raw)
|
||||
assert result.topics == ["legal"]
|
||||
assert result.suggested_new_topics == ["contracts"]
|
||||
|
||||
|
||||
def test_parse_with_preamble():
|
||||
raw = 'Here is the classification:\n{"assigned_topics": ["hr"], "new_topic_suggestions": []}\nDone.'
|
||||
result = _parse_classification(raw)
|
||||
assert result.topics == ["hr"]
|
||||
|
||||
|
||||
def test_parse_malformed_returns_empty():
|
||||
raw = "I cannot classify this document."
|
||||
result = _parse_classification(raw)
|
||||
assert result.topics == []
|
||||
assert result.suggested_new_topics == []
|
||||
|
||||
|
||||
def test_strip_code_fences():
|
||||
raw = "```json\n{}\n```"
|
||||
assert _strip_code_fences(raw) == "{}"
|
||||
|
||||
|
||||
def test_parse_suggestions_clean():
|
||||
raw = '{"suggested_topics": ["Human Resources", "Onboarding"]}'
|
||||
result = _parse_suggestions(raw)
|
||||
assert "Human Resources" in result
|
||||
assert "Onboarding" in result
|
||||
|
||||
|
||||
def test_parse_suggestions_with_fence():
|
||||
raw = "```\n{\"suggested_topics\": [\"Finance\"]}\n```"
|
||||
result = _parse_suggestions(raw)
|
||||
assert result == ["Finance"]
|
||||
|
||||
|
||||
def test_parse_suggestions_malformed():
|
||||
raw = "No suggestions available."
|
||||
result = _parse_suggestions(raw)
|
||||
assert result == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_classifier_with_mock_provider(isolated_data_dir):
|
||||
"""Test classifier orchestration with a mock provider."""
|
||||
from unittest.mock import AsyncMock, patch
|
||||
from ai.base import ClassificationResult
|
||||
import services.storage as st
|
||||
|
||||
# Create a document
|
||||
doc_id = "test-doc-1"
|
||||
st.save_metadata({
|
||||
"id": doc_id,
|
||||
"original_name": "test.txt",
|
||||
"filename": "test-doc-1.txt",
|
||||
"mime_type": "text/plain",
|
||||
"size_bytes": 50,
|
||||
"extracted_text": "Invoice for services rendered in March 2026.",
|
||||
"topics": [],
|
||||
"created_at": "2026-01-01T00:00:00Z",
|
||||
"classified_at": None,
|
||||
})
|
||||
|
||||
# Create some topics
|
||||
st.create_topic("Finance")
|
||||
st.create_topic("Legal")
|
||||
|
||||
mock_result = ClassificationResult(
|
||||
topics=["Finance"],
|
||||
suggested_new_topics=["Invoices"],
|
||||
reasoning="Document is about financial invoicing.",
|
||||
)
|
||||
|
||||
with patch("services.classifier.get_provider") as mock_get_provider:
|
||||
mock_provider = AsyncMock()
|
||||
mock_provider.classify = AsyncMock(return_value=mock_result)
|
||||
mock_get_provider.return_value = mock_provider
|
||||
|
||||
from services.classifier import classify_document
|
||||
topics = await classify_document(doc_id)
|
||||
|
||||
assert "Finance" in topics
|
||||
assert "Invoices" in topics
|
||||
|
||||
# Verify new topic was auto-created
|
||||
all_topics = st.load_topics()
|
||||
assert any(t["name"] == "Invoices" for t in all_topics)
|
||||
|
||||
# Verify document was updated
|
||||
meta = st.get_metadata(doc_id)
|
||||
assert "Finance" in meta["topics"]
|
||||
@@ -0,0 +1,107 @@
|
||||
def test_upload_txt_no_classify(client, sample_txt):
|
||||
with open(sample_txt, "rb") as f:
|
||||
resp = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("sample.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["original_name"] == "sample.txt"
|
||||
assert "extracted_text" in data
|
||||
assert "invoices" in data["extracted_text"].lower() or len(data["extracted_text"]) > 0
|
||||
assert data["topics"] == []
|
||||
assert "id" in data
|
||||
|
||||
|
||||
def test_upload_pdf_no_classify(client, sample_pdf):
|
||||
with open(sample_pdf, "rb") as f:
|
||||
resp = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("sample.pdf", f, "application/pdf")},
|
||||
data={"auto_classify": "false"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["mime_type"] == "application/pdf"
|
||||
assert len(data["extracted_text"]) > 0
|
||||
|
||||
|
||||
def test_list_documents(client, sample_txt):
|
||||
with open(sample_txt, "rb") as f:
|
||||
client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("a.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
)
|
||||
resp = client.get("/api/documents")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["total"] == 1
|
||||
assert len(data["items"]) == 1
|
||||
|
||||
|
||||
def test_list_documents_filter_by_topic(client, sample_txt):
|
||||
with open(sample_txt, "rb") as f:
|
||||
upload = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("a.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
).json()
|
||||
|
||||
import services.storage as st
|
||||
st.update_document_topics(upload["id"], ["finance"])
|
||||
|
||||
resp = client.get("/api/documents?topic=finance")
|
||||
assert resp.json()["total"] == 1
|
||||
|
||||
resp2 = client.get("/api/documents?topic=legal")
|
||||
assert resp2.json()["total"] == 0
|
||||
|
||||
|
||||
def test_get_document(client, sample_txt):
|
||||
with open(sample_txt, "rb") as f:
|
||||
upload = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("a.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
).json()
|
||||
|
||||
resp = client.get(f"/api/documents/{upload['id']}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["id"] == upload["id"]
|
||||
|
||||
|
||||
def test_get_document_not_found(client):
|
||||
resp = client.get("/api/documents/nonexistent")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_delete_document(client, sample_txt):
|
||||
with open(sample_txt, "rb") as f:
|
||||
upload = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("a.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
).json()
|
||||
|
||||
resp = client.delete(f"/api/documents/{upload['id']}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["success"] is True
|
||||
|
||||
resp2 = client.get(f"/api/documents/{upload['id']}")
|
||||
assert resp2.status_code == 404
|
||||
|
||||
|
||||
def test_delete_document_not_found(client):
|
||||
resp = client.delete("/api/documents/nonexistent")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_upload_empty_file(client):
|
||||
resp = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("empty.txt", b"", "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
@@ -0,0 +1,52 @@
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
from services.extractor import extract_text
|
||||
|
||||
|
||||
def test_extract_txt(tmp_path):
|
||||
p = tmp_path / "test.txt"
|
||||
p.write_text("Hello world this is a test document.", encoding="utf-8")
|
||||
text = extract_text(str(p), "text/plain")
|
||||
assert "Hello world" in text
|
||||
|
||||
|
||||
def test_extract_pdf(tmp_path):
|
||||
import fitz
|
||||
doc = fitz.open()
|
||||
page = doc.new_page()
|
||||
page.insert_text((50, 50), "PDF content about legal contracts.")
|
||||
pdf_path = tmp_path / "test.pdf"
|
||||
doc.save(str(pdf_path))
|
||||
doc.close()
|
||||
|
||||
text = extract_text(str(pdf_path), "application/pdf")
|
||||
assert "PDF content" in text
|
||||
|
||||
|
||||
def test_extract_docx(tmp_path):
|
||||
from docx import Document
|
||||
doc = Document()
|
||||
doc.add_paragraph("DOCX paragraph about financial reports.")
|
||||
docx_path = tmp_path / "test.docx"
|
||||
doc.save(str(docx_path))
|
||||
|
||||
text = extract_text(
|
||||
str(docx_path),
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
assert "DOCX paragraph" in text
|
||||
|
||||
|
||||
def test_extract_unknown_falls_back_to_text(tmp_path):
|
||||
p = tmp_path / "test.csv"
|
||||
p.write_text("col1,col2\nval1,val2", encoding="utf-8")
|
||||
text = extract_text(str(p), "text/csv")
|
||||
assert "col1" in text
|
||||
|
||||
|
||||
def test_extract_truncation(tmp_path):
|
||||
p = tmp_path / "big.txt"
|
||||
p.write_text("A" * 60_000, encoding="utf-8")
|
||||
text = extract_text(str(p), "text/plain")
|
||||
assert len(text) <= 50_100 # 50k + truncation marker
|
||||
assert "truncated" in text
|
||||
@@ -0,0 +1,4 @@
|
||||
def test_health(client):
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json() == {"status": "ok"}
|
||||
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
Integration test against a live LM Studio instance.
|
||||
Skipped automatically if LM Studio is not reachable.
|
||||
"""
|
||||
import pytest
|
||||
import httpx
|
||||
|
||||
|
||||
def lmstudio_available() -> bool:
|
||||
try:
|
||||
r = httpx.get("http://host.docker.internal:1234/v1/models", timeout=3)
|
||||
return r.status_code == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
@pytest.mark.skipif(not lmstudio_available(), reason="LM Studio not reachable at host.docker.internal:1234")
|
||||
@pytest.mark.asyncio
|
||||
async def test_lmstudio_health_check():
|
||||
from ai.lmstudio_provider import LMStudioProvider
|
||||
provider = LMStudioProvider(
|
||||
base_url="http://host.docker.internal:1234",
|
||||
model="gemma-4-e4b-it",
|
||||
)
|
||||
ok = await provider.health_check()
|
||||
assert ok, "LM Studio health check failed"
|
||||
|
||||
|
||||
@pytest.mark.skipif(not lmstudio_available(), reason="LM Studio not reachable at host.docker.internal:1234")
|
||||
@pytest.mark.asyncio
|
||||
async def test_lmstudio_classify():
|
||||
from ai.lmstudio_provider import LMStudioProvider
|
||||
from config import DEFAULT_SYSTEM_PROMPT
|
||||
|
||||
provider = LMStudioProvider(
|
||||
base_url="http://host.docker.internal:1234",
|
||||
model="gemma-4-e4b-it",
|
||||
)
|
||||
result = await provider.classify(
|
||||
document_text="This document is an invoice for software development services.",
|
||||
existing_topics=["Finance", "Legal", "HR"],
|
||||
system_prompt=DEFAULT_SYSTEM_PROMPT,
|
||||
)
|
||||
# Result should have some topics assigned or suggested
|
||||
assert isinstance(result.topics, list)
|
||||
assert isinstance(result.suggested_new_topics, list)
|
||||
@@ -0,0 +1,60 @@
|
||||
def test_get_settings_defaults(client):
|
||||
resp = client.get("/api/settings")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["active_provider"] == "lmstudio"
|
||||
assert "system_prompt" in data
|
||||
assert "providers" in data
|
||||
# API keys should be masked or empty
|
||||
for prov in ("anthropic", "openai"):
|
||||
key = data["providers"][prov].get("api_key", "")
|
||||
assert "****" not in key or len(key) <= 8 # masked or empty
|
||||
|
||||
|
||||
def test_patch_system_prompt(client):
|
||||
new_prompt = "Custom system prompt for testing."
|
||||
resp = client.patch("/api/settings", json={"system_prompt": new_prompt})
|
||||
assert resp.status_code == 200
|
||||
|
||||
resp2 = client.get("/api/settings")
|
||||
assert resp2.json()["system_prompt"] == new_prompt
|
||||
|
||||
|
||||
def test_patch_active_provider(client):
|
||||
resp = client.patch("/api/settings", json={"active_provider": "ollama"})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["active_provider"] == "ollama"
|
||||
|
||||
|
||||
def test_patch_invalid_provider(client):
|
||||
resp = client.patch("/api/settings", json={"active_provider": "unknown"})
|
||||
assert resp.status_code == 400
|
||||
|
||||
|
||||
def test_patch_provider_config(client):
|
||||
resp = client.patch("/api/settings", json={
|
||||
"providers": {
|
||||
"ollama": {"model": "mistral", "base_url": "http://host.docker.internal:11434"}
|
||||
}
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["providers"]["ollama"]["model"] == "mistral"
|
||||
|
||||
|
||||
def test_masked_api_key_not_overwritten(client):
|
||||
"""Patching with a masked key should not overwrite the real stored key."""
|
||||
# First set a real key
|
||||
client.patch("/api/settings", json={"providers": {"anthropic": {"api_key": "sk-ant-realkey"}}})
|
||||
# Then patch with masked key (simulating frontend re-submitting)
|
||||
client.patch("/api/settings", json={"providers": {"anthropic": {"api_key": "****key"}}})
|
||||
# The stored key should still be the real one
|
||||
import services.storage as st
|
||||
settings = st.load_settings()
|
||||
assert settings["providers"]["anthropic"]["api_key"] == "sk-ant-realkey"
|
||||
|
||||
|
||||
def test_get_default_prompt(client):
|
||||
resp = client.get("/api/settings/default-prompt")
|
||||
assert resp.status_code == 200
|
||||
assert "system_prompt" in resp.json()
|
||||
assert len(resp.json()["system_prompt"]) > 0
|
||||
@@ -0,0 +1,72 @@
|
||||
def test_list_topics_empty(client):
|
||||
resp = client.get("/api/topics")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["topics"] == []
|
||||
|
||||
|
||||
def test_create_topic(client):
|
||||
resp = client.post("/api/topics", json={"name": "Finance", "description": "Financial docs", "color": "#ff0000"})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["name"] == "Finance"
|
||||
assert data["color"] == "#ff0000"
|
||||
assert "id" in data
|
||||
|
||||
|
||||
def test_create_topic_deduplication(client):
|
||||
client.post("/api/topics", json={"name": "Finance"})
|
||||
resp = client.post("/api/topics", json={"name": "finance"}) # case-insensitive
|
||||
assert resp.status_code == 200
|
||||
topics = client.get("/api/topics").json()["topics"]
|
||||
assert len(topics) == 1
|
||||
|
||||
|
||||
def test_update_topic(client):
|
||||
create = client.post("/api/topics", json={"name": "Old Name"}).json()
|
||||
resp = client.patch(f"/api/topics/{create['id']}", json={"name": "New Name"})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["name"] == "New Name"
|
||||
|
||||
|
||||
def test_update_topic_not_found(client):
|
||||
resp = client.patch("/api/topics/nonexistent", json={"name": "X"})
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_delete_topic(client):
|
||||
create = client.post("/api/topics", json={"name": "ToDelete"}).json()
|
||||
resp = client.delete(f"/api/topics/{create['id']}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["success"] is True
|
||||
|
||||
topics = client.get("/api/topics").json()["topics"]
|
||||
assert not any(t["name"] == "ToDelete" for t in topics)
|
||||
|
||||
|
||||
def test_delete_topic_cascades_to_documents(client, sample_txt):
|
||||
# Create a topic
|
||||
topic = client.post("/api/topics", json={"name": "Legal"}).json()
|
||||
|
||||
# Upload doc (no auto classify to control topics manually)
|
||||
with open(sample_txt, "rb") as f:
|
||||
upload = client.post(
|
||||
"/api/documents/upload",
|
||||
files={"file": ("sample.txt", f, "text/plain")},
|
||||
data={"auto_classify": "false"},
|
||||
).json()
|
||||
|
||||
# Manually set topic on the document via classify endpoint
|
||||
import services.storage as st
|
||||
st.update_document_topics(upload["id"], ["Legal"])
|
||||
|
||||
# Delete topic
|
||||
client.delete(f"/api/topics/{topic['id']}")
|
||||
|
||||
# Verify document no longer has the topic
|
||||
doc = client.get(f"/api/documents/{upload['id']}").json()
|
||||
assert "Legal" not in doc["topics"]
|
||||
|
||||
|
||||
def test_delete_topic_not_found(client):
|
||||
resp = client.delete("/api/topics/nonexistent")
|
||||
assert resp.status_code == 404
|
||||
Reference in New Issue
Block a user