Add PDF document service with AI extraction and per-app settings

- New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-14 05:28:11 +02:00
parent d423bea134
commit 0d34867a69
52 changed files with 2500 additions and 28 deletions
@@ -0,0 +1,23 @@
+from app.services.ai.base import AIProvider
+
+
+def get_provider(ai_config: dict) -> AIProvider:
+    """
+    Factory: return an AIProvider instance based on the 'provider' key in the AI config section.
+    ai_config is the 'ai' section of doc_service_config.json, loaded fresh per processing job.
+    """
+    provider_name = ai_config.get("provider", "anthropic")
+    provider_cfg = ai_config.get(provider_name, {})
+
+    match provider_name:
+        case "anthropic":
+            from app.services.ai.anthropic_provider import AnthropicProvider
+            return AnthropicProvider(provider_cfg)
+        case "ollama" | "lmstudio":
+            from app.services.ai.openai_compat import OpenAICompatProvider
+            return OpenAICompatProvider(provider_cfg)
+        case _:
+            raise ValueError(f"Unknown AI provider: {provider_name!r}")
+
+
+__all__ = ["AIProvider", "get_provider"]
@@ -0,0 +1,31 @@
+import json
+
+from anthropic import AsyncAnthropic
+
+from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
+
+
+class AnthropicProvider(AIProvider):
+    def __init__(self, config: dict) -> None:
+        self._client = AsyncAnthropic(api_key=config["api_key"])
+        self._model = config.get("model", "claude-haiku-4-5-20251001")
+
+    async def classify_document(self, text: str) -> dict:
+        message = await self._client.messages.create(
+            model=self._model,
+            max_tokens=2048,
+            system=SYSTEM_PROMPT,
+            messages=[{
+                "role": "user",
+                "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000]),
+            }],
+        )
+        raw = message.content[0].text.strip()
+        return _parse_json(raw)
+
+
+def _parse_json(raw: str) -> dict:
+    # Strip accidental markdown fences despite explicit instruction not to include them
+    if raw.startswith("```"):
+        raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
+    return json.loads(raw)
@@ -0,0 +1,31 @@
+from abc import ABC, abstractmethod
+
+SYSTEM_PROMPT = (
+    "You are a financial document analysis assistant. "
+    "Given the text extracted from a PDF document, return ONLY a JSON object "
+    "with no markdown, no code fences, and no explanation."
+)
+
+USER_PROMPT_TEMPLATE = """Analyze the following document text and return a JSON object with exactly these keys:
+document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown),
+total_amount (string or null),
+currency (string or null),
+vendor_name (string or null),
+customer_name (string or null),
+billing_address (string or null),
+customer_address (string or null),
+invoice_number (string or null),
+invoice_date (string or null),
+due_date (string or null),
+tags (array of strings),
+line_items (array of objects, each with keys: description, amount).
+
+Document text:
+{text}"""
+
+
+class AIProvider(ABC):
+    @abstractmethod
+    async def classify_document(self, text: str) -> dict:
+        """Return structured extraction dict from document text."""
+        ...
@@ -0,0 +1,36 @@
+"""
+OpenAI-compatible provider for Ollama and LM Studio.
+Both expose an OpenAI-compatible /v1/chat/completions endpoint.
+"""
+import json
+
+from openai import AsyncOpenAI
+
+from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
+
+
+class OpenAICompatProvider(AIProvider):
+    def __init__(self, config: dict) -> None:
+        self._client = AsyncOpenAI(
+            base_url=config["base_url"],
+            api_key=config.get("api_key", "not-required"),
+        )
+        self._model = config["model"]
+
+    async def classify_document(self, text: str) -> dict:
+        response = await self._client.chat.completions.create(
+            model=self._model,
+            temperature=0,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000])},
+            ],
+        )
+        raw = response.choices[0].message.content.strip()
+        return _parse_json(raw)
+
+
+def _parse_json(raw: str) -> dict:
+    if raw.startswith("```"):
+        raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
+    return json.loads(raw)