Add PDF document service with AI extraction and per-app settings
- New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
from app.services.ai.base import AIProvider
|
||||
|
||||
|
||||
def get_provider(ai_config: dict) -> AIProvider:
|
||||
"""
|
||||
Factory: return an AIProvider instance based on the 'provider' key in the AI config section.
|
||||
ai_config is the 'ai' section of doc_service_config.json, loaded fresh per processing job.
|
||||
"""
|
||||
provider_name = ai_config.get("provider", "anthropic")
|
||||
provider_cfg = ai_config.get(provider_name, {})
|
||||
|
||||
match provider_name:
|
||||
case "anthropic":
|
||||
from app.services.ai.anthropic_provider import AnthropicProvider
|
||||
return AnthropicProvider(provider_cfg)
|
||||
case "ollama" | "lmstudio":
|
||||
from app.services.ai.openai_compat import OpenAICompatProvider
|
||||
return OpenAICompatProvider(provider_cfg)
|
||||
case _:
|
||||
raise ValueError(f"Unknown AI provider: {provider_name!r}")
|
||||
|
||||
|
||||
__all__ = ["AIProvider", "get_provider"]
|
||||
@@ -0,0 +1,31 @@
|
||||
import json
|
||||
|
||||
from anthropic import AsyncAnthropic
|
||||
|
||||
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
class AnthropicProvider(AIProvider):
|
||||
def __init__(self, config: dict) -> None:
|
||||
self._client = AsyncAnthropic(api_key=config["api_key"])
|
||||
self._model = config.get("model", "claude-haiku-4-5-20251001")
|
||||
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
message = await self._client.messages.create(
|
||||
model=self._model,
|
||||
max_tokens=2048,
|
||||
system=SYSTEM_PROMPT,
|
||||
messages=[{
|
||||
"role": "user",
|
||||
"content": USER_PROMPT_TEMPLATE.format(text=text[:100_000]),
|
||||
}],
|
||||
)
|
||||
raw = message.content[0].text.strip()
|
||||
return _parse_json(raw)
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> dict:
|
||||
# Strip accidental markdown fences despite explicit instruction not to include them
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
|
||||
return json.loads(raw)
|
||||
@@ -0,0 +1,31 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a financial document analysis assistant. "
|
||||
"Given the text extracted from a PDF document, return ONLY a JSON object "
|
||||
"with no markdown, no code fences, and no explanation."
|
||||
)
|
||||
|
||||
USER_PROMPT_TEMPLATE = """Analyze the following document text and return a JSON object with exactly these keys:
|
||||
document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown),
|
||||
total_amount (string or null),
|
||||
currency (string or null),
|
||||
vendor_name (string or null),
|
||||
customer_name (string or null),
|
||||
billing_address (string or null),
|
||||
customer_address (string or null),
|
||||
invoice_number (string or null),
|
||||
invoice_date (string or null),
|
||||
due_date (string or null),
|
||||
tags (array of strings),
|
||||
line_items (array of objects, each with keys: description, amount).
|
||||
|
||||
Document text:
|
||||
{text}"""
|
||||
|
||||
|
||||
class AIProvider(ABC):
|
||||
@abstractmethod
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
"""Return structured extraction dict from document text."""
|
||||
...
|
||||
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
OpenAI-compatible provider for Ollama and LM Studio.
|
||||
Both expose an OpenAI-compatible /v1/chat/completions endpoint.
|
||||
"""
|
||||
import json
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE
|
||||
|
||||
|
||||
class OpenAICompatProvider(AIProvider):
|
||||
def __init__(self, config: dict) -> None:
|
||||
self._client = AsyncOpenAI(
|
||||
base_url=config["base_url"],
|
||||
api_key=config.get("api_key", "not-required"),
|
||||
)
|
||||
self._model = config["model"]
|
||||
|
||||
async def classify_document(self, text: str) -> dict:
|
||||
response = await self._client.chat.completions.create(
|
||||
model=self._model,
|
||||
temperature=0,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000])},
|
||||
],
|
||||
)
|
||||
raw = response.choices[0].message.content.strip()
|
||||
return _parse_json(raw)
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> dict:
|
||||
if raw.startswith("```"):
|
||||
raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
|
||||
return json.loads(raw)
|
||||
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
Reads doc_service_config.json from the shared config volume.
|
||||
Caches the result for 30 seconds to avoid hitting the filesystem on every request.
|
||||
Uses asyncio.to_thread so the synchronous file read doesn't block the event loop.
|
||||
"""
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
_DEFAULT_CONFIG: dict = {
|
||||
"ai": {
|
||||
"provider": "anthropic",
|
||||
"anthropic": {"api_key": "", "model": "claude-haiku-4-5-20251001"},
|
||||
"ollama": {"base_url": "http://localhost:11434/v1", "model": "llama3.2", "api_key": "ollama"},
|
||||
"lmstudio": {"base_url": "http://localhost:1234/v1", "model": "local-model", "api_key": ""},
|
||||
},
|
||||
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
|
||||
}
|
||||
|
||||
_cache: dict | None = None
|
||||
_cache_at: float = 0.0
|
||||
_CACHE_TTL = 30.0
|
||||
|
||||
|
||||
def _read_config_sync() -> dict:
|
||||
path = Path(settings.CONFIG_PATH)
|
||||
if not path.exists():
|
||||
return _DEFAULT_CONFIG.copy()
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
async def load_doc_config() -> dict:
|
||||
global _cache, _cache_at
|
||||
now = time.monotonic()
|
||||
if _cache is not None and (now - _cache_at) < _CACHE_TTL:
|
||||
return _cache
|
||||
data = await asyncio.to_thread(_read_config_sync)
|
||||
_cache = data
|
||||
_cache_at = now
|
||||
return data
|
||||
@@ -0,0 +1,27 @@
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
|
||||
import aiofiles
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
|
||||
def get_upload_path(user_id: str, doc_id: str) -> Path:
|
||||
"""Return /data/documents/{user_id}/{doc_id}.pdf, creating the directory if needed."""
|
||||
user_dir = Path(settings.DATA_DIR) / user_id
|
||||
user_dir.mkdir(parents=True, exist_ok=True)
|
||||
return user_dir / f"{doc_id}.pdf"
|
||||
|
||||
|
||||
async def save_upload(file_data: bytes, user_id: str, doc_id: str) -> Path:
|
||||
dest = get_upload_path(user_id, doc_id)
|
||||
async with aiofiles.open(dest, "wb") as f:
|
||||
await f.write(file_data)
|
||||
return dest
|
||||
|
||||
|
||||
def delete_file(file_path: str) -> None:
|
||||
try:
|
||||
Path(file_path).unlink(missing_ok=True)
|
||||
except OSError:
|
||||
pass # log but do not raise — deletion failure must not 500
|
||||
Reference in New Issue
Block a user