Add PDF document service with AI extraction and per-app settings

- New `features/doc-service` FastAPI microservice: PDF upload, async
  text extraction (pdfplumber), AI classification via Anthropic/Ollama/
  LM Studio, per-user categories, file download
- Alembic migration isolated with `alembic_version_doc_service` table
- Main backend: httpx proxy routers for /api/documents/* and
  /api/documents/categories/*, admin settings API at /api/settings/*
- Runtime config in /config/doc_service_config.json (shared Docker
  volume); api_key masking on reads; atomic write with os.replace()
- Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage
  launcher hub, simplified Nav (removed Settings link), new routes
- docker-compose: doc-service service, doc_data + app_config volumes,
  removed internal:true from backend-net for outbound AI API calls
- Fix pre-commit hook: probe Docker socket path so git subprocess picks
  up Docker Desktop on macOS
- Fix security_check.py: use sys.executable for bandit so venv python
  is used instead of system python

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-14 05:28:11 +02:00
parent d423bea134
commit 0d34867a69
52 changed files with 2500 additions and 28 deletions
+117
View File
@@ -0,0 +1,117 @@
"""
Per-service runtime config helpers.
Config files live on the shared `app_config` Docker volume at /config/.
Each service has its own JSON file, e.g. /config/doc_service_config.json.
Atomic write pattern: write to .tmp in same dir, then os.replace() so
doc-service never reads a partial file.
"""
import json
import os
from pathlib import Path
from typing import Any
from pydantic import BaseModel
_CONFIG_DIR = Path(os.environ.get("APP_CONFIG_DIR", "/config"))
# ── Config schemas ─────────────────────────────────────────────────────────────
class AnthropicConfig(BaseModel):
api_key: str = ""
model: str = "claude-haiku-4-5-20251001"
class OllamaConfig(BaseModel):
base_url: str = "http://192.168.1.x:11434/v1"
model: str = "llama3.2"
api_key: str = "ollama"
class LMStudioConfig(BaseModel):
base_url: str = "http://192.168.1.x:1234/v1"
model: str = "local-model"
api_key: str = ""
class AIConfig(BaseModel):
provider: str = "anthropic"
anthropic: AnthropicConfig = AnthropicConfig()
ollama: OllamaConfig = OllamaConfig()
lmstudio: LMStudioConfig = LMStudioConfig()
class DocumentsConfig(BaseModel):
max_pdf_bytes: int = 20 * 1024 * 1024
class DocServiceConfig(BaseModel):
ai: AIConfig = AIConfig()
documents: DocumentsConfig = DocumentsConfig()
# ── Masking ────────────────────────────────────────────────────────────────────
def _mask_key(key: str) -> str:
if not key or len(key) <= 8:
return "••••"
return key[:7] + "••••"
def _mask_config(data: dict) -> dict:
"""Return a copy of data with api_key fields masked."""
import copy
masked = copy.deepcopy(data)
ai = masked.get("ai", {})
for provider in ("anthropic", "ollama", "lmstudio"):
if provider in ai and "api_key" in ai[provider]:
ai[provider]["api_key"] = _mask_key(ai[provider]["api_key"])
return masked
# ── Load / Save ────────────────────────────────────────────────────────────────
def _config_path(service: str) -> Path:
return _CONFIG_DIR / f"{service}_config.json"
def load_service_config(service: str) -> dict:
path = _config_path(service)
if not path.exists():
# Return default config if file doesn't exist yet
if service == "doc_service":
return DocServiceConfig().model_dump()
return {}
with path.open() as f:
return json.load(f)
def save_service_config(service: str, data: dict) -> None:
path = _config_path(service)
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(".tmp")
tmp.write_text(json.dumps(data, indent=2))
os.replace(tmp, path)
def load_doc_service_config() -> DocServiceConfig:
raw = load_service_config("doc_service")
return DocServiceConfig.model_validate(raw)
def save_doc_service_config(config: DocServiceConfig) -> None:
save_service_config("doc_service", config.model_dump())
def load_doc_service_config_masked() -> dict:
raw = load_service_config("doc_service")
return _mask_config(raw)
def _merge_api_key(new_key: str, existing_key: str) -> str:
"""If new_key is empty or a masked value, keep the existing key."""
if not new_key or "••••" in new_key:
return existing_key
return new_key
+11 -1
View File
@@ -2,7 +2,8 @@ from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.core.config import settings
from app.routers import admin, auth, profile, users
from app.routers import admin, auth, categories_proxy, documents_proxy, profile, users
from app.routers import settings as settings_router
app = FastAPI(title=settings.PROJECT_NAME, version="0.1.0")
@@ -18,6 +19,15 @@ app.include_router(auth.router, prefix="/api/auth", tags=["auth"])
app.include_router(users.router, prefix="/api/users", tags=["users"])
app.include_router(profile.router, prefix="/api/profile", tags=["profile"])
app.include_router(admin.router, prefix="/api/admin", tags=["admin"])
app.include_router(settings_router.router, prefix="/api/settings", tags=["settings"])
# categories_proxy MUST be registered before documents_proxy —
# otherwise /api/documents/{path:path} swallows /api/documents/categories/*
app.include_router(
categories_proxy.router,
prefix="/api/documents/categories",
tags=["categories"],
)
app.include_router(documents_proxy.router, prefix="/api/documents", tags=["documents"])
@app.get("/api/health")
+80
View File
@@ -0,0 +1,80 @@
"""
Proxy /api/documents/categories/* → doc-service:8001/categories/*.
Must be registered BEFORE the documents catch-all proxy in main.py,
otherwise /api/documents/{path:path} swallows category requests.
"""
import os
import httpx
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import StreamingResponse
from app.deps import get_current_user
from app.models.user import User
DOC_SERVICE_URL = os.environ.get("DOC_SERVICE_URL", "http://doc-service:8001")
_client = httpx.AsyncClient(base_url=DOC_SERVICE_URL, timeout=30.0)
router = APIRouter()
_HOP_BY_HOP = frozenset(
[
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
"host",
]
)
def _forward_headers(request: Request, user_id: str) -> dict:
headers = {
k: v
for k, v in request.headers.items()
if k.lower() not in _HOP_BY_HOP
}
headers["x-user-id"] = user_id
return headers
@router.api_route("", methods=["GET", "POST"])
@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE"])
async def proxy_categories(
request: Request,
current_user: User = Depends(get_current_user),
path: str = "",
) -> StreamingResponse:
url = f"/categories/{path}" if path else "/categories"
headers = _forward_headers(request, str(current_user.id))
body = await request.body()
try:
response = await _client.request(
method=request.method,
url=url,
headers=headers,
content=body,
params=dict(request.query_params),
)
except httpx.RequestError as exc:
raise HTTPException(status_code=502, detail=f"doc-service unreachable: {exc}")
resp_headers = {
k: v
for k, v in response.headers.items()
if k.lower() not in _HOP_BY_HOP
}
return StreamingResponse(
content=iter([response.content]),
status_code=response.status_code,
headers=resp_headers,
media_type=response.headers.get("content-type"),
)
+84
View File
@@ -0,0 +1,84 @@
"""
Proxy all /api/documents/* requests to doc-service:8001/documents/*.
Uses a module-level AsyncClient for connection pooling.
Strips hop-by-hop headers that must not be forwarded.
File downloads (/file endpoint) are streamed.
"""
import os
import httpx
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import StreamingResponse
from app.deps import get_current_user
from app.models.user import User
DOC_SERVICE_URL = os.environ.get("DOC_SERVICE_URL", "http://doc-service:8001")
# Module-level client — reused across requests for connection pooling
_client = httpx.AsyncClient(base_url=DOC_SERVICE_URL, timeout=120.0)
router = APIRouter()
_HOP_BY_HOP = frozenset(
[
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
"host",
]
)
def _forward_headers(request: Request, user_id: str) -> dict:
headers = {
k: v
for k, v in request.headers.items()
if k.lower() not in _HOP_BY_HOP
}
headers["x-user-id"] = user_id
return headers
@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE"])
async def proxy_documents(
path: str,
request: Request,
current_user: User = Depends(get_current_user),
) -> StreamingResponse:
url = f"/documents/{path}" if path else "/documents"
headers = _forward_headers(request, str(current_user.id))
# For multipart uploads, stream the body directly
body = await request.body()
try:
response = await _client.request(
method=request.method,
url=url,
headers=headers,
content=body,
params=dict(request.query_params),
)
except httpx.RequestError as exc:
raise HTTPException(status_code=502, detail=f"doc-service unreachable: {exc}")
# Strip hop-by-hop from response headers
resp_headers = {
k: v
for k, v in response.headers.items()
if k.lower() not in _HOP_BY_HOP
}
return StreamingResponse(
content=iter([response.content]),
status_code=response.status_code,
headers=resp_headers,
media_type=response.headers.get("content-type"),
)
+155
View File
@@ -0,0 +1,155 @@
"""
Admin-only settings API for per-service runtime configuration.
All endpoints require the caller to be an admin (Depends(get_current_admin)).
Config files live on the shared app_config volume (/config/).
"""
import asyncio
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from app.core.app_config import (
DocServiceConfig,
_merge_api_key,
load_doc_service_config,
load_doc_service_config_masked,
save_doc_service_config,
)
from app.deps import get_current_admin
from app.models.user import User
router = APIRouter()
# ── Pydantic request bodies ────────────────────────────────────────────────────
class AIProviderUpdate(BaseModel):
provider: str
anthropic_api_key: str = ""
anthropic_model: str = ""
ollama_base_url: str = ""
ollama_model: str = ""
ollama_api_key: str = ""
lmstudio_base_url: str = ""
lmstudio_model: str = ""
lmstudio_api_key: str = ""
class LimitsUpdate(BaseModel):
max_pdf_mb: int
# ── Documents settings ─────────────────────────────────────────────────────────
@router.get("/documents")
async def get_documents_settings(
_: User = Depends(get_current_admin),
) -> dict:
return load_doc_service_config_masked()
@router.patch("/documents/ai")
async def update_documents_ai(
body: AIProviderUpdate,
_: User = Depends(get_current_admin),
) -> dict:
valid_providers = ("anthropic", "ollama", "lmstudio")
if body.provider not in valid_providers:
raise HTTPException(status_code=422, detail=f"provider must be one of {valid_providers}")
config = load_doc_service_config()
config.ai.provider = body.provider
# Anthropic
if body.anthropic_api_key:
config.ai.anthropic.api_key = _merge_api_key(
body.anthropic_api_key, config.ai.anthropic.api_key
)
if body.anthropic_model:
config.ai.anthropic.model = body.anthropic_model
# Ollama
if body.ollama_base_url:
config.ai.ollama.base_url = body.ollama_base_url
if body.ollama_model:
config.ai.ollama.model = body.ollama_model
if body.ollama_api_key:
config.ai.ollama.api_key = _merge_api_key(body.ollama_api_key, config.ai.ollama.api_key)
# LM Studio
if body.lmstudio_base_url:
config.ai.lmstudio.base_url = body.lmstudio_base_url
if body.lmstudio_model:
config.ai.lmstudio.model = body.lmstudio_model
if body.lmstudio_api_key:
config.ai.lmstudio.api_key = _merge_api_key(
body.lmstudio_api_key, config.ai.lmstudio.api_key
)
await asyncio.to_thread(save_doc_service_config, config)
return load_doc_service_config_masked()
@router.post("/documents/ai/test")
async def test_documents_ai(
_: User = Depends(get_current_admin),
) -> dict:
"""Test the configured AI connection with a minimal prompt."""
from app.core.app_config import load_service_config
raw = await asyncio.to_thread(load_service_config, "doc_service")
ai_cfg = raw.get("ai", {})
provider_name = ai_cfg.get("provider", "anthropic")
try:
if provider_name == "anthropic":
import anthropic
client = anthropic.AsyncAnthropic(api_key=ai_cfg["anthropic"]["api_key"])
msg = await client.messages.create(
model=ai_cfg["anthropic"].get("model", "claude-haiku-4-5-20251001"),
max_tokens=16,
messages=[{"role": "user", "content": "Reply with: ok"}],
)
return {"ok": True, "provider": provider_name, "response": msg.content[0].text}
elif provider_name in ("ollama", "lmstudio"):
import openai
pcfg = ai_cfg[provider_name]
client = openai.AsyncOpenAI(
base_url=pcfg["base_url"],
api_key=pcfg.get("api_key") or "none",
)
resp = await client.chat.completions.create(
model=pcfg["model"],
messages=[{"role": "user", "content": "Reply with: ok"}],
max_tokens=16,
temperature=0,
)
return {
"ok": True,
"provider": provider_name,
"response": resp.choices[0].message.content,
}
else:
raise HTTPException(status_code=422, detail=f"Unknown provider: {provider_name}")
except Exception as exc:
return {"ok": False, "provider": provider_name, "error": str(exc)}
@router.patch("/documents/limits")
async def update_documents_limits(
body: LimitsUpdate,
_: User = Depends(get_current_admin),
) -> dict:
if body.max_pdf_mb < 1 or body.max_pdf_mb > 200:
raise HTTPException(status_code=422, detail="max_pdf_mb must be between 1 and 200")
config = load_doc_service_config()
config.documents.max_pdf_bytes = body.max_pdf_mb * 1024 * 1024
await asyncio.to_thread(save_doc_service_config, config)
return load_doc_service_config_masked()