From 0d34867a69992c46a3d3d5292418a783566cb85f Mon Sep 17 00:00:00 2001 From: curo1305 Date: Tue, 14 Apr 2026 05:28:11 +0200 Subject: [PATCH] Add PDF document service with AI extraction and per-app settings - New `features/doc-service` FastAPI microservice: PDF upload, async text extraction (pdfplumber), AI classification via Anthropic/Ollama/ LM Studio, per-user categories, file download - Alembic migration isolated with `alembic_version_doc_service` table - Main backend: httpx proxy routers for /api/documents/* and /api/documents/categories/*, admin settings API at /api/settings/* - Runtime config in /config/doc_service_config.json (shared Docker volume); api_key masking on reads; atomic write with os.replace() - Frontend: DocumentsPage, DocumentAdminSettingsPage, updated AppsPage launcher hub, simplified Nav (removed Settings link), new routes - docker-compose: doc-service service, doc_data + app_config volumes, removed internal:true from backend-net for outbound AI API calls - Fix pre-commit hook: probe Docker socket path so git subprocess picks up Docker Desktop on macOS - Fix security_check.py: use sys.executable for bandit so venv python is used instead of system python Co-Authored-By: Claude Sonnet 4.6 --- .githooks/pre-commit | 8 + .gitignore | 1 + TODO.md | 12 + backend/app/core/app_config.py | 117 ++++++ backend/app/main.py | 12 +- backend/app/routers/categories_proxy.py | 80 ++++ backend/app/routers/documents_proxy.py | 84 ++++ backend/app/routers/settings.py | 155 ++++++++ backend/pyproject.toml | 4 +- changelog/2026-04-14_doc-service.md | 58 +++ docker-compose.dev.yml | 5 + docker-compose.yml | 30 +- features/doc-service/.env.example | 3 + features/doc-service/Dockerfile | 34 ++ features/doc-service/alembic.ini | 45 +++ features/doc-service/alembic/env.py | 55 +++ features/doc-service/alembic/script.py.mako | 25 ++ .../versions/0001_create_doc_tables.py | 79 ++++ features/doc-service/app/__init__.py | 0 features/doc-service/app/core/__init__.py | 0 features/doc-service/app/core/config.py | 14 + features/doc-service/app/database.py | 16 + features/doc-service/app/deps.py | 12 + features/doc-service/app/main.py | 17 + features/doc-service/app/models/__init__.py | 5 + features/doc-service/app/models/category.py | 22 ++ .../app/models/category_assignment.py | 20 + features/doc-service/app/models/document.py | 31 ++ features/doc-service/app/routers/__init__.py | 0 .../doc-service/app/routers/categories.py | 80 ++++ features/doc-service/app/routers/documents.py | 304 ++++++++++++++ features/doc-service/app/schemas/__init__.py | 0 features/doc-service/app/schemas/category.py | 20 + features/doc-service/app/schemas/document.py | 39 ++ features/doc-service/app/services/__init__.py | 0 .../doc-service/app/services/ai/__init__.py | 23 ++ .../app/services/ai/anthropic_provider.py | 31 ++ features/doc-service/app/services/ai/base.py | 31 ++ .../app/services/ai/openai_compat.py | 36 ++ .../doc-service/app/services/config_reader.py | 44 +++ features/doc-service/app/services/storage.py | 27 ++ features/doc-service/pyproject.toml | 35 ++ features/doc-service/scripts/start.sh | 8 + features/doc-service/scripts/start_dev.sh | 8 + frontend/src/App.tsx | 9 +- frontend/src/api/client.ts | 110 ++++++ frontend/src/components/Nav.tsx | 11 +- frontend/src/pages/AppsPage.tsx | 86 +++- .../src/pages/DocumentAdminSettingsPage.tsx | 298 ++++++++++++++ frontend/src/pages/DocumentsPage.tsx | 370 ++++++++++++++++++ frontend/src/pages/SettingsPage.tsx | 12 - scripts/security_check.py | 2 +- 52 files changed, 2500 insertions(+), 28 deletions(-) create mode 100644 backend/app/core/app_config.py create mode 100644 backend/app/routers/categories_proxy.py create mode 100644 backend/app/routers/documents_proxy.py create mode 100644 backend/app/routers/settings.py create mode 100644 changelog/2026-04-14_doc-service.md create mode 100644 features/doc-service/.env.example create mode 100644 features/doc-service/Dockerfile create mode 100644 features/doc-service/alembic.ini create mode 100644 features/doc-service/alembic/env.py create mode 100644 features/doc-service/alembic/script.py.mako create mode 100644 features/doc-service/alembic/versions/0001_create_doc_tables.py create mode 100644 features/doc-service/app/__init__.py create mode 100644 features/doc-service/app/core/__init__.py create mode 100644 features/doc-service/app/core/config.py create mode 100644 features/doc-service/app/database.py create mode 100644 features/doc-service/app/deps.py create mode 100644 features/doc-service/app/main.py create mode 100644 features/doc-service/app/models/__init__.py create mode 100644 features/doc-service/app/models/category.py create mode 100644 features/doc-service/app/models/category_assignment.py create mode 100644 features/doc-service/app/models/document.py create mode 100644 features/doc-service/app/routers/__init__.py create mode 100644 features/doc-service/app/routers/categories.py create mode 100644 features/doc-service/app/routers/documents.py create mode 100644 features/doc-service/app/schemas/__init__.py create mode 100644 features/doc-service/app/schemas/category.py create mode 100644 features/doc-service/app/schemas/document.py create mode 100644 features/doc-service/app/services/__init__.py create mode 100644 features/doc-service/app/services/ai/__init__.py create mode 100644 features/doc-service/app/services/ai/anthropic_provider.py create mode 100644 features/doc-service/app/services/ai/base.py create mode 100644 features/doc-service/app/services/ai/openai_compat.py create mode 100644 features/doc-service/app/services/config_reader.py create mode 100644 features/doc-service/app/services/storage.py create mode 100644 features/doc-service/pyproject.toml create mode 100644 features/doc-service/scripts/start.sh create mode 100644 features/doc-service/scripts/start_dev.sh create mode 100644 frontend/src/pages/DocumentAdminSettingsPage.tsx create mode 100644 frontend/src/pages/DocumentsPage.tsx delete mode 100644 frontend/src/pages/SettingsPage.tsx diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 7f06559..df59018 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -4,6 +4,14 @@ REPO_ROOT="$(git rev-parse --show-toplevel)" +# Resolve Docker socket — the git hook environment may not inherit the active +# Docker context, so we probe common socket paths explicitly. +if [ -S "/Users/$USER/.docker/run/docker.sock" ]; then + export DOCKER_HOST="unix:///Users/$USER/.docker/run/docker.sock" +elif [ -S "/var/run/docker.sock" ]; then + export DOCKER_HOST="unix:///var/run/docker.sock" +fi + # Collect staged files on the host and pass them into the container as arguments STAGED=$(git diff --cached --name-only --diff-filter=ACM) diff --git a/.gitignore b/.gitignore index bf55016..a64342b 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ frontend/dist/ # OS .DS_Store +resume.txt diff --git a/TODO.md b/TODO.md index 6b4ebdc..e4f032a 100644 --- a/TODO.md +++ b/TODO.md @@ -19,6 +19,18 @@ - [ ] **Permissions registry** — admin-managed table that controls which apps each user can access. Schema: `user_app_permissions (user_id FK, app_key)`. Admin UI lets the admin grant/revoke per-app access per user. The Apps page only shows apps the current user has been granted access to. +## PDF Documents app (`features/doc-service`) + +- [x] **doc-service container** — FastAPI microservice on `backend-net`; never exposed to host or frontend directly +- [x] **PDF upload + async extraction** — background task with pdfplumber + pluggable AI (Anthropic / Ollama / LM Studio) +- [x] **Per-app settings page** — `/apps/documents/settings/admin`; AI provider config, max file size; admin only +- [x] **Per-user categories** — create/rename/delete categories; assign multiple categories per document +- [x] **Alembic isolation** — `alembic_version_doc_service` version table; no collision with main backend migrations +- [x] **Runtime config file** — `/config/doc_service_config.json` on shared Docker volume; editable from frontend; 30s TTL cache in doc-service +- [ ] **Re-process document** — UI button to re-trigger AI extraction on an existing document (after changing AI provider/model) +- [ ] **Bulk category operations** — assign/remove a category from multiple documents at once +- [ ] **Search / filter documents** — filter by status, document type, category, date range + ## Frontend features - [x] **Logout button** — visible when logged in, clears token and redirects to `/login` diff --git a/backend/app/core/app_config.py b/backend/app/core/app_config.py new file mode 100644 index 0000000..c97345c --- /dev/null +++ b/backend/app/core/app_config.py @@ -0,0 +1,117 @@ +""" +Per-service runtime config helpers. + +Config files live on the shared `app_config` Docker volume at /config/. +Each service has its own JSON file, e.g. /config/doc_service_config.json. + +Atomic write pattern: write to .tmp in same dir, then os.replace() so +doc-service never reads a partial file. +""" +import json +import os +from pathlib import Path +from typing import Any + +from pydantic import BaseModel + +_CONFIG_DIR = Path(os.environ.get("APP_CONFIG_DIR", "/config")) + +# ── Config schemas ───────────────────────────────────────────────────────────── + + +class AnthropicConfig(BaseModel): + api_key: str = "" + model: str = "claude-haiku-4-5-20251001" + + +class OllamaConfig(BaseModel): + base_url: str = "http://192.168.1.x:11434/v1" + model: str = "llama3.2" + api_key: str = "ollama" + + +class LMStudioConfig(BaseModel): + base_url: str = "http://192.168.1.x:1234/v1" + model: str = "local-model" + api_key: str = "" + + +class AIConfig(BaseModel): + provider: str = "anthropic" + anthropic: AnthropicConfig = AnthropicConfig() + ollama: OllamaConfig = OllamaConfig() + lmstudio: LMStudioConfig = LMStudioConfig() + + +class DocumentsConfig(BaseModel): + max_pdf_bytes: int = 20 * 1024 * 1024 + + +class DocServiceConfig(BaseModel): + ai: AIConfig = AIConfig() + documents: DocumentsConfig = DocumentsConfig() + + +# ── Masking ──────────────────────────────────────────────────────────────────── + +def _mask_key(key: str) -> str: + if not key or len(key) <= 8: + return "••••" + return key[:7] + "••••" + + +def _mask_config(data: dict) -> dict: + """Return a copy of data with api_key fields masked.""" + import copy + masked = copy.deepcopy(data) + ai = masked.get("ai", {}) + for provider in ("anthropic", "ollama", "lmstudio"): + if provider in ai and "api_key" in ai[provider]: + ai[provider]["api_key"] = _mask_key(ai[provider]["api_key"]) + return masked + + +# ── Load / Save ──────────────────────────────────────────────────────────────── + +def _config_path(service: str) -> Path: + return _CONFIG_DIR / f"{service}_config.json" + + +def load_service_config(service: str) -> dict: + path = _config_path(service) + if not path.exists(): + # Return default config if file doesn't exist yet + if service == "doc_service": + return DocServiceConfig().model_dump() + return {} + with path.open() as f: + return json.load(f) + + +def save_service_config(service: str, data: dict) -> None: + path = _config_path(service) + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + tmp.write_text(json.dumps(data, indent=2)) + os.replace(tmp, path) + + +def load_doc_service_config() -> DocServiceConfig: + raw = load_service_config("doc_service") + return DocServiceConfig.model_validate(raw) + + +def save_doc_service_config(config: DocServiceConfig) -> None: + save_service_config("doc_service", config.model_dump()) + + +def load_doc_service_config_masked() -> dict: + raw = load_service_config("doc_service") + return _mask_config(raw) + + +def _merge_api_key(new_key: str, existing_key: str) -> str: + """If new_key is empty or a masked value, keep the existing key.""" + if not new_key or "••••" in new_key: + return existing_key + return new_key diff --git a/backend/app/main.py b/backend/app/main.py index 0e34990..76f5849 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -2,7 +2,8 @@ from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from app.core.config import settings -from app.routers import admin, auth, profile, users +from app.routers import admin, auth, categories_proxy, documents_proxy, profile, users +from app.routers import settings as settings_router app = FastAPI(title=settings.PROJECT_NAME, version="0.1.0") @@ -18,6 +19,15 @@ app.include_router(auth.router, prefix="/api/auth", tags=["auth"]) app.include_router(users.router, prefix="/api/users", tags=["users"]) app.include_router(profile.router, prefix="/api/profile", tags=["profile"]) app.include_router(admin.router, prefix="/api/admin", tags=["admin"]) +app.include_router(settings_router.router, prefix="/api/settings", tags=["settings"]) +# categories_proxy MUST be registered before documents_proxy — +# otherwise /api/documents/{path:path} swallows /api/documents/categories/* +app.include_router( + categories_proxy.router, + prefix="/api/documents/categories", + tags=["categories"], +) +app.include_router(documents_proxy.router, prefix="/api/documents", tags=["documents"]) @app.get("/api/health") diff --git a/backend/app/routers/categories_proxy.py b/backend/app/routers/categories_proxy.py new file mode 100644 index 0000000..4534f5a --- /dev/null +++ b/backend/app/routers/categories_proxy.py @@ -0,0 +1,80 @@ +""" +Proxy /api/documents/categories/* → doc-service:8001/categories/*. + +Must be registered BEFORE the documents catch-all proxy in main.py, +otherwise /api/documents/{path:path} swallows category requests. +""" +import os + +import httpx +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import StreamingResponse + +from app.deps import get_current_user +from app.models.user import User + +DOC_SERVICE_URL = os.environ.get("DOC_SERVICE_URL", "http://doc-service:8001") + +_client = httpx.AsyncClient(base_url=DOC_SERVICE_URL, timeout=30.0) + +router = APIRouter() + +_HOP_BY_HOP = frozenset( + [ + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailers", + "transfer-encoding", + "upgrade", + "host", + ] +) + + +def _forward_headers(request: Request, user_id: str) -> dict: + headers = { + k: v + for k, v in request.headers.items() + if k.lower() not in _HOP_BY_HOP + } + headers["x-user-id"] = user_id + return headers + + +@router.api_route("", methods=["GET", "POST"]) +@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE"]) +async def proxy_categories( + request: Request, + current_user: User = Depends(get_current_user), + path: str = "", +) -> StreamingResponse: + url = f"/categories/{path}" if path else "/categories" + headers = _forward_headers(request, str(current_user.id)) + body = await request.body() + + try: + response = await _client.request( + method=request.method, + url=url, + headers=headers, + content=body, + params=dict(request.query_params), + ) + except httpx.RequestError as exc: + raise HTTPException(status_code=502, detail=f"doc-service unreachable: {exc}") + + resp_headers = { + k: v + for k, v in response.headers.items() + if k.lower() not in _HOP_BY_HOP + } + + return StreamingResponse( + content=iter([response.content]), + status_code=response.status_code, + headers=resp_headers, + media_type=response.headers.get("content-type"), + ) diff --git a/backend/app/routers/documents_proxy.py b/backend/app/routers/documents_proxy.py new file mode 100644 index 0000000..4d8f697 --- /dev/null +++ b/backend/app/routers/documents_proxy.py @@ -0,0 +1,84 @@ +""" +Proxy all /api/documents/* requests to doc-service:8001/documents/*. + +Uses a module-level AsyncClient for connection pooling. +Strips hop-by-hop headers that must not be forwarded. +File downloads (/file endpoint) are streamed. +""" +import os + +import httpx +from fastapi import APIRouter, Depends, HTTPException, Request +from fastapi.responses import StreamingResponse + +from app.deps import get_current_user +from app.models.user import User + +DOC_SERVICE_URL = os.environ.get("DOC_SERVICE_URL", "http://doc-service:8001") + +# Module-level client — reused across requests for connection pooling +_client = httpx.AsyncClient(base_url=DOC_SERVICE_URL, timeout=120.0) + +router = APIRouter() + +_HOP_BY_HOP = frozenset( + [ + "connection", + "keep-alive", + "proxy-authenticate", + "proxy-authorization", + "te", + "trailers", + "transfer-encoding", + "upgrade", + "host", + ] +) + + +def _forward_headers(request: Request, user_id: str) -> dict: + headers = { + k: v + for k, v in request.headers.items() + if k.lower() not in _HOP_BY_HOP + } + headers["x-user-id"] = user_id + return headers + + +@router.api_route("/{path:path}", methods=["GET", "POST", "PUT", "PATCH", "DELETE"]) +async def proxy_documents( + path: str, + request: Request, + current_user: User = Depends(get_current_user), +) -> StreamingResponse: + url = f"/documents/{path}" if path else "/documents" + headers = _forward_headers(request, str(current_user.id)) + + # For multipart uploads, stream the body directly + body = await request.body() + + try: + response = await _client.request( + method=request.method, + url=url, + headers=headers, + content=body, + params=dict(request.query_params), + ) + except httpx.RequestError as exc: + raise HTTPException(status_code=502, detail=f"doc-service unreachable: {exc}") + + # Strip hop-by-hop from response headers + resp_headers = { + k: v + for k, v in response.headers.items() + if k.lower() not in _HOP_BY_HOP + } + + return StreamingResponse( + content=iter([response.content]), + status_code=response.status_code, + headers=resp_headers, + media_type=response.headers.get("content-type"), + ) diff --git a/backend/app/routers/settings.py b/backend/app/routers/settings.py new file mode 100644 index 0000000..bbfdf91 --- /dev/null +++ b/backend/app/routers/settings.py @@ -0,0 +1,155 @@ +""" +Admin-only settings API for per-service runtime configuration. + +All endpoints require the caller to be an admin (Depends(get_current_admin)). +Config files live on the shared app_config volume (/config/). +""" +import asyncio + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel + +from app.core.app_config import ( + DocServiceConfig, + _merge_api_key, + load_doc_service_config, + load_doc_service_config_masked, + save_doc_service_config, +) +from app.deps import get_current_admin +from app.models.user import User + +router = APIRouter() + + +# ── Pydantic request bodies ──────────────────────────────────────────────────── + + +class AIProviderUpdate(BaseModel): + provider: str + anthropic_api_key: str = "" + anthropic_model: str = "" + ollama_base_url: str = "" + ollama_model: str = "" + ollama_api_key: str = "" + lmstudio_base_url: str = "" + lmstudio_model: str = "" + lmstudio_api_key: str = "" + + +class LimitsUpdate(BaseModel): + max_pdf_mb: int + + +# ── Documents settings ───────────────────────────────────────────────────────── + + +@router.get("/documents") +async def get_documents_settings( + _: User = Depends(get_current_admin), +) -> dict: + return load_doc_service_config_masked() + + +@router.patch("/documents/ai") +async def update_documents_ai( + body: AIProviderUpdate, + _: User = Depends(get_current_admin), +) -> dict: + valid_providers = ("anthropic", "ollama", "lmstudio") + if body.provider not in valid_providers: + raise HTTPException(status_code=422, detail=f"provider must be one of {valid_providers}") + + config = load_doc_service_config() + + config.ai.provider = body.provider + + # Anthropic + if body.anthropic_api_key: + config.ai.anthropic.api_key = _merge_api_key( + body.anthropic_api_key, config.ai.anthropic.api_key + ) + if body.anthropic_model: + config.ai.anthropic.model = body.anthropic_model + + # Ollama + if body.ollama_base_url: + config.ai.ollama.base_url = body.ollama_base_url + if body.ollama_model: + config.ai.ollama.model = body.ollama_model + if body.ollama_api_key: + config.ai.ollama.api_key = _merge_api_key(body.ollama_api_key, config.ai.ollama.api_key) + + # LM Studio + if body.lmstudio_base_url: + config.ai.lmstudio.base_url = body.lmstudio_base_url + if body.lmstudio_model: + config.ai.lmstudio.model = body.lmstudio_model + if body.lmstudio_api_key: + config.ai.lmstudio.api_key = _merge_api_key( + body.lmstudio_api_key, config.ai.lmstudio.api_key + ) + + await asyncio.to_thread(save_doc_service_config, config) + return load_doc_service_config_masked() + + +@router.post("/documents/ai/test") +async def test_documents_ai( + _: User = Depends(get_current_admin), +) -> dict: + """Test the configured AI connection with a minimal prompt.""" + from app.core.app_config import load_service_config + + raw = await asyncio.to_thread(load_service_config, "doc_service") + ai_cfg = raw.get("ai", {}) + provider_name = ai_cfg.get("provider", "anthropic") + + try: + if provider_name == "anthropic": + import anthropic + client = anthropic.AsyncAnthropic(api_key=ai_cfg["anthropic"]["api_key"]) + msg = await client.messages.create( + model=ai_cfg["anthropic"].get("model", "claude-haiku-4-5-20251001"), + max_tokens=16, + messages=[{"role": "user", "content": "Reply with: ok"}], + ) + return {"ok": True, "provider": provider_name, "response": msg.content[0].text} + + elif provider_name in ("ollama", "lmstudio"): + import openai + pcfg = ai_cfg[provider_name] + client = openai.AsyncOpenAI( + base_url=pcfg["base_url"], + api_key=pcfg.get("api_key") or "none", + ) + resp = await client.chat.completions.create( + model=pcfg["model"], + messages=[{"role": "user", "content": "Reply with: ok"}], + max_tokens=16, + temperature=0, + ) + return { + "ok": True, + "provider": provider_name, + "response": resp.choices[0].message.content, + } + else: + raise HTTPException(status_code=422, detail=f"Unknown provider: {provider_name}") + + except Exception as exc: + return {"ok": False, "provider": provider_name, "error": str(exc)} + + +@router.patch("/documents/limits") +async def update_documents_limits( + body: LimitsUpdate, + _: User = Depends(get_current_admin), +) -> dict: + if body.max_pdf_mb < 1 or body.max_pdf_mb > 200: + raise HTTPException(status_code=422, detail="max_pdf_mb must be between 1 and 200") + + config = load_doc_service_config() + config.documents.max_pdf_bytes = body.max_pdf_mb * 1024 * 1024 + await asyncio.to_thread(save_doc_service_config, config) + return load_doc_service_config_masked() diff --git a/backend/pyproject.toml b/backend/pyproject.toml index b866301..b450db1 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -17,13 +17,15 @@ dependencies = [ "python-jose[cryptography]>=3.3", "bcrypt>=4.0", "python-multipart>=0.0.9", + "httpx>=0.27", + "anthropic>=0.28", + "openai>=1.0", ] [project.optional-dependencies] dev = [ "pytest>=8", "pytest-asyncio>=0.23", - "httpx>=0.27", "ruff>=0.4", ] diff --git a/changelog/2026-04-14_doc-service.md b/changelog/2026-04-14_doc-service.md new file mode 100644 index 0000000..8f7dc37 --- /dev/null +++ b/changelog/2026-04-14_doc-service.md @@ -0,0 +1,58 @@ +# 2026-04-14 — PDF Document Service + +**Timestamp:** 2026-04-14T00:00:00+00:00 + +## Summary + +Added `features/doc-service` — a FastAPI microservice that accepts PDF uploads, extracts text with pdfplumber, and uses a pluggable AI provider (Anthropic, Ollama, or LM Studio) to classify and extract structured data. Integrated it into the main backend via httpx proxy routers. Added an admin settings UI at `/apps/documents/settings/admin`. Updated the frontend route tree, Nav, and AppsPage. + +## Files Added + +- `features/doc-service/Dockerfile` — UID 1001, pre-creates `/data/documents` and `/config` +- `features/doc-service/pyproject.toml` — service dependencies +- `features/doc-service/alembic.ini` — separate `alembic_version_doc_service` table +- `features/doc-service/.env.example` +- `features/doc-service/scripts/start.sh` — migrations + uvicorn +- `features/doc-service/scripts/start_dev.sh` — migrations + uvicorn --reload +- `features/doc-service/alembic/env.py` — async migrations, VERSION_TABLE isolation +- `features/doc-service/alembic/versions/0001_create_doc_tables.py` — documents, document_categories, document_category_assignments +- `features/doc-service/app/main.py` — no CORS (internal service) +- `features/doc-service/app/core/config.py` — DATABASE_URL, DATA_DIR, CONFIG_PATH settings +- `features/doc-service/app/database.py` — async engine, AsyncSessionLocal, Base +- `features/doc-service/app/deps.py` — get_user_id from X-User-Id header +- `features/doc-service/app/models/document.py` — Document ORM model +- `features/doc-service/app/models/category.py` — DocumentCategory ORM model +- `features/doc-service/app/models/category_assignment.py` — CategoryAssignment composite PK +- `features/doc-service/app/models/__init__.py` +- `features/doc-service/app/schemas/document.py` — DocumentOut, DocumentStatusOut, DocumentTypeUpdate, CategoryOut +- `features/doc-service/app/schemas/category.py` — CategoryCreate, CategoryOut, CategoryUpdate +- `features/doc-service/app/routers/documents.py` — upload, list, get, status, patch type, delete, file download, category assignment +- `features/doc-service/app/routers/categories.py` — CRUD for DocumentCategory +- `features/doc-service/app/services/storage.py` — aiofiles write, path helpers, delete +- `features/doc-service/app/services/config_reader.py` — load_doc_config() with 30s TTL cache +- `features/doc-service/app/services/ai/__init__.py` — get_provider() factory +- `features/doc-service/app/services/ai/base.py` — AIProvider ABC, shared prompts +- `features/doc-service/app/services/ai/anthropic_provider.py` — AnthropicProvider +- `features/doc-service/app/services/ai/openai_compat.py` — OpenAICompatProvider (Ollama + LM Studio) +- `backend/app/core/app_config.py` — DocServiceConfig Pydantic model, load/save with atomic write, api_key masking +- `backend/app/routers/settings.py` — GET/PATCH /api/settings/documents/*, admin only +- `backend/app/routers/documents_proxy.py` — httpx proxy to doc-service /documents/* +- `backend/app/routers/categories_proxy.py` — httpx proxy to doc-service /categories/* +- `frontend/src/pages/DocumentsPage.tsx` — upload, list, status polling, categories, file download +- `frontend/src/pages/DocumentAdminSettingsPage.tsx` — AI provider config, connection test, upload limits + +## Files Modified + +- `backend/app/main.py` — registered settings_router, categories_proxy (before!), documents_proxy +- `backend/pyproject.toml` — moved httpx to main deps, added anthropic>=0.28, openai>=1.0 +- `frontend/src/App.tsx` — added /apps/documents and /apps/documents/settings/admin routes, removed /settings +- `frontend/src/components/Nav.tsx` — removed Settings link, added Profile link, logo links to / +- `frontend/src/pages/AppsPage.tsx` — replaced stub with app launcher card grid +- `frontend/src/api/client.ts` — added documents, categories, and settings API functions +- `docker-compose.yml` — added doc-service service, doc_data + app_config volumes, removed internal:true from backend-net, added app_config volume to backend +- `docker-compose.dev.yml` — added doc-service dev override with --reload +- `TODO.md` — added PDF Documents app section + +## Files Deleted + +- `frontend/src/pages/SettingsPage.tsx` — stub replaced by per-app settings pages diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 694d741..e058e68 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -22,3 +22,8 @@ services: volumes: - ./frontend:/app - /app/node_modules + + doc-service: + command: sh scripts/start_dev.sh + volumes: + - ./features/doc-service:/app diff --git a/docker-compose.yml b/docker-compose.yml index bf739ba..3abdd97 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -30,6 +30,30 @@ services: env_file: ./backend/.env environment: DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-password}@db:5432/${POSTGRES_DB:-destroying_sap} + DOC_SERVICE_URL: http://doc-service:8001 + volumes: + - app_config:/config + depends_on: + db: + condition: service_healthy + networks: + - backend-net + + # ── Doc service (PDF extraction) ──────────────────────────────────────────── + doc-service: + build: + context: ./features/doc-service + dockerfile: Dockerfile + network: host + user: "1001:1001" + restart: unless-stopped + environment: + DATABASE_URL: postgresql+asyncpg://${POSTGRES_USER:-postgres}:${POSTGRES_PASSWORD:-password}@db:5432/${POSTGRES_DB:-destroying_sap} + DATA_DIR: /data/documents + CONFIG_PATH: /config/doc_service_config.json + volumes: + - doc_data:/data/documents + - app_config:/config depends_on: db: condition: service_healthy @@ -54,10 +78,12 @@ services: volumes: postgres_data: + doc_data: # PDF files persisted across restarts + app_config: # Per-service runtime config JSON files networks: - # Internal-only: db ↔ backend ↔ frontend reverse proxy. No host routing. + # backend-net: db ↔ backend ↔ doc-service. No host ports bound. + # internal:true removed — doc-service needs outbound access for cloud AI providers. backend-net: - internal: true # External-facing: only the frontend binds a host port through this network. frontend-net: diff --git a/features/doc-service/.env.example b/features/doc-service/.env.example new file mode 100644 index 0000000..129e4ce --- /dev/null +++ b/features/doc-service/.env.example @@ -0,0 +1,3 @@ +DATABASE_URL=postgresql+asyncpg://postgres:password@db:5432/destroying_sap +DATA_DIR=/data/documents +CONFIG_PATH=/config/doc_service_config.json diff --git a/features/doc-service/Dockerfile b/features/doc-service/Dockerfile new file mode 100644 index 0000000..6aa6f7a --- /dev/null +++ b/features/doc-service/Dockerfile @@ -0,0 +1,34 @@ +# ── Stage 1: dependency installation ───────────────────────────────────────── +FROM python:3.12-slim AS builder + +WORKDIR /app + +RUN pip install --upgrade pip + +COPY pyproject.toml . +RUN pip install --prefix=/install . + +# ── Stage 2: runtime ────────────────────────────────────────────────────────── +FROM python:3.12-slim + +# Create non-root user (UID/GID 1001) +RUN groupadd --gid 1001 appuser && \ + useradd --uid 1001 --gid 1001 --no-create-home --shell /bin/sh appuser + +# Pre-create data and config dirs with correct ownership. +# Named volumes mounted over these paths will inherit ownership on first creation. +RUN mkdir -p /data/documents /config && chown -R appuser:appuser /data /config + +WORKDIR /app + +COPY --from=builder /install /usr/local +COPY --chown=appuser:appuser app ./app +COPY --chown=appuser:appuser alembic ./alembic +COPY --chown=appuser:appuser alembic.ini . +COPY --chown=appuser:appuser scripts ./scripts + +USER appuser + +EXPOSE 8001 + +CMD ["sh", "scripts/start.sh"] diff --git a/features/doc-service/alembic.ini b/features/doc-service/alembic.ini new file mode 100644 index 0000000..57ee4ad --- /dev/null +++ b/features/doc-service/alembic.ini @@ -0,0 +1,45 @@ +[alembic] +script_location = alembic +prepend_sys_path = . +version_path_separator = os +sqlalchemy.url = postgresql+asyncpg://postgres:password@localhost:5432/destroying_sap + +# Use a separate version table so this service's migrations don't collide +# with the main backend's alembic_version table in the shared postgres instance. +version_table = alembic_version_doc_service + +[post_write_hooks] + +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/features/doc-service/alembic/env.py b/features/doc-service/alembic/env.py new file mode 100644 index 0000000..571eba2 --- /dev/null +++ b/features/doc-service/alembic/env.py @@ -0,0 +1,55 @@ +import asyncio +from logging.config import fileConfig + +from alembic import context +from sqlalchemy.ext.asyncio import create_async_engine + +from app.core.config import settings +from app.database import Base +import app.models # noqa: F401 — registers Document, DocumentCategory, CategoryAssignment + +config = context.config +config.set_main_option("sqlalchemy.url", settings.DATABASE_URL) + +if config.config_file_name: + fileConfig(config.config_file_name) + +target_metadata = Base.metadata + +# Separate version table — must not collide with the main backend's alembic_version table. +VERSION_TABLE = "alembic_version_doc_service" + + +def run_migrations_offline(): + context.configure( + url=settings.DATABASE_URL, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + version_table=VERSION_TABLE, + ) + with context.begin_transaction(): + context.run_migrations() + + +def do_run_migrations(connection): + context.configure( + connection=connection, + target_metadata=target_metadata, + version_table=VERSION_TABLE, + ) + with context.begin_transaction(): + context.run_migrations() + + +async def run_migrations_online(): + engine = create_async_engine(settings.DATABASE_URL) + async with engine.connect() as conn: + await conn.run_sync(do_run_migrations) + await engine.dispose() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + asyncio.run(run_migrations_online()) diff --git a/features/doc-service/alembic/script.py.mako b/features/doc-service/alembic/script.py.mako new file mode 100644 index 0000000..17dcba0 --- /dev/null +++ b/features/doc-service/alembic/script.py.mako @@ -0,0 +1,25 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/features/doc-service/alembic/versions/0001_create_doc_tables.py b/features/doc-service/alembic/versions/0001_create_doc_tables.py new file mode 100644 index 0000000..358da8f --- /dev/null +++ b/features/doc-service/alembic/versions/0001_create_doc_tables.py @@ -0,0 +1,79 @@ +"""create document tables + +Revision ID: 0001 +Revises: +Create Date: 2026-04-14 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +revision: str = "0001" +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + "documents", + sa.Column("id", sa.String(), primary_key=True), + sa.Column("user_id", sa.String(), nullable=False), + sa.Column("filename", sa.String(), nullable=False), + sa.Column("file_path", sa.String(), nullable=False), + sa.Column("file_size", sa.Integer(), nullable=False), + sa.Column("status", sa.String(), nullable=False), + sa.Column("document_type", sa.String(), nullable=True), + sa.Column("raw_text", sa.Text(), nullable=True), + sa.Column("extracted_data", sa.Text(), nullable=True), + sa.Column("tags", sa.Text(), nullable=True), + sa.Column("error_message", sa.String(500), nullable=True), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.Column("processed_at", sa.DateTime(timezone=True), nullable=True), + ) + op.create_index("ix_documents_user_id", "documents", ["user_id"]) + + op.create_table( + "document_categories", + sa.Column("id", sa.String(), primary_key=True), + sa.Column("user_id", sa.String(), nullable=False), + sa.Column("name", sa.String(128), nullable=False), + sa.Column( + "created_at", + sa.DateTime(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + ) + op.create_index("ix_document_categories_user_id", "document_categories", ["user_id"]) + + op.create_table( + "document_category_assignments", + sa.Column( + "document_id", + sa.String(), + sa.ForeignKey("documents.id", ondelete="CASCADE"), + primary_key=True, + ), + sa.Column( + "category_id", + sa.String(), + sa.ForeignKey("document_categories.id", ondelete="CASCADE"), + primary_key=True, + ), + ) + + +def downgrade() -> None: + op.drop_table("document_category_assignments") + op.drop_index("ix_document_categories_user_id", "document_categories") + op.drop_table("document_categories") + op.drop_index("ix_documents_user_id", "documents") + op.drop_table("documents") diff --git a/features/doc-service/app/__init__.py b/features/doc-service/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/features/doc-service/app/core/__init__.py b/features/doc-service/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/features/doc-service/app/core/config.py b/features/doc-service/app/core/config.py new file mode 100644 index 0000000..0582c09 --- /dev/null +++ b/features/doc-service/app/core/config.py @@ -0,0 +1,14 @@ +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + PROJECT_NAME: str = "doc-service" + DATABASE_URL: str = "postgresql+asyncpg://postgres:password@db:5432/destroying_sap" + DATA_DIR: str = "/data/documents" + CONFIG_PATH: str = "/config/doc_service_config.json" + + class Config: + env_file = ".env" + + +settings = Settings() diff --git a/features/doc-service/app/database.py b/features/doc-service/app/database.py new file mode 100644 index 0000000..d9b3829 --- /dev/null +++ b/features/doc-service/app/database.py @@ -0,0 +1,16 @@ +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine +from sqlalchemy.orm import DeclarativeBase + +from app.core.config import settings + +engine = create_async_engine(settings.DATABASE_URL, echo=False) +AsyncSessionLocal = async_sessionmaker(engine, expire_on_commit=False) + + +class Base(DeclarativeBase): + pass + + +async def get_db() -> AsyncSession: + async with AsyncSessionLocal() as session: + yield session diff --git a/features/doc-service/app/deps.py b/features/doc-service/app/deps.py new file mode 100644 index 0000000..9ca9600 --- /dev/null +++ b/features/doc-service/app/deps.py @@ -0,0 +1,12 @@ +from fastapi import Header, HTTPException + + +async def get_user_id(x_user_id: str = Header(...)) -> str: + """ + Extract the user identity injected by the main backend proxy. + The main backend validates the JWT and forwards the user ID via this header. + Doc-service trusts it because it is only reachable from backend on backend-net. + """ + if not x_user_id: + raise HTTPException(status_code=400, detail="Missing X-User-Id header") + return x_user_id diff --git a/features/doc-service/app/main.py b/features/doc-service/app/main.py new file mode 100644 index 0000000..66685bd --- /dev/null +++ b/features/doc-service/app/main.py @@ -0,0 +1,17 @@ +from fastapi import FastAPI + +from app.core.config import settings +from app.routers import categories, documents + +app = FastAPI(title=settings.PROJECT_NAME) + +# No CORS — this service is only reachable from the main backend on backend-net. +# All browser traffic goes through the main backend proxy. + +app.include_router(documents.router, prefix="/documents", tags=["documents"]) +app.include_router(categories.router, prefix="/categories", tags=["categories"]) + + +@app.get("/health") +def health(): + return {"status": "ok"} diff --git a/features/doc-service/app/models/__init__.py b/features/doc-service/app/models/__init__.py new file mode 100644 index 0000000..b8c5e5b --- /dev/null +++ b/features/doc-service/app/models/__init__.py @@ -0,0 +1,5 @@ +from app.models.document import Document +from app.models.category import DocumentCategory +from app.models.category_assignment import CategoryAssignment + +__all__ = ["Document", "DocumentCategory", "CategoryAssignment"] diff --git a/features/doc-service/app/models/category.py b/features/doc-service/app/models/category.py new file mode 100644 index 0000000..e1b7fd4 --- /dev/null +++ b/features/doc-service/app/models/category.py @@ -0,0 +1,22 @@ +import uuid +from datetime import datetime + +from sqlalchemy import DateTime, String, func +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class DocumentCategory(Base): + __tablename__ = "document_categories" + + id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4())) + user_id: Mapped[str] = mapped_column(String, nullable=False, index=True) + name: Mapped[str] = mapped_column(String(128), nullable=False) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + + assignments: Mapped[list["CategoryAssignment"]] = relationship( + "CategoryAssignment", back_populates="category", cascade="all, delete-orphan" + ) diff --git a/features/doc-service/app/models/category_assignment.py b/features/doc-service/app/models/category_assignment.py new file mode 100644 index 0000000..c61d307 --- /dev/null +++ b/features/doc-service/app/models/category_assignment.py @@ -0,0 +1,20 @@ +from sqlalchemy import ForeignKey, String +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class CategoryAssignment(Base): + __tablename__ = "document_category_assignments" + + document_id: Mapped[str] = mapped_column( + String, ForeignKey("documents.id", ondelete="CASCADE"), primary_key=True + ) + category_id: Mapped[str] = mapped_column( + String, ForeignKey("document_categories.id", ondelete="CASCADE"), primary_key=True + ) + + document: Mapped["Document"] = relationship("Document", back_populates="category_assignments") + category: Mapped["DocumentCategory"] = relationship( + "DocumentCategory", back_populates="assignments" + ) diff --git a/features/doc-service/app/models/document.py b/features/doc-service/app/models/document.py new file mode 100644 index 0000000..601a9e1 --- /dev/null +++ b/features/doc-service/app/models/document.py @@ -0,0 +1,31 @@ +import uuid +from datetime import datetime + +from sqlalchemy import DateTime, Integer, String, Text, func +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from app.database import Base + + +class Document(Base): + __tablename__ = "documents" + + id: Mapped[str] = mapped_column(String, primary_key=True, default=lambda: str(uuid.uuid4())) + user_id: Mapped[str] = mapped_column(String, nullable=False, index=True) + filename: Mapped[str] = mapped_column(String, nullable=False) + file_path: Mapped[str] = mapped_column(String, nullable=False) + file_size: Mapped[int] = mapped_column(Integer, nullable=False) + status: Mapped[str] = mapped_column(String, nullable=False, default="pending") + document_type: Mapped[str | None] = mapped_column(String, nullable=True) + raw_text: Mapped[str | None] = mapped_column(Text, nullable=True) + extracted_data: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON string + tags: Mapped[str | None] = mapped_column(Text, nullable=True) # JSON array string + error_message: Mapped[str | None] = mapped_column(String(500), nullable=True) + created_at: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) + processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True) + + category_assignments: Mapped[list["CategoryAssignment"]] = relationship( + "CategoryAssignment", back_populates="document", cascade="all, delete-orphan" + ) diff --git a/features/doc-service/app/routers/__init__.py b/features/doc-service/app/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/features/doc-service/app/routers/categories.py b/features/doc-service/app/routers/categories.py new file mode 100644 index 0000000..1392cda --- /dev/null +++ b/features/doc-service/app/routers/categories.py @@ -0,0 +1,80 @@ +from fastapi import APIRouter, Depends, HTTPException +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.database import get_db +from app.deps import get_user_id +from app.models.category import DocumentCategory +from app.schemas.category import CategoryCreate, CategoryOut, CategoryUpdate + +router = APIRouter() + + +@router.get("", response_model=list[CategoryOut]) +async def list_categories( + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> list[DocumentCategory]: + result = await db.execute( + select(DocumentCategory) + .where(DocumentCategory.user_id == user_id) + .order_by(DocumentCategory.name) + ) + return result.scalars().all() + + +@router.post("", response_model=CategoryOut, status_code=201) +async def create_category( + body: CategoryCreate, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> DocumentCategory: + name = body.name.strip() + if not name: + raise HTTPException(status_code=422, detail="Category name cannot be empty") + cat = DocumentCategory(user_id=user_id, name=name[:128]) + db.add(cat) + await db.commit() + await db.refresh(cat) + return cat + + +@router.patch("/{cat_id}", response_model=CategoryOut) +async def rename_category( + cat_id: str, + body: CategoryUpdate, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> DocumentCategory: + cat = await _get_user_cat(cat_id, user_id, db) + name = body.name.strip() + if not name: + raise HTTPException(status_code=422, detail="Category name cannot be empty") + cat.name = name[:128] + await db.commit() + await db.refresh(cat) + return cat + + +@router.delete("/{cat_id}", status_code=204) +async def delete_category( + cat_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> None: + cat = await _get_user_cat(cat_id, user_id, db) + await db.delete(cat) + await db.commit() + + +async def _get_user_cat(cat_id: str, user_id: str, db: AsyncSession) -> DocumentCategory: + result = await db.execute( + select(DocumentCategory).where( + DocumentCategory.id == cat_id, + DocumentCategory.user_id == user_id, + ) + ) + cat = result.scalar_one_or_none() + if cat is None: + raise HTTPException(status_code=404, detail="Category not found") + return cat diff --git a/features/doc-service/app/routers/documents.py b/features/doc-service/app/routers/documents.py new file mode 100644 index 0000000..5ec252e --- /dev/null +++ b/features/doc-service/app/routers/documents.py @@ -0,0 +1,304 @@ +import asyncio +import json +import uuid +from datetime import datetime, timezone + +import aiofiles +import pdfplumber +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile +from fastapi.responses import StreamingResponse +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm import selectinload + +from app.database import AsyncSessionLocal, get_db +from app.deps import get_user_id +from app.models.category import DocumentCategory +from app.models.category_assignment import CategoryAssignment +from app.models.document import Document +from app.schemas.document import DocumentOut, DocumentStatusOut, DocumentTypeUpdate +from app.services.ai import get_provider +from app.services.config_reader import load_doc_config +from app.services.storage import delete_file, get_upload_path, save_upload + +router = APIRouter() + +_DEFAULT_MAX_BYTES = 20 * 1024 * 1024 + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document: + result = await db.execute( + select(Document) + .where(Document.id == doc_id, Document.user_id == user_id) + .options( + selectinload(Document.category_assignments) + .selectinload(CategoryAssignment.category) + ) + ) + doc = result.scalar_one_or_none() + if doc is None: + raise HTTPException(status_code=404, detail="Document not found") + return doc + + +def _doc_with_categories(doc: Document) -> DocumentOut: + from app.schemas.document import CategoryOut + cats = [CategoryOut(id=a.category.id, name=a.category.name) for a in doc.category_assignments] + return DocumentOut( + id=doc.id, + user_id=doc.user_id, + filename=doc.filename, + file_size=doc.file_size, + status=doc.status, + document_type=doc.document_type, + extracted_data=doc.extracted_data, + tags=doc.tags, + error_message=doc.error_message, + created_at=doc.created_at, + processed_at=doc.processed_at, + categories=cats, + ) + + +def _extract_pdf_text(file_path: str) -> str: + """Synchronous — must be called via asyncio.to_thread.""" + text_parts = [] + with pdfplumber.open(file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text_parts.append(page_text) + return "\n".join(text_parts) + + +# ── Background processing ───────────────────────────────────────────────────── + +async def process_document(doc_id: str) -> None: + """ + Runs after the upload response is sent. + Opens its own DB session — never use the request's Depends session here. + Loads AI config fresh from the config file so settings changes apply without restart. + """ + async with AsyncSessionLocal() as db: + doc = await db.get(Document, doc_id) + if doc is None: + return + + doc.status = "processing" + await db.commit() + + try: + text = await asyncio.to_thread(_extract_pdf_text, doc.file_path) + config = await load_doc_config() + provider = get_provider(config["ai"]) + result = await provider.classify_document(text) + + doc.raw_text = text[:500_000] # cap stored text at 500k chars + doc.extracted_data = json.dumps(result) + doc.document_type = result.get("document_type", "unknown") + doc.tags = json.dumps(result.get("tags", [])) + doc.status = "done" + doc.processed_at = datetime.now(timezone.utc) + except Exception as exc: + doc.status = "failed" + doc.error_message = str(exc)[:500] + + await db.commit() + + +# ── Routes ──────────────────────────────────────────────────────────────────── + +@router.post("/upload", response_model=DocumentOut, status_code=202) +async def upload_document( + file: UploadFile, + background_tasks: BackgroundTasks, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> DocumentOut: + if file.content_type not in ("application/pdf", "application/octet-stream"): + if not (file.filename or "").lower().endswith(".pdf"): + raise HTTPException(status_code=415, detail="Only PDF files are accepted") + + config = await load_doc_config() + max_bytes = config.get("documents", {}).get("max_pdf_bytes", _DEFAULT_MAX_BYTES) + + file_data = await file.read() + if len(file_data) > max_bytes: + raise HTTPException( + status_code=413, + detail=f"File exceeds maximum size of {max_bytes // (1024*1024)} MB", + ) + + doc_id = str(uuid.uuid4()) + dest = await save_upload(file_data, user_id, doc_id) + + doc = Document( + id=doc_id, + user_id=user_id, + filename=file.filename or "upload.pdf", + file_path=str(dest), + file_size=len(file_data), + status="pending", + ) + db.add(doc) + await db.commit() + await db.refresh(doc) + + background_tasks.add_task(process_document, doc_id) + + return _doc_with_categories(doc) + + +@router.get("", response_model=list[DocumentOut]) +async def list_documents( + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> list[DocumentOut]: + result = await db.execute( + select(Document) + .where(Document.user_id == user_id) + .options( + selectinload(Document.category_assignments) + .selectinload(CategoryAssignment.category) + ) + .order_by(Document.created_at.desc()) + ) + return [_doc_with_categories(d) for d in result.scalars().all()] + + +@router.get("/{doc_id}", response_model=DocumentOut) +async def get_document( + doc_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> DocumentOut: + doc = await _get_user_doc(doc_id, user_id, db) + return _doc_with_categories(doc) + + +@router.get("/{doc_id}/status", response_model=DocumentStatusOut) +async def get_document_status( + doc_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> Document: + result = await db.execute( + select(Document).where(Document.id == doc_id, Document.user_id == user_id) + ) + doc = result.scalar_one_or_none() + if doc is None: + raise HTTPException(status_code=404, detail="Document not found") + return doc + + +@router.patch("/{doc_id}/type", response_model=DocumentOut) +async def update_document_type( + doc_id: str, + body: DocumentTypeUpdate, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> DocumentOut: + doc = await _get_user_doc(doc_id, user_id, db) + doc.document_type = body.document_type + await db.commit() + await db.refresh(doc) + return _doc_with_categories(doc) + + +@router.delete("/{doc_id}", status_code=204) +async def delete_document( + doc_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> None: + result = await db.execute( + select(Document).where(Document.id == doc_id, Document.user_id == user_id) + ) + doc = result.scalar_one_or_none() + if doc is None: + raise HTTPException(status_code=404, detail="Document not found") + delete_file(doc.file_path) + await db.delete(doc) + await db.commit() + + +@router.get("/{doc_id}/file") +async def download_file( + doc_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> StreamingResponse: + result = await db.execute( + select(Document).where(Document.id == doc_id, Document.user_id == user_id) + ) + doc = result.scalar_one_or_none() + if doc is None: + raise HTTPException(status_code=404, detail="Document not found") + + async def file_generator(): + async with aiofiles.open(doc.file_path, "rb") as f: + while chunk := await f.read(64 * 1024): + yield chunk + + return StreamingResponse( + file_generator(), + media_type="application/pdf", + headers={"Content-Disposition": f'inline; filename="{doc.filename}"'}, + ) + + +# ── Category assignment ─────────────────────────────────────────────────────── + +@router.post("/{doc_id}/categories/{cat_id}", status_code=204) +async def assign_category( + doc_id: str, + cat_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> None: + # Verify both belong to this user + doc_result = await db.execute( + select(Document).where(Document.id == doc_id, Document.user_id == user_id) + ) + if doc_result.scalar_one_or_none() is None: + raise HTTPException(status_code=404, detail="Document not found") + + cat_result = await db.execute( + select(DocumentCategory).where( + DocumentCategory.id == cat_id, DocumentCategory.user_id == user_id + ) + ) + if cat_result.scalar_one_or_none() is None: + raise HTTPException(status_code=404, detail="Category not found") + + # Upsert — ignore if already assigned + existing = await db.execute( + select(CategoryAssignment).where( + CategoryAssignment.document_id == doc_id, + CategoryAssignment.category_id == cat_id, + ) + ) + if existing.scalar_one_or_none() is None: + db.add(CategoryAssignment(document_id=doc_id, category_id=cat_id)) + await db.commit() + + +@router.delete("/{doc_id}/categories/{cat_id}", status_code=204) +async def remove_category( + doc_id: str, + cat_id: str, + user_id: str = Depends(get_user_id), + db: AsyncSession = Depends(get_db), +) -> None: + result = await db.execute( + select(CategoryAssignment).where( + CategoryAssignment.document_id == doc_id, + CategoryAssignment.category_id == cat_id, + ) + ) + assignment = result.scalar_one_or_none() + if assignment: + await db.delete(assignment) + await db.commit() diff --git a/features/doc-service/app/schemas/__init__.py b/features/doc-service/app/schemas/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/features/doc-service/app/schemas/category.py b/features/doc-service/app/schemas/category.py new file mode 100644 index 0000000..9d498e5 --- /dev/null +++ b/features/doc-service/app/schemas/category.py @@ -0,0 +1,20 @@ +from datetime import datetime + +from pydantic import BaseModel + + +class CategoryOut(BaseModel): + id: str + user_id: str + name: str + created_at: datetime + + model_config = {"from_attributes": True} + + +class CategoryCreate(BaseModel): + name: str + + +class CategoryUpdate(BaseModel): + name: str diff --git a/features/doc-service/app/schemas/document.py b/features/doc-service/app/schemas/document.py new file mode 100644 index 0000000..b00c506 --- /dev/null +++ b/features/doc-service/app/schemas/document.py @@ -0,0 +1,39 @@ +from datetime import datetime + +from pydantic import BaseModel + + +class CategoryOut(BaseModel): + id: str + name: str + model_config = {"from_attributes": True} + + +class DocumentOut(BaseModel): + id: str + user_id: str + filename: str + file_size: int + status: str + document_type: str | None + extracted_data: str | None # JSON string — frontend calls JSON.parse() + tags: str | None # JSON array string + error_message: str | None + created_at: datetime + processed_at: datetime | None + categories: list[CategoryOut] = [] + + model_config = {"from_attributes": True} + + +class DocumentStatusOut(BaseModel): + id: str + status: str + error_message: str | None + processed_at: datetime | None + + model_config = {"from_attributes": True} + + +class DocumentTypeUpdate(BaseModel): + document_type: str diff --git a/features/doc-service/app/services/__init__.py b/features/doc-service/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/features/doc-service/app/services/ai/__init__.py b/features/doc-service/app/services/ai/__init__.py new file mode 100644 index 0000000..73ece4f --- /dev/null +++ b/features/doc-service/app/services/ai/__init__.py @@ -0,0 +1,23 @@ +from app.services.ai.base import AIProvider + + +def get_provider(ai_config: dict) -> AIProvider: + """ + Factory: return an AIProvider instance based on the 'provider' key in the AI config section. + ai_config is the 'ai' section of doc_service_config.json, loaded fresh per processing job. + """ + provider_name = ai_config.get("provider", "anthropic") + provider_cfg = ai_config.get(provider_name, {}) + + match provider_name: + case "anthropic": + from app.services.ai.anthropic_provider import AnthropicProvider + return AnthropicProvider(provider_cfg) + case "ollama" | "lmstudio": + from app.services.ai.openai_compat import OpenAICompatProvider + return OpenAICompatProvider(provider_cfg) + case _: + raise ValueError(f"Unknown AI provider: {provider_name!r}") + + +__all__ = ["AIProvider", "get_provider"] diff --git a/features/doc-service/app/services/ai/anthropic_provider.py b/features/doc-service/app/services/ai/anthropic_provider.py new file mode 100644 index 0000000..c3bbaec --- /dev/null +++ b/features/doc-service/app/services/ai/anthropic_provider.py @@ -0,0 +1,31 @@ +import json + +from anthropic import AsyncAnthropic + +from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE + + +class AnthropicProvider(AIProvider): + def __init__(self, config: dict) -> None: + self._client = AsyncAnthropic(api_key=config["api_key"]) + self._model = config.get("model", "claude-haiku-4-5-20251001") + + async def classify_document(self, text: str) -> dict: + message = await self._client.messages.create( + model=self._model, + max_tokens=2048, + system=SYSTEM_PROMPT, + messages=[{ + "role": "user", + "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000]), + }], + ) + raw = message.content[0].text.strip() + return _parse_json(raw) + + +def _parse_json(raw: str) -> dict: + # Strip accidental markdown fences despite explicit instruction not to include them + if raw.startswith("```"): + raw = raw.split("\n", 1)[1].rsplit("```", 1)[0] + return json.loads(raw) diff --git a/features/doc-service/app/services/ai/base.py b/features/doc-service/app/services/ai/base.py new file mode 100644 index 0000000..2beb541 --- /dev/null +++ b/features/doc-service/app/services/ai/base.py @@ -0,0 +1,31 @@ +from abc import ABC, abstractmethod + +SYSTEM_PROMPT = ( + "You are a financial document analysis assistant. " + "Given the text extracted from a PDF document, return ONLY a JSON object " + "with no markdown, no code fences, and no explanation." +) + +USER_PROMPT_TEMPLATE = """Analyze the following document text and return a JSON object with exactly these keys: +document_type (one of: invoice, bill, receipt, order, expense, revenue, unknown), +total_amount (string or null), +currency (string or null), +vendor_name (string or null), +customer_name (string or null), +billing_address (string or null), +customer_address (string or null), +invoice_number (string or null), +invoice_date (string or null), +due_date (string or null), +tags (array of strings), +line_items (array of objects, each with keys: description, amount). + +Document text: +{text}""" + + +class AIProvider(ABC): + @abstractmethod + async def classify_document(self, text: str) -> dict: + """Return structured extraction dict from document text.""" + ... diff --git a/features/doc-service/app/services/ai/openai_compat.py b/features/doc-service/app/services/ai/openai_compat.py new file mode 100644 index 0000000..241a430 --- /dev/null +++ b/features/doc-service/app/services/ai/openai_compat.py @@ -0,0 +1,36 @@ +""" +OpenAI-compatible provider for Ollama and LM Studio. +Both expose an OpenAI-compatible /v1/chat/completions endpoint. +""" +import json + +from openai import AsyncOpenAI + +from app.services.ai.base import AIProvider, SYSTEM_PROMPT, USER_PROMPT_TEMPLATE + + +class OpenAICompatProvider(AIProvider): + def __init__(self, config: dict) -> None: + self._client = AsyncOpenAI( + base_url=config["base_url"], + api_key=config.get("api_key", "not-required"), + ) + self._model = config["model"] + + async def classify_document(self, text: str) -> dict: + response = await self._client.chat.completions.create( + model=self._model, + temperature=0, + messages=[ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": USER_PROMPT_TEMPLATE.format(text=text[:100_000])}, + ], + ) + raw = response.choices[0].message.content.strip() + return _parse_json(raw) + + +def _parse_json(raw: str) -> dict: + if raw.startswith("```"): + raw = raw.split("\n", 1)[1].rsplit("```", 1)[0] + return json.loads(raw) diff --git a/features/doc-service/app/services/config_reader.py b/features/doc-service/app/services/config_reader.py new file mode 100644 index 0000000..536f481 --- /dev/null +++ b/features/doc-service/app/services/config_reader.py @@ -0,0 +1,44 @@ +""" +Reads doc_service_config.json from the shared config volume. +Caches the result for 30 seconds to avoid hitting the filesystem on every request. +Uses asyncio.to_thread so the synchronous file read doesn't block the event loop. +""" +import asyncio +import json +import time +from pathlib import Path + +from app.core.config import settings + +_DEFAULT_CONFIG: dict = { + "ai": { + "provider": "anthropic", + "anthropic": {"api_key": "", "model": "claude-haiku-4-5-20251001"}, + "ollama": {"base_url": "http://localhost:11434/v1", "model": "llama3.2", "api_key": "ollama"}, + "lmstudio": {"base_url": "http://localhost:1234/v1", "model": "local-model", "api_key": ""}, + }, + "documents": {"max_pdf_bytes": 20 * 1024 * 1024}, +} + +_cache: dict | None = None +_cache_at: float = 0.0 +_CACHE_TTL = 30.0 + + +def _read_config_sync() -> dict: + path = Path(settings.CONFIG_PATH) + if not path.exists(): + return _DEFAULT_CONFIG.copy() + with open(path) as f: + return json.load(f) + + +async def load_doc_config() -> dict: + global _cache, _cache_at + now = time.monotonic() + if _cache is not None and (now - _cache_at) < _CACHE_TTL: + return _cache + data = await asyncio.to_thread(_read_config_sync) + _cache = data + _cache_at = now + return data diff --git a/features/doc-service/app/services/storage.py b/features/doc-service/app/services/storage.py new file mode 100644 index 0000000..45ea418 --- /dev/null +++ b/features/doc-service/app/services/storage.py @@ -0,0 +1,27 @@ +import asyncio +from pathlib import Path + +import aiofiles + +from app.core.config import settings + + +def get_upload_path(user_id: str, doc_id: str) -> Path: + """Return /data/documents/{user_id}/{doc_id}.pdf, creating the directory if needed.""" + user_dir = Path(settings.DATA_DIR) / user_id + user_dir.mkdir(parents=True, exist_ok=True) + return user_dir / f"{doc_id}.pdf" + + +async def save_upload(file_data: bytes, user_id: str, doc_id: str) -> Path: + dest = get_upload_path(user_id, doc_id) + async with aiofiles.open(dest, "wb") as f: + await f.write(file_data) + return dest + + +def delete_file(file_path: str) -> None: + try: + Path(file_path).unlink(missing_ok=True) + except OSError: + pass # log but do not raise — deletion failure must not 500 diff --git a/features/doc-service/pyproject.toml b/features/doc-service/pyproject.toml new file mode 100644 index 0000000..77df723 --- /dev/null +++ b/features/doc-service/pyproject.toml @@ -0,0 +1,35 @@ +[build-system] +requires = ["setuptools>=45"] +build-backend = "setuptools.build_meta" + +[project] +name = "doc-service" +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.111", + "uvicorn[standard]>=0.29", + "sqlalchemy[asyncio]>=2.0", + "asyncpg>=0.29", + "alembic>=1.13", + "pydantic-settings>=2.2", + "anthropic>=0.28", + "openai>=1.0", + "pdfplumber>=0.11", + "aiofiles>=23.0", + "python-multipart>=0.0.9", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8", + "pytest-asyncio>=0.23", + "httpx>=0.27", + "ruff>=0.4", +] + +[tool.pytest.ini_options] +asyncio_mode = "auto" + +[tool.ruff] +line-length = 100 diff --git a/features/doc-service/scripts/start.sh b/features/doc-service/scripts/start.sh new file mode 100644 index 0000000..6139098 --- /dev/null +++ b/features/doc-service/scripts/start.sh @@ -0,0 +1,8 @@ +#!/bin/sh +set -e + +echo "[doc-service] running migrations..." +alembic upgrade head + +echo "[doc-service] starting uvicorn..." +exec uvicorn app.main:app --host 0.0.0.0 --port 8001 diff --git a/features/doc-service/scripts/start_dev.sh b/features/doc-service/scripts/start_dev.sh new file mode 100644 index 0000000..7e9c1c3 --- /dev/null +++ b/features/doc-service/scripts/start_dev.sh @@ -0,0 +1,8 @@ +#!/bin/sh +set -e + +echo "[doc-service] running migrations..." +alembic upgrade head + +echo "[doc-service] starting uvicorn (dev)..." +exec uvicorn app.main:app --host 0.0.0.0 --port 8001 --reload diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index 18c534a..53d1a03 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -6,8 +6,9 @@ import LoginPage from "./pages/LoginPage"; import DashboardPage from "./pages/DashboardPage"; import ProfilePage from "./pages/ProfilePage"; import AppsPage from "./pages/AppsPage"; -import SettingsPage from "./pages/SettingsPage"; import AdminPage from "./pages/AdminPage"; +import DocumentsPage from "./pages/DocumentsPage"; +import DocumentAdminSettingsPage from "./pages/DocumentAdminSettingsPage"; function PrivateRoute({ children }: { children: React.ReactNode }) { const { token } = useAuth(); @@ -33,7 +34,11 @@ export default function App() { } /> } /> - } /> + } /> + } + /> } /> } /> diff --git a/frontend/src/api/client.ts b/frontend/src/api/client.ts index 24dfb3c..a5a812a 100644 --- a/frontend/src/api/client.ts +++ b/frontend/src/api/client.ts @@ -73,3 +73,113 @@ export const getProfile = () => export const updateProfile = (data: ProfileUpdate) => api.put("/profile/me", data).then((r) => r.data); + +// --- Documents --- +export type DocumentStatus = "pending" | "processing" | "done" | "failed"; + +export interface CategoryOut { + id: string; + name: string; +} + +export interface DocumentOut { + id: string; + user_id: string; + filename: string; + file_size: number; + status: DocumentStatus; + document_type: string | null; + extracted_data: string | null; + tags: string | null; + error_message: string | null; + created_at: string; + processed_at: string | null; + categories: CategoryOut[]; +} + +export interface DocumentStatusOut { + id: string; + status: DocumentStatus; + document_type: string | null; + error_message: string | null; + processed_at: string | null; +} + +export const listDocuments = () => + api.get("/documents").then((r) => r.data); + +export const getDocument = (id: string) => + api.get(`/documents/${id}`).then((r) => r.data); + +export const getDocumentStatus = (id: string) => + api.get(`/documents/${id}/status`).then((r) => r.data); + +export const uploadDocument = (file: File) => { + const form = new FormData(); + form.append("file", file); + return api.post("/documents/upload", form).then((r) => r.data); +}; + +export const updateDocumentType = (id: string, document_type: string) => + api.patch(`/documents/${id}/type`, { document_type }).then((r) => r.data); + +export const deleteDocument = (id: string) => + api.delete(`/documents/${id}`); + +export const downloadDocument = async (id: string, filename: string) => { + const response = await api.get(`/documents/${id}/file`, { responseType: "blob" }); + const url = URL.createObjectURL(response.data); + const a = document.createElement("a"); + a.href = url; + a.download = filename; + a.click(); + URL.revokeObjectURL(url); +}; + +export const assignCategory = (docId: string, catId: string) => + api.post(`/documents/${docId}/categories/${catId}`); + +export const removeCategory = (docId: string, catId: string) => + api.delete(`/documents/${docId}/categories/${catId}`); + +// --- Categories --- +export const listCategories = () => + api.get("/documents/categories").then((r) => r.data); + +export const createCategory = (name: string) => + api.post("/documents/categories", { name }).then((r) => r.data); + +export const renameCategory = (id: string, name: string) => + api.patch(`/documents/categories/${id}`, { name }).then((r) => r.data); + +export const deleteCategory = (id: string) => + api.delete(`/documents/categories/${id}`); + +// --- Settings (admin only) --- +export interface AIProviderUpdate { + provider: string; + anthropic_api_key?: string; + anthropic_model?: string; + ollama_base_url?: string; + ollama_model?: string; + ollama_api_key?: string; + lmstudio_base_url?: string; + lmstudio_model?: string; + lmstudio_api_key?: string; +} + +export const getDocumentSettings = () => + api.get>("/settings/documents").then((r) => r.data); + +export const updateDocumentAISettings = (data: AIProviderUpdate) => + api.patch>("/settings/documents/ai", data).then((r) => r.data); + +export const testDocumentAIConnection = () => + api.post<{ ok: boolean; provider: string; response?: string; error?: string }>( + "/settings/documents/ai/test" + ).then((r) => r.data); + +export const updateDocumentLimits = (max_pdf_mb: number) => + api.patch>("/settings/documents/limits", { max_pdf_mb }).then( + (r) => r.data + ); diff --git a/frontend/src/components/Nav.tsx b/frontend/src/components/Nav.tsx index d3e1871..7928369 100644 --- a/frontend/src/components/Nav.tsx +++ b/frontend/src/components/Nav.tsx @@ -15,16 +15,11 @@ export default function Nav() { padding: "12px 24px", borderBottom: "1px solid #ccc", }}> - Home + Home Apps - Settings {user?.is_admin && Admin} - + Profile + ); } diff --git a/frontend/src/pages/AppsPage.tsx b/frontend/src/pages/AppsPage.tsx index afd799d..f9b933c 100644 --- a/frontend/src/pages/AppsPage.tsx +++ b/frontend/src/pages/AppsPage.tsx @@ -1,11 +1,95 @@ +import { Link } from "react-router-dom"; +import { useQuery } from "@tanstack/react-query"; import Nav from "../components/Nav"; +import { getMe } from "../api/client"; + +interface AppCard { + slug: string; + name: string; + description: string; + status: "available" | "coming_soon"; + path: string; + settingsPath?: string; +} + +const APPS: AppCard[] = [ + { + slug: "documents", + name: "Documents", + description: "Upload PDF files, extract data, and organise them with categories.", + status: "available", + path: "/apps/documents", + settingsPath: "/apps/documents/settings/admin", + }, +]; export default function AppsPage() { + const { data: user } = useQuery({ queryKey: ["me"], queryFn: getMe }); + return ( <>