Add generic plugin architecture and watch-directory feature

Introduces a manifest contract so feature containers self-describe their
settings (JSON Schema + access rules). Backend and frontend gain generic
plugin proxy and dynamic Extensions UI with zero feature-specific code.

Doc-service is the first plugin consumer: exposes /plugin/manifest and
/plugin/settings, adds a watchdog-based file watcher that auto-ingests
PDFs from a mounted directory, maps subfolders to categories, supports
AI-suggested folder/filename (user-confirmed), and enforces a no-remove
policy. Access is gated by is_superuser or doc-service-admin group.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-18 02:09:50 +02:00
parent 2d7207b62f
commit 00466a9801
29 changed files with 1373 additions and 52 deletions
+3
View File
@@ -22,5 +22,8 @@ resume.txt
# Test fixtures — drop PDFs here for local testing, never commit them
features/doc-service/tests/pdfs/*.pdf
# Feature branch test stacks — never commit these
docker-compose.feat-*.yml
# Don't sync .un files
*.un~
+134 -8
View File
@@ -78,12 +78,13 @@ docker compose up --build -d
├── .githooks/pre-commit ← Runs scripts/security_check.py before every commit
├── scripts/security_check.py ← Static analysis: secrets, weak crypto, SQLi, JWT
├── changelog/YYYY-MM-DD_<slug>.md ← Per-date change logs
├── dev-watch/ ← Dev bind-mount for file watcher testing (.gitkeep only)
├── backend/ ← FastAPI gateway (port 8000, internal)
│ ├── app/
│ │ ├── main.py ← App factory, router registration, lifespan (health loop)
│ │ ├── database.py ← AsyncEngine, AsyncSessionLocal, Base
│ │ ├── deps.py ← get_current_user, get_current_admin
│ │ ├── deps.py ← get_current_user, get_current_admin, check_plugin_access
│ │ ├── core/
│ │ │ ├── config.py ← All settings via pydantic-settings (reads .env)
│ │ │ ├── security.py ← JWT sign/verify (RS256), bcrypt hash/verify
@@ -106,10 +107,11 @@ docker compose up --build -d
│ │ │ ├── groups.py ← Group CRUD + member management (admin-only)
│ │ │ ├── settings.py ← AI, doc limits, system prompts, appearance, themes (admin-only)
│ │ │ ├── services.py ← GET /services (health status)
│ │ │ ├── plugins.py ← Generic plugin proxy (GET/PATCH /api/plugins/*)
│ │ │ ├── categories_proxy.py ← Transparent proxy → doc-service /categories/*
│ │ │ └── documents_proxy.py ← Transparent proxy → doc-service /documents/*
│ │ └── services/
│ │ └── service_health.py ← Background 30s health-check loop
│ │ └── service_health.py ← Background 30s health-check loop; caches /plugin/manifest per service
│ ├── alembic/
│ │ ├── env.py ← Async migration runner
│ │ └── versions/ ← Migration chain (see Migrations section)
@@ -133,7 +135,7 @@ docker compose up --build -d
│ │
│ └── doc-service/ ← PDF extraction microservice (port 8001, internal)
│ ├── app/
│ │ ├── main.py
│ │ ├── main.py ← FastAPI, lifespan (file watcher start/stop)
│ │ ├── database.py ← Same PostgreSQL instance as backend
│ │ ├── deps.py ← get_user_id (reads x-user-id header)
│ │ ├── models/
@@ -144,13 +146,16 @@ docker compose up --build -d
│ │ │ ├── document.py ← DocumentOut, DocumentPage, DocumentStatusOut, etc.
│ │ │ └── category.py ← CategoryOut, CategoryCreate, CategoryUpdate
│ │ ├── routers/
│ │ │ ├── documents.py ← Full document CRUD + file serving + reprocess
│ │ │ ── categories.py ← Category CRUD
│ │ │ ├── documents.py ← Full document CRUD + file serving + reprocess + suggestion endpoints
│ │ │ ── categories.py ← Category CRUD (includes watch-owned categories)
│ │ │ └── plugin.py ← GET /plugin/manifest, GET+PATCH /plugin/settings
│ │ └── services/
│ │ ├── storage.py ← File I/O
│ │ ├── ai_client.py ← classify_document() → ai-service:8010/chat
│ │ ── config_reader.py
│ │ ── config_reader.py ← Config load/save including storage/watch settings
│ │ └── file_watcher.py ← watchdog-based PDF watcher + startup scan + ingestion
│ ├── alembic/versions/ ← Doc-service migration chain
│ │ └── 0003_add_watch_columns.py ← source, watch_path, suggested_folder, suggested_filename
│ ├── Dockerfile
│ └── STATUS.md
@@ -164,10 +169,12 @@ docker compose up --build -d
│ │ └── useTheme.ts ← Theme toggle
│ ├── components/
│ │ ├── AppShell.tsx ← Layout: Sidebar + scrollable main
│ │ ├── Sidebar.tsx ← Collapsible nav (icons ↔ icons+labels)
│ │ ├── Sidebar.tsx ← Collapsible nav; "Extensions" section auto-populated from /api/plugins
│ │ ├── ThemeToggle.tsx ← Light/dark mode toggle
│ │ ├── PluginSchemaForm.tsx ← JSON Schema → React form (boolean/string/number/readOnly)
│ │ └── ui/ ← shadcn/ui components (Button, Input, …)
│ ├── pages/ ← One file per route (see Routes section)
│ │ └── PluginSettingsPage.tsx ← Generic plugin settings page driven by manifest
│ ├── lib/utils.ts ← cn() = clsx + tailwind-merge
│ └── styles/theme.css ← CSS custom properties, Tailwind setup
├── vite.config.ts ← /api/* proxied to backend:8000
@@ -283,6 +290,10 @@ Unique constraint: `(group_id, user_id)`
| `error_message` | String(500) | nullable | |
| `created_at` | DateTime(tz) | server_default=now() | |
| `processed_at` | DateTime(tz) | nullable | |
| `source` | String(16) | default="upload" | "upload" or "watch" |
| `watch_path` | String | nullable | original absolute path in watch directory |
| `suggested_folder` | String(128) | nullable | AI-suggested category (pending user confirm) |
| `suggested_filename` | String(500) | nullable | AI-suggested title/rename (pending user confirm) |
**`document_categories`**
@@ -318,6 +329,7 @@ Unique constraint: `(group_id, user_id)`
|--------|------|
| `0001` | `create_doc_tables` |
| `0002` | `add_document_title` |
| `0003` | `add_watch_columns` |
---
@@ -407,6 +419,10 @@ Unique constraint: `(group_id, user_id)`
| GET | `/api/documents/{id}/file` | Download PDF (streaming) |
| POST | `/api/documents/{id}/categories/{cat_id}` | Assign category |
| DELETE | `/api/documents/{id}/categories/{cat_id}` | Remove category |
| POST | `/api/documents/{id}/suggestions/folder/confirm` | Confirm AI folder suggestion |
| POST | `/api/documents/{id}/suggestions/folder/reject` | Reject AI folder suggestion |
| POST | `/api/documents/{id}/suggestions/filename/confirm` | Confirm AI filename suggestion |
| POST | `/api/documents/{id}/suggestions/filename/reject` | Reject AI filename suggestion |
### Categories (`/api/documents/categories/*`) — authenticated, proxied to doc-service
@@ -417,6 +433,17 @@ Unique constraint: `(group_id, user_id)`
| PATCH | `/api/documents/categories/{id}` | Rename |
| DELETE | `/api/documents/categories/{id}` | Delete (204) |
### Plugins (`/api/plugins`) — authenticated, auth-per-plugin
| Method | Path | Description |
|--------|------|-------------|
| GET | `/api/plugins` | List plugins accessible to current user |
| GET | `/api/plugins/{id}/manifest` | Plugin manifest with settings JSON Schema (auth-gated) |
| GET | `/api/plugins/{id}/settings` | Proxy to feature `/plugin/settings` (auth-gated) |
| PATCH | `/api/plugins/{id}/settings` | Proxy to feature `/plugin/settings` (auth-gated) |
Auth: is_superuser OR member of group listed in manifest `required_groups`. Returns 404 (not 403) to hide existence.
### AI-service (internal only — not exposed to browser)
| Method | Path | Description |
@@ -442,6 +469,7 @@ Unique constraint: `(group_id, user_id)`
| `/apps/ai/settings/admin` | `AIAdminSettingsPage` | AdminRoute |
| `/profile` | `ProfilePage` | PrivateRoute |
| `/settings` | `SettingsPage` | PrivateRoute |
| `/settings/plugins/:id` | `PluginSettingsPage` | PrivateRoute (auth enforced per-plugin by backend) |
| `/admin` | `AdminPage` (→ `/admin/users`) | AdminRoute |
| `/admin/users` | `AdminUsersPage` | AdminRoute |
| `/admin/groups` | `AdminGroupsPage` | AdminRoute |
@@ -566,6 +594,9 @@ Adding a new API call:
["categories"] // document categories
["documents", params] // document list (params object for cache isolation)
["document", id] // single document
["plugins"] // accessible plugin list (filtered by user access)
["plugin-manifest", id] // plugin manifest (cached)
["plugin-settings", id] // plugin current settings
```
**Mutation pattern**:
@@ -708,7 +739,7 @@ Use `validation_alias` when the ORM field name differs from the JSON key (e.g.,
| `db` | postgres:16-alpine | 5432 | 70:70 | `postgres_data` | backend-net |
| `backend` | python:3.12-slim | 8000 | 1001:1001 | `app_config` | backend-net |
| `ai-service` | python:3.12-slim | 8010 | 1001:1001 | `app_config` | backend-net |
| `doc-service` | python:3.12-slim | 8001 | 1001:1001 | `doc_data`, `app_config` | backend-net |
| `doc-service` | python:3.12-slim | 8001 | 1001:1001 | `doc_data`, `watch_data`, `app_config` | backend-net |
| `frontend` | nginx-unprivileged:alpine | 8080 | 1001:1001 | — | backend-net, frontend-net |
### Volumes
@@ -717,6 +748,7 @@ Use `validation_alias` when the ORM field name differs from the JSON key (e.g.,
|--------|-----------|---------|
| `postgres_data` | `/var/lib/postgresql/data` | PostgreSQL data |
| `doc_data` | `/data/documents` | Uploaded PDF files |
| `watch_data` | `/data/watch` | Watch directory (bind-mount NAS/Nextcloud via docker-compose.override.yml) |
| `app_config` | `/config` | Per-service runtime config JSON files |
### Networks
@@ -816,6 +848,100 @@ Always run `git push` immediately after every `git commit`.
---
### Feature branch & isolated test environment
Every non-trivial implementation (anything beyond a one-line fix or doc change) **must** follow this workflow:
#### 1 — Create a feature branch
After the planning phase is approved, branch off `main`:
```bash
git checkout main && git pull
git checkout -b feat/<slug> # e.g. feat/color-mode, feat/admin-appearance
```
#### 2 — Spin up an isolated Docker stack for the feature
A dedicated compose stack runs alongside the main dev stack so both can be tested independently.
**Find the next free port** (main dev stack owns 5173):
```bash
for port in $(seq 5174 5200); do
lsof -iTCP:$port -sTCP:LISTEN -t &>/dev/null || { echo "$port"; break; }
done
```
Use the first free port returned (call it `$PORT`).
**Create a per-feature override file** at `docker-compose.feat-<slug>.yml` (gitignored):
```yaml
# docker-compose.feat-<slug>.yml — feature test stack, never committed to main
services:
frontend:
ports:
- "$PORT:8080" # e.g. 5174:8080
container_name: frontend-<slug>
backend:
container_name: backend-<slug>
doc-service:
container_name: doc-service-<slug>
ai-service:
container_name: ai-service-<slug>
db:
container_name: db-<slug>
networks:
backend-net:
name: backend-net-<slug>
frontend-net:
name: frontend-net-<slug>
```
**Start the feature stack**:
```bash
docker compose -f docker-compose.yml \
-f docker-compose.dev.yml \
-f docker-compose.feat-<slug>.yml \
--project-name <slug> up --build
```
The feature frontend is now reachable at `http://localhost:$PORT`.
The main dev stack continues running unaffected on `:5173`.
#### 3 — Develop on the feature branch
All code changes happen on `feat/<slug>`. Commit and push normally:
```bash
git add <files>
git commit -m "feat: <description>"
git push -u origin feat/<slug>
```
#### 4 — Confirm functionality
Before merging, verify all of the following on `http://localhost:$PORT`:
- [ ] Login and registration work end-to-end
- [ ] The specific feature works as intended
- [ ] No regressions visible in the UI
- [ ] Backend logs show no unexpected errors: `docker compose -p <slug> logs backend`
- [ ] Migrations (if any) applied cleanly: `docker compose -p <slug> exec backend alembic upgrade head`
#### 5 — Merge to main
Once all checks pass:
```bash
git checkout main
git merge --no-ff feat/<slug> -m "Merge feat/<slug>: <description>"
git push
git branch -d feat/<slug>
git push origin --delete feat/<slug>
```
#### 6 — Tear down the feature stack
```bash
docker compose -f docker-compose.yml \
-f docker-compose.dev.yml \
-f docker-compose.feat-<slug>.yml \
--project-name <slug> down --volumes --remove-orphans
rm docker-compose.feat-<slug>.yml
```
---
### Infrastructure change protocol
After **any** change to Dockerfiles, `docker-compose*.yml`, `nginx.conf`, or setup scripts:
+16
View File
@@ -82,6 +82,21 @@ All `/api/documents/*` and `/api/documents/categories/*` requests are transparen
- Strips hop-by-hop headers + `content-length`, `accept-encoding`, `content-type`
- Returns `Response` (not `StreamingResponse`) to avoid content-length/chunked conflicts
### Plugin system (`/api/plugins`)
Generic extension/plugin infrastructure — **zero feature-specific code in backend**. Feature containers self-describe via `GET /plugin/manifest`.
| Method | Path | Auth | Description |
|--------|------|------|-------------|
| `GET` | `/api/plugins` | user | List plugins accessible to current user |
| `GET` | `/api/plugins/{id}/manifest` | user | Cached manifest for a plugin (404 if not accessible) |
| `GET` | `/api/plugins/{id}/settings` | user | Proxy to feature `GET /plugin/settings` |
| `PATCH` | `/api/plugins/{id}/settings` | user | Proxy to feature `PATCH /plugin/settings` |
Access is controlled by the manifest: `allow_superuser` for admins; `required_groups` for group members. `check_plugin_access(plugin_id, user, db)` in `deps.py` enforces this.
During each health poll, `service_health.py` also fetches `GET /plugin/manifest` from healthy services and caches it. New feature containers that expose `/plugin/manifest` automatically appear in the Extensions sidebar — no backend code changes required.
### Database models
| Model | Table | Notes |
@@ -137,6 +152,7 @@ Browser (port 5173 dev / 80 prod)
## Future work
- [x] Groups system: `groups`, `group_memberships` tables; admin CRUD; add/remove members
- [x] Generic plugin infrastructure: manifest contract, `/api/plugins` proxy router, `check_plugin_access`
- [ ] App permissions registry: `group_app_permissions` table; AppsPage filtered by group grants
- [ ] Doc sharing via group membership
- [ ] App permissions registry: `user_app_permissions (user_id, app_key)`; AppsPage filtered by grants
+41
View File
@@ -43,3 +43,44 @@ async def get_current_admin(
detail="Not found",
)
return current_user
async def check_plugin_access(
plugin_id: str,
current_user: User,
db: AsyncSession,
) -> bool:
"""
Return True if the user may access the given plugin's settings.
Access is granted when any of these conditions holds:
1. The user is a superuser AND the manifest allows superuser access.
2. The user is a member of one of the groups listed in manifest.access.required_groups.
Returns False (not raises) so callers can decide how to respond.
"""
from app.models.group import Group, GroupMembership
from app.services.service_health import get_cached_manifest
manifest = get_cached_manifest(plugin_id)
if manifest is None:
return False
access = manifest.get("access", {})
if current_user.is_superuser and access.get("allow_superuser", True):
return True
for group_name in access.get("required_groups", []):
result = await db.execute(
select(GroupMembership)
.join(Group, Group.id == GroupMembership.group_id)
.where(
Group.name == group_name,
GroupMembership.user_id == current_user.id,
)
)
if result.scalar_one_or_none() is not None:
return True
return False
+2 -1
View File
@@ -6,7 +6,7 @@ from fastapi.middleware.cors import CORSMiddleware
from app.core.app_config import seed_builtin_themes
from app.core.config import settings
from app.routers import admin, auth, categories_proxy, documents_proxy, groups, profile, services, users
from app.routers import admin, auth, categories_proxy, documents_proxy, groups, plugins, profile, services, users
from app.routers import settings as settings_router
from app.services.service_health import check_all, health_check_loop, register_services
@@ -46,6 +46,7 @@ app.include_router(admin.router, prefix="/api/admin", tags=["admin"])
app.include_router(groups.router, prefix="/api/admin/groups", tags=["admin"])
app.include_router(settings_router.router, prefix="/api/settings", tags=["settings"])
app.include_router(services.router, prefix="/api/services", tags=["services"])
app.include_router(plugins.router, prefix="/api/plugins", tags=["plugins"])
# categories_proxy MUST be registered before documents_proxy —
# otherwise /api/documents/{path:path} swallows /api/documents/categories/*
app.include_router(
+125
View File
@@ -0,0 +1,125 @@
"""
Generic plugin proxy.
Feature containers advertise themselves via GET /plugin/manifest. The backend
health-poller caches those manifests. This router exposes them to the browser
through auth-gated endpoints so the frontend never needs to know about specific
features.
Routes:
GET /api/plugins → list accessible plugins for current user
GET /api/plugins/{id}/manifest → cached manifest (404 if not accessible)
GET /api/plugins/{id}/settings → proxy to feature /plugin/settings
PATCH /api/plugins/{id}/settings → proxy to feature /plugin/settings
"""
import httpx
from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import Response
from sqlalchemy.ext.asyncio import AsyncSession
from app.database import get_db
from app.deps import check_plugin_access, get_current_user
from app.models.user import User
from app.services.service_health import _REGISTRY, get_cached_manifest, get_service_url
router = APIRouter()
_HOP_BY_HOP = frozenset([
"connection",
"keep-alive",
"proxy-authenticate",
"proxy-authorization",
"te",
"trailers",
"transfer-encoding",
"upgrade",
"host",
"accept-encoding",
])
_STRIP_RESPONSE = frozenset([*_HOP_BY_HOP, "content-length", "content-type"])
async def _proxy(plugin_id: str, method: str, path: str, body: bytes | None,
content_type: str | None = None) -> Response:
"""Forward a request to the feature container's plugin endpoint."""
url = get_service_url(plugin_id)
if url is None:
raise HTTPException(status_code=404, detail="Not found")
headers: dict[str, str] = {}
if content_type:
headers["content-type"] = content_type
try:
async with httpx.AsyncClient(base_url=url, timeout=30.0) as client:
resp = await client.request(method, path, content=body, headers=headers)
except httpx.RequestError as exc:
raise HTTPException(status_code=502, detail=f"Plugin service unreachable: {exc}")
resp_headers = {k: v for k, v in resp.headers.items() if k.lower() not in _STRIP_RESPONSE}
return Response(
content=resp.content,
status_code=resp.status_code,
headers=resp_headers,
media_type=resp.headers.get("content-type", "application/json"),
)
@router.get("")
async def list_plugins(
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> list[dict]:
"""Return the list of plugins the current user may access."""
accessible = []
for svc in _REGISTRY:
manifest = get_cached_manifest(svc.id)
if manifest is None:
continue
if await check_plugin_access(svc.id, current_user, db):
accessible.append({
"id": manifest["id"],
"name": manifest["name"],
"icon": manifest.get("icon", "package"),
"version": manifest.get("version", ""),
})
return accessible
@router.get("/{plugin_id}/manifest")
async def get_plugin_manifest(
plugin_id: str,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> dict:
if not await check_plugin_access(plugin_id, current_user, db):
raise HTTPException(status_code=404, detail="Not found")
manifest = get_cached_manifest(plugin_id)
if manifest is None:
raise HTTPException(status_code=404, detail="Not found")
return manifest
@router.get("/{plugin_id}/settings")
async def get_plugin_settings(
plugin_id: str,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> Response:
if not await check_plugin_access(plugin_id, current_user, db):
raise HTTPException(status_code=404, detail="Not found")
return await _proxy(plugin_id, "GET", "/plugin/settings", None)
@router.patch("/{plugin_id}/settings")
async def update_plugin_settings(
plugin_id: str,
request: Request,
current_user: User = Depends(get_current_user),
db: AsyncSession = Depends(get_db),
) -> Response:
if not await check_plugin_access(plugin_id, current_user, db):
raise HTTPException(status_code=404, detail="Not found")
body = await request.body()
content_type = request.headers.get("content-type", "application/json")
return await _proxy(plugin_id, "PATCH", "/plugin/settings", body, content_type)
+40 -3
View File
@@ -2,8 +2,9 @@
Background health-checker for registered feature services.
Polls each service's /health endpoint every POLL_INTERVAL seconds and stores
the result in an in-memory dict. The REST layer reads from that dict — no DB,
no blocking calls on the request path.
the result in an in-memory dict. Also fetches /plugin/manifest when available
and caches it so the plugin proxy can serve it without per-request network calls.
The REST layer reads from that dict — no DB, no blocking calls on the request path.
"""
import asyncio
import logging
@@ -35,10 +36,13 @@ _REGISTRY: list[ServiceDefinition] = []
# id → True/False/None (None = not yet checked)
_health: dict[str, bool | None] = {}
# id → plugin manifest dict, or None if the service has no plugin manifest
_manifests: dict[str, dict | None] = {}
def register_services(doc_service_url: str, ai_service_url: str) -> None:
"""Called once during app startup to populate the registry from config."""
global _REGISTRY, _health
global _REGISTRY, _health, _manifests
_REGISTRY = [
ServiceDefinition(
@@ -62,6 +66,7 @@ def register_services(doc_service_url: str, ai_service_url: str) -> None:
]
_health = {svc.id: None for svc in _REGISTRY}
_manifests = {svc.id: None for svc in _REGISTRY}
logger.info("Service registry initialised with %d services", len(_REGISTRY))
@@ -88,6 +93,25 @@ async def _check_service(svc: ServiceDefinition) -> None:
else:
logger.warning("Service %s is now UNHEALTHY", svc.id)
# Opportunistically fetch plugin manifest when the service is healthy
if healthy:
await _fetch_manifest(svc)
async def _fetch_manifest(svc: ServiceDefinition) -> None:
"""Try to GET /plugin/manifest from the service; cache result (or None)."""
url = f"{svc.internal_url}/plugin/manifest"
try:
async with httpx.AsyncClient(timeout=5.0) as client:
resp = await client.get(url)
if resp.status_code == 200:
_manifests[svc.id] = resp.json()
else:
_manifests[svc.id] = None
except Exception:
# Service doesn't have a plugin manifest — not an error
_manifests[svc.id] = None
async def check_all() -> None:
"""Run health checks for all registered services concurrently."""
@@ -125,3 +149,16 @@ def get_all_statuses() -> list[dict]:
}
for svc in _REGISTRY
]
def get_cached_manifest(service_id: str) -> dict | None:
"""Return the cached plugin manifest for a service, or None if unavailable."""
return _manifests.get(service_id)
def get_service_url(service_id: str) -> str | None:
"""Return the internal URL for a registered service, or None if unknown."""
for svc in _REGISTRY:
if svc.id == service_id:
return svc.internal_url
return None
@@ -0,0 +1,44 @@
# 2026-04-18 — Generic Plugin Architecture + Watch Directory Feature
**Timestamp:** 2026-04-18T00:00:00Z
## Summary
Implemented a generic plugin/extension infrastructure that allows feature containers to self-describe their settings via a manifest contract, with no feature-specific code required in the backend or frontend. Built the watch-directory feature entirely inside the doc-service container as the first plugin consumer.
## Files Added
| File | Description |
|------|-------------|
| `backend/app/routers/plugins.py` | Generic plugin proxy: `GET/PATCH /api/plugins`, `/api/plugins/{id}/manifest`, `/api/plugins/{id}/settings` |
| `frontend/src/components/PluginSchemaForm.tsx` | JSON Schema → React form renderer (boolean/string/number/readOnly) |
| `frontend/src/pages/PluginSettingsPage.tsx` | Generic plugin settings page driven by manifest |
| `features/doc-service/app/routers/plugin.py` | Doc-service plugin endpoints: `/plugin/manifest`, `/plugin/settings` |
| `features/doc-service/app/services/file_watcher.py` | watchdog-based PDF watcher with startup scan, folder-to-category mapping, no-remove policy |
| `features/doc-service/alembic/versions/0003_add_watch_columns.py` | Migration: source, watch_path, suggested_folder, suggested_filename |
| `dev-watch/.gitkeep` | Dev bind-mount directory for local file watcher testing |
## Files Modified
| File | Description |
|------|-------------|
| `backend/app/services/service_health.py` | Also fetches and caches `/plugin/manifest` from healthy services |
| `backend/app/deps.py` | Added `check_plugin_access(plugin_id, user, db)` helper |
| `backend/app/main.py` | Mounted `/api/plugins` router |
| `frontend/src/api/client.ts` | Added plugin API functions and suggestion confirm/reject functions; extended `DocumentOut` with new fields |
| `frontend/src/components/Sidebar.tsx` | Added dynamic "Extensions" section populated from `/api/plugins` |
| `frontend/src/App.tsx` | Added `/settings/plugins/:id` route |
| `features/doc-service/app/models/document.py` | Added 4 new columns: source, watch_path, suggested_folder, suggested_filename |
| `features/doc-service/app/schemas/document.py` | Exposed 4 new fields in `DocumentOut` |
| `features/doc-service/app/services/config_reader.py` | Added storage config defaults, `get_storage_config()`, `save_storage_config()` |
| `features/doc-service/app/routers/documents.py` | Watch-user visibility (`OR user_id = "watch"`); 4 suggestion endpoints |
| `features/doc-service/app/routers/categories.py` | Watch-owned categories included in list |
| `features/doc-service/app/main.py` | Lifespan watcher start/stop; plugin router mounted |
| `features/doc-service/pyproject.toml` | Added `watchdog>=4.0` |
| `features/doc-service/Dockerfile` | Pre-create `/data/watch` |
| `docker-compose.yml` | Added `watch_data` named volume; mounted to doc-service |
| `docker-compose.dev.yml` | Dev bind-mount `./dev-watch:/data/watch` |
| `CLAUDE.md` | Updated all affected sections (models, migrations, endpoints, routes, tree, query keys, volumes) |
| `backend/STATUS.md` | Plugin system section added |
| `features/doc-service/STATUS.md` | Watch feature, plugin endpoints, migration 0003, updated architecture diagram |
| `frontend/STATUS.md` | Extensions sidebar, PluginSchemaForm, PluginSettingsPage, new API functions |
+1
View File
@@ -0,0 +1 @@
# Watch directory for development testing
+1
View File
@@ -34,3 +34,4 @@ services:
env_file: ./features/doc-service/.env
volumes:
- ./features/doc-service:/app
- ./dev-watch:/data/watch # bind-mount local folder for easy testing
+2
View File
@@ -70,6 +70,7 @@ services:
AI_SERVICE_URL: http://ai-service:8010
volumes:
- doc_data:/data/documents
- watch_data:/data/watch
- app_config:/config
depends_on:
db:
@@ -98,6 +99,7 @@ services:
volumes:
postgres_data:
doc_data: # PDF files persisted across restarts
watch_data: # Watch directory — bind-mount your NAS/Nextcloud here via docker-compose.override.yml
app_config: # Per-service runtime config JSON files
networks:
+1 -1
View File
@@ -17,7 +17,7 @@ RUN groupadd --gid 1001 appuser && \
# Pre-create data and config dirs with correct ownership.
# Named volumes mounted over these paths will inherit ownership on first creation.
RUN mkdir -p /data/documents /config && chown -R appuser:appuser /data /config
RUN mkdir -p /data/documents /data/watch /config && chown -R appuser:appuser /data /config
WORKDIR /app
+70 -28
View File
@@ -2,11 +2,11 @@
## What it is
PDF document management microservice. Handles upload, storage, async AI-powered extraction, tagging, categorisation, and retrieval of PDF documents on a per-user basis.
PDF document management microservice. Handles upload, storage, async AI-powered extraction, tagging, categorisation, and retrieval of PDF documents on a per-user basis. Also supports automatic ingestion from a mounted watch directory (NAS, Nextcloud, Syncthing, etc.).
Port: `8001` (internal only, not exposed to host). All traffic arrives via the backend proxy (`backend/app/routers/documents_proxy.py`), which injects the authenticated `x-user-id` header.
Database: shared PostgreSQL instance, isolated via `alembic_version_doc_service` Alembic version table. Storage: `/data/documents/` (Docker named volume `doc_data`).
Database: shared PostgreSQL instance, isolated via `alembic_version_doc_service` Alembic version table. Storage: `/data/documents/` (Docker named volume `doc_data`). Watch directory: `/data/watch` (named volume `watch_data` in prod; bind-mount in dev via `docker-compose.dev.yml`).
---
@@ -31,13 +31,25 @@ Database: shared PostgreSQL instance, isolated via `alembic_version_doc_service`
| `PATCH` | `/documents/{id}/type` | Update document type |
| `PATCH` | `/documents/{id}/tags` | Replace tag list (dedup, preserve order) |
| `PATCH` | `/documents/{id}/title` | Update editable title |
| `GET` | `/documents/categories` | List all categories for the user |
| `GET` | `/documents/categories` | List all categories (user + watch) |
| `POST` | `/documents/categories` | Create a category; triggers re-analysis of documents in similar categories |
| `POST` | `/documents/{id}/reprocess` | Reset status to pending and re-run AI extraction; 409 if already pending/processing |
| `PATCH` | `/documents/categories/{id}` | Rename a category |
| `DELETE` | `/documents/categories/{id}` | Delete a category |
| `POST` | `/documents/{id}/categories/{cat_id}` | Assign category to document |
| `DELETE` | `/documents/{id}/categories/{cat_id}` | Remove category from document |
| `POST` | `/documents/{id}/suggestions/folder/confirm` | Apply AI folder suggestion → create/find category + assign |
| `POST` | `/documents/{id}/suggestions/folder/reject` | Clear AI folder suggestion |
| `POST` | `/documents/{id}/suggestions/filename/confirm` | Apply AI filename suggestion → set title |
| `POST` | `/documents/{id}/suggestions/filename/reject` | Clear AI filename suggestion |
### Plugin endpoints (internal — backend calls only)
| Method | Path | Description |
|--------|------|-------------|
| `GET` | `/plugin/manifest` | Static manifest: metadata, JSON Schema for settings, access rules |
| `GET` | `/plugin/settings` | Current watch/storage config values |
| `PATCH` | `/plugin/settings` | Update watch/storage config (persisted to `/config/doc_service_config.json`) |
### Pagination & filtering (`GET /documents`)
@@ -59,21 +71,27 @@ Response: `{ items: [...], total: N, page: N, pages: N }`
### Document schema
```
id UUID
user_id string (from x-user-id header)
filename original filename
title AI-suggested editable title (nullable)
file_size bytes
status pending | processing | done | failed
document_type AI-classified type (nullable)
extracted_data JSON string — all AI-extracted fields
tags JSON array string — editable tags
error_message set if status=failed
created_at upload timestamp
processed_at when extraction finished
categories many-to-many via category_assignments
id UUID
user_id string (from x-user-id header; "watch" for watch-ingested docs)
filename original filename
title AI-suggested editable title (nullable)
file_size bytes
status pending | processing | done | failed
document_type AI-classified type (nullable)
extracted_data JSON string — all AI-extracted fields
tags JSON array string — editable tags
error_message set if status=failed
created_at upload timestamp
processed_at when extraction finished
source "upload" (default) or "watch"
watch_path original absolute path in watch directory (nullable)
suggested_folder AI-suggested category name, pending user confirm (nullable)
suggested_filename AI-suggested title/rename, pending user confirm (nullable)
categories many-to-many via category_assignments
```
Watch-ingested documents (`user_id = "watch"`) are visible to all authenticated users.
### AI extraction (via ai-service)
System prompt and user prompt template are loaded at runtime from `doc_service_config.json` (`system_prompts` key). Defaults are built into the service and used as fallback if the config key is absent. Changes made via the AI Settings UI take effect within 30 seconds (config cache TTL).
@@ -93,12 +111,25 @@ Prompt sends the first 50 000 chars of extracted text. Expected JSON response in
```
Env override: `DOC_MAX_PDF_MB`
### Watch directory feature
Controlled via plugin settings (UI accessible to superusers and `doc-service-admin` group members):
- `watch_enabled` — toggle file watching (default: false)
- `watch_path` — mount point (read-only, `/data/watch`; override via Docker volume)
- `ai_folder_suggestion` — AI suggests a category for each ingested doc (user confirms)
- `ai_folder_default` — default category when AI suggestion is disabled
- `ai_rename_suggestion` — AI suggests a title for each ingested doc (user confirms)
On startup scan, the watcher walks the watch directory and ingests any PDFs not already in the database (idempotency check by `watch_path`). Subfolders are automatically mapped to categories (e.g. `watch/invoices/bill.pdf` → category "invoices"). No-remove policy: deleting a file from the watch directory does not delete the document record.
### Database migrations
| Revision | Description |
|----------|-------------|
| 0001 | Initial schema (documents, categories, category_assignments) |
| 0002 | Add `title` column to documents |
| 0003 | Add `source`, `watch_path`, `suggested_folder`, `suggested_filename` columns |
Run automatically on container start via `alembic upgrade head`.
@@ -109,18 +140,26 @@ Run automatically on container start via `alembic upgrade head`.
```
backend (proxy) → doc-service:8001
documents.py router
┌────────┴────────┐
upload list/get/patch
save_upload() pdfplumber extraction
│ │
Document(status=pending) ai_client.classify_document()
│ │
BackgroundTask ai-service:8010/chat
│ │
process_document() JSON result → update doc row
┌────────────┼────────────────────┐
documents.py categories.py plugin.py
│ │ (internal only)
┌────────┴────────┐
upload list/get/patch/suggest
save_upload() pdfplumber extraction
│ │
Document(status=pending) ai_client.classify_document()
│ │
BackgroundTask ai-service:8010/chat
│ │
process_document() JSON result → update doc row
file_watcher.py (watchdog Observer, daemon thread)
├── _PdfEventHandler.on_created / on_moved
│ └── asyncio.run_coroutine_threadsafe(ingest_file, loop)
└── _scan_existing() on startup (catches offline gaps)
```
---
@@ -140,6 +179,8 @@ backend (proxy) → doc-service:8001
## Future work
- [x] `POST /documents/{id}/reprocess` — re-run AI extraction
- [x] Watch directory feature with file watcher, startup scan, folder-to-category mapping, AI suggestion toggles
- [x] Plugin manifest endpoint (`/plugin/manifest`, `/plugin/settings`) for generic settings UI
- [ ] Advanced filter: query `extracted_data` JSON fields (vendor, due_date, amount) — requires PostgreSQL `jsonb` column or indexed virtual columns
- [ ] Bulk operations endpoint
- [ ] Document sharing via groups (blocked on groups/permissions system in backend)
@@ -147,3 +188,4 @@ backend (proxy) → doc-service:8001
- [ ] Rate limiting on upload endpoint
- [ ] Soft delete with restore
- [ ] Category rename / delete with cascade handling
- [ ] Frontend UI for suggestion badges (suggested_folder / suggested_filename confirm/reject buttons)
@@ -0,0 +1,30 @@
"""add watch directory columns to documents
Revision ID: 0003
Revises: 0002
Create Date: 2026-04-18
"""
from typing import Sequence, Union
import sqlalchemy as sa
from alembic import op
revision: str = "0003"
down_revision: Union[str, None] = "0002"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
op.add_column("documents", sa.Column("source", sa.String(16), nullable=False, server_default="upload"))
op.add_column("documents", sa.Column("watch_path", sa.String(), nullable=True))
op.add_column("documents", sa.Column("suggested_folder", sa.String(128), nullable=True))
op.add_column("documents", sa.Column("suggested_filename", sa.String(500), nullable=True))
def downgrade() -> None:
op.drop_column("documents", "suggested_filename")
op.drop_column("documents", "suggested_folder")
op.drop_column("documents", "watch_path")
op.drop_column("documents", "source")
+31 -1
View File
@@ -1,15 +1,45 @@
import asyncio
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI
from app.core.config import settings
from app.routers import categories, documents
from app.routers import plugin as plugin_router
app = FastAPI(title=settings.PROJECT_NAME)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
loop = asyncio.get_running_loop()
watcher = None
try:
from app.services.config_reader import get_storage_config
storage_config = await get_storage_config()
if storage_config.get("watch_enabled"):
from app.services.file_watcher import FileWatcherService
watcher = FileWatcherService(loop)
await watcher.start(storage_config["watch_path"], storage_config)
except Exception as exc:
logger.warning("[doc-service] File watcher could not start: %s", exc)
yield
if watcher is not None:
await watcher.stop()
app = FastAPI(title=settings.PROJECT_NAME, lifespan=lifespan)
# No CORS — this service is only reachable from the main backend on backend-net.
# All browser traffic goes through the main backend proxy.
app.include_router(documents.router, prefix="/documents", tags=["documents"])
app.include_router(categories.router, prefix="/categories", tags=["categories"])
app.include_router(plugin_router.router, prefix="/plugin", tags=["plugin"])
@app.get("/health")
@@ -27,6 +27,12 @@ class Document(Base):
)
processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
# Watch-directory ingestion fields (migration 0003)
source: Mapped[str] = mapped_column(String(16), nullable=False, default="upload")
watch_path: Mapped[str | None] = mapped_column(String, nullable=True)
suggested_folder: Mapped[str | None] = mapped_column(String(128), nullable=True)
suggested_filename: Mapped[str | None] = mapped_column(String(500), nullable=True)
category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
"CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
)
@@ -5,6 +5,8 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import or_
from app.database import AsyncSessionLocal, get_db
from app.deps import get_user_id
from app.models.category import DocumentCategory
@@ -15,6 +17,9 @@ from app.services.ai_client import classify_document
router = APIRouter()
# Sentinel user_id for watch-ingested categories — must match documents.py
_WATCH_USER_ID = "watch"
_SIMILARITY_THRESHOLD = 0.4
@@ -81,9 +86,10 @@ async def list_categories(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> list[DocumentCategory]:
# Include watch-ingested categories so they appear in the sidebar/filter
result = await db.execute(
select(DocumentCategory)
.where(DocumentCategory.user_id == user_id)
.where(or_(DocumentCategory.user_id == user_id, DocumentCategory.user_id == _WATCH_USER_ID))
.order_by(DocumentCategory.name)
)
return result.scalars().all()
+106 -6
View File
@@ -26,13 +26,21 @@ router = APIRouter()
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
# Sentinel user_id used for watch-directory-ingested documents.
# These documents are visible to all authenticated users.
_WATCH_USER_ID = "watch"
# ── Helpers ───────────────────────────────────────────────────────────────────
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
"""Fetch a document owned by user_id OR a watch-ingested document (visible to all)."""
result = await db.execute(
select(Document)
.where(Document.id == doc_id, Document.user_id == user_id)
.where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
.options(
selectinload(Document.category_assignments)
.selectinload(CategoryAssignment.category)
@@ -61,6 +69,10 @@ def _doc_with_categories(doc: Document) -> DocumentOut:
created_at=doc.created_at,
processed_at=doc.processed_at,
categories=cats,
source=doc.source,
watch_path=doc.watch_path,
suggested_folder=doc.suggested_folder,
suggested_filename=doc.suggested_filename,
)
@@ -183,7 +195,8 @@ async def list_documents(
sort_expr = sort_col.desc() if order == "desc" else sort_col.asc()
# Build filter conditions once and reuse for both count + items queries.
conditions = [Document.user_id == user_id]
# Watch-ingested documents (user_id = "watch") are visible to all users.
conditions = [or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID)]
if status:
conditions.append(Document.status == status)
if document_type:
@@ -247,7 +260,10 @@ async def get_document_status(
db: AsyncSession = Depends(get_db),
) -> Document:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
doc = result.scalar_one_or_none()
if doc is None:
@@ -347,7 +363,10 @@ async def download_file(
db: AsyncSession = Depends(get_db),
) -> StreamingResponse:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
doc = result.scalar_one_or_none()
if doc is None:
@@ -374,9 +393,12 @@ async def assign_category(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
# Verify both belong to this user
# Verify the document is accessible (own or watch-ingested)
doc_result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
if doc_result.scalar_one_or_none() is None:
raise HTTPException(status_code=404, detail="Document not found")
@@ -418,3 +440,81 @@ async def remove_category(
if assignment:
await db.delete(assignment)
await db.commit()
# ── AI suggestion confirmation ────────────────────────────────────────────────
# These endpoints allow users to confirm or reject AI suggestions on
# watch-ingested documents. No disk mutations — suggestions only update the DB.
@router.post("/{doc_id}/suggestions/folder/confirm", status_code=204)
async def confirm_folder_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
if not doc.suggested_folder:
raise HTTPException(status_code=400, detail="No folder suggestion pending")
# Find or create the suggested category under the watch sentinel user
cat_result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.user_id == _WATCH_USER_ID,
DocumentCategory.name == doc.suggested_folder,
)
)
cat = cat_result.scalar_one_or_none()
if cat is None:
cat = DocumentCategory(user_id=_WATCH_USER_ID, name=doc.suggested_folder[:128])
db.add(cat)
await db.commit()
await db.refresh(cat)
# Assign if not already assigned
exists = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat.id,
)
)
if exists.scalar_one_or_none() is None:
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
doc.suggested_folder = None
await db.commit()
@router.post("/{doc_id}/suggestions/folder/reject", status_code=204)
async def reject_folder_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
doc.suggested_folder = None
await db.commit()
@router.post("/{doc_id}/suggestions/filename/confirm", status_code=204)
async def confirm_filename_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
if not doc.suggested_filename:
raise HTTPException(status_code=400, detail="No filename suggestion pending")
doc.title = doc.suggested_filename
doc.suggested_filename = None
await db.commit()
@router.post("/{doc_id}/suggestions/filename/reject", status_code=204)
async def reject_filename_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
doc.suggested_filename = None
await db.commit()
@@ -0,0 +1,97 @@
"""
Plugin manifest and settings endpoints for doc-service.
These are internal-only — they are called by the main backend's generic plugin
proxy, never directly by the browser. No authentication is applied here because
the backend enforces access control before forwarding the request.
Endpoints:
GET /plugin/manifest → static manifest with JSON Schema for settings
GET /plugin/settings → current storage config values
PATCH /plugin/settings → update storage config (partial update)
"""
from fastapi import APIRouter
from pydantic import BaseModel
from app.services.config_reader import get_storage_config, save_storage_config
router = APIRouter()
_MANIFEST: dict = {
"id": "doc-service",
"name": "Document Service",
"icon": "file-text",
"version": "1.0",
"access": {
"allow_superuser": True,
"required_groups": ["doc-service-admin"],
},
"settings_schema": {
"type": "object",
"title": "Storage & Watch",
"properties": {
"watch_enabled": {
"type": "boolean",
"title": "Enable file watching",
"description": (
"Automatically ingest PDF files added to the mounted watch directory. "
"Requires a service restart to take effect after toggling."
),
},
"watch_path": {
"type": "string",
"title": "Watch path",
"readOnly": True,
"description": "Configured via Docker volume mount — edit docker-compose to change.",
},
"ai_folder_suggestion": {
"type": "boolean",
"title": "AI folder suggestion",
"description": (
"AI suggests a category for each ingested document. "
"You must confirm the suggestion before it is applied."
),
},
"ai_folder_default": {
"type": "string",
"title": "Default import category",
"description": "Category assigned automatically when AI folder suggestion is disabled.",
},
"ai_rename_suggestion": {
"type": "boolean",
"title": "AI rename suggestion",
"description": (
"AI suggests a document title for each ingested file. "
"You must confirm before it is applied."
),
},
},
},
}
class StorageSettingsUpdate(BaseModel):
watch_enabled: bool | None = None
ai_folder_suggestion: bool | None = None
ai_folder_default: str | None = None
ai_rename_suggestion: bool | None = None
# watch_path is intentionally excluded — it cannot be changed via API
@router.get("/manifest")
async def get_manifest() -> dict:
return _MANIFEST
@router.get("/settings")
async def get_settings() -> dict:
return await get_storage_config()
@router.patch("/settings")
async def update_settings(body: StorageSettingsUpdate) -> dict:
update = body.model_dump(exclude_none=True)
if "ai_folder_default" in update:
update["ai_folder_default"] = update["ai_folder_default"][:128].strip() or "imports"
await save_storage_config(update)
return await get_storage_config()
@@ -23,6 +23,10 @@ class DocumentOut(BaseModel):
created_at: datetime
processed_at: datetime | None
categories: list[CategoryOut] = []
source: str = "upload"
watch_path: str | None = None
suggested_folder: str | None = None
suggested_filename: str | None = None
model_config = {"from_attributes": True}
@@ -14,6 +14,14 @@ from pathlib import Path
from app.core.config import settings
_DEFAULT_STORAGE_CONFIG: dict = {
"watch_enabled": False,
"watch_path": "/data/watch",
"ai_folder_suggestion": False,
"ai_folder_default": "imports",
"ai_rename_suggestion": False,
}
_DEFAULT_SYSTEM_PROMPT = (
"You are a financial document analysis assistant. "
"Given the text extracted from a PDF document, return ONLY a JSON object "
@@ -43,6 +51,7 @@ _DEFAULT_USER_TEMPLATE = (
_DEFAULT_CONFIG: dict = {
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
"storage": _DEFAULT_STORAGE_CONFIG,
"system_prompts": {
"system": _DEFAULT_SYSTEM_PROMPT,
"user_template": _DEFAULT_USER_TEMPLATE,
@@ -64,6 +73,25 @@ def _read_config_sync() -> dict:
return _apply_env_overrides(base)
def _read_config_sync_raw() -> dict:
"""Read without env overrides — used when we need to write back to disk."""
path = Path(settings.CONFIG_PATH)
if not path.exists():
return deepcopy(_DEFAULT_CONFIG)
with open(path) as f:
return json.load(f)
def _write_config_sync(config: dict) -> None:
"""Atomically write config JSON to disk."""
path = Path(settings.CONFIG_PATH)
tmp = path.with_suffix(".tmp")
tmp.parent.mkdir(parents=True, exist_ok=True)
with open(tmp, "w") as f:
json.dump(config, f, indent=2)
os.replace(tmp, path)
def _apply_env_overrides(config: dict) -> dict:
cfg = deepcopy(config)
docs = cfg.setdefault("documents", {})
@@ -84,3 +112,22 @@ async def load_doc_config() -> dict:
_cache = data
_cache_at = now
return data
async def get_storage_config() -> dict:
"""Return storage config block, filling in defaults for any missing keys."""
config = await load_doc_config()
result = deepcopy(_DEFAULT_STORAGE_CONFIG)
result.update(config.get("storage", {}))
return result
async def save_storage_config(data: dict) -> None:
"""Merge data into the storage config block and persist to disk."""
global _cache, _cache_at
raw = await asyncio.to_thread(_read_config_sync_raw)
raw.setdefault("storage", {}).update(data)
await asyncio.to_thread(_write_config_sync, raw)
# Invalidate cache so next read picks up the new values
_cache = None
_cache_at = 0.0
@@ -0,0 +1,256 @@
"""
File-system watcher for the watch directory.
Uses the watchdog library to monitor a configured directory for new PDF files.
When a PDF is detected, it is automatically ingested into the document service
(copied to /data/documents, a DB record is created, and the AI pipeline runs).
Key design decisions:
- No-remove policy: on_deleted and on_moved events are intentionally ignored.
The watcher never deletes, moves, or modifies files on the watched volume.
- Watch documents use user_id="watch" as a sentinel so they are visible to
all authenticated users in the document list.
- Subfolder names map to categories: a file at invoices/bill.pdf is assigned
to a "invoices" category (auto-created if needed).
- Suggestions: if ai_folder_suggestion or ai_rename_suggestion are enabled,
the relevant fields are set on the document after AI processing so users
can confirm/reject from the UI.
- Thread → async bridge: watchdog runs in a daemon thread; asyncio coroutines
are dispatched from that thread via run_coroutine_threadsafe.
"""
import asyncio
import json
import logging
import uuid
from pathlib import Path
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
from app.database import AsyncSessionLocal
from app.models.category import DocumentCategory
from app.models.category_assignment import CategoryAssignment
from app.models.document import Document
from app.services.storage import save_upload
logger = logging.getLogger(__name__)
# Must match _WATCH_USER_ID in app/routers/documents.py
WATCH_USER_ID = "watch"
# ── Ingestion logic ───────────────────────────────────────────────────────────
async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
"""
Ingest a single PDF file from the watch directory.
Idempotent: skips files that already have a non-failed document record.
"""
from sqlalchemy import select
path = Path(path_str)
if not path.exists() or not path.is_file():
return
async with AsyncSessionLocal() as db:
# Idempotency check — skip if already tracked (and not failed)
existing_result = await db.execute(
select(Document).where(Document.watch_path == path_str)
)
existing = existing_result.scalar_one_or_none()
if existing is not None and existing.status != "failed":
return
# Determine category from the first subfolder component
try:
rel = path.relative_to(watch_root)
folder_name = rel.parts[0] if len(rel.parts) > 1 else None
except ValueError:
folder_name = None
# Read file bytes
try:
file_data = path.read_bytes()
except OSError as exc:
logger.warning("[watcher] Cannot read %s: %s", path_str, exc)
return
# Save a copy to /data/documents/watch/{doc_id}.pdf
doc_id = existing.id if existing is not None else str(uuid.uuid4())
dest = await save_upload(file_data, WATCH_USER_ID, doc_id)
if existing is not None:
# Re-ingest a previously failed document
existing.file_path = str(dest)
existing.file_size = len(file_data)
existing.status = "pending"
existing.error_message = None
await db.commit()
else:
doc = Document(
id=doc_id,
user_id=WATCH_USER_ID,
source="watch",
watch_path=path_str,
filename=path.name,
file_path=str(dest),
file_size=len(file_data),
status="pending",
)
db.add(doc)
await db.commit()
# Auto-assign category from subfolder name
if folder_name:
cat_result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.user_id == WATCH_USER_ID,
DocumentCategory.name == folder_name,
)
)
cat = cat_result.scalar_one_or_none()
if cat is None:
cat = DocumentCategory(user_id=WATCH_USER_ID, name=folder_name[:128])
db.add(cat)
await db.commit()
await db.refresh(cat)
exists_assign = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat.id,
)
)
if exists_assign.scalar_one_or_none() is None:
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
await db.commit()
# Run AI pipeline (opens its own session internally)
from app.routers.documents import process_document
await process_document(doc_id)
# Set AI suggestions if enabled
if config.get("ai_folder_suggestion") or config.get("ai_rename_suggestion"):
await _apply_suggestions(doc_id, config)
async def _apply_suggestions(doc_id: str, config: dict) -> None:
"""Populate suggested_folder / suggested_filename after AI processing."""
from sqlalchemy import select
async with AsyncSessionLocal() as db:
result = await db.execute(select(Document).where(Document.id == doc_id))
doc = result.scalar_one_or_none()
if doc is None or doc.status != "done" or not doc.extracted_data:
return
try:
extracted = json.loads(doc.extracted_data)
except Exception:
return
changed = False
if config.get("ai_folder_suggestion"):
suggestions = extracted.get("suggested_categories", [])
if suggestions:
doc.suggested_folder = str(suggestions[0])[:128]
changed = True
if config.get("ai_rename_suggestion"):
title = extracted.get("title")
if title:
doc.suggested_filename = str(title)[:500]
changed = True
if changed:
await db.commit()
# ── Watchdog event handler ────────────────────────────────────────────────────
class _PdfEventHandler(FileSystemEventHandler):
def __init__(
self,
watch_root: Path,
loop: asyncio.AbstractEventLoop,
config: dict,
) -> None:
super().__init__()
self._watch_root = watch_root
self._loop = loop
self._config = config
def _dispatch_ingest(self, path_str: str) -> None:
if path_str.lower().endswith(".pdf"):
asyncio.run_coroutine_threadsafe(
ingest_file(path_str, self._watch_root, self._config),
self._loop,
)
def on_created(self, event): # type: ignore[override]
if not event.is_directory:
self._dispatch_ingest(event.src_path)
def on_moved(self, event): # type: ignore[override]
# Handles atomic rename/move (e.g. Nextcloud or Syncthing completing a sync)
if not event.is_directory:
self._dispatch_ingest(event.dest_path)
# on_deleted / on_modified: intentionally not overridden — no-remove policy
# ── Service ───────────────────────────────────────────────────────────────────
class FileWatcherService:
"""Manages the watchdog Observer lifecycle within the FastAPI lifespan."""
def __init__(self, loop: asyncio.AbstractEventLoop) -> None:
self._loop = loop
self._observer: Observer | None = None
self._watch_root: Path | None = None
self._config: dict = {}
async def start(self, watch_path: str, config: dict) -> None:
self._watch_root = Path(watch_path)
self._config = config
if not self._watch_root.exists():
logger.warning(
"[watcher] Watch path %s does not exist — file watching disabled",
watch_path,
)
return
handler = _PdfEventHandler(self._watch_root, self._loop, config)
self._observer = Observer()
self._observer.schedule(handler, watch_path, recursive=True)
self._observer.start()
logger.info("[watcher] started, watching %s", watch_path)
# Run startup scan as a background task so startup is not blocked
asyncio.create_task(self._scan_existing())
async def _scan_existing(self) -> None:
"""Ingest any PDFs already present in the watch directory."""
if self._watch_root is None:
return
logger.info("[watcher] scanning existing files in %s", self._watch_root)
count = 0
for pdf_path in sorted(self._watch_root.rglob("*.pdf")):
try:
await ingest_file(str(pdf_path), self._watch_root, self._config)
count += 1
except Exception as exc:
logger.warning("[watcher] scan error for %s: %s", pdf_path, exc)
logger.info("[watcher] startup scan complete — processed %d file(s)", count)
async def stop(self) -> None:
if self._observer is not None:
self._observer.stop()
await asyncio.to_thread(self._observer.join)
self._observer = None
logger.info("[watcher] stopped")
+1
View File
@@ -17,6 +17,7 @@ dependencies = [
"pdfplumber>=0.11",
"aiofiles>=23.0",
"python-multipart>=0.0.9",
"watchdog>=4.0",
]
[project.optional-dependencies]
+20 -2
View File
@@ -23,6 +23,7 @@ All API calls go through `src/api/client.ts` (single Axios instance, JWT injecte
| `/admin/groups` | `AdminGroupsPage` | Admin only |
| `/profile` | `ProfilePage` | Required |
| `/settings` | `SettingsPage` (placeholder) | Required |
| `/settings/plugins/:id` | `PluginSettingsPage` | Required (per-plugin access control) |
`PrivateRoute` redirects to `/login` when no token. `AdminRoute` redirects to `/` when not admin.
@@ -60,6 +61,12 @@ Cards are rendered dynamically from `GET /api/services` (polled every 30 s via T
- Sections auto-open when navigating to their route
- In collapsed (icons-only) mode, clicking the Apps icon navigates to `/apps`
**Extensions** section (dynamic):
- Populated from `GET /api/plugins` (polled via TanStack Query, `retry: false`)
- Only shown when the user has access to at least one plugin
- Each entry links to `/settings/plugins/:id`
- No code changes needed to add future plugin-enabled feature containers
### Documents page (`/apps/documents`)
**Upload:** PDF file input, 202 response, error display.
@@ -140,6 +147,10 @@ Key functions:
| `removeCategory(docId, catId)` | Remove |
| `updateDocumentTags(id, tags)` | `PATCH /documents/{id}/tags` |
| `updateDocumentTitle(id, title)` | `PATCH /documents/{id}/title` |
| `confirmFolderSuggestion(docId)` | `POST /documents/{id}/suggestions/folder/confirm` |
| `rejectFolderSuggestion(docId)` | `POST /documents/{id}/suggestions/folder/reject` |
| `confirmFilenameSuggestion(docId)` | `POST /documents/{id}/suggestions/filename/confirm` |
| `rejectFilenameSuggestion(docId)` | `POST /documents/{id}/suggestions/filename/reject` |
| `getAISettings()` | `GET /settings/ai` (masked) |
| `updateAISettings(data)` | `PATCH /settings/ai` |
| `testAIConnection()` | `POST /settings/ai/test` |
@@ -152,6 +163,10 @@ Key functions:
| `adminAddGroupMember(gId, uId)` | `POST /admin/groups/{gId}/members/{uId}` |
| `adminRemoveGroupMember(gId, uId)` | `DELETE /admin/groups/{gId}/members/{uId}` |
| `updateDocumentLimits(data)` | `PATCH /settings/documents/limits` |
| `getPlugins()` | `GET /plugins` — list accessible plugins |
| `getPluginManifest(id)` | `GET /plugins/{id}/manifest` |
| `getPluginSettings(id)` | `GET /plugins/{id}/settings` |
| `updatePluginSettings(id, data)` | `PATCH /plugins/{id}/settings` |
---
@@ -168,8 +183,10 @@ Key functions:
| Component | Path | Description |
|-----------|------|-------------|
| `AppShell` | `src/components/AppShell.tsx` | Layout wrapper: Sidebar + scrollable main content |
| `Sidebar` | `src/components/Sidebar.tsx` | Collapsible left nav (icons-only ↔ icons+labels) |
| `Sidebar` | `src/components/Sidebar.tsx` | Collapsible left nav; includes dynamic "Extensions" section |
| `ThemeToggle` | `src/components/ThemeToggle.tsx` | Sun/moon ghost icon button; persists to localStorage |
| `PluginSchemaForm` | `src/components/PluginSchemaForm.tsx` | JSON Schema → React form (boolean/string/number/readOnly fields) |
| `PluginSettingsPage` | `src/pages/PluginSettingsPage.tsx` | Generic plugin settings page (manifest-driven) |
| `Button` | `src/components/ui/button.tsx` | shadcn/ui Button (default, ghost, outline, destructive) |
| `Input` | `src/components/ui/input.tsx` | shadcn/ui Input |
@@ -188,10 +205,11 @@ Key functions:
- [x] UI component library: shadcn/ui + Tailwind CSS — installed and wired up
- [x] AppShell + Sidebar replacing inline Nav component
- [x] Light/dark theme context with OS preference detection
- [x] Generic plugin infrastructure: Extensions sidebar section, PluginSchemaForm, PluginSettingsPage
- [ ] Suggestion badges in DocumentsPage for `suggested_folder` / `suggested_filename` (confirm/reject buttons)
- [ ] Toast notification system (upload success, save feedback, errors)
- [ ] Loading skeletons
- [ ] `POST /queue/jobs` integration — show AI processing queue status / progress per document
- [ ] Re-process document button (`POST /documents/{id}/reprocess` — needs backend endpoint first)
- [ ] Advanced filter: extracted data fields (vendor, due date, amount) — needs backend support
- [x] Groups admin UI — list, create, edit, delete, add/remove members
- [ ] App permissions UI per group (blocked on backend group_app_permissions)
+2
View File
@@ -15,6 +15,7 @@ import DocumentsPage from "./pages/DocumentsPage";
import DocumentAdminSettingsPage from "./pages/DocumentAdminSettingsPage";
import AIAdminSettingsPage from "./pages/AIAdminSettingsPage";
import SettingsPage from "./pages/SettingsPage";
import PluginSettingsPage from "./pages/PluginSettingsPage";
function PrivateRoute({ children }: { children: React.ReactNode }) {
const { token } = useAuth();
@@ -55,6 +56,7 @@ export default function App() {
/>
<Route path="/profile" element={<PrivateRoute><ProfilePage /></PrivateRoute>} />
<Route path="/settings" element={<PrivateRoute><SettingsPage /></PrivateRoute>} />
<Route path="/settings/plugins/:id" element={<PrivateRoute><PluginSettingsPage /></PrivateRoute>} />
<Route path="/admin" element={<AdminRoute><AdminPage /></AdminRoute>} />
<Route path="/admin/users" element={<AdminRoute><AdminUsersPage /></AdminRoute>} />
<Route path="/admin/groups" element={<AdminRoute><AdminGroupsPage /></AdminRoute>} />
+60
View File
@@ -107,6 +107,10 @@ export interface DocumentOut {
created_at: string;
processed_at: string | null;
categories: CategoryOut[];
source: string;
watch_path: string | null;
suggested_folder: string | null;
suggested_filename: string | null;
}
export interface DocumentPage {
@@ -371,3 +375,59 @@ export const updateSystemPrompt = (
api
.patch<SystemPromptsData>(`/settings/system-prompts/${serviceId}`, data)
.then((r) => r.data);
// --- Document suggestions (watch-ingested documents) ---
export const confirmFolderSuggestion = (docId: string) =>
api.post(`/documents/${docId}/suggestions/folder/confirm`);
export const rejectFolderSuggestion = (docId: string) =>
api.post(`/documents/${docId}/suggestions/folder/reject`);
export const confirmFilenameSuggestion = (docId: string) =>
api.post(`/documents/${docId}/suggestions/filename/confirm`);
export const rejectFilenameSuggestion = (docId: string) =>
api.post(`/documents/${docId}/suggestions/filename/reject`);
// --- Plugins ---
export interface PluginOut {
id: string;
name: string;
icon: string;
version: string;
}
export interface PluginSchemaProperty {
type: string;
title: string;
description?: string;
readOnly?: boolean;
}
export interface PluginManifest {
id: string;
name: string;
icon: string;
version: string;
access: {
allow_superuser: boolean;
required_groups: string[];
};
settings_schema: {
type: string;
title?: string;
properties: Record<string, PluginSchemaProperty>;
};
}
export const getPlugins = () =>
api.get<PluginOut[]>("/plugins").then((r) => r.data);
export const getPluginManifest = (id: string) =>
api.get<PluginManifest>(`/plugins/${id}/manifest`).then((r) => r.data);
export const getPluginSettings = (id: string) =>
api.get<Record<string, unknown>>(`/plugins/${id}/settings`).then((r) => r.data);
export const updatePluginSettings = (id: string, data: Record<string, unknown>) =>
api.patch<Record<string, unknown>>(`/plugins/${id}/settings`, data).then((r) => r.data);
@@ -0,0 +1,119 @@
import { useState, useEffect } from "react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { cn } from "@/lib/utils";
import type { PluginSchemaProperty } from "@/api/client";
interface PluginSchema {
type: string;
title?: string;
properties: Record<string, PluginSchemaProperty>;
}
interface PluginSchemaFormProps {
schema: PluginSchema;
values: Record<string, unknown>;
onSave: (values: Record<string, unknown>) => void;
isPending?: boolean;
isError?: boolean;
isSuccess?: boolean;
}
function Toggle({ checked, onChange }: { checked: boolean; onChange: (v: boolean) => void }) {
return (
<button
type="button"
role="switch"
aria-checked={checked}
onClick={() => onChange(!checked)}
className={cn(
"relative inline-flex h-6 w-11 shrink-0 items-center rounded-full transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-primary",
checked ? "bg-primary" : "bg-muted/60 border border-border"
)}
>
<span
className={cn(
"inline-block h-4 w-4 rounded-full bg-white shadow-sm transition-transform",
checked ? "translate-x-6" : "translate-x-1"
)}
/>
</button>
);
}
export default function PluginSchemaForm({
schema,
values,
onSave,
isPending,
isError,
isSuccess,
}: PluginSchemaFormProps) {
const [form, setForm] = useState<Record<string, unknown>>(values);
useEffect(() => {
setForm(values);
}, [values]);
const setField = (key: string, value: unknown) => {
setForm((prev) => ({ ...prev, [key]: value }));
};
return (
<div className="space-y-6">
{Object.entries(schema.properties).map(([key, prop]) => (
<div key={key} className="space-y-1.5">
<div className="flex items-center justify-between gap-4">
<div className="min-w-0">
<p className="text-sm font-medium text-foreground">{prop.title}</p>
{prop.description && (
<p className="text-xs text-muted mt-0.5">{prop.description}</p>
)}
</div>
{prop.type === "boolean" && !prop.readOnly && (
<Toggle
checked={Boolean(form[key])}
onChange={(v) => setField(key, v)}
/>
)}
</div>
{prop.type === "string" && prop.readOnly && (
<p className="text-sm text-muted font-mono bg-muted/20 px-3 py-1.5 rounded-md border border-border">
{String(form[key] ?? "")}
</p>
)}
{prop.type === "string" && !prop.readOnly && (
<Input
value={String(form[key] ?? "")}
onChange={(e) => setField(key, e.target.value)}
className="h-9"
/>
)}
{prop.type === "number" && !prop.readOnly && (
<Input
type="number"
value={String(form[key] ?? "")}
onChange={(e) => setField(key, Number(e.target.value))}
className="h-9"
/>
)}
</div>
))}
<div className="flex items-center gap-3 pt-2">
<Button onClick={() => onSave(form)} disabled={isPending} size="sm">
{isPending ? "Saving…" : "Save changes"}
</Button>
{isError && (
<span className="text-sm text-destructive">Failed to save. Please try again.</span>
)}
{isSuccess && !isPending && (
<span className="text-sm text-green-600 dark:text-green-400">Saved successfully.</span>
)}
</div>
</div>
);
}
+44 -1
View File
@@ -16,11 +16,12 @@ import {
Users,
UsersRound,
Palette,
Puzzle,
} from "lucide-react";
import { Button } from "@/components/ui/button";
import ThemeToggle from "@/components/ThemeToggle";
import { useAuth } from "@/hooks/useAuth";
import { getMe, listCategories } from "@/api/client";
import { getMe, getPlugins, listCategories } from "@/api/client";
import { cn } from "@/lib/utils";
export default function Sidebar() {
@@ -56,6 +57,14 @@ export default function Sidebar() {
enabled: appsOpen && docsOpen && !!user,
});
const { data: plugins = [] } = useQuery({
queryKey: ["plugins"],
queryFn: getPlugins,
enabled: !!user,
// Empty array on 404/error — regular users simply see no plugins
retry: false,
});
const navItemClass = (isActive: boolean) =>
cn(
"flex items-center rounded-lg transition-colors",
@@ -209,6 +218,40 @@ export default function Sidebar() {
)}
</NavLink>
{/* Extensions — visible only when the user has accessible plugins */}
{plugins.length > 0 && (
<div>
{sidebarExpanded ? (
<>
<div className="px-3 py-1.5">
<span className="text-xs font-semibold uppercase tracking-wider text-muted">
Extensions
</span>
</div>
<div className="space-y-0.5">
{plugins.map((plugin) => (
<NavLink
key={plugin.id}
to={`/settings/plugins/${plugin.id}`}
className={({ isActive }) => subItemClass(isActive)}
>
<Puzzle className="h-4 w-4 shrink-0" />
<span className="whitespace-nowrap truncate">{plugin.name}</span>
</NavLink>
))}
</div>
</>
) : (
<NavLink
to={`/settings/plugins/${plugins[0].id}`}
className={({ isActive }) => navItemClass(isActive)}
>
<Puzzle className="h-5 w-5 shrink-0" />
</NavLink>
)}
</div>
)}
{/* Admin — expandable */}
{user?.is_admin && (
<div>
+63
View File
@@ -0,0 +1,63 @@
import { useParams } from "react-router-dom";
import { useQuery, useMutation, useQueryClient } from "@tanstack/react-query";
import { getPluginManifest, getPluginSettings, updatePluginSettings } from "@/api/client";
import PluginSchemaForm from "@/components/PluginSchemaForm";
export default function PluginSettingsPage() {
const { id } = useParams<{ id: string }>();
const queryClient = useQueryClient();
const { data: manifest, isLoading: manifestLoading, isError: manifestError } = useQuery({
queryKey: ["plugin-manifest", id],
queryFn: () => getPluginManifest(id!),
enabled: !!id,
retry: false,
});
const { data: settings, isLoading: settingsLoading } = useQuery({
queryKey: ["plugin-settings", id],
queryFn: () => getPluginSettings(id!),
enabled: !!id && !!manifest,
});
const mutation = useMutation({
mutationFn: (values: Record<string, unknown>) => updatePluginSettings(id!, values),
onSuccess: () => {
queryClient.invalidateQueries({ queryKey: ["plugin-settings", id] });
},
});
if (manifestLoading || settingsLoading) {
return <p className="text-sm text-muted p-6">Loading</p>;
}
if (manifestError || !manifest) {
return (
<p className="text-sm text-destructive p-6">
Plugin not found or you do not have access to its settings.
</p>
);
}
return (
<div className="max-w-xl p-6 space-y-6">
<div>
<h1 className="text-xl font-semibold text-foreground">
{manifest.settings_schema.title ?? manifest.name}
</h1>
<p className="text-sm text-muted mt-1">
{manifest.name} · v{manifest.version}
</p>
</div>
<PluginSchemaForm
schema={manifest.settings_schema}
values={settings ?? {}}
onSave={(values) => mutation.mutate(values)}
isPending={mutation.isPending}
isError={mutation.isError}
isSuccess={mutation.isSuccess}
/>
</div>
);
}