Add generic plugin architecture and watch-directory feature

Introduces a manifest contract so feature containers self-describe their
settings (JSON Schema + access rules). Backend and frontend gain generic
plugin proxy and dynamic Extensions UI with zero feature-specific code.

Doc-service is the first plugin consumer: exposes /plugin/manifest and
/plugin/settings, adds a watchdog-based file watcher that auto-ingests
PDFs from a mounted directory, maps subfolders to categories, supports
AI-suggested folder/filename (user-confirmed), and enforces a no-remove
policy. Access is gated by is_superuser or doc-service-admin group.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
curo1305
2026-04-18 02:09:50 +02:00
parent 2d7207b62f
commit 00466a9801
29 changed files with 1373 additions and 52 deletions
+31 -1
View File
@@ -1,15 +1,45 @@
import asyncio
import logging
from contextlib import asynccontextmanager
from fastapi import FastAPI
from app.core.config import settings
from app.routers import categories, documents
from app.routers import plugin as plugin_router
app = FastAPI(title=settings.PROJECT_NAME)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
loop = asyncio.get_running_loop()
watcher = None
try:
from app.services.config_reader import get_storage_config
storage_config = await get_storage_config()
if storage_config.get("watch_enabled"):
from app.services.file_watcher import FileWatcherService
watcher = FileWatcherService(loop)
await watcher.start(storage_config["watch_path"], storage_config)
except Exception as exc:
logger.warning("[doc-service] File watcher could not start: %s", exc)
yield
if watcher is not None:
await watcher.stop()
app = FastAPI(title=settings.PROJECT_NAME, lifespan=lifespan)
# No CORS — this service is only reachable from the main backend on backend-net.
# All browser traffic goes through the main backend proxy.
app.include_router(documents.router, prefix="/documents", tags=["documents"])
app.include_router(categories.router, prefix="/categories", tags=["categories"])
app.include_router(plugin_router.router, prefix="/plugin", tags=["plugin"])
@app.get("/health")
@@ -27,6 +27,12 @@ class Document(Base):
)
processed_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
# Watch-directory ingestion fields (migration 0003)
source: Mapped[str] = mapped_column(String(16), nullable=False, default="upload")
watch_path: Mapped[str | None] = mapped_column(String, nullable=True)
suggested_folder: Mapped[str | None] = mapped_column(String(128), nullable=True)
suggested_filename: Mapped[str | None] = mapped_column(String(500), nullable=True)
category_assignments: Mapped[list["CategoryAssignment"]] = relationship(
"CategoryAssignment", back_populates="document", cascade="all, delete-orphan"
)
@@ -5,6 +5,8 @@ from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import or_
from app.database import AsyncSessionLocal, get_db
from app.deps import get_user_id
from app.models.category import DocumentCategory
@@ -15,6 +17,9 @@ from app.services.ai_client import classify_document
router = APIRouter()
# Sentinel user_id for watch-ingested categories — must match documents.py
_WATCH_USER_ID = "watch"
_SIMILARITY_THRESHOLD = 0.4
@@ -81,9 +86,10 @@ async def list_categories(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> list[DocumentCategory]:
# Include watch-ingested categories so they appear in the sidebar/filter
result = await db.execute(
select(DocumentCategory)
.where(DocumentCategory.user_id == user_id)
.where(or_(DocumentCategory.user_id == user_id, DocumentCategory.user_id == _WATCH_USER_ID))
.order_by(DocumentCategory.name)
)
return result.scalars().all()
+106 -6
View File
@@ -26,13 +26,21 @@ router = APIRouter()
_DEFAULT_MAX_BYTES = 20 * 1024 * 1024
# Sentinel user_id used for watch-directory-ingested documents.
# These documents are visible to all authenticated users.
_WATCH_USER_ID = "watch"
# ── Helpers ───────────────────────────────────────────────────────────────────
async def _get_user_doc(doc_id: str, user_id: str, db: AsyncSession) -> Document:
"""Fetch a document owned by user_id OR a watch-ingested document (visible to all)."""
result = await db.execute(
select(Document)
.where(Document.id == doc_id, Document.user_id == user_id)
.where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
.options(
selectinload(Document.category_assignments)
.selectinload(CategoryAssignment.category)
@@ -61,6 +69,10 @@ def _doc_with_categories(doc: Document) -> DocumentOut:
created_at=doc.created_at,
processed_at=doc.processed_at,
categories=cats,
source=doc.source,
watch_path=doc.watch_path,
suggested_folder=doc.suggested_folder,
suggested_filename=doc.suggested_filename,
)
@@ -183,7 +195,8 @@ async def list_documents(
sort_expr = sort_col.desc() if order == "desc" else sort_col.asc()
# Build filter conditions once and reuse for both count + items queries.
conditions = [Document.user_id == user_id]
# Watch-ingested documents (user_id = "watch") are visible to all users.
conditions = [or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID)]
if status:
conditions.append(Document.status == status)
if document_type:
@@ -247,7 +260,10 @@ async def get_document_status(
db: AsyncSession = Depends(get_db),
) -> Document:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
doc = result.scalar_one_or_none()
if doc is None:
@@ -347,7 +363,10 @@ async def download_file(
db: AsyncSession = Depends(get_db),
) -> StreamingResponse:
result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
doc = result.scalar_one_or_none()
if doc is None:
@@ -374,9 +393,12 @@ async def assign_category(
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
# Verify both belong to this user
# Verify the document is accessible (own or watch-ingested)
doc_result = await db.execute(
select(Document).where(Document.id == doc_id, Document.user_id == user_id)
select(Document).where(
Document.id == doc_id,
or_(Document.user_id == user_id, Document.user_id == _WATCH_USER_ID),
)
)
if doc_result.scalar_one_or_none() is None:
raise HTTPException(status_code=404, detail="Document not found")
@@ -418,3 +440,81 @@ async def remove_category(
if assignment:
await db.delete(assignment)
await db.commit()
# ── AI suggestion confirmation ────────────────────────────────────────────────
# These endpoints allow users to confirm or reject AI suggestions on
# watch-ingested documents. No disk mutations — suggestions only update the DB.
@router.post("/{doc_id}/suggestions/folder/confirm", status_code=204)
async def confirm_folder_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
if not doc.suggested_folder:
raise HTTPException(status_code=400, detail="No folder suggestion pending")
# Find or create the suggested category under the watch sentinel user
cat_result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.user_id == _WATCH_USER_ID,
DocumentCategory.name == doc.suggested_folder,
)
)
cat = cat_result.scalar_one_or_none()
if cat is None:
cat = DocumentCategory(user_id=_WATCH_USER_ID, name=doc.suggested_folder[:128])
db.add(cat)
await db.commit()
await db.refresh(cat)
# Assign if not already assigned
exists = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat.id,
)
)
if exists.scalar_one_or_none() is None:
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
doc.suggested_folder = None
await db.commit()
@router.post("/{doc_id}/suggestions/folder/reject", status_code=204)
async def reject_folder_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
doc.suggested_folder = None
await db.commit()
@router.post("/{doc_id}/suggestions/filename/confirm", status_code=204)
async def confirm_filename_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
if not doc.suggested_filename:
raise HTTPException(status_code=400, detail="No filename suggestion pending")
doc.title = doc.suggested_filename
doc.suggested_filename = None
await db.commit()
@router.post("/{doc_id}/suggestions/filename/reject", status_code=204)
async def reject_filename_suggestion(
doc_id: str,
user_id: str = Depends(get_user_id),
db: AsyncSession = Depends(get_db),
) -> None:
doc = await _get_user_doc(doc_id, user_id, db)
doc.suggested_filename = None
await db.commit()
@@ -0,0 +1,97 @@
"""
Plugin manifest and settings endpoints for doc-service.
These are internal-only — they are called by the main backend's generic plugin
proxy, never directly by the browser. No authentication is applied here because
the backend enforces access control before forwarding the request.
Endpoints:
GET /plugin/manifest → static manifest with JSON Schema for settings
GET /plugin/settings → current storage config values
PATCH /plugin/settings → update storage config (partial update)
"""
from fastapi import APIRouter
from pydantic import BaseModel
from app.services.config_reader import get_storage_config, save_storage_config
router = APIRouter()
_MANIFEST: dict = {
"id": "doc-service",
"name": "Document Service",
"icon": "file-text",
"version": "1.0",
"access": {
"allow_superuser": True,
"required_groups": ["doc-service-admin"],
},
"settings_schema": {
"type": "object",
"title": "Storage & Watch",
"properties": {
"watch_enabled": {
"type": "boolean",
"title": "Enable file watching",
"description": (
"Automatically ingest PDF files added to the mounted watch directory. "
"Requires a service restart to take effect after toggling."
),
},
"watch_path": {
"type": "string",
"title": "Watch path",
"readOnly": True,
"description": "Configured via Docker volume mount — edit docker-compose to change.",
},
"ai_folder_suggestion": {
"type": "boolean",
"title": "AI folder suggestion",
"description": (
"AI suggests a category for each ingested document. "
"You must confirm the suggestion before it is applied."
),
},
"ai_folder_default": {
"type": "string",
"title": "Default import category",
"description": "Category assigned automatically when AI folder suggestion is disabled.",
},
"ai_rename_suggestion": {
"type": "boolean",
"title": "AI rename suggestion",
"description": (
"AI suggests a document title for each ingested file. "
"You must confirm before it is applied."
),
},
},
},
}
class StorageSettingsUpdate(BaseModel):
watch_enabled: bool | None = None
ai_folder_suggestion: bool | None = None
ai_folder_default: str | None = None
ai_rename_suggestion: bool | None = None
# watch_path is intentionally excluded — it cannot be changed via API
@router.get("/manifest")
async def get_manifest() -> dict:
return _MANIFEST
@router.get("/settings")
async def get_settings() -> dict:
return await get_storage_config()
@router.patch("/settings")
async def update_settings(body: StorageSettingsUpdate) -> dict:
update = body.model_dump(exclude_none=True)
if "ai_folder_default" in update:
update["ai_folder_default"] = update["ai_folder_default"][:128].strip() or "imports"
await save_storage_config(update)
return await get_storage_config()
@@ -23,6 +23,10 @@ class DocumentOut(BaseModel):
created_at: datetime
processed_at: datetime | None
categories: list[CategoryOut] = []
source: str = "upload"
watch_path: str | None = None
suggested_folder: str | None = None
suggested_filename: str | None = None
model_config = {"from_attributes": True}
@@ -14,6 +14,14 @@ from pathlib import Path
from app.core.config import settings
_DEFAULT_STORAGE_CONFIG: dict = {
"watch_enabled": False,
"watch_path": "/data/watch",
"ai_folder_suggestion": False,
"ai_folder_default": "imports",
"ai_rename_suggestion": False,
}
_DEFAULT_SYSTEM_PROMPT = (
"You are a financial document analysis assistant. "
"Given the text extracted from a PDF document, return ONLY a JSON object "
@@ -43,6 +51,7 @@ _DEFAULT_USER_TEMPLATE = (
_DEFAULT_CONFIG: dict = {
"documents": {"max_pdf_bytes": 20 * 1024 * 1024},
"storage": _DEFAULT_STORAGE_CONFIG,
"system_prompts": {
"system": _DEFAULT_SYSTEM_PROMPT,
"user_template": _DEFAULT_USER_TEMPLATE,
@@ -64,6 +73,25 @@ def _read_config_sync() -> dict:
return _apply_env_overrides(base)
def _read_config_sync_raw() -> dict:
"""Read without env overrides — used when we need to write back to disk."""
path = Path(settings.CONFIG_PATH)
if not path.exists():
return deepcopy(_DEFAULT_CONFIG)
with open(path) as f:
return json.load(f)
def _write_config_sync(config: dict) -> None:
"""Atomically write config JSON to disk."""
path = Path(settings.CONFIG_PATH)
tmp = path.with_suffix(".tmp")
tmp.parent.mkdir(parents=True, exist_ok=True)
with open(tmp, "w") as f:
json.dump(config, f, indent=2)
os.replace(tmp, path)
def _apply_env_overrides(config: dict) -> dict:
cfg = deepcopy(config)
docs = cfg.setdefault("documents", {})
@@ -84,3 +112,22 @@ async def load_doc_config() -> dict:
_cache = data
_cache_at = now
return data
async def get_storage_config() -> dict:
"""Return storage config block, filling in defaults for any missing keys."""
config = await load_doc_config()
result = deepcopy(_DEFAULT_STORAGE_CONFIG)
result.update(config.get("storage", {}))
return result
async def save_storage_config(data: dict) -> None:
"""Merge data into the storage config block and persist to disk."""
global _cache, _cache_at
raw = await asyncio.to_thread(_read_config_sync_raw)
raw.setdefault("storage", {}).update(data)
await asyncio.to_thread(_write_config_sync, raw)
# Invalidate cache so next read picks up the new values
_cache = None
_cache_at = 0.0
@@ -0,0 +1,256 @@
"""
File-system watcher for the watch directory.
Uses the watchdog library to monitor a configured directory for new PDF files.
When a PDF is detected, it is automatically ingested into the document service
(copied to /data/documents, a DB record is created, and the AI pipeline runs).
Key design decisions:
- No-remove policy: on_deleted and on_moved events are intentionally ignored.
The watcher never deletes, moves, or modifies files on the watched volume.
- Watch documents use user_id="watch" as a sentinel so they are visible to
all authenticated users in the document list.
- Subfolder names map to categories: a file at invoices/bill.pdf is assigned
to a "invoices" category (auto-created if needed).
- Suggestions: if ai_folder_suggestion or ai_rename_suggestion are enabled,
the relevant fields are set on the document after AI processing so users
can confirm/reject from the UI.
- Thread → async bridge: watchdog runs in a daemon thread; asyncio coroutines
are dispatched from that thread via run_coroutine_threadsafe.
"""
import asyncio
import json
import logging
import uuid
from pathlib import Path
from watchdog.events import FileSystemEventHandler
from watchdog.observers import Observer
from app.database import AsyncSessionLocal
from app.models.category import DocumentCategory
from app.models.category_assignment import CategoryAssignment
from app.models.document import Document
from app.services.storage import save_upload
logger = logging.getLogger(__name__)
# Must match _WATCH_USER_ID in app/routers/documents.py
WATCH_USER_ID = "watch"
# ── Ingestion logic ───────────────────────────────────────────────────────────
async def ingest_file(path_str: str, watch_root: Path, config: dict) -> None:
"""
Ingest a single PDF file from the watch directory.
Idempotent: skips files that already have a non-failed document record.
"""
from sqlalchemy import select
path = Path(path_str)
if not path.exists() or not path.is_file():
return
async with AsyncSessionLocal() as db:
# Idempotency check — skip if already tracked (and not failed)
existing_result = await db.execute(
select(Document).where(Document.watch_path == path_str)
)
existing = existing_result.scalar_one_or_none()
if existing is not None and existing.status != "failed":
return
# Determine category from the first subfolder component
try:
rel = path.relative_to(watch_root)
folder_name = rel.parts[0] if len(rel.parts) > 1 else None
except ValueError:
folder_name = None
# Read file bytes
try:
file_data = path.read_bytes()
except OSError as exc:
logger.warning("[watcher] Cannot read %s: %s", path_str, exc)
return
# Save a copy to /data/documents/watch/{doc_id}.pdf
doc_id = existing.id if existing is not None else str(uuid.uuid4())
dest = await save_upload(file_data, WATCH_USER_ID, doc_id)
if existing is not None:
# Re-ingest a previously failed document
existing.file_path = str(dest)
existing.file_size = len(file_data)
existing.status = "pending"
existing.error_message = None
await db.commit()
else:
doc = Document(
id=doc_id,
user_id=WATCH_USER_ID,
source="watch",
watch_path=path_str,
filename=path.name,
file_path=str(dest),
file_size=len(file_data),
status="pending",
)
db.add(doc)
await db.commit()
# Auto-assign category from subfolder name
if folder_name:
cat_result = await db.execute(
select(DocumentCategory).where(
DocumentCategory.user_id == WATCH_USER_ID,
DocumentCategory.name == folder_name,
)
)
cat = cat_result.scalar_one_or_none()
if cat is None:
cat = DocumentCategory(user_id=WATCH_USER_ID, name=folder_name[:128])
db.add(cat)
await db.commit()
await db.refresh(cat)
exists_assign = await db.execute(
select(CategoryAssignment).where(
CategoryAssignment.document_id == doc_id,
CategoryAssignment.category_id == cat.id,
)
)
if exists_assign.scalar_one_or_none() is None:
db.add(CategoryAssignment(document_id=doc_id, category_id=cat.id))
await db.commit()
# Run AI pipeline (opens its own session internally)
from app.routers.documents import process_document
await process_document(doc_id)
# Set AI suggestions if enabled
if config.get("ai_folder_suggestion") or config.get("ai_rename_suggestion"):
await _apply_suggestions(doc_id, config)
async def _apply_suggestions(doc_id: str, config: dict) -> None:
"""Populate suggested_folder / suggested_filename after AI processing."""
from sqlalchemy import select
async with AsyncSessionLocal() as db:
result = await db.execute(select(Document).where(Document.id == doc_id))
doc = result.scalar_one_or_none()
if doc is None or doc.status != "done" or not doc.extracted_data:
return
try:
extracted = json.loads(doc.extracted_data)
except Exception:
return
changed = False
if config.get("ai_folder_suggestion"):
suggestions = extracted.get("suggested_categories", [])
if suggestions:
doc.suggested_folder = str(suggestions[0])[:128]
changed = True
if config.get("ai_rename_suggestion"):
title = extracted.get("title")
if title:
doc.suggested_filename = str(title)[:500]
changed = True
if changed:
await db.commit()
# ── Watchdog event handler ────────────────────────────────────────────────────
class _PdfEventHandler(FileSystemEventHandler):
def __init__(
self,
watch_root: Path,
loop: asyncio.AbstractEventLoop,
config: dict,
) -> None:
super().__init__()
self._watch_root = watch_root
self._loop = loop
self._config = config
def _dispatch_ingest(self, path_str: str) -> None:
if path_str.lower().endswith(".pdf"):
asyncio.run_coroutine_threadsafe(
ingest_file(path_str, self._watch_root, self._config),
self._loop,
)
def on_created(self, event): # type: ignore[override]
if not event.is_directory:
self._dispatch_ingest(event.src_path)
def on_moved(self, event): # type: ignore[override]
# Handles atomic rename/move (e.g. Nextcloud or Syncthing completing a sync)
if not event.is_directory:
self._dispatch_ingest(event.dest_path)
# on_deleted / on_modified: intentionally not overridden — no-remove policy
# ── Service ───────────────────────────────────────────────────────────────────
class FileWatcherService:
"""Manages the watchdog Observer lifecycle within the FastAPI lifespan."""
def __init__(self, loop: asyncio.AbstractEventLoop) -> None:
self._loop = loop
self._observer: Observer | None = None
self._watch_root: Path | None = None
self._config: dict = {}
async def start(self, watch_path: str, config: dict) -> None:
self._watch_root = Path(watch_path)
self._config = config
if not self._watch_root.exists():
logger.warning(
"[watcher] Watch path %s does not exist — file watching disabled",
watch_path,
)
return
handler = _PdfEventHandler(self._watch_root, self._loop, config)
self._observer = Observer()
self._observer.schedule(handler, watch_path, recursive=True)
self._observer.start()
logger.info("[watcher] started, watching %s", watch_path)
# Run startup scan as a background task so startup is not blocked
asyncio.create_task(self._scan_existing())
async def _scan_existing(self) -> None:
"""Ingest any PDFs already present in the watch directory."""
if self._watch_root is None:
return
logger.info("[watcher] scanning existing files in %s", self._watch_root)
count = 0
for pdf_path in sorted(self._watch_root.rglob("*.pdf")):
try:
await ingest_file(str(pdf_path), self._watch_root, self._config)
count += 1
except Exception as exc:
logger.warning("[watcher] scan error for %s: %s", pdf_path, exc)
logger.info("[watcher] startup scan complete — processed %d file(s)", count)
async def stop(self) -> None:
if self._observer is not None:
self._observer.stop()
await asyncio.to_thread(self._observer.join)
self._observer = None
logger.info("[watcher] stopped")